mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-03 13:00:37 +01:00
radv,aco: Use function call structure for RT programs
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29580>
This commit is contained in:
parent
c5d796c902
commit
0a1911b220
17 changed files with 706 additions and 136 deletions
|
|
@ -188,15 +188,12 @@ struct ac_shader_args {
|
|||
|
||||
/* RT */
|
||||
struct {
|
||||
struct ac_arg uniform_shader_addr;
|
||||
struct ac_arg sbt_descriptors;
|
||||
struct ac_arg launch_sizes[3];
|
||||
struct ac_arg launch_size_addr;
|
||||
struct ac_arg launch_ids[3];
|
||||
struct ac_arg dynamic_callable_stack_base;
|
||||
struct ac_arg traversal_shader_addr;
|
||||
struct ac_arg shader_addr;
|
||||
struct ac_arg shader_record;
|
||||
struct ac_arg payload_offset;
|
||||
} rt;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1837,7 +1837,8 @@ emit_program(Program* program, std::vector<uint32_t>& code, std::vector<struct a
|
|||
(uint32_t*)(program->constant_data.data() + program->constant_data.size()));
|
||||
|
||||
program->config->scratch_bytes_per_wave =
|
||||
align(program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule);
|
||||
align(program->config->scratch_bytes_per_wave + program->scratch_arg_size,
|
||||
program->dev.scratch_alloc_granule);
|
||||
program->config->wgp_mode = program->wgp_mode;
|
||||
|
||||
return exec_size;
|
||||
|
|
|
|||
|
|
@ -575,6 +575,10 @@ kill(wait_imm& imm, depctr_wait& depctr, Instruction* instr, wait_ctx& ctx,
|
|||
*/
|
||||
force_waitcnt(ctx, imm);
|
||||
}
|
||||
if (instr->opcode == aco_opcode::s_swappc_b64) {
|
||||
u_foreach_bit (i, ctx.nonzero & ~counter_vs)
|
||||
imm[i] = 0;
|
||||
}
|
||||
|
||||
check_instr(ctx, imm, instr);
|
||||
|
||||
|
|
|
|||
|
|
@ -273,8 +273,8 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
|
|||
void
|
||||
aco_compile_rt_prolog(const struct aco_compiler_options* options,
|
||||
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
|
||||
const struct ac_shader_args* out_args, aco_callback* build_prolog,
|
||||
void** binary)
|
||||
const struct ac_arg* descriptors, unsigned raygen_param_count,
|
||||
nir_parameter* raygen_params, aco_callback* build_prolog, void** binary)
|
||||
{
|
||||
init();
|
||||
|
||||
|
|
@ -285,7 +285,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options,
|
|||
program->debug.func = NULL;
|
||||
program->debug.private_data = NULL;
|
||||
|
||||
select_rt_prolog(program.get(), &config, options, info, in_args, out_args);
|
||||
select_rt_prolog(program.get(), &config, options, info, in_args, descriptors, raygen_param_count,
|
||||
raygen_params);
|
||||
validate(program.get());
|
||||
insert_waitcnt(program.get());
|
||||
insert_NOPs(program.get());
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct nir_parameter;
|
||||
typedef struct nir_parameter nir_parameter;
|
||||
struct ac_shader_config;
|
||||
struct aco_shader_info;
|
||||
struct aco_vs_prolog_info;
|
||||
|
|
@ -42,8 +44,8 @@ void aco_compile_shader(const struct aco_compiler_options* options,
|
|||
|
||||
void aco_compile_rt_prolog(const struct aco_compiler_options* options,
|
||||
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
|
||||
const struct ac_shader_args* out_args, aco_callback* build_prolog,
|
||||
void** binary);
|
||||
const struct ac_arg* descriptors, unsigned raygen_param_count,
|
||||
nir_parameter* raygen_params, aco_callback* build_prolog, void** binary);
|
||||
|
||||
void aco_compile_vs_prolog(const struct aco_compiler_options* options,
|
||||
const struct aco_shader_info* info,
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@
|
|||
#include <vector>
|
||||
|
||||
typedef struct nir_shader nir_shader;
|
||||
typedef struct nir_parameter nir_parameter;
|
||||
|
||||
namespace aco {
|
||||
|
||||
|
|
@ -2337,6 +2338,7 @@ public:
|
|||
bool has_call = false;
|
||||
ABI callee_abi = {};
|
||||
RegisterDemand callee_param_demand = RegisterDemand();
|
||||
unsigned scratch_arg_size = 0;
|
||||
|
||||
struct {
|
||||
monotonic_buffer_resource memory;
|
||||
|
|
@ -2409,7 +2411,8 @@ void select_trap_handler_shader(Program* program, ac_shader_config* config,
|
|||
void select_rt_prolog(Program* program, ac_shader_config* config,
|
||||
const struct aco_compiler_options* options,
|
||||
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
|
||||
const struct ac_shader_args* out_args);
|
||||
const struct ac_arg* descriptors, unsigned raygen_param_count,
|
||||
nir_parameter* raygen_params);
|
||||
void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo,
|
||||
ac_shader_config* config, const struct aco_compiler_options* options,
|
||||
const struct aco_shader_info* info, const struct ac_shader_args* args);
|
||||
|
|
|
|||
|
|
@ -438,7 +438,10 @@ validate_ir(Program* program)
|
|||
((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
|
||||
(instr->isScratch() && i == 0) || (instr->isDS() && i == 0) ||
|
||||
(instr->opcode == aco_opcode::p_init_scratch && i == 0) ||
|
||||
(instr_disables_wqm(instr.get()) && i + 2 >= instr->operands.size());
|
||||
(instr_disables_wqm(instr.get()) && i + 2 >= instr->operands.size()) ||
|
||||
((instr->opcode == aco_opcode::p_return ||
|
||||
instr->opcode == aco_opcode::p_reload_preserved) &&
|
||||
i == 0);
|
||||
check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
|
||||
} else {
|
||||
check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
|
||||
|
|
|
|||
|
|
@ -285,12 +285,18 @@ void create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_
|
|||
const struct aco_export_mrt* mrt1);
|
||||
Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset);
|
||||
void build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs);
|
||||
Instruction* add_startpgm(struct isel_context* ctx);
|
||||
Instruction* add_startpgm(struct isel_context* ctx, bool is_callee = false);
|
||||
void finish_program(isel_context* ctx);
|
||||
|
||||
struct callee_info get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,
|
||||
const nir_parameter* parameters, Program* program,
|
||||
RegisterDemand reg_limit);
|
||||
void load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param,
|
||||
Temp stack_ptr, unsigned scratch_param_size, Temp dst);
|
||||
void store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param,
|
||||
Temp stack_ptr, unsigned scratch_param_size, Temp data);
|
||||
|
||||
void emit_reload_preserved(isel_context* ctx);
|
||||
|
||||
#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
|
|
|
|||
|
|
@ -45,14 +45,8 @@ append_logical_end(isel_context* ctx, bool append_reload_preserved)
|
|||
{
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
|
||||
if (append_reload_preserved && ctx->program->is_callee && ctx->block->loop_nest_depth == 0) {
|
||||
Operand stack_ptr_op;
|
||||
if (ctx->program->gfx_level >= GFX9)
|
||||
stack_ptr_op = Operand(ctx->callee_info.stack_ptr.def.getTemp());
|
||||
else
|
||||
stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
|
||||
bld.pseudo(aco_opcode::p_reload_preserved, bld.def(bld.lm), bld.def(s1, scc), stack_ptr_op);
|
||||
}
|
||||
if (append_reload_preserved && ctx->program->is_callee && ctx->block->loop_nest_depth == 0)
|
||||
emit_reload_preserved(ctx);
|
||||
|
||||
bld.pseudo(aco_opcode::p_logical_end);
|
||||
}
|
||||
|
|
@ -676,8 +670,10 @@ build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
|
|||
}
|
||||
|
||||
Instruction*
|
||||
add_startpgm(struct isel_context* ctx)
|
||||
add_startpgm(struct isel_context* ctx, bool is_callee)
|
||||
{
|
||||
ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size;
|
||||
|
||||
unsigned def_count = 0;
|
||||
for (unsigned i = 0; i < ctx->args->arg_count; i++) {
|
||||
if (ctx->args->args[i].skip)
|
||||
|
|
@ -689,6 +685,15 @@ add_startpgm(struct isel_context* ctx)
|
|||
def_count++;
|
||||
}
|
||||
|
||||
if (is_callee) {
|
||||
/* We do not support shader args in callees. */
|
||||
assert(def_count == 0);
|
||||
def_count += ctx->callee_info.reg_param_count;
|
||||
/* Add system parameters separately - they aren't counted by reg_param_count */
|
||||
assert(ctx->callee_info.stack_ptr.is_reg && ctx->callee_info.return_address.is_reg);
|
||||
def_count += 2;
|
||||
}
|
||||
|
||||
Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
|
||||
ctx->block->instructions.emplace_back(startpgm);
|
||||
for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
|
||||
|
|
@ -721,6 +726,22 @@ add_startpgm(struct isel_context* ctx)
|
|||
}
|
||||
}
|
||||
|
||||
if (is_callee) {
|
||||
unsigned def_idx = 0;
|
||||
if (ctx->program->gfx_level >= GFX9)
|
||||
ctx->program->stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
|
||||
else
|
||||
ctx->program->static_scratch_rsrc = ctx->callee_info.stack_ptr.def.getTemp();
|
||||
startpgm->definitions[def_idx++] = ctx->callee_info.stack_ptr.def;
|
||||
startpgm->definitions[def_idx++] = ctx->callee_info.return_address.def;
|
||||
|
||||
for (auto& info : ctx->callee_info.param_infos) {
|
||||
if (!info.is_reg)
|
||||
continue;
|
||||
startpgm->definitions[def_idx++] = info.def;
|
||||
}
|
||||
}
|
||||
|
||||
/* epilog has no scratch */
|
||||
if (ctx->args->scratch_offset.used) {
|
||||
if (ctx->program->gfx_level < GFX9) {
|
||||
|
|
@ -1074,6 +1095,15 @@ get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,
|
|||
|
||||
find_param_regs(program, abi, info, assignment_infos, reg_limit);
|
||||
|
||||
/* The call target parameters are special - they are marked as discardable to allow us
|
||||
* to overwrite the parameter values within each callee for the divergent dispatch logic.
|
||||
* However, we still need to explicitly write back the new values to the ABI-assigned registers
|
||||
* when jumping to the next divergent callee/returning. Therefore, mark them as needing explicit
|
||||
* preservation.
|
||||
*/
|
||||
info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC].needs_explicit_preservation = true;
|
||||
info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC].needs_explicit_preservation = true;
|
||||
|
||||
/* Explicitly preserve the stack pointer. spill_preserved() can ensure correctness on its own,
|
||||
* but it only can spill the initial stack pointer value to a linear VGPR, the inactive lanes of
|
||||
* which would in turn need to be spilled to scratch. Explicitly preserving the stack pointer's
|
||||
|
|
@ -1084,4 +1114,16 @@ get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count,
|
|||
return info;
|
||||
}
|
||||
|
||||
void
|
||||
emit_reload_preserved(isel_context* ctx)
|
||||
{
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
Operand stack_ptr_op;
|
||||
if (ctx->program->gfx_level >= GFX9)
|
||||
stack_ptr_op = Operand(ctx->program->stack_ptr);
|
||||
else
|
||||
stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
|
||||
bld.pseudo(aco_opcode::p_reload_preserved, bld.def(bld.lm), Operand(), stack_ptr_op);
|
||||
}
|
||||
|
||||
} // namespace aco
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include "aco_instruction_selection.h"
|
||||
#include "aco_interface.h"
|
||||
#include "aco_nir_call_attribs.h"
|
||||
|
||||
#include "nir_builder.h"
|
||||
#include "nir_control_flow.h"
|
||||
|
|
@ -238,8 +239,11 @@ setup_nir(isel_context* ctx, nir_shader* nir)
|
|||
nir_opt_dce(nir);
|
||||
}
|
||||
|
||||
nir_function_impl* func = nir_shader_get_entrypoint(nir);
|
||||
nir_index_ssa_defs(func);
|
||||
/* nir_shader_get_entrypoint returns NULL for RT shaders, but there should only be
|
||||
* one impl at this stage.
|
||||
*/
|
||||
nir_foreach_function_impl (func, nir)
|
||||
nir_index_ssa_defs(func);
|
||||
}
|
||||
|
||||
/* Returns true if we can skip uniformization of a merge phi. This makes the destination divergent,
|
||||
|
|
@ -348,6 +352,13 @@ void
|
|||
init_context(isel_context* ctx, nir_shader* shader)
|
||||
{
|
||||
nir_function_impl* impl = nir_shader_get_entrypoint(shader);
|
||||
if (!impl) {
|
||||
/* RT shaders have no NIR entrypoint, but only one function impl exists at this stage */
|
||||
nir_foreach_function_impl (func, shader) {
|
||||
impl = func;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ctx->shader = shader;
|
||||
|
||||
assert(shader->info.max_subgroup_size >= ctx->program->wave_size);
|
||||
|
|
@ -613,7 +624,17 @@ init_context(isel_context* ctx, nir_shader* shader)
|
|||
case nir_intrinsic_ddx_fine:
|
||||
case nir_intrinsic_ddy_fine:
|
||||
case nir_intrinsic_ddx_coarse:
|
||||
case nir_intrinsic_ddy_coarse: type = RegType::vgpr; break;
|
||||
case nir_intrinsic_ddy_coarse:
|
||||
case nir_intrinsic_load_return_param_amd: {
|
||||
type = RegType::vgpr;
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_param: {
|
||||
nir_parameter* param =
|
||||
&impl->function->params[nir_intrinsic_param_idx(intrinsic)];
|
||||
type = param->is_uniform ? RegType::sgpr : RegType::vgpr;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
|
||||
i++) {
|
||||
|
|
@ -773,8 +794,17 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
|
|||
assert(ctx.program->config->lds_size <= ctx.program->dev.lds_limit);
|
||||
|
||||
unsigned nir_num_blocks = 0;
|
||||
for (unsigned i = 0; i < shader_count; i++)
|
||||
nir_num_blocks += nir_shader_get_entrypoint(shaders[i])->num_blocks;
|
||||
for (unsigned i = 0; i < shader_count; i++) {
|
||||
nir_function_impl* entrypoint = nir_shader_get_entrypoint(shaders[i]);
|
||||
if (!entrypoint) {
|
||||
/* RT shaders have no NIR entrypoint, but only one function impl exists at this stage */
|
||||
nir_foreach_function_impl (func, shaders[i]) {
|
||||
entrypoint = func;
|
||||
break;
|
||||
}
|
||||
}
|
||||
nir_num_blocks += entrypoint->num_blocks;
|
||||
}
|
||||
ctx.program->blocks.reserve(nir_num_blocks * 2);
|
||||
ctx.block = ctx.program->create_and_insert_block();
|
||||
ctx.block->kind = block_kind_top_level;
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
#include "aco_builder.h"
|
||||
#include "aco_instruction_selection.h"
|
||||
#include "aco_ir.h"
|
||||
#include "aco_nir_call_attribs.h"
|
||||
|
||||
#include "amdgfxregs.h"
|
||||
#include <array>
|
||||
|
|
@ -788,6 +789,141 @@ visit_jump(isel_context* ctx, nir_jump_instr* instr)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
visit_call(isel_context* ctx, nir_call_instr* instr)
|
||||
{
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
|
||||
ABI abi;
|
||||
/* TODO: callable abi? */
|
||||
switch (instr->callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) {
|
||||
case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = rtRaygenABI; break;
|
||||
case ACO_NIR_CALL_ABI_TRAVERSAL: abi = rtTraversalABI; break;
|
||||
case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = rtAnyHitABI; break;
|
||||
default: UNREACHABLE("invalid abi");
|
||||
}
|
||||
|
||||
RegisterDemand limit = get_addr_regs_from_waves(ctx->program, ctx->program->min_waves);
|
||||
|
||||
struct callee_info info =
|
||||
get_callee_info(ctx->program->gfx_level, abi, instr->callee->num_params,
|
||||
instr->callee->params, nullptr, limit);
|
||||
std::vector<parameter_info> return_infos;
|
||||
|
||||
/* Before setting up the call itself, set up parameters stored in scratch memory.
|
||||
* The stack layout during a call looks something like this:
|
||||
* -------------------------------------------------------------------
|
||||
* | caller stack area | callee's scratch params | callee stack area
|
||||
* -------------------------------------------------------------------
|
||||
* ^ caller's stack ptr ^ callee's stack ptr
|
||||
*
|
||||
* Since we don't know how big our own stack area is yet (spilling and register preservation may
|
||||
* add to the stack size), we query the callee's stack pointer using p_callee_stack_ptr and use
|
||||
* negative offsets to index into the scratch parameter area (similar to how the callee will load
|
||||
* the parameters as well).
|
||||
*/
|
||||
|
||||
Temp stack_ptr, param_stack_ptr;
|
||||
if (info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9) {
|
||||
param_stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), bld.def(s1, scc),
|
||||
Operand::c32(info.scratch_param_size),
|
||||
Operand(ctx->callee_info.stack_ptr.def.getTemp()));
|
||||
stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
|
||||
} else {
|
||||
param_stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
|
||||
Operand::c32(info.scratch_param_size));
|
||||
stack_ptr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1), Operand::c32(0));
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < info.param_infos.size(); ++i) {
|
||||
if (info.param_infos[i].is_reg)
|
||||
continue;
|
||||
|
||||
store_scratch_param(ctx, bld, info.param_infos[i], param_stack_ptr, info.scratch_param_size,
|
||||
get_ssa_temp(ctx, instr->params[i].ssa));
|
||||
}
|
||||
|
||||
unsigned extra_def_count = 1;
|
||||
unsigned extra_param_count = 2;
|
||||
|
||||
unsigned param_size = info.scratch_param_size;
|
||||
if (ctx->program->gfx_level < GFX9)
|
||||
param_size *= ctx->program->wave_size;
|
||||
|
||||
assert(info.param_infos[0].is_reg);
|
||||
Instruction* call_instr = create_instruction(aco_opcode::p_call, Format::PSEUDO_CALL,
|
||||
info.reg_param_count + extra_param_count,
|
||||
info.reg_discardable_param_count + extra_def_count);
|
||||
call_instr->call().abi = abi;
|
||||
if (ctx->program->gfx_level >= GFX9) {
|
||||
call_instr->operands[0] = Operand(stack_ptr, info.stack_ptr.def.physReg());
|
||||
} else {
|
||||
call_instr->operands[0] = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
|
||||
call_instr->operands[0].setPrecolored(info.stack_ptr.def.physReg());
|
||||
}
|
||||
|
||||
call_instr->operands[1] = Operand::c32(param_size);
|
||||
call_instr->definitions[0] = Definition(bld.tmp(s2), info.return_address.def.physReg());
|
||||
|
||||
/* Set up parameters stored in registers. Every parameter corresponds to an operand,
|
||||
* and parameters that may have their value clobbered (i.e. discardable and return params)
|
||||
* also have a definition.
|
||||
*/
|
||||
unsigned reg_param_idx = 0;
|
||||
unsigned reg_discardable_param_idx = 0;
|
||||
for (unsigned i = 0; i < info.param_infos.size(); ++i) {
|
||||
if (!info.param_infos[i].is_reg) {
|
||||
/* While setting up parameters, also capture information about where return parameters
|
||||
* are stored, in order to reload them later.
|
||||
* Since return_infos stores return parameters contiguously, and return parameters in
|
||||
* scratch may be at any position in the parameter list, we need to add information about
|
||||
* returned scratch parameters in the same loop as returned parameters stored in registers.
|
||||
*/
|
||||
if (instr->callee->params[i].is_return) {
|
||||
parameter_info return_info = {};
|
||||
return_info.is_reg = false;
|
||||
return_info.scratch_offset = info.param_infos[i].scratch_offset;
|
||||
return_infos.emplace_back(return_info);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
Operand& op = call_instr->operands[reg_param_idx + extra_param_count];
|
||||
op.setPrecolored(info.param_infos[i].def.physReg());
|
||||
|
||||
if (instr->callee->params[i].is_uniform)
|
||||
op.setTemp(bld.as_uniform(get_ssa_temp(ctx, instr->params[i].ssa)));
|
||||
else
|
||||
op.setTemp(as_vgpr(ctx, get_ssa_temp(ctx, instr->params[i].ssa)));
|
||||
|
||||
if ((instr->callee->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE) ||
|
||||
instr->callee->params[i].is_return) {
|
||||
Definition def = bld.def(op.regClass(), op.physReg());
|
||||
call_instr->definitions[extra_def_count + reg_discardable_param_idx++] = def;
|
||||
if (instr->callee->params[i].is_return) {
|
||||
assert(!instr->callee->params[i].is_uniform);
|
||||
parameter_info return_info = {};
|
||||
return_info.is_reg = true;
|
||||
return_info.def = def;
|
||||
return_infos.emplace_back(return_info);
|
||||
}
|
||||
}
|
||||
|
||||
++reg_param_idx;
|
||||
}
|
||||
|
||||
ctx->block->instructions.emplace_back(static_cast<Instruction*>(call_instr));
|
||||
|
||||
ctx->call_infos.emplace_back(call_info{
|
||||
instr,
|
||||
call_instr,
|
||||
std::move(return_infos),
|
||||
info.scratch_param_size,
|
||||
});
|
||||
ctx->block->kind |= block_kind_contains_call;
|
||||
ctx->program->has_call = true;
|
||||
}
|
||||
|
||||
void
|
||||
visit_debug_info(isel_context* ctx, nir_instr_debug_info* instr_info)
|
||||
{
|
||||
|
|
@ -839,6 +975,7 @@ visit_block(isel_context* ctx, nir_block* block)
|
|||
case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
|
||||
case nir_instr_type_deref: break;
|
||||
case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
|
||||
case nir_instr_type_call: visit_call(ctx, nir_instr_as_call(instr)); break;
|
||||
default: isel_err(instr, "Unknown NIR instr type");
|
||||
}
|
||||
}
|
||||
|
|
@ -1152,32 +1289,52 @@ merged_wave_info_to_mask(isel_context* ctx, unsigned i)
|
|||
}
|
||||
|
||||
void
|
||||
insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
|
||||
insert_return(isel_context& ctx)
|
||||
{
|
||||
unsigned src_count = 0;
|
||||
for (unsigned i = 0; i < ctx.args->arg_count; i++)
|
||||
src_count += !!BITSET_TEST(ctx.output_args, i);
|
||||
assert(ctx.callee_info.stack_ptr.needs_explicit_preservation);
|
||||
assert(
|
||||
ctx.callee_info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC].needs_explicit_preservation);
|
||||
assert(
|
||||
ctx.callee_info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC].needs_explicit_preservation);
|
||||
|
||||
/* stack_ptr always needs to be explicitly preserved */
|
||||
unsigned preserved_param_count = 1;
|
||||
if (ctx.callee_info.return_address.needs_explicit_preservation)
|
||||
++preserved_param_count;
|
||||
for (auto param_info : ctx.callee_info.param_infos) {
|
||||
if (!param_info.is_reg || !param_info.needs_explicit_preservation)
|
||||
continue;
|
||||
++preserved_param_count;
|
||||
}
|
||||
unsigned src_count = preserved_param_count + 1;
|
||||
Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
|
||||
ctx.block->instructions.emplace_back(ret);
|
||||
|
||||
src_count = 0;
|
||||
for (unsigned i = 0; i < ctx.args->arg_count; i++) {
|
||||
if (!BITSET_TEST(ctx.output_args, i))
|
||||
unsigned def_idx = 0;
|
||||
ret->operands[def_idx++] = Operand();
|
||||
|
||||
Operand stack_op = Operand(ctx.callee_info.stack_ptr.def.getTemp());
|
||||
stack_op.setPrecolored(ctx.callee_info.stack_ptr.def.physReg());
|
||||
ret->operands[def_idx++] = stack_op;
|
||||
|
||||
for (unsigned i = 0; i < ctx.callee_info.param_infos.size(); ++i) {
|
||||
const auto& param_info = ctx.callee_info.param_infos[i];
|
||||
if (!param_info.is_reg || !param_info.needs_explicit_preservation)
|
||||
continue;
|
||||
|
||||
enum ac_arg_regfile file = ctx.args->args[i].file;
|
||||
unsigned size = ctx.args->args[i].size;
|
||||
unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
|
||||
RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
|
||||
Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
|
||||
: Operand(PhysReg{reg}, type);
|
||||
ret->operands[src_count] = op;
|
||||
src_count++;
|
||||
Temp param_temp = param_info.def.getTemp();
|
||||
if (i == ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC)
|
||||
param_temp = ctx.next_divergent_pc;
|
||||
else if (i == ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC)
|
||||
param_temp = ctx.next_pc;
|
||||
Operand op = Operand(param_temp);
|
||||
op.setPrecolored(param_info.def.physReg());
|
||||
ret->operands[def_idx++] = op;
|
||||
}
|
||||
if (ctx.callee_info.return_address.needs_explicit_preservation) {
|
||||
Operand op = Operand(ctx.callee_info.return_address.def.getTemp());
|
||||
op.setPrecolored(ctx.callee_info.return_address.def.physReg());
|
||||
ret->operands[def_idx++] = op;
|
||||
}
|
||||
|
||||
Builder bld(ctx.program, ctx.block);
|
||||
bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -1194,20 +1351,45 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
|
|||
init_context(&ctx, nir);
|
||||
setup_fp_mode(&ctx, nir);
|
||||
|
||||
Instruction* startpgm = add_startpgm(&ctx);
|
||||
nir_function_impl* impl = NULL;
|
||||
nir_foreach_function_impl (func, nir) {
|
||||
impl = func;
|
||||
break;
|
||||
}
|
||||
|
||||
ABI abi;
|
||||
/* TODO: callable abi? */
|
||||
switch (impl->function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) {
|
||||
case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = rtRaygenABI; break;
|
||||
case ACO_NIR_CALL_ABI_TRAVERSAL: abi = rtTraversalABI; break;
|
||||
case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = rtAnyHitABI; break;
|
||||
default: UNREACHABLE("invalid abi");
|
||||
}
|
||||
|
||||
RegisterDemand limit = get_addr_regs_from_waves(ctx.program, ctx.program->min_waves);
|
||||
|
||||
ctx.callee_abi = abi;
|
||||
ctx.program->callee_abi = ctx.callee_abi;
|
||||
ctx.callee_info =
|
||||
get_callee_info(ctx.program->gfx_level, ctx.callee_abi, impl->function->num_params,
|
||||
impl->function->params, ctx.program, limit);
|
||||
ctx.program->is_callee = true;
|
||||
|
||||
Instruction* startpgm = add_startpgm(&ctx, true);
|
||||
|
||||
append_logical_start(ctx.block);
|
||||
split_arguments(&ctx, startpgm);
|
||||
visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
|
||||
append_logical_end(&ctx);
|
||||
visit_cf_list(&ctx, &impl->body);
|
||||
/* This block doesn't need a p_reload_preserved, we add it manually after p_return */
|
||||
append_logical_end(&ctx, false);
|
||||
ctx.block->kind |= block_kind_uniform;
|
||||
|
||||
/* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
|
||||
* shader without shader calls.
|
||||
*/
|
||||
if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
|
||||
insert_rt_jump_next(ctx, args);
|
||||
else
|
||||
if (ctx.next_pc != Temp()) {
|
||||
insert_return(ctx);
|
||||
Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc));
|
||||
} else {
|
||||
Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm);
|
||||
}
|
||||
|
||||
cleanup_context(&ctx);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1136,9 +1136,9 @@ get_buffer_store_op(unsigned bytes)
|
|||
}
|
||||
|
||||
void
|
||||
split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
|
||||
Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
|
||||
Temp* write_datas, unsigned* offsets)
|
||||
split_buffer_store(isel_context* ctx, unsigned align_mul, unsigned align_offset, bool smem,
|
||||
RegType dst_type, Temp data, unsigned writemask, int swizzle_element_size,
|
||||
unsigned* write_count, Temp* write_datas, unsigned* offsets)
|
||||
{
|
||||
unsigned write_count_with_skips = 0;
|
||||
bool skips[16];
|
||||
|
|
@ -1168,11 +1168,9 @@ split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, Reg
|
|||
byte = 8;
|
||||
|
||||
/* dword or larger stores have to be dword-aligned */
|
||||
unsigned align_mul = nir_intrinsic_align_mul(instr);
|
||||
unsigned align_offset = nir_intrinsic_align_offset(instr) + offset;
|
||||
bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
|
||||
bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
|
||||
if (!dword_aligned)
|
||||
byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
|
||||
byte = MIN2(byte, ((align_offset + offset) % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
|
||||
|
||||
bytes[write_count_with_skips] = byte;
|
||||
advance_write_mask(&todo, offset, byte);
|
||||
|
|
@ -2291,8 +2289,8 @@ visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||
unsigned write_count = 0;
|
||||
Temp write_datas[32];
|
||||
unsigned offsets[32];
|
||||
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, max_size, &write_count,
|
||||
write_datas, offsets);
|
||||
split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
|
||||
RegType::vgpr, data, writemask, max_size, &write_count, write_datas, offsets);
|
||||
|
||||
/* GFX6-7 are affected by a hw bug that prevents address clamping to work
|
||||
* correctly when the SGPR offset is used.
|
||||
|
|
@ -2457,8 +2455,8 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||
unsigned write_count = 0;
|
||||
Temp write_datas[32];
|
||||
unsigned offsets[32];
|
||||
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
|
||||
write_datas, offsets);
|
||||
split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
|
||||
RegType::vgpr, data, writemask, 16, &write_count, write_datas, offsets);
|
||||
|
||||
Temp addr, offset;
|
||||
uint32_t const_offset;
|
||||
|
|
@ -2830,7 +2828,8 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
|
|||
unsigned write_count = 0;
|
||||
Temp write_datas[32];
|
||||
unsigned offsets[32];
|
||||
split_buffer_store(ctx, intrin, false, RegType::vgpr, store_src, write_mask,
|
||||
split_buffer_store(ctx, nir_intrinsic_align_mul(intrin), nir_intrinsic_align_offset(intrin),
|
||||
false, RegType::vgpr, store_src, write_mask,
|
||||
swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
|
||||
write_datas, offsets);
|
||||
|
||||
|
|
@ -3339,8 +3338,9 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||
Temp write_datas[32];
|
||||
unsigned offsets[32];
|
||||
unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
|
||||
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
|
||||
&write_count, write_datas, offsets);
|
||||
split_buffer_store(ctx, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), false,
|
||||
RegType::vgpr, data, writemask, swizzle_component_size, &write_count,
|
||||
write_datas, offsets);
|
||||
|
||||
if (ctx->program->gfx_level >= GFX9) {
|
||||
uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
|
||||
|
|
@ -3890,6 +3890,106 @@ emit_ds_bvh_stack_push8_pop1_rtn(isel_context* ctx, nir_intrinsic_instr* instr,
|
|||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr,
|
||||
unsigned scratch_param_size, Temp dst)
|
||||
{
|
||||
int32_t const_offset = param.scratch_offset - scratch_param_size;
|
||||
|
||||
LoadEmitInfo info = {Operand(v1), dst, dst.size(), 4};
|
||||
info.align_mul = 4;
|
||||
info.align_offset = 0;
|
||||
info.cache = get_cache_flags(ctx, ACCESS_IS_SWIZZLED_AMD, ac_access_type_load);
|
||||
info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
|
||||
info.sync = memory_sync_info(storage_scratch, semantic_private);
|
||||
if (ctx->program->gfx_level >= GFX9) {
|
||||
if (const_offset < ctx->program->dev.scratch_global_offset_min) {
|
||||
stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
|
||||
stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
|
||||
Operand::c32(const_offset));
|
||||
const_offset = 0;
|
||||
}
|
||||
info.offset = stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr);
|
||||
info.const_offset = const_offset;
|
||||
EmitLoadParameters params = scratch_flat_load_params;
|
||||
params.max_const_offset = ctx->program->dev.scratch_global_offset_max;
|
||||
emit_load(ctx, bld, info, params);
|
||||
} else {
|
||||
info.resource = load_scratch_resource(
|
||||
ctx->program, bld, ctx->program->private_segment_buffers.size() - 1, false);
|
||||
if (stack_ptr.id()) {
|
||||
info.soffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), stack_ptr,
|
||||
Operand::c32(-const_offset * ctx->program->wave_size));
|
||||
} else {
|
||||
info.soffset =
|
||||
bld.copy(bld.def(s1), Operand::c32(-const_offset * ctx->program->wave_size));
|
||||
}
|
||||
emit_load(ctx, bld, info, scratch_mubuf_load_params);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr,
|
||||
unsigned scratch_param_size, Temp data)
|
||||
{
|
||||
int32_t const_base_offset = param.scratch_offset - scratch_param_size;
|
||||
unsigned byte_size = data.bytes();
|
||||
unsigned write_count = 0;
|
||||
Temp write_datas[32];
|
||||
unsigned offsets[32];
|
||||
unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
|
||||
split_buffer_store(ctx, 4, 0, false, RegType::vgpr, as_vgpr(ctx, data),
|
||||
u_bit_consecutive(0, byte_size), swizzle_component_size, &write_count,
|
||||
write_datas, offsets);
|
||||
|
||||
if (ctx->program->gfx_level < GFX9) {
|
||||
Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, -1u, false);
|
||||
for (unsigned i = 0; i < write_count; i++) {
|
||||
Temp soffset;
|
||||
if (stack_ptr.id()) {
|
||||
soffset =
|
||||
bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), stack_ptr,
|
||||
Operand::c32(-const_base_offset * ctx->program->wave_size + offsets[i]));
|
||||
} else {
|
||||
soffset =
|
||||
bld.copy(bld.def(s1),
|
||||
Operand::c32(-const_base_offset * ctx->program->wave_size + offsets[i]));
|
||||
}
|
||||
assert(write_datas[i].bytes() == 4);
|
||||
|
||||
Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, scratch_rsrc, Operand(v1),
|
||||
Operand(soffset), write_datas[i], 0, false);
|
||||
instr->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
|
||||
instr->mubuf().cache.value = ac_swizzled;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < write_count; i++) {
|
||||
int32_t const_offset = const_base_offset + offsets[i];
|
||||
|
||||
if (const_offset < ctx->program->dev.scratch_global_offset_min) {
|
||||
stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
|
||||
stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
|
||||
Operand::c32(const_offset));
|
||||
const_offset = 0;
|
||||
}
|
||||
|
||||
aco_opcode op;
|
||||
switch (write_datas[i].bytes()) {
|
||||
case 4: op = aco_opcode::scratch_store_dword; break;
|
||||
case 8: op = aco_opcode::scratch_store_dwordx2; break;
|
||||
case 12: op = aco_opcode::scratch_store_dwordx3; break;
|
||||
case 16: op = aco_opcode::scratch_store_dwordx4; break;
|
||||
default: UNREACHABLE("Unexpected param size");
|
||||
}
|
||||
|
||||
bld.scratch(op, Operand(v1), stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr),
|
||||
write_datas[i], (int16_t)const_offset,
|
||||
memory_sync_info(storage_scratch, semantic_private));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
||||
{
|
||||
|
|
@ -4965,6 +5065,81 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||
}
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_set_next_call_pc_amd:
|
||||
ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
|
||||
ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa);
|
||||
break;
|
||||
case nir_intrinsic_load_call_return_address_amd:
|
||||
bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
|
||||
Operand(ctx->callee_info.return_address.def.getTemp()));
|
||||
break;
|
||||
case nir_intrinsic_load_return_param_amd: {
|
||||
call_info& info = ctx->call_infos[nir_intrinsic_call_idx(instr)];
|
||||
|
||||
unsigned idx = nir_intrinsic_param_idx(instr);
|
||||
assert(idx < info.nir_instr->callee->num_params);
|
||||
assert(info.nir_instr->callee->params[idx].is_return);
|
||||
|
||||
unsigned index_in_return_params = 0u;
|
||||
for (unsigned i = 0; i < idx; ++i) {
|
||||
if (info.nir_instr->callee->params[i].is_return)
|
||||
++index_in_return_params;
|
||||
}
|
||||
|
||||
if (info.return_info[index_in_return_params].is_reg) {
|
||||
bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
|
||||
Operand(info.return_info[index_in_return_params].def.getTemp()));
|
||||
} else {
|
||||
Temp stack_ptr;
|
||||
if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
|
||||
stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), bld.def(s1, scc),
|
||||
Operand::c32(info.scratch_param_size),
|
||||
Operand(ctx->callee_info.stack_ptr.def.getTemp()));
|
||||
else
|
||||
stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
|
||||
Operand::c32(info.scratch_param_size));
|
||||
load_scratch_param(ctx, bld, info.return_info[index_in_return_params], stack_ptr,
|
||||
info.scratch_param_size, get_ssa_temp(ctx, &instr->def));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_param: {
|
||||
const auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)];
|
||||
Temp dst = get_ssa_temp(ctx, &instr->def);
|
||||
if (param.is_reg) {
|
||||
bld.copy(Definition(dst), Operand(param.def.getTemp()));
|
||||
|
||||
auto vec_it = ctx->allocated_vec.find(param.def.tempId());
|
||||
if (vec_it != ctx->allocated_vec.end())
|
||||
ctx->allocated_vec.emplace(dst.id(), vec_it->second);
|
||||
} else {
|
||||
Temp stack_ptr = Temp();
|
||||
if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
|
||||
stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
|
||||
load_scratch_param(ctx, bld, param, stack_ptr, ctx->callee_info.scratch_param_size, dst);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_store_param_amd: {
|
||||
nir_intrinsic_instr* parent = nir_def_as_intrinsic_or_null(instr->src[0].ssa);
|
||||
if (parent && parent->intrinsic == nir_intrinsic_load_param &&
|
||||
nir_intrinsic_param_idx(parent) == nir_intrinsic_param_idx(instr))
|
||||
break;
|
||||
|
||||
auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)];
|
||||
if (param.is_reg) {
|
||||
param.def.setTemp(param.def.regClass().type() == RegType::vgpr
|
||||
? as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))
|
||||
: bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)));
|
||||
} else {
|
||||
Temp stack_ptr = Temp();
|
||||
if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9)
|
||||
stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
|
||||
store_scratch_param(ctx, bld, param, stack_ptr, ctx->callee_info.scratch_param_size,
|
||||
get_ssa_temp(ctx, instr->src[0].ssa));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
isel_err(&instr->instr, "Unimplemented intrinsic instr");
|
||||
abort();
|
||||
|
|
|
|||
|
|
@ -8,13 +8,18 @@
|
|||
#include "aco_instruction_selection.h"
|
||||
#include "aco_interface.h"
|
||||
#include "aco_ir.h"
|
||||
#include "aco_nir_call_attribs.h"
|
||||
|
||||
#include "ac_descriptors.h"
|
||||
#include "sid.h"
|
||||
|
||||
namespace aco {
|
||||
|
||||
void
|
||||
select_rt_prolog(Program* program, ac_shader_config* config,
|
||||
const struct aco_compiler_options* options, const struct aco_shader_info* info,
|
||||
const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
|
||||
const struct ac_shader_args* in_args, const struct ac_arg* descriptors,
|
||||
unsigned raygen_param_count, nir_parameter* raygen_params)
|
||||
{
|
||||
init_program(program, compute_cs, info, options, config);
|
||||
Block* block = program->create_and_insert_block();
|
||||
|
|
@ -24,8 +29,13 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|||
calc_min_waves(program);
|
||||
Builder bld(program, block);
|
||||
block->instructions.reserve(32);
|
||||
unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
|
||||
unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
|
||||
unsigned num_sgprs = in_args->num_sgprs_used;
|
||||
unsigned num_vgprs = in_args->num_vgprs_used;
|
||||
|
||||
RegisterDemand limit = get_addr_regs_from_waves(program, program->min_waves);
|
||||
|
||||
struct callee_info raygen_info = get_callee_info(program->gfx_level, rtRaygenABI,
|
||||
raygen_param_count, raygen_params, NULL, limit);
|
||||
|
||||
/* Inputs:
|
||||
* Ring offsets: s[0-1]
|
||||
|
|
@ -41,9 +51,12 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|||
* Local invocation IDs: v[0-2]
|
||||
*/
|
||||
PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
|
||||
PhysReg in_descriptors = get_arg_reg(in_args, *descriptors);
|
||||
PhysReg in_push_constants = get_arg_reg(in_args, in_args->push_constants);
|
||||
PhysReg in_dynamic_descriptors = get_arg_reg(in_args, in_args->dynamic_descriptors);
|
||||
PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
|
||||
PhysReg in_traversal_addr = get_arg_reg(in_args, in_args->rt.traversal_shader_addr);
|
||||
PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
|
||||
PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
|
||||
PhysReg in_wg_id_x;
|
||||
PhysReg in_wg_id_y;
|
||||
PhysReg in_wg_id_z;
|
||||
|
|
@ -77,15 +90,48 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|||
* Shader VA: v[4-5]
|
||||
* Shader Record Ptr: v[6-7]
|
||||
*/
|
||||
PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
|
||||
PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]);
|
||||
PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]);
|
||||
PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]);
|
||||
assert(raygen_info.stack_ptr.is_reg);
|
||||
assert(raygen_info.return_address.is_reg);
|
||||
assert(raygen_info.param_infos[0].is_reg);
|
||||
assert(raygen_info.param_infos[1].is_reg);
|
||||
assert(raygen_info.param_infos[RT_ARG_LAUNCH_ID + 2].is_reg);
|
||||
assert(raygen_info.param_infos[RT_ARG_LAUNCH_SIZE + 2].is_reg);
|
||||
assert(raygen_info.param_infos[RT_ARG_DESCRIPTORS + 2].is_reg);
|
||||
assert(raygen_info.param_infos[RT_ARG_PUSH_CONSTANTS + 2].is_reg);
|
||||
assert(raygen_info.param_infos[RT_ARG_SBT_DESCRIPTORS + 2].is_reg);
|
||||
assert(raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].is_reg);
|
||||
assert(raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].is_reg);
|
||||
PhysReg out_stack_ptr_param = raygen_info.stack_ptr.def.physReg();
|
||||
PhysReg out_return_shader_addr = raygen_info.return_address.def.physReg();
|
||||
PhysReg out_divergent_shader_addr = raygen_info.param_infos[0].def.physReg();
|
||||
PhysReg out_uniform_shader_addr = raygen_info.param_infos[1].def.physReg();
|
||||
PhysReg out_launch_size_x = raygen_info.param_infos[RT_ARG_LAUNCH_SIZE + 2].def.physReg();
|
||||
PhysReg out_launch_size_y = out_launch_size_x.advance(4);
|
||||
PhysReg out_launch_size_z = out_launch_size_y.advance(4);
|
||||
PhysReg out_launch_ids[3];
|
||||
for (unsigned i = 0; i < 3; i++)
|
||||
out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]);
|
||||
PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
|
||||
PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
|
||||
out_launch_ids[0] = raygen_info.param_infos[RT_ARG_LAUNCH_ID + 2].def.physReg();
|
||||
for (unsigned i = 1; i < 3; i++)
|
||||
out_launch_ids[i] = out_launch_ids[i - 1].advance(4);
|
||||
PhysReg out_descriptors = raygen_info.param_infos[RT_ARG_DESCRIPTORS + 2].def.physReg();
|
||||
PhysReg out_push_constants = raygen_info.param_infos[RT_ARG_PUSH_CONSTANTS + 2].def.physReg();
|
||||
PhysReg out_dynamic_descriptors =
|
||||
raygen_info.param_infos[RT_ARG_DYNAMIC_DESCRIPTORS + 2].def.physReg();
|
||||
PhysReg out_sbt_descriptors = raygen_info.param_infos[RT_ARG_SBT_DESCRIPTORS + 2].def.physReg();
|
||||
PhysReg out_traversal_addr =
|
||||
raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].def.physReg();
|
||||
PhysReg out_record_ptr = raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].def.physReg();
|
||||
|
||||
unsigned param_idx = 0;
|
||||
for (auto& param_info : raygen_info.param_infos) {
|
||||
unsigned byte_size =
|
||||
align(raygen_params[param_idx].bit_size, 32) / 8 * raygen_params[param_idx].num_components;
|
||||
if (raygen_params[param_idx].is_uniform)
|
||||
num_sgprs = std::max(num_sgprs, param_info.def.physReg().reg() + byte_size / 4);
|
||||
else
|
||||
num_vgprs = std::max(num_vgprs, param_info.def.physReg().reg() - 256 + byte_size / 4);
|
||||
++param_idx;
|
||||
}
|
||||
num_sgprs = std::max(num_sgprs, raygen_info.stack_ptr.def.physReg().reg());
|
||||
|
||||
/* Temporaries: */
|
||||
PhysReg tmp_wg_start_x = PhysReg{num_sgprs};
|
||||
|
|
@ -94,18 +140,26 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|||
num_sgprs++;
|
||||
PhysReg tmp_swizzle_bound_y = PhysReg{num_sgprs};
|
||||
num_sgprs++;
|
||||
PhysReg tmp_wg_id_y;
|
||||
if (program->gfx_level >= GFX12) {
|
||||
tmp_wg_id_y = PhysReg{num_sgprs};
|
||||
num_sgprs++;
|
||||
} else {
|
||||
tmp_wg_id_y = in_wg_id_y;
|
||||
}
|
||||
PhysReg tmp_wg_id_y = PhysReg{num_sgprs};
|
||||
num_sgprs++;
|
||||
num_sgprs = align(num_sgprs, 2);
|
||||
PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
|
||||
num_sgprs += 2;
|
||||
PhysReg tmp_launch_size_addr = PhysReg{num_sgprs};
|
||||
num_sgprs += 2;
|
||||
PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
|
||||
num_sgprs += 2;
|
||||
PhysReg tmp_sbt_desc = PhysReg{num_sgprs};
|
||||
if (program->gfx_level < GFX9)
|
||||
num_sgprs += 2;
|
||||
PhysReg tmp_traversal_addr = PhysReg{num_sgprs};
|
||||
num_sgprs += 1;
|
||||
PhysReg tmp_push_constants = PhysReg{num_sgprs};
|
||||
num_sgprs++;
|
||||
PhysReg tmp_descriptors = PhysReg{num_sgprs};
|
||||
num_sgprs++;
|
||||
PhysReg tmp_dynamic_descriptors = PhysReg{num_sgprs};
|
||||
num_sgprs++;
|
||||
|
||||
PhysReg tmp_swizzled_id_x = PhysReg{256 + num_vgprs++};
|
||||
PhysReg tmp_swizzled_id_y = PhysReg{256 + num_vgprs++};
|
||||
|
|
@ -113,40 +167,66 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|||
PhysReg tmp_swizzled_id_shifted_y = PhysReg{256 + num_vgprs++};
|
||||
|
||||
/* Confirm some assumptions about register aliasing */
|
||||
assert(in_ring_offsets == out_uniform_shader_addr);
|
||||
assert(get_arg_reg(in_args, in_args->push_constants) ==
|
||||
get_arg_reg(out_args, out_args->push_constants));
|
||||
assert(get_arg_reg(in_args, in_args->dynamic_descriptors) ==
|
||||
get_arg_reg(out_args, out_args->dynamic_descriptors));
|
||||
assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
|
||||
get_arg_reg(out_args, out_args->rt.sbt_descriptors));
|
||||
assert(get_arg_reg(in_args, in_args->rt.traversal_shader_addr) ==
|
||||
get_arg_reg(out_args, out_args->rt.traversal_shader_addr));
|
||||
assert(in_launch_size_addr == out_launch_size_x);
|
||||
assert(in_stack_base == out_launch_size_z);
|
||||
assert(in_local_id == out_launch_ids[0]);
|
||||
|
||||
/* <gfx9 reads in_scratch_offset at the end of the prolog to write out the scratch_offset
|
||||
* arg. Make sure no other outputs have overwritten it by then.
|
||||
*/
|
||||
assert(options->gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used);
|
||||
if (program->gfx_level >= GFX9) {
|
||||
if (program->gfx_level < GFX12) {
|
||||
assert(in_wg_id_z == out_launch_size_y);
|
||||
assert(in_wg_id_y == out_launch_size_x);
|
||||
}
|
||||
assert(in_sbt_desc == out_sbt_descriptors);
|
||||
assert(in_traversal_addr == out_descriptors);
|
||||
} else {
|
||||
assert(out_launch_size_x == in_wg_id_y);
|
||||
assert(out_sbt_descriptors == in_launch_size_addr);
|
||||
}
|
||||
|
||||
/* load raygen sbt */
|
||||
bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
|
||||
Operand::c32(0u));
|
||||
|
||||
bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_launch_size_addr, s2),
|
||||
Operand(in_launch_size_addr, s2));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_traversal_addr, s1),
|
||||
Operand(in_traversal_addr, s1));
|
||||
|
||||
/* On GFX8-, the out push constant/descriptor parameters alias WG IDs, so we copy these
|
||||
* parameters only after we're done calculating the launch IDs.
|
||||
*/
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_push_constants, s1),
|
||||
Operand(in_push_constants, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_dynamic_descriptors, s1),
|
||||
Operand(in_dynamic_descriptors, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_descriptors, s1), Operand(in_descriptors, s1));
|
||||
|
||||
if (options->gfx_level < GFX9)
|
||||
bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_sbt_desc, s2), Operand(in_sbt_desc, s2));
|
||||
|
||||
/* init scratch */
|
||||
if (options->gfx_level < GFX9) {
|
||||
/* copy ring offsets to temporary location*/
|
||||
bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
|
||||
Operand(in_ring_offsets, s2));
|
||||
/* Unconditionally apply the scratch offset to scratch_rsrc so we just have
|
||||
* to pass the rsrc through to callees.
|
||||
*/
|
||||
bld.sop2(aco_opcode::s_add_u32, Definition(tmp_ring_offsets, s1), Definition(scc, s1),
|
||||
Operand(in_ring_offsets, s1), Operand(in_scratch_offset, s1));
|
||||
bld.sop2(aco_opcode::s_addc_u32, Definition(tmp_ring_offsets.advance(4), s1),
|
||||
Definition(scc, s1), Operand(in_ring_offsets.advance(4), s1), Operand::c32(0),
|
||||
Operand(scc, s1));
|
||||
} else if (options->gfx_level < GFX11) {
|
||||
hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
|
||||
Operand(in_scratch_offset, s1));
|
||||
}
|
||||
|
||||
/* set stack ptr */
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
|
||||
/* Set up the Z launch ID, as well as setting up workgroup Y IDs. On gfx11-, the setup consists
|
||||
* of backing the ID up as the load for the ray launch sizes will overwrite it.
|
||||
*/
|
||||
if (options->gfx_level >= GFX12) {
|
||||
bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
|
||||
Operand(in_wg_id_y, s1));
|
||||
bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1),
|
||||
Operand::c32(0));
|
||||
} else {
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1));
|
||||
}
|
||||
|
||||
/* load raygen address */
|
||||
bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
|
||||
|
|
@ -156,22 +236,12 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|||
assert(out_launch_size_x.reg() % 4 == 0);
|
||||
if (options->gfx_level >= GFX12) {
|
||||
bld.smem(aco_opcode::s_load_dwordx3, Definition(out_launch_size_x, s3),
|
||||
Operand(in_launch_size_addr, s2), Operand::c32(0u));
|
||||
Operand(tmp_launch_size_addr, s2), Operand::c32(0u));
|
||||
} else {
|
||||
bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
|
||||
Operand(in_launch_size_addr, s2), Operand::c32(8u));
|
||||
Operand(tmp_launch_size_addr, s2), Operand::c32(8u));
|
||||
bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
|
||||
Operand(in_launch_size_addr, s2), Operand::c32(0u));
|
||||
}
|
||||
|
||||
/* calculate ray launch ids */
|
||||
if (options->gfx_level >= GFX12) {
|
||||
bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
|
||||
Operand(in_wg_id_y, s1));
|
||||
bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(tmp_wg_id_y, s1), Operand(in_wg_id_y, s1),
|
||||
Operand::c32(0));
|
||||
} else {
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
|
||||
Operand(tmp_launch_size_addr, s2), Operand::c32(0u));
|
||||
}
|
||||
|
||||
/* Swizzle ray launch IDs. We dispatch a 1D 32x1/64x1 workgroup natively. Many games dispatch
|
||||
|
|
@ -313,13 +383,61 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|||
bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
|
||||
Operand(tmp_raygen_sbt.advance(4), s1));
|
||||
|
||||
if (options->gfx_level < GFX9) {
|
||||
/* write scratch/ring offsets to outputs, if needed */
|
||||
bld.sop1(aco_opcode::s_mov_b32,
|
||||
Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
|
||||
Operand(in_scratch_offset, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
|
||||
Operand(tmp_ring_offsets, s2));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr, s1),
|
||||
Operand(tmp_traversal_addr, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr.advance(4), s1),
|
||||
Operand::c32(options->address32_hi));
|
||||
|
||||
if (program->gfx_level < GFX8)
|
||||
bld.vop3(aco_opcode::v_lshr_b64, Definition(out_divergent_shader_addr, v2),
|
||||
Operand(out_uniform_shader_addr, s2), Operand::c32(0));
|
||||
else
|
||||
bld.vop3(aco_opcode::v_lshrrev_b64, Definition(out_divergent_shader_addr, v2),
|
||||
Operand::c32(0), Operand(out_uniform_shader_addr, s2));
|
||||
|
||||
/* Launch IDs are calculated, so copy the push constant/sbt descriptor parameters.
|
||||
* Do this here before other parameters overwrite the inputs.
|
||||
*/
|
||||
if (program->gfx_level < GFX9) {
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors, s1),
|
||||
Operand(tmp_sbt_desc, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors.advance(4), s1),
|
||||
Operand(tmp_sbt_desc.advance(4), s1));
|
||||
}
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_push_constants, s1),
|
||||
Operand(tmp_push_constants, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_dynamic_descriptors, s1),
|
||||
Operand(tmp_dynamic_descriptors, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_descriptors, s1), Operand(tmp_descriptors, s1));
|
||||
|
||||
bld.sop1(aco_opcode::s_mov_b64, Definition(out_return_shader_addr, s2), Operand::c32(0));
|
||||
|
||||
if (program->gfx_level >= GFX9) {
|
||||
bld.sopk(aco_opcode::s_movk_i32, Definition(out_stack_ptr_param, s1), 0);
|
||||
} else {
|
||||
/* Construct the scratch_rsrc here and pass it to the callees to use directly. */
|
||||
struct ac_buffer_state ac_state = {0};
|
||||
uint32_t desc[4];
|
||||
|
||||
ac_state.size = 0xffffffff;
|
||||
ac_state.format = PIPE_FORMAT_R32_FLOAT;
|
||||
for (int i = 0; i < 4; i++)
|
||||
ac_state.swizzle[i] = PIPE_SWIZZLE_0;
|
||||
ac_state.element_size = 1u;
|
||||
ac_state.index_stride = program->wave_size == 64 ? 3u : 2u;
|
||||
ac_state.add_tid = true;
|
||||
ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
|
||||
|
||||
ac_build_buffer_descriptor(program->gfx_level, &ac_state, desc);
|
||||
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param, s1),
|
||||
Operand(tmp_ring_offsets, s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(4), s1),
|
||||
Operand(tmp_ring_offsets.advance(4), s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(8), s1),
|
||||
Operand::c32(desc[2]));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(out_stack_ptr_param.advance(12), s1),
|
||||
Operand::c32(desc[3]));
|
||||
}
|
||||
|
||||
/* jump to raygen */
|
||||
|
|
|
|||
|
|
@ -931,9 +931,13 @@ static void
|
|||
compile_rt_prolog(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline)
|
||||
{
|
||||
const struct radv_physical_device *pdev = radv_device_physical(device);
|
||||
struct nir_function raygen_stub = {};
|
||||
uint32_t push_constant_size = 0;
|
||||
|
||||
pipeline->prolog = radv_create_rt_prolog(device);
|
||||
/* Create a dummy function signature for raygen shaders in order to pass parameter info to the prolog */
|
||||
radv_nir_init_rt_function_params(&raygen_stub, MESA_SHADER_RAYGEN, 0);
|
||||
radv_nir_lower_callee_signature(&raygen_stub);
|
||||
pipeline->prolog = radv_create_rt_prolog(device, raygen_stub.num_params, raygen_stub.params);
|
||||
|
||||
/* create combined config */
|
||||
struct ac_shader_config *config = &pipeline->prolog->config;
|
||||
|
|
|
|||
|
|
@ -3408,13 +3408,12 @@ radv_aco_build_shader_part(void **bin, uint32_t num_sgprs, uint32_t num_vgprs, c
|
|||
}
|
||||
|
||||
struct radv_shader *
|
||||
radv_create_rt_prolog(struct radv_device *device)
|
||||
radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count, nir_parameter *raygen_params)
|
||||
{
|
||||
const struct radv_physical_device *pdev = radv_device_physical(device);
|
||||
const struct radv_instance *instance = radv_physical_device_instance(pdev);
|
||||
struct radv_shader *prolog;
|
||||
struct radv_shader_args in_args = {0};
|
||||
struct radv_shader_args out_args = {0};
|
||||
struct radv_nir_compiler_options options = {0};
|
||||
radv_fill_nir_compiler_options(&options, device, NULL, false, instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS,
|
||||
radv_device_fault_detection_enabled(device), false);
|
||||
|
|
@ -3435,7 +3434,6 @@ radv_create_rt_prolog(struct radv_device *device)
|
|||
info.cs.uses_block_id[i] = true;
|
||||
|
||||
radv_declare_shader_args(device, NULL, &info, MESA_SHADER_COMPUTE, MESA_SHADER_NONE, &in_args);
|
||||
radv_declare_rt_shader_args(options.info->gfx_level, &out_args);
|
||||
info.user_sgprs_locs = in_args.user_sgprs_locs;
|
||||
|
||||
#if AMD_LLVM_AVAILABLE
|
||||
|
|
@ -3449,8 +3447,8 @@ radv_create_rt_prolog(struct radv_device *device)
|
|||
struct aco_compiler_options ac_opts;
|
||||
radv_aco_convert_shader_info(&ac_info, &info, &in_args, &device->cache_key, options.info->gfx_level);
|
||||
radv_aco_convert_opts(&ac_opts, &options, &in_args, &stage_key);
|
||||
aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &out_args.ac, &radv_aco_build_shader_binary,
|
||||
(void **)&binary);
|
||||
aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &in_args.descriptors[0], raygen_param_count, raygen_params,
|
||||
&radv_aco_build_shader_binary, (void **)&binary);
|
||||
binary->info = info;
|
||||
|
||||
radv_postprocess_binary_config(device, binary, &in_args);
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ struct radv_shader_args;
|
|||
struct radv_shader_args;
|
||||
struct radv_serialized_shader_arena_block;
|
||||
struct vk_pipeline_robustness_state;
|
||||
struct nir_parameter;
|
||||
typedef struct nir_parameter nir_parameter;
|
||||
|
||||
#define RADV_GRAPHICS_STAGE_BITS \
|
||||
(VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_TASK_BIT_EXT)
|
||||
|
|
@ -549,7 +551,8 @@ void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena
|
|||
|
||||
struct radv_shader *radv_create_trap_handler_shader(struct radv_device *device);
|
||||
|
||||
struct radv_shader *radv_create_rt_prolog(struct radv_device *device);
|
||||
struct radv_shader *radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count,
|
||||
nir_parameter *raygen_params);
|
||||
|
||||
struct radv_shader_part *radv_shader_part_create(struct radv_device *device, struct radv_shader_part_binary *binary,
|
||||
unsigned wave_size);
|
||||
|
|
|
|||
|
|
@ -362,6 +362,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|||
case nir_intrinsic_load_urb_input_handle_intel:
|
||||
case nir_intrinsic_load_urb_output_handle_intel:
|
||||
case nir_intrinsic_load_ray_query_global_intel:
|
||||
case nir_intrinsic_load_call_return_address_amd:
|
||||
is_divergent = false;
|
||||
break;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue