diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 3c3cb899d28..92121b2aa42 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2324,6 +2324,8 @@ public: bool is_callee = false; bool has_call = false; bool bypass_reg_preservation = false; + ABI callee_abi = {}; + RegisterDemand callee_param_demand = RegisterDemand(); struct { monotonic_buffer_resource memory; diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h index f0e34ab7ba7..18f817dd338 100644 --- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h +++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h @@ -18,6 +18,31 @@ namespace aco { +struct parameter_info { + bool discardable; + bool is_reg; + union { + Definition def; + unsigned scratch_offset; + }; +}; + +struct call_info { + nir_call_instr* nir_instr; + Instruction* aco_instr; + std::vector return_info; + unsigned scratch_param_size; +}; + +struct callee_info { + std::vector param_infos; + parameter_info return_address; + parameter_info stack_ptr; + unsigned reg_param_count = 0; + unsigned reg_discardable_param_count = 0; + unsigned scratch_param_size = 0; +}; + enum aco_color_output_type { ACO_TYPE_ANY32, ACO_TYPE_FLOAT16, @@ -135,6 +160,13 @@ struct isel_context { uint32_t wqm_instruction_idx; BITSET_DECLARE(output_args, AC_MAX_ARGS); + + /* Function information */ + ABI callee_abi; + struct callee_info callee_info; + std::vector call_infos; + Temp next_divergent_pc; + Temp next_pc; }; inline Temp @@ -257,6 +289,10 @@ void build_end_with_regs(isel_context* ctx, std::vector& regs); Instruction* add_startpgm(struct isel_context* ctx); void finish_program(isel_context* ctx); +struct callee_info get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count, + const nir_parameter* parameters, Program* program, + RegisterDemand reg_limit); + #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__) void _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr, diff --git a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp index 7f0dee20edb..2139274c875 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp @@ -8,9 +8,12 @@ #include "aco_builder.h" #include "aco_instruction_selection.h" #include "aco_ir.h" +#include "aco_nir_call_attribs.h" #include "util/memstream.h" +#include + namespace aco { void @@ -820,4 +823,275 @@ finish_program(isel_context* ctx) } } +struct param_assignment_info { + uint16_t required_alignment; + uint16_t provided_alignment; + RegClass rc; + parameter_info* dst_info; + bool is_return_param; + /* If true, this parameter shouldn't count toward the callee info's reg_param_count because it + * receives special handling (e.g. the call return address being a definition instead of an + * operand). + */ + bool is_system_param; + /* This parameter must reside in a register. Used for stack pointers as well as s_swappc + * operands. + */ + bool force_reg; +}; + +std::optional +find_reg(BITSET_WORD* regs, RegClass rc) +{ + uint16_t start = 0; + uint16_t size = 128; + if (rc.type() == RegType::vgpr) { + start = 256; + size = 256; + } + + uint16_t contiguous_size = 0; + for (uint16_t i = 0; i < size; ++i) { + if (!BITSET_TEST(regs, start + i)) { + contiguous_size = 0; + continue; + } + if (++contiguous_size >= rc.size()) + return PhysReg{(unsigned)(start + i - contiguous_size + 1)}; + } + return {}; +} + +void +find_param_regs(Program* program, const ABI& abi, callee_info& info, + std::vector& params, RegisterDemand reg_limit) +{ + unsigned scratch_param_bytes = 0; + RegisterDemand param_demand = RegisterDemand(); + + BITSET_DECLARE(preserved_regs, 512); + BITSET_DECLARE(clobbered_regs, 512); + abi.preservedRegisters(preserved_regs, reg_limit); + BITSET_COPY(clobbered_regs, preserved_regs); + BITSET_NOT(clobbered_regs); + bool has_preserved_regs = !BITSET_IS_EMPTY(preserved_regs); + + std::stable_sort(params.begin(), params.end(), + [](const param_assignment_info& first, const param_assignment_info& second) + { + /* Assign parameters with larger alignments first so we can use parameters + * with smaller alignments as padding + */ + return first.provided_alignment > second.provided_alignment; + }); + std::stable_sort(params.begin(), params.end(), + [](const param_assignment_info& first, const param_assignment_info& second) + { + /* Move parameters forced into registers to the very front so we assign + * them first. + */ + return first.force_reg && !second.force_reg; + }); + for (size_t i = 1; i < params.size(); ++i) { + assert(!params[i].force_reg || params[i - 1].force_reg); + } + /* Reverse parameters and start from the end, to make erasing elements cheap */ + std::reverse(params.begin(), params.end()); + + while (!params.empty()) { + RegClass rc = params.back().rc; + bool discardable = params.back().dst_info->discardable || params.back().is_return_param; + + BITSET_WORD* regs; + if (has_preserved_regs && !discardable) + regs = preserved_regs; + else + regs = clobbered_regs; + + auto next_reg = find_reg(regs, rc); + /* Force parameter into scratch if it exceeds the ABI's maximum parameter demand */ + if (abi.max_param_demand != RegisterDemand() && + (param_demand + Temp(0, rc)).exceeds(abi.max_param_demand)) + next_reg = {}; + + if (next_reg && next_reg->reg() % params.back().required_alignment) { + /* We found a register, but it's not aligned properly. Check if we can add some padding + * (and ideally stuff a different parameter in there). + */ + uint16_t required_padding = + params.back().required_alignment - (next_reg->reg() % params.back().required_alignment); + uint16_t aligned_size = rc.size() + required_padding; + for (unsigned i = 0; i < aligned_size; ++i) { + /* The added padding exceeds the size of the register range. Just bail out at this + * point. + * TODO: we could probably try finding a new register, but then we'd need to reevaluate + * alignment etc... + */ + if (!BITSET_TEST(regs, next_reg->advance(i * 4).reg())) { + next_reg = {}; + break; + } + } + + /* Try finding a small parameter to put inside the padding space */ + for (auto it2 = std::next(params.rbegin()); next_reg && it2 != params.rend(); ++it2) { + if (it2->rc.type() != params.back().rc.type() || + it2->dst_info->discardable != discardable) + continue; + if (it2->rc.size() > required_padding || (it2->required_alignment % next_reg->reg())) + continue; + + param_demand += Temp(0, it2->rc); + + it2->dst_info->def.setPrecolored(*next_reg); + for (unsigned i = 0; i < it2->rc.size(); ++i) + BITSET_CLEAR(regs, next_reg->reg() + i); + if (!it2->is_system_param) { + ++info.reg_param_count; + if (discardable) + ++info.reg_discardable_param_count; + } + params.erase(std::prev(it2.base())); + break; + } + if (next_reg) + next_reg = next_reg->advance(required_padding * 4); + } + if (next_reg) { + param_demand += Temp(0, params.back().rc); + params.back().dst_info->def.setPrecolored(*next_reg); + BITSET_CLEAR_RANGE(regs, next_reg->reg(), next_reg->reg() + params.back().rc.size() - 1); + if (!params.back().is_system_param) { + ++info.reg_param_count; + if (discardable) + ++info.reg_discardable_param_count; + } + } else { + assert(!params.back().force_reg); + params.back().dst_info->is_reg = false; + params.back().dst_info->scratch_offset = scratch_param_bytes; + scratch_param_bytes += rc.size() * 4; + } + params.pop_back(); + } + + info.scratch_param_size = scratch_param_bytes; + if (program) + program->callee_param_demand = param_demand; +} + +struct callee_info +get_callee_info(amd_gfx_level gfx_level, const ABI& abi, unsigned param_count, + const nir_parameter* parameters, Program* program, RegisterDemand reg_limit) +{ + struct callee_info info = {}; + info.param_infos.reserve(param_count); + + std::vector assignment_infos; + assignment_infos.reserve(param_count + 2); + + Temp return_addr = program ? program->allocateTmp(s2) : Temp(); + Definition return_def = Definition(return_addr); + info.return_address = {}; + info.return_address.discardable = false; + info.return_address.is_reg = true; + info.return_address.def = return_def; + + param_assignment_info return_def_info = {}; + return_def_info.required_alignment = 2; + return_def_info.provided_alignment = 2; + return_def_info.rc = s2; + return_def_info.dst_info = &info.return_address; + return_def_info.is_return_param = false; + return_def_info.is_system_param = true; + return_def_info.force_reg = true; + assignment_infos.push_back(return_def_info); + + if (gfx_level >= GFX9) { + Temp stack_ptr = program ? program->allocateTmp(s1) : Temp(); + Definition stack_def = Definition(stack_ptr); + info.stack_ptr = {}; + info.stack_ptr.discardable = false; + info.stack_ptr.is_reg = true; + info.stack_ptr.def = stack_def; + + param_assignment_info stack_ptr_info = {}; + stack_ptr_info.required_alignment = 1; + stack_ptr_info.provided_alignment = 1; + stack_ptr_info.rc = s1; + stack_ptr_info.dst_info = &info.stack_ptr; + stack_ptr_info.is_return_param = false; + stack_ptr_info.is_system_param = true; + stack_ptr_info.force_reg = true; + assignment_infos.push_back(stack_ptr_info); + } else { + Temp scratch_rsrc = program ? program->allocateTmp(s4) : Temp(); + Definition rsrc_def = Definition(scratch_rsrc); + info.stack_ptr = {}; + info.stack_ptr.discardable = false; + info.stack_ptr.is_reg = true; + info.stack_ptr.def = rsrc_def; + + param_assignment_info rsrc_info = {}; + rsrc_info.required_alignment = 4; + rsrc_info.provided_alignment = 4; + rsrc_info.rc = s4; + rsrc_info.dst_info = &info.stack_ptr; + rsrc_info.is_return_param = false; + rsrc_info.is_system_param = true; + rsrc_info.force_reg = true; + assignment_infos.push_back(rsrc_info); + } + + size_t info_base = assignment_infos.size(); + + for (unsigned i = 0; i < param_count; ++i) { + RegType type = parameters[i].is_uniform ? RegType::sgpr : RegType::vgpr; + unsigned byte_size = align(parameters[i].bit_size, 32) / 8 * parameters[i].num_components; + RegClass rc = RegClass(type, byte_size / 4); + + Temp dst = program ? program->allocateTmp(rc) : Temp(); + Definition def = Definition(dst); + + parameter_info param_info = {}; + param_info.discardable = + !!(parameters[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE); + param_info.is_reg = true; + param_info.def = def; + info.param_infos.push_back(param_info); + + uint16_t required_alignment = 1; + uint16_t provided_alignment = 1; + + if (rc.type() == RegType::sgpr) { + if (rc.size() > 2) + required_alignment = 4; + else if (rc.size() > 1) + required_alignment = 2; + } + if (rc.size() % 4 == 0) + provided_alignment = 4; + else if (rc.size() % 2 == 0) + provided_alignment = 2; + + param_assignment_info assignment_info = {}; + assignment_info.required_alignment = required_alignment; + assignment_info.provided_alignment = provided_alignment; + assignment_info.rc = rc; + assignment_info.is_return_param = parameters[i].is_return; + /* Force the first two parameters (callee addresses) into registers - they're assumed to be + * accessible through a temp. + */ + assignment_info.force_reg = i <= 1; + assignment_infos.push_back(assignment_info); + } + + for (unsigned i = 0; i < param_count; ++i) + assignment_infos[info_base + i].dst_info = &info.param_infos[i]; + + find_param_regs(program, abi, info, assignment_infos, reg_limit); + + return info; +} + } // namespace aco diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp index cec4e97b4c3..b12487a3e00 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp @@ -414,6 +414,8 @@ init_context(isel_context* ctx, nir_shader* shader) ctx->program->allocateRange(impl->ssa_alloc); RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; + unsigned call_count = 0; + /* TODO: make this recursive to improve compile times */ bool done = false; while (!done) { @@ -702,12 +704,18 @@ init_context(isel_context* ctx, nir_shader* shader) regclasses[phi->def.index] = rc; break; } + case nir_instr_type_call: { + ++call_count; + break; + } default: break; } } } } + ctx->call_infos.reserve(call_count); + ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena; ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr;