diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 2929ba29b00..16c341a55b1 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -849,8 +849,8 @@ std::pair get_reg_simple(ra_ctx& ctx, } /* collect variables from a register area and clear reg_file */ -std::set> collect_vars(ra_ctx& ctx, RegisterFile& reg_file, - const PhysRegInterval reg_interval) +std::set> find_vars(ra_ctx& ctx, RegisterFile& reg_file, + const PhysRegInterval reg_interval) { std::set> vars; for (PhysReg j : reg_interval) { @@ -862,21 +862,29 @@ std::set> collect_vars(ra_ctx& ctx, RegisterFile& if (id) { assignment& var = ctx.assignments[id]; vars.emplace(var.rc.bytes(), id); - reg_file.clear(var.reg, var.rc); - if (!reg_file[j]) - break; } } } else if (reg_file[j] != 0) { unsigned id = reg_file[j]; assignment& var = ctx.assignments[id]; vars.emplace(var.rc.bytes(), id); - reg_file.clear(var.reg, var.rc); } } return vars; } +/* collect variables from a register area and clear reg_file */ +std::set> collect_vars(ra_ctx& ctx, RegisterFile& reg_file, + const PhysRegInterval reg_interval) +{ + std::set> vars = find_vars(ctx, reg_file, reg_interval); + for (std::pair size_id : vars) { + assignment& var = ctx.assignments[size_id.second]; + reg_file.clear(var.reg, var.rc); + } + return vars; +} + bool get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, std::vector>& parallelcopies, @@ -1232,7 +1240,7 @@ bool get_reg_specified(ra_ctx& ctx, return true; } -void increase_register_file(ra_ctx& ctx, RegType type) { +bool increase_register_file(ra_ctx& ctx, RegType type) { uint16_t max_addressible_sgpr = ctx.program->sgpr_limit; uint16_t max_addressible_vgpr = ctx.program->vgpr_limit; if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) { @@ -1240,10 +1248,82 @@ void increase_register_file(ra_ctx& ctx, RegType type) { } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) { update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1)); } else { - //FIXME: if nothing helps, shift-rotate the registers to make space - aco_err(ctx.program, "Failed to allocate registers during shader compilation."); - abort(); + return false; } + return true; +} + +struct IDAndRegClass { + IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {} + + unsigned id; + RegClass rc; +}; + +struct IDAndInfo { + IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {} + + unsigned id; + DefInfo info; +}; + +/* Reallocates vars by sorting them and placing each variable after the previous + * one. If one of the variables has 0xffffffff as an ID, the register assigned + * for that variable will be returned. + */ +PhysReg compact_relocate_vars(ra_ctx& ctx, const std::vector& vars, + std::vector>& parallelcopies, + PhysReg start) +{ + /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword + * temporary sizes to dwords. + */ + std::vector sorted; + for (IDAndRegClass var : vars) { + DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1); + sorted.emplace_back(var.id, info); + } + + std::sort(sorted.begin(), sorted.end(), [&ctx](const IDAndInfo& a, + const IDAndInfo& b) { + unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4); + unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4); + if (a_stride > b_stride) + return true; + if (a_stride < b_stride) + return false; + if (a.id == 0xffffffff || b.id == 0xffffffff) + return a.id == 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */ + return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg; + }); + + PhysReg next_reg = start; + PhysReg space_reg; + for (IDAndInfo& var : sorted) { + unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4; + next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4)); + + /* 0xffffffff is a special variable ID used reserve a space for killed + * operands and definitions. + */ + if (var.id != 0xffffffff) { + if (next_reg != ctx.assignments[var.id].reg) { + RegClass rc = ctx.assignments[var.id].rc; + Temp tmp(var.id, rc); + + Operand pc_op(tmp); + pc_op.setFixed(ctx.assignments[var.id].reg); + Definition pc_def(next_reg, rc); + parallelcopies.emplace_back(pc_op, pc_def); + } + } else { + space_reg = next_reg; + } + + next_reg = next_reg.advance(var.info.rc.size() * 4); + } + + return space_reg; } bool is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction *instr) @@ -1369,9 +1449,48 @@ PhysReg get_reg(ra_ctx& ctx, * too many moves. */ assert(reg_file.count_zero(info.bounds) >= info.size); - //FIXME: if nothing helps, shift-rotate the registers to make space + if (!increase_register_file(ctx, info.rc.type())) { + /* fallback algorithm: reallocate all variables at once */ + unsigned def_size = info.rc.size(); + for (Definition def : instr->definitions) { + if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) + def_size += def.regClass().size(); + } + + unsigned killed_op_size = 0; + for (Operand op : instr->operands) { + if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type()) + killed_op_size += op.regClass().size(); + } + + const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type()); + + /* reallocate passthrough variables and non-killed operands */ + std::vector vars; + for (const std::pair& var : find_vars(ctx, reg_file, regs)) + vars.emplace_back(var.second, ctx.assignments[var.second].rc); + vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size))); + + PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo()); + + /* reallocate killed operands */ + std::vector killed_op_vars; + for (Operand op : instr->operands) { + if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type()) + killed_op_vars.emplace_back(op.tempId(), op.regClass()); + } + compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space); + + /* reallocate definitions */ + std::vector def_vars; + for (Definition def : instr->definitions) { + if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) + def_vars.emplace_back(def.tempId(), def.regClass()); + } + def_vars.emplace_back(0xffffffff, info.rc); + return compact_relocate_vars(ctx, def_vars, parallelcopies, space); + } - increase_register_file(ctx, info.rc.type()); return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index); } @@ -1504,7 +1623,10 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, success = get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval { best_pos, size }); if (!success) { - increase_register_file(ctx, temp.type()); + if (!increase_register_file(ctx, temp.type())) { + /* use the fallback algorithm in get_reg() */ + return get_reg(ctx, reg_file, temp, parallelcopies, instr); + } return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr); }