diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 1af93c99fec..5fb867c7a57 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1918,9 +1918,6 @@ bool operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr& instr, unsigned idx, PhysReg reg, RegClass rc) { - if (instr->operands[idx].isFixed()) - return instr->operands[idx].physReg() == reg; - bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64; if (gfx_level <= GFX9 && is_writelane && idx <= 1) { @@ -1952,37 +1949,77 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr& instr, unsign } } +void +handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, + std::vector>& parallelcopy, + aco_ptr& instr) +{ + assert(instr->operands.size() <= 64); + + RegisterFile tmp_file(register_file); + + uint64_t mask = 0; + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand& op = instr->operands[i]; + + if (!op.isTemp() || !op.isFixed()) + continue; + + PhysReg src = ctx.assignments[op.tempId()].reg; + + if (op.physReg() == src) { + tmp_file.block(op.physReg(), op.regClass()); + continue; + } + + bool found = false; + u_foreach_bit64 (j, mask) { + if (instr->operands[j].tempId() == op.tempId() && + instr->operands[j].physReg() == op.physReg()) { + found = true; + break; + } + } + if (found) + continue; /* the copy is already added to the list */ + + /* clear from register_file so fixed operands are not collected be collect_vars() */ + tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src + + mask |= (uint64_t)1 << i; + + Operand pc_op(instr->operands[i].getTemp(), src); + Definition pc_def = Definition(op.physReg(), pc_op.regClass()); + parallelcopy.emplace_back(pc_op, pc_def); + } + + if (!mask) + return; + + std::vector blocking_vars; + u_foreach_bit64 (i, mask) { + Operand& op = instr->operands[i]; + PhysRegInterval target{op.physReg(), op.size()}; + std::vector blocking_vars2 = collect_vars(ctx, tmp_file, target); + blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end()); + + /* prevent get_regs_for_copies() from using these registers */ + tmp_file.block(op.physReg(), op.regClass()); + } + + get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval()); + update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops); +} + void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, std::vector>& parallelcopy, aco_ptr& instr, Operand& operand, unsigned operand_index) { - /* check if the operand is fixed */ + /* clear the operand in case it's only a stride mismatch */ PhysReg src = ctx.assignments[operand.tempId()].reg; - PhysReg dst; - if (operand.isFixed()) { - assert(operand.physReg() != src); - - /* check if target reg is blocked, and move away the blocking var */ - if (register_file.test(operand.physReg(), operand.bytes())) { - PhysRegInterval target{operand.physReg(), operand.size()}; - - RegisterFile tmp_file(register_file); - - std::vector blocking_vars = collect_vars(ctx, tmp_file, target); - - tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src - tmp_file.block(operand.physReg(), operand.regClass()); - - get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval()); - } - dst = operand.physReg(); - - } else { - /* clear the operand in case it's only a stride mismatch */ - register_file.clear(src, operand.regClass()); - dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index); - } + register_file.clear(src, operand.regClass()); + PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index); Operand pc_op = operand; pc_op.setFixed(src); @@ -2757,6 +2794,7 @@ register_allocation(Program* program, std::vector& live_out_per_block, ra bool temp_in_scc = register_file[scc]; /* handle operands */ + bool fixed = false; for (unsigned i = 0; i < instr->operands.size(); ++i) { auto& operand = instr->operands[i]; if (!operand.isTemp()) @@ -2766,6 +2804,18 @@ register_allocation(Program* program, std::vector& live_out_per_block, ra operand.setTemp(read_variable(ctx, operand.getTemp(), block.index)); assert(ctx.assignments[operand.tempId()].assigned); + fixed |= + operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg(); + } + + if (fixed) + handle_fixed_operands(ctx, register_file, parallelcopy, instr); + + for (unsigned i = 0; i < instr->operands.size(); ++i) { + auto& operand = instr->operands[i]; + if (!operand.isTemp() || operand.isFixed()) + continue; + PhysReg reg = ctx.assignments[operand.tempId()].reg; if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass())) operand.setFixed(reg); diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp index da4f1e205ce..28861203a14 100644 --- a/src/amd/compiler/tests/test_regalloc.cpp +++ b/src/amd/compiler/tests/test_regalloc.cpp @@ -89,7 +89,7 @@ BEGIN_TEST(regalloc.precolor.swap) //! s2: %op1:s[2-3] = p_unit_test Temp op1 = bld.pseudo(aco_opcode::p_unit_test, bld.def(s2)); - //! s2: %op1_2:s[0-1], s2: %op0_2:s[2-3] = p_parallelcopy %op1:s[2-3], %op0:s[0-1] + //! s2: %op0_2:s[2-3], s2: %op1_2:s[0-1] = p_parallelcopy %op0:s[0-1], %op1:s[2-3] //! p_unit_test %op0_2:s[2-3], %op1_2:s[0-1] Operand op(inputs[0]); op.setFixed(PhysReg(2)); @@ -103,7 +103,7 @@ BEGIN_TEST(regalloc.precolor.blocking_vector) if (!setup_cs("s2 s1", GFX10)) return; - //! s2: %tmp0_2:s[2-3], s1: %tmp1_2:s[1] = p_parallelcopy %tmp0:s[0-1], %tmp1:s[2] + //! s1: %tmp1_2:s[1], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp1:s[2], %tmp0:s[0-1] //! p_unit_test %tmp1_2:s[1] Operand op(inputs[1]); op.setFixed(PhysReg(1)); @@ -120,7 +120,7 @@ BEGIN_TEST(regalloc.precolor.vector.test) if (!setup_cs("s2 s1 s1", GFX10)) return; - //! s1: %tmp2_2:s[0], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp2:s[3], %tmp0:s[0-1] + //! s2: %tmp0_2:s[2-3], s1: %tmp2_2:s[0] = p_parallelcopy %tmp0:s[0-1], %tmp2:s[3] //! p_unit_test %tmp0_2:s[2-3] Operand op(inputs[0]); op.setFixed(PhysReg(2)); @@ -137,7 +137,7 @@ BEGIN_TEST(regalloc.precolor.vector.collect) if (!setup_cs("s2 s1 s1", GFX10)) return; - //! s1: %tmp1_2:s[0], s1: %tmp2_2:s[1], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp1:s[2], %tmp2:s[3], %tmp0:s[0-1] + //! s2: %tmp0_2:s[2-3], s1: %tmp1_2:s[0], s1: %tmp2_2:s[1] = p_parallelcopy %tmp0:s[0-1], %tmp1:s[2], %tmp2:s[3] //! p_unit_test %tmp0_2:s[2-3] Operand op(inputs[0]); op.setFixed(PhysReg(2)); @@ -154,13 +154,40 @@ BEGIN_TEST(regalloc.precolor.vgpr_move) if (!setup_cs("v1 v1", GFX10)) return; - //! v1: %tmp0_2:v[1], v1: %tmp1_2:v[0] = p_parallelcopy %tmp0:v[0], %tmp1:v[1] + //! v1: %tmp1_2:v[0], v1: %tmp0_2:v[1] = p_parallelcopy %tmp1:v[1], %tmp0:v[0] //! p_unit_test %tmp0_2:v[1], %tmp1_2:v[0] bld.pseudo(aco_opcode::p_unit_test, inputs[0], Operand(inputs[1], PhysReg(256))); finish_ra_test(ra_test_policy()); END_TEST +BEGIN_TEST(regalloc.precolor.multiple_operands) + //>> v1: %tmp0:v[0], v1: %tmp1:v[1], v1: %tmp2:v[2], v1: %tmp3:v[3] = p_startpgm + if (!setup_cs("v1 v1 v1 v1", GFX10)) + return; + + //! v1: %tmp3_2:v[0], v1: %tmp0_2:v[1], v1: %tmp1_2:v[2], v1: %tmp2_2:v[3] = p_parallelcopy %tmp3:v[3], %tmp0:v[0], %tmp1:v[1], %tmp2:v[2] + //! p_unit_test %tmp3_2:v[0], %tmp0_2:v[1], %tmp1_2:v[2], %tmp2_2:v[3] + bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[3], PhysReg(256+0)), + Operand(inputs[0], PhysReg(256+1)), Operand(inputs[1], PhysReg(256+2)), + Operand(inputs[2], PhysReg(256+3))); + + finish_ra_test(ra_test_policy()); +END_TEST + +BEGIN_TEST(regalloc.precolor.different_regs) + //>> v1: %tmp0:v[0] = p_startpgm + if (!setup_cs("v1", GFX10)) + return; + + //! v1: %tmp1:v[1], v1: %tmp2:v[2] = p_parallelcopy %tmp0:v[0], %tmp0:v[0] + //! p_unit_test %tmp1:v[1], %tmp1:v[1], %tmp1:v[1] + bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[0], PhysReg(256+0)), + Operand(inputs[0], PhysReg(256+1)), Operand(inputs[0], PhysReg(256+2))); + + finish_ra_test(ra_test_policy()); +END_TEST + BEGIN_TEST(regalloc.scratch_sgpr.create_vector) if (!setup_cs("v1 s1", GFX7)) return;