diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 6562d9e25d0..2ca9e14c91d 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1936,8 +1936,11 @@ handle_operands(std::map& copy_map, lower_context* ctx, continue; } - if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr) - assert(!(it->second.def.physReg() == pi->scratch_sgpr)); + if (it->second.def.getTemp().type() == RegType::sgpr) { + assert(it->second.def.physReg() != pi->scratch_sgpr); + assert(pi->needs_scratch_reg); + assert(!preserve_scc || pi->scratch_sgpr != scc); + } /* to resolve the cycle, we have to swap the src reg with the dst reg */ copy_operation swap = it->second; diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 7ff35c079e2..78f13f8a79a 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -2039,12 +2039,17 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) reads_linear = true; } - if (!writes_linear || !reads_linear || !reg_file[scc]) + if (!writes_linear || !reads_linear) return; instr->pseudo().needs_scratch_reg = true; instr->pseudo().tmp_in_scc = reg_file[scc]; + if (!reg_file[scc]) { + instr->pseudo().scratch_sgpr = scc; + return; + } + int reg = ctx.max_used_sgpr; for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--) ; @@ -2933,18 +2938,16 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vectoroperands[i] = parallelcopy[i].first; @@ -2974,6 +2977,7 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vectorpseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr; pc->pseudo().tmp_in_scc = false; + pc->pseudo().scratch_sgpr = scc; } instructions.emplace_back(std::move(pc)); @@ -3064,7 +3068,6 @@ register_allocation(Program* program, ra_test_policy policy) for (; instr_it != block.instructions.end(); ++instr_it) { aco_ptr& instr = *instr_it; std::vector> parallelcopy; - bool temp_in_scc = register_file[scc]; if (instr->opcode == aco_opcode::p_branch) { /* unconditional branches are handled after phis of the target */ @@ -3121,6 +3124,7 @@ register_allocation(Program* program, ra_test_policy policy) ctx.war_hint.set(operand.physReg().reg() + j); } } + bool temp_in_scc = register_file[scc]; /* remove dead vars from register file */ for (const Operand& op : instr->operands) { diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp index 5902b780ca7..ad9cd311592 100644 --- a/src/amd/compiler/tests/test_regalloc.cpp +++ b/src/amd/compiler/tests/test_regalloc.cpp @@ -555,7 +555,7 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_def) finish_ra_test(ra_test_policy()); //~gfx8_cbranch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:1 scratch:s1 - //~gfx8_branch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:0 scratch:s0 + //~gfx8_branch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:0 scratch:s253 aco_ptr& parallelcopy = program->blocks[0].instructions[6]; aco_print_instr(program->gfx_level, parallelcopy.get(), output); if (parallelcopy->isPseudo()) {