diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 539b35a8181..fdb5de268b2 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1712,8 +1712,7 @@ struct Export_instruction : public Instruction { static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); struct Pseudo_instruction : public Instruction { - PhysReg scratch_sgpr; /* might not be valid if it's not needed */ - bool tmp_in_scc; + PhysReg scratch_sgpr; /* might not be valid if it's not needed */ bool needs_scratch_reg; /* if scratch_sgpr/scc can be written, initialized by RA. */ }; static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 2ca9e14c91d..1b5c073a656 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1711,7 +1711,7 @@ handle_operands(std::map& copy_map, lower_context* ctx, if (it->second.def.physReg() == scc) writes_scc = true; - assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr)); + assert(!pi->needs_scratch_reg || it->second.def.physReg() != pi->scratch_sgpr); /* if src and dst reg are the same, remove operation */ if (it->first == it->second.op.physReg()) { @@ -1753,7 +1753,7 @@ handle_operands(std::map& copy_map, lower_context* ctx, } /* first, handle paths in the location transfer graph */ - bool preserve_scc = pi->tmp_in_scc && !writes_scc; + bool preserve_scc = pi->needs_scratch_reg && pi->scratch_sgpr != scc && !writes_scc; bool skip_partial_copies = true; for (auto it = copy_map.begin();;) { if (copy_map.empty()) { @@ -2056,23 +2056,24 @@ handle_operands_linear_vgpr(std::map& copy_map, lower_c std::map second_map(copy_map); handle_operands(second_map, ctx, gfx_level, pi); - bool tmp_in_scc = pi->tmp_in_scc; - if (tmp_in_scc) { - bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1)); - pi->tmp_in_scc = false; + assert(pi->needs_scratch_reg); + PhysReg scratch_sgpr = pi->scratch_sgpr; + if (scratch_sgpr != scc) { + bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1)); + pi->scratch_sgpr = scc; } bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm)); handle_operands(copy_map, ctx, gfx_level, pi); bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm)); - if (tmp_in_scc) { - bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), + if (scratch_sgpr != scc) { + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1), Operand::zero()); - pi->tmp_in_scc = true; + pi->scratch_sgpr = scratch_sgpr; } - ctx->program->statistics[aco_statistic_copies] += tmp_in_scc ? 4 : 2; + ctx->program->statistics[aco_statistic_copies] += scratch_sgpr == scc ? 2 : 4; } void diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 26242b2079f..cb2f19f7477 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -139,8 +139,6 @@ save_reg_writes(pr_opt_ctx& ctx, aco_ptr& instr) ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size, idx); } if (instr->isPseudo() && instr->pseudo().needs_scratch_reg) { - if (!instr->pseudo().tmp_in_scc) - ctx.instr_idx_by_regs[ctx.current_block->index][scc] = overwritten_unknown_instr; ctx.instr_idx_by_regs[ctx.current_block->index][instr->pseudo().scratch_sgpr] = overwritten_unknown_instr; } diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index dbc963f115f..3de1a813487 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -2043,7 +2043,6 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) return; instr->pseudo().needs_scratch_reg = true; - instr->pseudo().tmp_in_scc = reg_file[scc]; if (!reg_file[scc]) { instr->pseudo().scratch_sgpr = scc; @@ -2322,7 +2321,6 @@ get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file, } for (aco_ptr& phi : instructions) { - phi->pseudo().tmp_in_scc = register_file[scc]; phi->pseudo().scratch_sgpr = scratch_reg; phi->pseudo().needs_scratch_reg = true; } @@ -3000,7 +2998,6 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vectorpseudo().needs_scratch_reg = may_swap_sgprs || linear_vgpr; - pc->pseudo().tmp_in_scc = false; pc->pseudo().scratch_sgpr = scc; } diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp index 5c518d766af..e08b8448820 100644 --- a/src/amd/compiler/aco_ssa_elimination.cpp +++ b/src/amd/compiler/aco_ssa_elimination.cpp @@ -81,8 +81,7 @@ insert_parallelcopies(ssa_elimination_ctx& ctx) pc->operands[i] = phi_info.op; i++; } - /* this shouldn't be needed since we're only copying vgprs */ - pc->pseudo().tmp_in_scc = false; + pc->pseudo().needs_scratch_reg = false; block.instructions.insert(it, std::move(pc)); } @@ -102,7 +101,6 @@ insert_parallelcopies(ssa_elimination_ctx& ctx) pc->operands[i] = phi_info.op; i++; } - pc->pseudo().tmp_in_scc = succ.instructions[0]->pseudo().tmp_in_scc; pc->pseudo().scratch_sgpr = succ.instructions[0]->pseudo().scratch_sgpr; pc->pseudo().needs_scratch_reg = succ.instructions[0]->pseudo().needs_scratch_reg; auto it = std::prev(block.instructions.end()); diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp index ad9cd311592..17dba8c2c53 100644 --- a/src/amd/compiler/tests/test_regalloc.cpp +++ b/src/amd/compiler/tests/test_regalloc.cpp @@ -554,12 +554,13 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_def) finish_ra_test(ra_test_policy()); - //~gfx8_cbranch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:1 scratch:s1 - //~gfx8_branch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] scc:0 scratch:s253 + //~gfx8_cbranch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] needs_scratch:1 scratch:s1 + //~gfx8_branch>> lv1: %ltmp2_2:v[29] = p_parallelcopy %ltmp2:v[28] needs_scratch:1 scratch:s253 aco_ptr& parallelcopy = program->blocks[0].instructions[6]; aco_print_instr(program->gfx_level, parallelcopy.get(), output); if (parallelcopy->isPseudo()) { - fprintf(output, " scc:%u scratch:s%u\n", parallelcopy->pseudo().tmp_in_scc, + fprintf(output, " needs_scratch:%d scratch:s%u\n", + parallelcopy->pseudo().needs_scratch_reg, parallelcopy->pseudo().scratch_sgpr.reg()); } else { fprintf(output, "\n"); diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 574209f396d..c0e7fd6ca8e 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -637,7 +637,7 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc) Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1.as_linear()), Operand(v1_lo, v1.as_linear())); instr->pseudo().scratch_sgpr = m0; - instr->pseudo().tmp_in_scc = true; + instr->pseudo().needs_scratch_reg = true; finish_to_hw_instr_test(); END_TEST @@ -660,7 +660,8 @@ BEGIN_TEST(to_hw_instr.swap_linear_vgpr) Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear), Definition(reg_v1, v1_linear), Operand(reg_v1, v1_linear), Operand(reg_v0, v1_linear)); - instr->pseudo().scratch_sgpr = m0; + instr->pseudo().scratch_sgpr = scc; + instr->pseudo().needs_scratch_reg = true; finish_to_hw_instr_test(); END_TEST @@ -684,7 +685,8 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_v3) //! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v3_linear), Operand(reg_v4, v3_linear)); - instr->pseudo().scratch_sgpr = m0; + instr->pseudo().scratch_sgpr = scc; + instr->pseudo().needs_scratch_reg = true; finish_to_hw_instr_test(); END_TEST @@ -709,7 +711,8 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_coalesce) Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear), Definition(reg_v1, v1_linear), Operand(reg_v4, v1_linear), Operand(reg_v5, v1_linear)); - instr->pseudo().scratch_sgpr = m0; + instr->pseudo().scratch_sgpr = scc; + instr->pseudo().needs_scratch_reg = true; finish_to_hw_instr_test(); END_TEST