aco: allow live-range splits of linear vgprs in top-level blocks

Fixes dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46 on GFX8. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12172>
2026-05-06 05:08:08 +02:00 · 2021-06-09 14:33:24 +01:00 · 2021-06-09 14:33:24 +01:00 · 6ed18749de
commit 6ed18749de
parent 8d50385bbd
3 changed files with 140 additions and 32 deletions
--- a/src/amd/ci/deqp-radv-polaris10-aco-fails.txt
+++ b/src/amd/ci/deqp-radv-polaris10-aco-fails.txt
@ -1,2 +0,0 @@
-# ACO crash
-dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46,Crash
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@ -56,6 +56,7 @@ struct assignment {
 struct ra_ctx {

   Program* program;
+   Block* block = NULL;
   std::vector<assignment> assignments;
   std::vector<std::unordered_map<unsigned, Temp>> renames;
   std::vector<uint32_t> loop_header;
@ -1074,8 +1075,9 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
               n++;
               continue;
            }
-            /* we cannot split live ranges of linear vgprs */
-            if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
+            /* we cannot split live ranges of linear vgprs inside control flow */
+            if (!(ctx.block->kind & block_kind_top_level) &&
+                ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
               found = false;
               break;
            }
@ -1221,8 +1223,10 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
            break;
         }

-         /* we cannot split live ranges of linear vgprs */
-         if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
+         /* we cannot split live ranges of linear vgprs inside control flow */
+         //TODO: ensure that live range splits inside control flow are never necessary
+         if (!(ctx.block->kind & block_kind_top_level) &&
+             ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
            found = false;
            break;
         }
@ -1627,7 +1631,7 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,

   PhysReg best_pos{0xFFF};
   unsigned num_moves = 0xFF;
-   bool best_war_hint = true;
+   bool best_avoid = true;

   /* test for each operand which definition placement causes the least shuffle instructions */
   for (unsigned i = 0, offset = 0; i < instr->operands.size();
@ -1661,14 +1665,9 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
          reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
         continue;

-      /* count variables to be moved and check war_hint */
-      bool war_hint = false;
-      bool linear_vgpr = false;
+      /* count variables to be moved and check "avoid" */
+      bool avoid = false;
      for (PhysReg j : reg_win) {
-         if (linear_vgpr) {
-            break;
-         }
-
         if (reg_file[j] != 0) {
            if (reg_file[j] == 0xF0000000) {
               PhysReg reg;
@ -1678,14 +1677,18 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
                  k += reg_file.test(reg, 1);
            } else {
               k += 4;
-               /* we cannot split live ranges of linear vgprs */
-               if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr())
-                  linear_vgpr = true;
+               /* we cannot split live ranges of linear vgprs inside control flow */
+               if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
+                  if (ctx.block->kind & block_kind_top_level)
+                     avoid = true;
+                  else
+                     break;
+               }
            }
         }
-         war_hint |= ctx.war_hint[j];
+         avoid |= ctx.war_hint[j];
      }
-      if (linear_vgpr || (war_hint && !best_war_hint))
+      if (avoid && !best_avoid)
         continue;

      /* count operands in wrong positions */
@ -1703,7 +1706,7 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,

      best_pos = reg_win.lo();
      num_moves = k;
-      best_war_hint = war_hint;
+      best_avoid = avoid;
   }

   if (num_moves >= bytes)
@ -1775,24 +1778,22 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
   default: return;
   }

-   /* if all definitions are vgpr, no need to care for SCC */
-   bool writes_sgpr = false;
+   bool writes_linear = false;
+   /* if all definitions are logical vgpr, no need to care for SCC */
   for (Definition& def : instr->definitions) {
-      if (def.getTemp().type() == RegType::sgpr) {
-         writes_sgpr = true;
-         break;
-      }
+      if (def.getTemp().regClass().is_linear())
+         writes_linear = true;
   }
   /* if all operands are constant, no need to care either */
-   bool reads_sgpr = false;
+   bool reads_linear = false;
   bool reads_subdword = false;
   for (Operand& op : instr->operands) {
-      if (op.isTemp() && op.getTemp().type() == RegType::sgpr)
-         reads_sgpr = true;
+      if (op.isTemp() && op.getTemp().regClass().is_linear())
+         reads_linear = true;
      if (op.isTemp() && op.regClass().is_subdword())
         reads_subdword = true;
   }
-   bool needs_scratch_reg = (writes_sgpr && reads_sgpr && reg_file[scc]) ||
+   bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
                            (ctx.program->chip_class <= GFX7 && reads_subdword);
   if (!needs_scratch_reg)
      return;
@ -1911,7 +1912,7 @@ Temp
 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
 {
   std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
-   if (preds.size() == 0 || val.regClass().is_linear_vgpr())
+   if (preds.size() == 0)
      return val;

   if (preds.size() == 1) {
@ -1934,6 +1935,8 @@ handle_live_in(ra_ctx& ctx, Temp val, Block* block)
   }

   if (needs_phi) {
+      assert(!val.regClass().is_linear_vgpr());
+
      /* the variable has been renamed differently in the predecessors: we need to insert a phi */
      aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
      aco_ptr<Instruction> phi{
@ -2243,6 +2246,8 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
   std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());

   for (Block& block : program->blocks) {
+      ctx.block = &block;
+
      /* initialize register file */
      RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
      ctx.war_hint.reset();
@ -2646,9 +2651,12 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
            pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
                                                            Format::PSEUDO, parallelcopy.size(),
                                                            parallelcopy.size()));
+            bool linear_vgpr = false;
            bool sgpr_operands_alias_defs = false;
            uint64_t sgpr_operands[4] = {0, 0, 0, 0};
            for (unsigned i = 0; i < parallelcopy.size(); i++) {
+               linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
+
               if (temp_in_scc && parallelcopy[i].first.isTemp() &&
                   parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
                  if (!sgpr_operands_alias_defs) {
@ -2676,7 +2684,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
               ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
            }

-            if (temp_in_scc && sgpr_operands_alias_defs) {
+            if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
               /* disable definitions and re-enable operands */
               RegisterFile tmp_file(register_file);
               for (const Definition& def : instr->definitions) {
--- a/src/amd/compiler/tests/test_regalloc.cpp
+++ b/src/amd/compiler/tests/test_regalloc.cpp
@ -184,3 +184,105 @@ BEGIN_TEST(regalloc.scratch_sgpr.create_vector_sgpr_operand)

   finish_ra_test(ra_test_policy(), true);
 END_TEST
+
+BEGIN_TEST(regalloc.linear_vgpr.live_range_split.fixed_def)
+   //>> p_startpgm
+   if (!setup_cs("", GFX10))
+      return;
+
+   PhysReg reg_v0{256};
+   PhysReg reg_v1{257};
+
+   //! lv1: %tmp1:v[0] = p_unit_test
+   Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v0));
+
+   //! lv1: %tmp2:v[1] = p_parallelcopy %tmp1:v[0]
+   //! v1: %_:v[0] = p_unit_test
+   bld.pseudo(aco_opcode::p_unit_test, Definition(reg_v0, v1));
+
+   //! p_unit_test %tmp2:v[1]
+   bld.pseudo(aco_opcode::p_unit_test, tmp);
+
+   finish_ra_test(ra_test_policy());
+END_TEST
+
+BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_impl)
+   //>> p_startpgm
+   if (!setup_cs("", GFX10))
+      return;
+
+   program->dev.vgpr_limit = 3;
+
+   PhysReg reg_v1{257};
+
+   //! s1: %scc_tmp:scc, s1: %1:s[0] = p_unit_test
+   Temp s0_tmp = bld.tmp(s1);
+   Temp scc_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(s1, scc), Definition(s0_tmp.id(), PhysReg{0}, s1));
+
+   //! lv1: %tmp1:v[1] = p_unit_test
+   Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v1));
+
+   //! lv1: %tmp2:v[2] = p_parallelcopy %tmp1:v[1]
+   //! v2: %_:v[0-1] = p_unit_test
+   bld.pseudo(aco_opcode::p_unit_test, bld.def(v2));
+
+   //! p_unit_test %tmp2:v[2], %scc_tmp:scc, %1:s[0]
+   bld.pseudo(aco_opcode::p_unit_test, tmp, scc_tmp, s0_tmp);
+
+   finish_ra_test(ra_test_policy());
+
+   //>> lv1: %5:v[2] = p_parallelcopy %3:v[1] scc:1 scratch:s1
+   Pseudo_instruction& parallelcopy = program->blocks[0].instructions[3]->pseudo();
+   aco_print_instr(&parallelcopy, output);
+   fprintf(output, " scc:%u scratch:s%u\n", parallelcopy.tmp_in_scc, parallelcopy.scratch_sgpr.reg());
+END_TEST
+
+BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_regs_for_copies)
+   //>> p_startpgm
+   if (!setup_cs("", GFX10))
+      return;
+
+   program->dev.vgpr_limit = 6;
+
+   PhysReg reg_v2{258};
+   PhysReg reg_v4{260};
+
+   //! lv1: %lin_tmp1:v[4] = p_unit_test
+   Temp lin_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v4));
+   //! v2: %log_tmp1:v[2-3] = p_unit_test
+   Temp log_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v2, reg_v2));
+
+   //! lv1: %lin_tmp2:v[0], v2: %log_tmp2:v[4-5] = p_parallelcopy %lin_tmp1:v[4], %log_tmp1:v[2-3]
+   //! v3: %_:v[1-3] = p_unit_test
+   bld.pseudo(aco_opcode::p_unit_test, bld.def(v3));
+
+   //! p_unit_test %log_tmp2:v[4-5], %lin_tmp2:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, log_tmp, lin_tmp);
+
+   finish_ra_test(ra_test_policy());
+END_TEST
+
+BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_create_vector)
+   //>> p_startpgm
+   if (!setup_cs("", GFX10))
+      return;
+
+   program->dev.vgpr_limit = 4;
+
+   PhysReg reg_v0{256};
+   PhysReg reg_v1{257};
+
+   //! lv1: %lin_tmp1:v[0] = p_unit_test
+   Temp lin_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v0));
+   //! v1: %log_tmp:v[1] = p_unit_test
+   Temp log_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1, reg_v1));
+
+   //! lv1: %lin_tmp2:v[2] = p_parallelcopy %lin_tmp1:v[0]
+   //! v2: %_:v[0-1] = p_create_vector v1: undef, %log_tmp:v[1]
+   bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(v1), log_tmp);
+
+   //! p_unit_test %lin_tmp2:v[2]
+   bld.pseudo(aco_opcode::p_unit_test, lin_tmp);
+
+   finish_ra_test(ra_test_policy());
+END_TEST