aco/ra: rework linear VGPR allocation

We allocate them at the end of the register file and keep them separate from normal VGPRs. This is for two reasons: - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a linear one, we never have to swap a normal VGPR and a linear one. This simplifies copy lowering. - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a linear VGPR in control flow. fossil-db (navi31): Totals from 5493 (6.93% of 79242) affected shaders: MaxWaves: 150365 -> 150343 (-0.01%) Instrs: 7974740 -> 7976073 (+0.02%); split: -0.06%, +0.08% CodeSize: 41296024 -> 41299024 (+0.01%); split: -0.06%, +0.06% VGPRs: 283192 -> 329560 (+16.37%) Latency: 64267936 -> 64268414 (+0.00%); split: -0.17%, +0.17% InvThroughput: 10954037 -> 10951735 (-0.02%); split: -0.09%, +0.07% VClause: 132792 -> 132956 (+0.12%); split: -0.06%, +0.18% SClause: 223854 -> 223841 (-0.01%); split: -0.01%, +0.01% Copies: 559574 -> 561395 (+0.33%); split: -0.24%, +0.56% Branches: 179630 -> 179636 (+0.00%); split: -0.02%, +0.02% VALU: 4572683 -> 4574487 (+0.04%); split: -0.03%, +0.07% SALU: 772076 -> 772111 (+0.00%); split: -0.01%, +0.01% VOPD: 1095 -> 1099 (+0.37%); split: +0.73%, -0.37% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27697>
2025-12-25 06:30:10 +01:00 · 2024-02-14 19:55:59 +00:00 · 2024-02-14 19:55:59 +00:00 · b7738de4f9
commit b7738de4f9
parent 2d49c79c7e
6 changed files with 216 additions and 36 deletions
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@ -176,7 +176,7 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
      validate(program.get());

      /* Register Allocation */
-      aco::register_allocation(program.get(), live_vars.live_out);
+      aco::register_allocation(program.get(), live_vars);

      if (aco::validate_ra(program.get())) {
         aco_print_program(program.get(), stderr);
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -2221,8 +2221,7 @@ void optimize(Program* program);
 void optimize_postRA(Program* program);
 void setup_reduce_temp(Program* program);
 void lower_to_cssa(Program* program, live& live_vars);
-void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
-                         ra_test_policy = {});
+void register_allocation(Program* program, live& live_vars, ra_test_policy = {});
 void ssa_elimination(Program* program);
 void lower_to_hw_instr(Program* program);
 void schedule_program(Program* program, live& live_vars);
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@ -89,6 +89,10 @@ struct ra_ctx {
   uint16_t vgpr_limit;
   std::bitset<512> war_hint;

+   uint16_t sgpr_bounds;
+   uint16_t vgpr_bounds;
+   uint16_t num_linear_vgprs;
+
   ra_test_policy policy;

   ra_ctx(Program* program_, ra_test_policy policy_)
@ -101,6 +105,10 @@ struct ra_ctx {
         create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
      sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
      vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
+
+      sgpr_bounds = program->max_reg_demand.sgpr;
+      vgpr_bounds = program->max_reg_demand.vgpr;
+      num_linear_vgprs = 0;
   }
 };

@ -196,10 +204,13 @@ get_stride(RegClass rc)
 PhysRegInterval
 get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr)
 {
-   if (type == RegType::vgpr) {
-      return {PhysReg{256}, (unsigned)ctx.program->max_reg_demand.vgpr};
+   uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs;
+   if (type == RegType::vgpr && linear_vgpr) {
+      return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs};
+   } else if (type == RegType::vgpr) {
+      return PhysRegInterval{PhysReg(256), linear_vgpr_start};
   } else {
-      return {PhysReg{0}, (unsigned)ctx.program->max_reg_demand.sgpr};
+      return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds};
   }
 }

@ -252,7 +263,7 @@ struct DefInfo {
         assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");

         if (imageGather4D16Bug)
-            bounds.size -= rc.bytes() / 4;
+            bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
      }
   }
 };
@ -1294,7 +1305,8 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
      }
   }

-   assert(regs_free >= size);
+   assert((regs_free + ctx.num_linear_vgprs) >= size);
+
   /* we might have to move dead operands to dst in order to make space */
   unsigned op_moves = 0;

@ -1458,15 +1470,22 @@ get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
 bool
 increase_register_file(ra_ctx& ctx, RegClass rc)
 {
-   if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
-      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
-                                                          ctx.program->max_reg_demand.sgpr));
+   if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
+       ctx.vgpr_bounds < ctx.vgpr_limit) {
+      /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
+      update_vgpr_sgpr_demand(
+         ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
+
+      ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
   } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
-      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
-                                                          ctx.program->max_reg_demand.sgpr + 1));
+      update_vgpr_sgpr_demand(
+         ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
+
+      ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
   } else {
      return false;
   }
+
   return true;
 }

@ -1632,6 +1651,124 @@ get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Ins
   return {};
 }

+bool
+compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
+                     std::vector<std::pair<Operand, Definition>>& parallelcopies)
+{
+   PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
+   int zeros = reg_file.count_zero(linear_vgpr_bounds);
+   if (zeros == 0)
+      return false;
+
+   std::vector<IDAndRegClass> vars;
+   for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds))
+      vars.emplace_back(id, ctx.assignments[id].rc);
+
+   ctx.num_linear_vgprs -= zeros;
+   compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo());
+
+   return true;
+}
+
+/* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate
+ * from normal VGPRs. This is for two reasons:
+ * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a
+ *   linear one, we never have to swap a normal VGPR and a linear one.
+ * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a
+ *   linear VGPR in control flow.
+ */
+PhysReg
+alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
+                  std::vector<std::pair<Operand, Definition>>& parallelcopies)
+{
+   assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
+   assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
+
+   RegClass rc = instr->definitions[0].regClass();
+
+   /* Try to choose an unused space in the linear VGPR bounds. */
+   for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) {
+      PhysReg reg(256 + ctx.vgpr_bounds - i);
+      if (!reg_file.test(reg, rc.bytes())) {
+         adjust_max_used_regs(ctx, rc, reg);
+         return reg;
+      }
+   }
+
+   PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
+
+   /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */
+   compact_linear_vgprs(ctx, reg_file, parallelcopies);
+
+   PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size()));
+   /* Space that was for normal VGPRs, but is now for linear VGPRs. */
+   PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg));
+
+   RegisterFile tmp_file(reg_file);
+   PhysRegInterval reg_win{reg, rc.size()};
+   std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win);
+
+   /* Re-enable killed operands */
+   for (Operand& op : instr->operands) {
+      if (op.isTemp() && op.isFirstKillBeforeDef())
+         tmp_file.fill(op);
+   }
+
+   /* Find new assignments for blocking vars. */
+   std::vector<std::pair<Operand, Definition>> pc;
+   if (!ctx.policy.skip_optimistic_path &&
+       get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
+      parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
+   } else {
+      /* Fallback algorithm: reallocate all variables at once. */
+      std::vector<IDAndRegClass> vars;
+      for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds))
+         vars.emplace_back(id, ctx.assignments[id].rc);
+      compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256));
+
+      std::vector<IDAndRegClass> killed_op_vars;
+      for (Operand& op : instr->operands) {
+         if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr)
+            killed_op_vars.emplace_back(op.tempId(), op.regClass());
+      }
+      compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo());
+   }
+
+   /* If this is updated earlier, a killed operand can't be placed inside the definition. */
+   ctx.num_linear_vgprs += rc.size();
+
+   adjust_max_used_regs(ctx, rc, reg);
+   return reg;
+}
+
+bool
+should_compact_linear_vgprs(ra_ctx& ctx, live& live_vars, const RegisterFile& reg_file)
+{
+   if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty())
+      return false;
+
+   /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to
+    * ensure in advance that there is enough space for normal VGPRs. */
+   unsigned max_vgpr_usage = 0;
+   unsigned next_toplevel = ctx.block->index + 1;
+   for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) {
+      max_vgpr_usage =
+         MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr);
+   }
+
+   std::vector<aco_ptr<Instruction>>& instructions =
+      ctx.program->blocks[next_toplevel].instructions;
+   if (!instructions.empty() && is_phi(instructions[0])) {
+      max_vgpr_usage =
+         MAX2(max_vgpr_usage, (unsigned)live_vars.register_demand[next_toplevel][0].vgpr);
+   }
+
+   for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true)))
+      max_vgpr_usage -= ctx.assignments[tmp].rc.size();
+
+   return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
+}
+
 PhysReg
 get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
        std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
@ -1694,14 +1831,29 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
   if (res)
      return *res;

-   /* try using more registers */
+   /* try compacting the linear vgprs to make more space */
+   std::vector<std::pair<Operand, Definition>> pc;
+   if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
+       compact_linear_vgprs(ctx, reg_file, pc)) {
+      parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
+
+      /* We don't need to fill the copy definitions in because we don't care about the linear VGPR
+       * space here. */
+      RegisterFile tmp_file(reg_file);
+      for (std::pair<Operand, Definition>& copy : pc)
+         tmp_file.clear(copy.first);
+
+      return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
+   }

   /* We should only fail here because keeping under the limit would require
    * too many moves. */
   assert(reg_file.count_zero(info.bounds) >= info.size);

+   /* try using more registers */
   if (!increase_register_file(ctx, info.rc)) {
-      /* fallback algorithm: reallocate all variables at once */
+      /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be
+       * compact at the end) */
      unsigned def_size = info.rc.size();
      for (Definition def : instr->definitions) {
         if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
@ -2803,8 +2955,9 @@ emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& par
 } /* end namespace */

 void
-register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
+register_allocation(Program* program, live& live_vars, ra_test_policy policy)
 {
+   std::vector<IDSet>& live_out_per_block = live_vars.live_out;
   ra_ctx ctx(program, policy);
   get_affinities(ctx, live_out_per_block);

@ -2882,19 +3035,17 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
            continue;
         }

-         /* unconditional branches are handled after phis of the target */
+         std::vector<std::pair<Operand, Definition>> parallelcopy;
+         bool temp_in_scc = register_file[scc];
+
         if (instr->opcode == aco_opcode::p_branch) {
-            /* last instruction of the block */
+            /* unconditional branches are handled after phis of the target */
            instructions.emplace_back(std::move(instr));
            break;
         }

-         std::vector<std::pair<Operand, Definition>> parallelcopy;
-
         assert(!is_phi(instr));

-         bool temp_in_scc = register_file[scc];
-
         /* handle operands */
         bool fixed = false;
         for (unsigned i = 0; i < instr->operands.size(); ++i) {
@ -3004,7 +3155,11 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
               continue;

            /* find free reg */
-            if (instr->opcode == aco_opcode::p_split_vector) {
+            if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
+               /* Allocation of linear VGPRs is special. */
+               definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy));
+               update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
+            } else if (instr->opcode == aco_opcode::p_split_vector) {
               PhysReg reg = instr->operands[0].physReg();
               RegClass rc = definition->regClass();
               for (unsigned j = 0; j < i; j++)
@ -3162,6 +3317,26 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra

      } /* end for Instr */

+      if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) {
+         /* Reset this for block_kind_resume. */
+         ASSERTED PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
+         assert(register_file.count_zero(linear_vgpr_bounds) == linear_vgpr_bounds.size);
+         ctx.num_linear_vgprs = 0;
+      } else if (should_compact_linear_vgprs(ctx, live_vars, register_file)) {
+         aco_ptr<Instruction> br = std::move(instructions.back());
+         instructions.pop_back();
+
+         bool temp_in_scc =
+            register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
+
+         std::vector<std::pair<Operand, Definition>> parallelcopy;
+         compact_linear_vgprs(ctx, register_file, parallelcopy);
+         update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
+         emit_parallel_copy(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
+
+         instructions.push_back(std::move(br));
+      }
+
      block.instructions = std::move(instructions);
   } /* end for BB */

--- a/src/amd/compiler/tests/helpers.cpp
+++ b/src/amd/compiler/tests/helpers.cpp
@ -211,7 +211,7 @@ finish_ra_test(ra_test_policy policy, bool lower)

   program->workgroup_size = program->wave_size;
   aco::live live_vars = aco::live_var_analysis(program.get());
-   aco::register_allocation(program.get(), live_vars.live_out, policy);
+   aco::register_allocation(program.get(), live_vars, policy);

   if (aco::validate_ra(program.get())) {
      fail_test("Validation after register allocation failed");
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@ -62,8 +62,8 @@ BEGIN_TEST(d3d11_derivs.simple)

   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                         ; $_
-   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                       ; $_
   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                       ; $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                       ; $_
   //>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST
@ -101,8 +101,9 @@ BEGIN_TEST(d3d11_derivs.constant)
   //>> p_end_linear_vgpr (latekill)(kill)%wqm
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                                             ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
   //>> v_mov_b32_e32 v#ry, -0.5                                                           ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                       ; $_
   //>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST
@ -173,12 +174,12 @@ BEGIN_TEST(d3d11_derivs.bias)
   //>> p_end_linear_vgpr (latekill)(kill)%wqm
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

-   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                 ; $_
-   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                 ; $_
-   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                               ; $_
-   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                               ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                   ; $_
+   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                   ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                                 ; $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                                 ; $_
   //>> BB1:
-   //>> image_sample_b v[#_:#_], [v2, v#rx, v#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_ $_
+   //>> image_sample_b v[#_:#_], [v#rb, v#rx, v#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST

@ -265,9 +266,9 @@ BEGIN_TEST(d3d11_derivs.array)
   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                               ; $_
   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                               ; $_
   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                                                       ; $_
-   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                                             ; $_
   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                             ; $_
   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                             ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                                             ; $_
   //>> BB1:
   //; success = rx+1 == ry and rx+2 == rl
   //>> image_sample v[#_:#_], v[#rx:#rl], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; $_ $_
@ -353,8 +354,9 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9)
   //>> p_end_linear_vgpr (latekill)(kill)%wqm
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                    ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                ; $_
   //>> v_mov_b32_e32 v#ry, 0.5                                   ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                              ; $_
   //; success = rx+1 == ry
   //>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf ; $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
@ -398,8 +400,8 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                   ; $_
   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                           ; $_
   //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
-   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                 ; $_
   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                 ; $_
   //>> BB1:
   //; success = rx+1 == ry and rx+2 == rl
   //>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf da ; $_ $_
@ -442,10 +444,11 @@ BEGIN_TEST(d3d11_derivs.cube)
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

   //>> v_cubeid_f32 v#rf_tmp, v#_, v#_, v#_                                                 ; $_ $_
-   //>> v_fmaak_f32 v#rx, v#_, v#_, 0x3fc00000                                               ; $_ $_
-   //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
   //>> v_mov_b32_e32 v#rf, v#rf_tmp                                                         ; $_
+   //>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
+   //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                         ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                         ; $_
   //; success = rx+1 == ry and rx+2 == rf
   //>> image_sample v[#_:#_], v[#rx:#rf], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
--- a/src/amd/compiler/tests/test_regalloc.cpp
+++ b/src/amd/compiler/tests/test_regalloc.cpp
@ -232,6 +232,8 @@ BEGIN_TEST(regalloc.branch_def_phis_at_merge_block)
   if (!setup_cs("", GFX10))
      return;

+   program->blocks[0].kind &= ~block_kind_top_level;
+
   //! s2: %_:s[2-3] = p_branch
   bld.branch(aco_opcode::p_branch, bld.def(s2));

@ -277,6 +279,7 @@ BEGIN_TEST(regalloc.branch_def_phis_at_branch_block)
   bld.reset(program->create_and_insert_block());
   program->blocks[3].linear_preds.push_back(1);
   program->blocks[3].linear_preds.push_back(2);
+   program->blocks[3].kind |= block_kind_top_level;

   finish_ra_test(ra_test_policy());
 END_TEST