diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index c42bcb8724a..7e418caaa55 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -176,7 +176,7 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
       validate(program.get());
 
       /* Register Allocation */
-      aco::register_allocation(program.get(), live_vars.live_out);
+      aco::register_allocation(program.get(), live_vars);
 
       if (aco::validate_ra(program.get())) {
          aco_print_program(program.get(), stderr);
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 7d80ad03ae8..8ee4caeed47 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -2221,8 +2221,7 @@ void optimize(Program* program);
 void optimize_postRA(Program* program);
 void setup_reduce_temp(Program* program);
 void lower_to_cssa(Program* program, live& live_vars);
-void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
-                         ra_test_policy = {});
+void register_allocation(Program* program, live& live_vars, ra_test_policy = {});
 void ssa_elimination(Program* program);
 void lower_to_hw_instr(Program* program);
 void schedule_program(Program* program, live& live_vars);
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index 78ba695e0c1..36ec765f760 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -89,6 +89,10 @@ struct ra_ctx {
    uint16_t vgpr_limit;
    std::bitset<512> war_hint;
 
+   uint16_t sgpr_bounds;
+   uint16_t vgpr_bounds;
+   uint16_t num_linear_vgprs;
+
    ra_test_policy policy;
 
    ra_ctx(Program* program_, ra_test_policy policy_)
@@ -101,6 +105,10 @@ struct ra_ctx {
          create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
       sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
       vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
+
+      sgpr_bounds = program->max_reg_demand.sgpr;
+      vgpr_bounds = program->max_reg_demand.vgpr;
+      num_linear_vgprs = 0;
    }
 };
 
@@ -196,10 +204,13 @@ get_stride(RegClass rc)
 PhysRegInterval
 get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr)
 {
-   if (type == RegType::vgpr) {
-      return {PhysReg{256}, (unsigned)ctx.program->max_reg_demand.vgpr};
+   uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs;
+   if (type == RegType::vgpr && linear_vgpr) {
+      return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs};
+   } else if (type == RegType::vgpr) {
+      return PhysRegInterval{PhysReg(256), linear_vgpr_start};
    } else {
-      return {PhysReg{0}, (unsigned)ctx.program->max_reg_demand.sgpr};
+      return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds};
    }
 }
 
@@ -252,7 +263,7 @@ struct DefInfo {
          assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
 
          if (imageGather4D16Bug)
-            bounds.size -= rc.bytes() / 4;
+            bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
       }
    }
 };
@@ -1294,7 +1305,8 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
       }
    }
 
-   assert(regs_free >= size);
+   assert((regs_free + ctx.num_linear_vgprs) >= size);
+
    /* we might have to move dead operands to dst in order to make space */
    unsigned op_moves = 0;
 
@@ -1458,15 +1470,22 @@ get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
 bool
 increase_register_file(ra_ctx& ctx, RegClass rc)
 {
-   if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
-      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
-                                                          ctx.program->max_reg_demand.sgpr));
+   if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
+       ctx.vgpr_bounds < ctx.vgpr_limit) {
+      /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
+      update_vgpr_sgpr_demand(
+         ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
+
+      ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
    } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
-      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
-                                                          ctx.program->max_reg_demand.sgpr + 1));
+      update_vgpr_sgpr_demand(
+         ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
+
+      ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
    } else {
       return false;
    }
+
    return true;
 }
 
@@ -1632,6 +1651,124 @@ get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Ins
    return {};
 }
 
+bool
+compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
+                     std::vector<std::pair<Operand, Definition>>& parallelcopies)
+{
+   PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
+   int zeros = reg_file.count_zero(linear_vgpr_bounds);
+   if (zeros == 0)
+      return false;
+
+   std::vector<IDAndRegClass> vars;
+   for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds))
+      vars.emplace_back(id, ctx.assignments[id].rc);
+
+   ctx.num_linear_vgprs -= zeros;
+   compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo());
+
+   return true;
+}
+
+/* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate
+ * from normal VGPRs. This is for two reasons:
+ * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a
+ *   linear one, we never have to swap a normal VGPR and a linear one.
+ * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a
+ *   linear VGPR in control flow.
+ */
+PhysReg
+alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
+                  std::vector<std::pair<Operand, Definition>>& parallelcopies)
+{
+   assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
+   assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
+
+   RegClass rc = instr->definitions[0].regClass();
+
+   /* Try to choose an unused space in the linear VGPR bounds. */
+   for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) {
+      PhysReg reg(256 + ctx.vgpr_bounds - i);
+      if (!reg_file.test(reg, rc.bytes())) {
+         adjust_max_used_regs(ctx, rc, reg);
+         return reg;
+      }
+   }
+
+   PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
+
+   /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */
+   compact_linear_vgprs(ctx, reg_file, parallelcopies);
+
+   PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size()));
+   /* Space that was for normal VGPRs, but is now for linear VGPRs. */
+   PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg));
+
+   RegisterFile tmp_file(reg_file);
+   PhysRegInterval reg_win{reg, rc.size()};
+   std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win);
+
+   /* Re-enable killed operands */
+   for (Operand& op : instr->operands) {
+      if (op.isTemp() && op.isFirstKillBeforeDef())
+         tmp_file.fill(op);
+   }
+
+   /* Find new assignments for blocking vars. */
+   std::vector<std::pair<Operand, Definition>> pc;
+   if (!ctx.policy.skip_optimistic_path &&
+       get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
+      parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
+   } else {
+      /* Fallback algorithm: reallocate all variables at once. */
+      std::vector<IDAndRegClass> vars;
+      for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds))
+         vars.emplace_back(id, ctx.assignments[id].rc);
+      compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256));
+
+      std::vector<IDAndRegClass> killed_op_vars;
+      for (Operand& op : instr->operands) {
+         if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr)
+            killed_op_vars.emplace_back(op.tempId(), op.regClass());
+      }
+      compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo());
+   }
+
+   /* If this is updated earlier, a killed operand can't be placed inside the definition. */
+   ctx.num_linear_vgprs += rc.size();
+
+   adjust_max_used_regs(ctx, rc, reg);
+   return reg;
+}
+
+bool
+should_compact_linear_vgprs(ra_ctx& ctx, live& live_vars, const RegisterFile& reg_file)
+{
+   if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty())
+      return false;
+
+   /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to
+    * ensure in advance that there is enough space for normal VGPRs. */
+   unsigned max_vgpr_usage = 0;
+   unsigned next_toplevel = ctx.block->index + 1;
+   for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) {
+      max_vgpr_usage =
+         MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr);
+   }
+
+   std::vector<aco_ptr<Instruction>>& instructions =
+      ctx.program->blocks[next_toplevel].instructions;
+   if (!instructions.empty() && is_phi(instructions[0])) {
+      max_vgpr_usage =
+         MAX2(max_vgpr_usage, (unsigned)live_vars.register_demand[next_toplevel][0].vgpr);
+   }
+
+   for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true)))
+      max_vgpr_usage -= ctx.assignments[tmp].rc.size();
+
+   return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
+}
+
 PhysReg
 get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
         std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
@@ -1694,14 +1831,29 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
    if (res)
       return *res;
 
-   /* try using more registers */
+   /* try compacting the linear vgprs to make more space */
+   std::vector<std::pair<Operand, Definition>> pc;
+   if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
+       compact_linear_vgprs(ctx, reg_file, pc)) {
+      parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
+
+      /* We don't need to fill the copy definitions in because we don't care about the linear VGPR
+       * space here. */
+      RegisterFile tmp_file(reg_file);
+      for (std::pair<Operand, Definition>& copy : pc)
+         tmp_file.clear(copy.first);
+
+      return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
+   }
 
    /* We should only fail here because keeping under the limit would require
     * too many moves. */
    assert(reg_file.count_zero(info.bounds) >= info.size);
 
+   /* try using more registers */
    if (!increase_register_file(ctx, info.rc)) {
-      /* fallback algorithm: reallocate all variables at once */
+      /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be
+       * compact at the end) */
       unsigned def_size = info.rc.size();
       for (Definition def : instr->definitions) {
          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
@@ -2803,8 +2955,9 @@ emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& par
 } /* end namespace */
 
 void
-register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
+register_allocation(Program* program, live& live_vars, ra_test_policy policy)
 {
+   std::vector<IDSet>& live_out_per_block = live_vars.live_out;
    ra_ctx ctx(program, policy);
    get_affinities(ctx, live_out_per_block);
 
@@ -2882,19 +3035,17 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
             continue;
          }
 
-         /* unconditional branches are handled after phis of the target */
+         std::vector<std::pair<Operand, Definition>> parallelcopy;
+         bool temp_in_scc = register_file[scc];
+
          if (instr->opcode == aco_opcode::p_branch) {
-            /* last instruction of the block */
+            /* unconditional branches are handled after phis of the target */
             instructions.emplace_back(std::move(instr));
             break;
          }
 
-         std::vector<std::pair<Operand, Definition>> parallelcopy;
-
          assert(!is_phi(instr));
 
-         bool temp_in_scc = register_file[scc];
-
          /* handle operands */
          bool fixed = false;
          for (unsigned i = 0; i < instr->operands.size(); ++i) {
@@ -3004,7 +3155,11 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                continue;
 
             /* find free reg */
-            if (instr->opcode == aco_opcode::p_split_vector) {
+            if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
+               /* Allocation of linear VGPRs is special. */
+               definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy));
+               update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
+            } else if (instr->opcode == aco_opcode::p_split_vector) {
                PhysReg reg = instr->operands[0].physReg();
                RegClass rc = definition->regClass();
                for (unsigned j = 0; j < i; j++)
@@ -3162,6 +3317,26 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
 
       } /* end for Instr */
 
+      if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) {
+         /* Reset this for block_kind_resume. */
+         ASSERTED PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
+         assert(register_file.count_zero(linear_vgpr_bounds) == linear_vgpr_bounds.size);
+         ctx.num_linear_vgprs = 0;
+      } else if (should_compact_linear_vgprs(ctx, live_vars, register_file)) {
+         aco_ptr<Instruction> br = std::move(instructions.back());
+         instructions.pop_back();
+
+         bool temp_in_scc =
+            register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
+
+         std::vector<std::pair<Operand, Definition>> parallelcopy;
+         compact_linear_vgprs(ctx, register_file, parallelcopy);
+         update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
+         emit_parallel_copy(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
+
+         instructions.push_back(std::move(br));
+      }
+
       block.instructions = std::move(instructions);
    } /* end for BB */
 
diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp
index 7334182d11f..b4b52617fa6 100644
--- a/src/amd/compiler/tests/helpers.cpp
+++ b/src/amd/compiler/tests/helpers.cpp
@@ -211,7 +211,7 @@ finish_ra_test(ra_test_policy policy, bool lower)
 
    program->workgroup_size = program->wave_size;
    aco::live live_vars = aco::live_var_analysis(program.get());
-   aco::register_allocation(program.get(), live_vars.live_out, policy);
+   aco::register_allocation(program.get(), live_vars, policy);
 
    if (aco::validate_ra(program.get())) {
       fail_test("Validation after register allocation failed");
diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp
index 0b68803f979..001dbcda8f5 100644
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@@ -62,8 +62,8 @@ BEGIN_TEST(d3d11_derivs.simple)
 
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
    //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                         ; $_
-   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                       ; $_
    //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                       ; $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                       ; $_
    //>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST
@@ -101,8 +101,9 @@ BEGIN_TEST(d3d11_derivs.constant)
    //>> p_end_linear_vgpr (latekill)(kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                                             ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
    //>> v_mov_b32_e32 v#ry, -0.5                                                           ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                       ; $_
    //>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST
@@ -173,12 +174,12 @@ BEGIN_TEST(d3d11_derivs.bias)
    //>> p_end_linear_vgpr (latekill)(kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
-   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                 ; $_
-   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                 ; $_
-   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                               ; $_
-   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                               ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                   ; $_
+   //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                                   ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                                 ; $_
+   //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                                 ; $_
    //>> BB1:
-   //>> image_sample_b v[#_:#_], [v2, v#rx, v#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_ $_
+   //>> image_sample_b v[#_:#_], [v#rb, v#rx, v#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
 END_TEST
 
@@ -265,9 +266,9 @@ BEGIN_TEST(d3d11_derivs.array)
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                               ; $_
    //>> v_interp_p2_f32_e32 v#ry_tmp, v#_, attr0.y                                               ; $_
    //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                                                       ; $_
-   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                                             ; $_
    //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                             ; $_
    //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                             ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                                             ; $_
    //>> BB1:
    //; success = rx+1 == ry and rx+2 == rl
    //>> image_sample v[#_:#_], v[#rx:#rl], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; $_ $_
@@ -353,8 +354,9 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9)
    //>> p_end_linear_vgpr (latekill)(kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
-   //>> v_interp_p2_f32_e32 v#rx, v#_, attr0.x                    ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                ; $_
    //>> v_mov_b32_e32 v#ry, 0.5                                   ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                              ; $_
    //; success = rx+1 == ry
    //>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf ; $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
@@ -398,8 +400,8 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                   ; $_
    //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                           ; $_
    //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
-   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                 ; $_
    //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
+   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                 ; $_
    //>> BB1:
    //; success = rx+1 == ry and rx+2 == rl
    //>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf da ; $_ $_
@@ -442,10 +444,11 @@ BEGIN_TEST(d3d11_derivs.cube)
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_cubeid_f32 v#rf_tmp, v#_, v#_, v#_                                                 ; $_ $_
-   //>> v_fmaak_f32 v#rx, v#_, v#_, 0x3fc00000                                               ; $_ $_
-   //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
    //>> v_mov_b32_e32 v#rf, v#rf_tmp                                                         ; $_
+   //>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
+   //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                           ; $_ $_
    //>> v_mov_b32_e32 v#ry, v#ry_tmp                                                         ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                         ; $_
    //; success = rx+1 == ry and rx+2 == rf
    //>> image_sample v[#_:#_], v[#rx:#rf], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; $_ $_
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp
index 60f33d63028..27558940177 100644
--- a/src/amd/compiler/tests/test_regalloc.cpp
+++ b/src/amd/compiler/tests/test_regalloc.cpp
@@ -232,6 +232,8 @@ BEGIN_TEST(regalloc.branch_def_phis_at_merge_block)
    if (!setup_cs("", GFX10))
       return;
 
+   program->blocks[0].kind &= ~block_kind_top_level;
+
    //! s2: %_:s[2-3] = p_branch
    bld.branch(aco_opcode::p_branch, bld.def(s2));
 
@@ -277,6 +279,7 @@ BEGIN_TEST(regalloc.branch_def_phis_at_branch_block)
    bld.reset(program->create_and_insert_block());
    program->blocks[3].linear_preds.push_back(1);
    program->blocks[3].linear_preds.push_back(2);
+   program->blocks[3].kind |= block_kind_top_level;
 
    finish_ra_test(ra_test_policy());
 END_TEST