From dedfff9dbf04b2f8b649fb63ac880411bede4c39 Mon Sep 17 00:00:00 2001
From: Georg Lehmann <dadschoorse@gmail.com>
Date: Thu, 25 Jul 2024 17:15:15 +0200
Subject: [PATCH] aco: only set latekill in live_var_analysis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cleaner to have this all in one place, in my opinion.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30368>
---
 .../compiler/aco_instruction_selection.cpp    | 60 ++++---------------
 src/amd/compiler/aco_live_var_analysis.cpp    | 33 +++++++++-
 src/amd/compiler/aco_reduce_assign.cpp        | 12 +---
 src/amd/compiler/aco_spill.cpp                |  6 +-
 src/amd/compiler/aco_validate.cpp             |  5 --
 src/amd/compiler/tests/test_d3d11_derivs.cpp  | 24 ++++----
 src/amd/compiler/tests/test_reduce_assign.cpp |  4 +-
 src/amd/compiler/tests/test_regalloc.cpp      | 52 ++++++++--------
 8 files changed, 87 insertions(+), 109 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 3cfb2f52f71..c788dcb1269 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -180,13 +180,8 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
 
    if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
       /* GFX6-7: there is no bpermute instruction */
-      Operand index_op(index);
-      Operand input_data(data);
-      index_op.setLateKill(true);
-      input_data.setLateKill(true);
-
       return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
-                        bld.def(bld.lm, vcc), index_op, input_data);
+                        bld.def(bld.lm, vcc), index, data);
    } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
 
       /* GFX10 wave64 mode: emulate full-wave bpermute */
@@ -199,11 +194,6 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
-      Operand input_data(data);
-
-      index_x4.setLateKill(true);
-      input_data.setLateKill(true);
-      same_half.setLateKill(true);
 
       if (ctx->options->gfx_level <= GFX10_3) {
          /* We need one pair of shared VGPRs:
@@ -212,11 +202,10 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
          ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
 
          return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
-                           bld.def(s1, scc), index_x4, input_data, same_half);
+                           bld.def(s1, scc), index_x4, data, same_half);
       } else {
          return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
-                           bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data,
-                           same_half);
+                           bld.def(s1, scc), Operand(v1.as_linear()), index_x4, data, same_half);
       }
    } else {
       /* GFX8-9 or GFX10 wave32: bpermute works normally */
@@ -3610,11 +3599,8 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       Temp ref = get_alu_src(ctx, instr->src[0]);
       Temp src = get_alu_src(ctx, instr->src[1], 2);
       Temp accum = get_alu_src(ctx, instr->src[2], 4);
-      Builder::Result res = bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src),
-                                     as_vgpr(ctx, ref), as_vgpr(ctx, accum));
-      res.instr->operands[0].setLateKill(true);
-      res.instr->operands[1].setLateKill(true);
-      res.instr->operands[2].setLateKill(true);
+      bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), as_vgpr(ctx, ref),
+               as_vgpr(ctx, accum));
       emit_split_vector(ctx, dst, 4);
       break;
    }
@@ -5613,13 +5599,9 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem
    Builder bld(ctx->program, ctx->block);
 
    if (in_exec_divergent_or_in_loop(ctx)) {
-      Operand prim_mask_op = bld.m0(prim_mask);
-      prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
-      Operand coord2_op(coord2);
-      coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */
       bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
                  Operand::c32(idx), Operand::c32(component), Operand::c32(high_16bits), coord1,
-                 coord2_op, prim_mask_op);
+                 coord2, bld.m0(prim_mask));
       return;
    }
 
@@ -5676,11 +5658,8 @@ emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src,
       }
    } else {
       assert(!high_16bits);
-      Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
-                                             bld.m0(prim_mask), idx, component);
-
-      if (ctx->program->dev.has_16bank_lds)
-         interp_p1->operands[0].setLateKill(true);
+      Temp interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
+                                  bld.m0(prim_mask), idx, component);
 
       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
                  idx, component);
@@ -5696,11 +5675,9 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig
    if (ctx->options->gfx_level >= GFX11) {
       uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
       if (in_exec_divergent_or_in_loop(ctx)) {
-         Operand prim_mask_op = bld.m0(prim_mask);
-         prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
          bld.pseudo(aco_opcode::p_interp_gfx11, Definition(tmp), Operand(v1.as_linear()),
                     Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
-                    prim_mask_op);
+                    bld.m0(prim_mask));
       } else {
          Temp p =
             bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
@@ -6189,11 +6166,8 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v
    mimg->operands[0] = Operand(rsrc);
    mimg->operands[1] = samp;
    mimg->operands[2] = vdata;
-   for (unsigned i = 0; i < coords.size(); i++) {
+   for (unsigned i = 0; i < coords.size(); i++)
       mimg->operands[3 + i] = Operand(coords[i]);
-      if (coords[i].regClass().is_linear_vgpr())
-         mimg->operands[3 + i].setLateKill(true);
-   }
    mimg->mimg().strict_wqm = strict_wqm;
 
    return &bld.insert(std::move(mimg))->mimg();
@@ -8219,9 +8193,7 @@ create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt*
       create_instruction(aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
    for (unsigned i = 0; i < 4; i++) {
       exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
-      exp->operands[i].setLateKill(true);
       exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
-      exp->operands[i + 4].setLateKill(true);
    }
 
    RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
@@ -8267,9 +8239,6 @@ visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
    Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)));
    Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
 
-   A.setLateKill(true);
-   B.setLateKill(true);
-
    VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu();
    vop3p.neg_lo[0] = (signed_mask & 0x1) != 0;
    vop3p.neg_lo[1] = (signed_mask & 0x2) != 0;
@@ -10501,9 +10470,7 @@ visit_block(isel_context* ctx, nir_block* block)
    if (ctx->block->kind & block_kind_top_level) {
       Builder bld(ctx->program, ctx->block);
       for (Temp tmp : ctx->unended_linear_vgprs) {
-         Operand op(tmp);
-         op.setLateKill(true);
-         bld.pseudo(aco_opcode::p_end_linear_vgpr, op);
+         bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
       }
       ctx->unended_linear_vgprs.clear();
    }
@@ -11462,16 +11429,13 @@ add_startpgm(struct isel_context* ctx)
       } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
          /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
           */
-         Operand scratch_offset = Operand(get_arg(ctx, ctx->args->scratch_offset));
-         scratch_offset.setLateKill(true);
-
          Operand scratch_addr = ctx->args->ring_offsets.used
                                    ? Operand(get_arg(ctx, ctx->args->ring_offsets))
                                    : Operand(s2);
 
          Builder bld(ctx->program, ctx->block);
          bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
-                    scratch_offset);
+                    get_arg(ctx, ctx->args->scratch_offset));
       }
    }
 
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index fac93e84898..757af0e3202 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -179,8 +179,13 @@ process_live_temps_per_block(live_ctx& ctx, Block* block)
       ctx.program->needs_vcc |= instr_needs_vcc(insn);
       insn->register_demand = RegisterDemand(new_demand.vgpr, new_demand.sgpr);
 
+      bool has_vgpr_def = false;
+
       /* KILL */
       for (Definition& definition : insn->definitions) {
+         has_vgpr_def |= definition.regClass().type() == RegType::vgpr &&
+                         !definition.regClass().is_linear_vgpr();
+
          if (!definition.isTemp()) {
             continue;
          }
@@ -212,13 +217,39 @@ process_live_temps_per_block(live_ctx& ctx, Block* block)
             if (insn->operands[op_idx].isOfType(RegType::sgpr))
                insn->operands[op_idx].setLateKill(true);
          }
+      } else if (insn->opcode == aco_opcode::p_bpermute_readlane ||
+                 insn->opcode == aco_opcode::p_bpermute_permlane ||
+                 insn->opcode == aco_opcode::p_bpermute_shared_vgpr ||
+                 insn->opcode == aco_opcode::p_dual_src_export_gfx11 ||
+                 insn->opcode == aco_opcode::v_mqsad_u32_u8) {
+         for (Operand& op : insn->operands)
+            op.setLateKill(true);
+      } else if (insn->opcode == aco_opcode::p_interp_gfx11) {
+         insn->operands.back().setLateKill(true); /* we don't want the bld.lm def to use m0 */
+         if (insn->operands.size() == 7)
+            insn->operands[5].setLateKill(true); /* we re-use the destination reg in the middle */
+      } else if (insn->opcode == aco_opcode::v_interp_p1_f32 && ctx.program->dev.has_16bank_lds) {
+         insn->operands[0].setLateKill(true);
+      } else if (insn->opcode == aco_opcode::p_init_scratch) {
+         insn->operands.back().setLateKill(true);
+      } else if (instr_info.classes[(int)insn->opcode] == instr_class::wmma) {
+         insn->operands[0].setLateKill(true);
+         insn->operands[1].setLateKill(true);
       }
 
       /* we need to do this in a separate loop because the next one can
        * setKill() for several operands at once and we don't want to
        * overwrite that in a later iteration */
-      for (Operand& op : insn->operands)
+      for (Operand& op : insn->operands) {
          op.setKill(false);
+         /* Linear vgprs must be late kill: this is to ensure linear VGPR operands and
+          * normal VGPR definitions don't try to use the same register, which is problematic
+          * because of assignment restrictions.
+          */
+         if (op.hasRegClass() && op.regClass().is_linear_vgpr() && !op.isUndefined() &&
+             has_vgpr_def)
+            op.setLateKill(true);
+      }
 
       /* GEN */
       for (unsigned i = 0; i < insn->operands.size(); ++i) {
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp
index d280306a8e9..11c0d2022e4 100644
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -64,11 +64,9 @@ setup_reduce_temp(Program* program)
             aco_ptr<Instruction> end{create_instruction(
                aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)};
             end->operands[0] = Operand(reduceTmp);
-            end->operands[0].setLateKill(true);
-            if (vtmp_inserted_at >= 0) {
+            if (vtmp_inserted_at >= 0)
                end->operands[1] = Operand(vtmp);
-               end->operands[1].setLateKill(true);
-            }
+
             /* insert after the phis of the block */
             std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
             while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
@@ -164,16 +162,12 @@ setup_reduce_temp(Program* program)
 
          if (instr->isReduction()) {
             instr->operands[1] = Operand(reduceTmp);
-            instr->operands[1].setLateKill(true);
-            if (need_vtmp) {
+            if (need_vtmp)
                instr->operands[2] = Operand(vtmp);
-               instr->operands[2].setLateKill(true);
-            }
          } else {
             assert(instr->opcode == aco_opcode::p_interp_gfx11 ||
                    instr->opcode == aco_opcode::p_bpermute_permlane);
             instr->operands[0] = Operand(reduceTmp);
-            instr->operands[0].setLateKill(true);
          }
       }
    }
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
index d252f1bba6c..86bfbb39119 100644
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1419,10 +1419,8 @@ end_unused_spill_vgprs(spill_ctx& ctx, Block& block, std::vector<Temp>& vgpr_spi
 
    aco_ptr<Instruction> destr{
       create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, temps.size(), 0)};
-   for (unsigned i = 0; i < temps.size(); i++) {
+   for (unsigned i = 0; i < temps.size(); i++)
       destr->operands[i] = Operand(temps[i]);
-      destr->operands[i].setLateKill(true);
-   }
 
    std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
    while (is_phi(*it))
@@ -1540,7 +1538,6 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
                /* spill sgpr: just add the vgpr temp to operands */
                Instruction* spill = create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
                spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
-               spill->operands[0].setLateKill(true);
                spill->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
                spill->operands[2] = (*it)->operands[0];
                instructions.emplace_back(aco_ptr<Instruction>(spill));
@@ -1586,7 +1583,6 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
                /* reload sgpr: just add the vgpr temp to operands */
                Instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1);
                reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
-               reload->operands[0].setLateKill(true);
                reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
                reload->definitions[0] = (*it)->definitions[0];
                instructions.emplace_back(aco_ptr<Instruction>(reload));
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index d7816ecc708..5733b14b382 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -372,11 +372,6 @@ validate_ir(Program* program)
                 op.isUndefined())
                continue;
 
-            /* Check that linear vgprs are late kill: this is to ensure linear VGPR operands and
-             * normal VGPR definitions don't try to use the same register, which is problematic
-             * because of assignment restrictions. */
-            check(op.isLateKill(), "Linear VGPR operands must be late kill", instr.get());
-
             /* Only kill linear VGPRs in top-level blocks. Otherwise, we might have to move linear
              * VGPRs to make space for normal ones and that isn't possible inside control flow. */
             if (op.isKill()) {
diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp
index 5a1ce39378e..b14b135c376 100644
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@@ -39,7 +39,7 @@ BEGIN_TEST(d3d11_derivs.simple)
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
@@ -80,7 +80,7 @@ BEGIN_TEST(d3d11_derivs.constant)
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
@@ -151,7 +151,7 @@ BEGIN_TEST(d3d11_derivs.bias)
    //>> v4: %_ = image_sample_b (kill)%_, (kill)%_, v1: undef, (latekill)%wqm, (kill)%bias 2d
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                                   ; $_
@@ -194,7 +194,7 @@ BEGIN_TEST(d3d11_derivs.offset)
    //>> v4: %_ = image_sample_o (kill)%_, (kill)%_, v1: undef, (latekill)%wqm, (kill)%offset 2d
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                        ; $_
@@ -239,7 +239,7 @@ BEGIN_TEST(d3d11_derivs.array)
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2darray da
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.z                                               ; $_
@@ -286,7 +286,7 @@ BEGIN_TEST(d3d11_derivs.bias_array)
    //>> v4: %_ = image_sample_b (kill)%_, (kill)%_, v1: undef, (latekill)%wqm, (kill)%bias 2darray da
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.z                                                             ; $_
@@ -331,7 +331,7 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9)
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                ; $_
@@ -373,7 +373,7 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2darray da
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y                   ; $_
@@ -420,7 +420,7 @@ BEGIN_TEST(d3d11_derivs.cube)
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm cube da
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_cubeid_f32 v#rf_tmp, v#_, v#_, v#_                                                 ; $_ $_
@@ -467,7 +467,7 @@ BEGIN_TEST(d3d11_derivs.cube_array)
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm cube da
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_cubeid_f32 v#rf, v#_, v#_, v#_                                                      ; $_ $_
@@ -548,7 +548,7 @@ BEGIN_TEST(d3d11_derivs.bc_optimize)
    //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 END_TEST
 
@@ -589,7 +589,7 @@ BEGIN_TEST(d3d11_derivs.get_lod)
    //>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d
    //>> BB2
    //>> BB6
-   //>> p_end_linear_vgpr (latekill)(kill)%wqm
+   //>> p_end_linear_vgpr (kill)%wqm
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 END_TEST
 
diff --git a/src/amd/compiler/tests/test_reduce_assign.cpp b/src/amd/compiler/tests/test_reduce_assign.cpp
index b211ca8ff5b..651ebb1c2aa 100644
--- a/src/amd/compiler/tests/test_reduce_assign.cpp
+++ b/src/amd/compiler/tests/test_reduce_assign.cpp
@@ -33,7 +33,7 @@ BEGIN_TEST(setup_reduce_temp.divergent_if_phi)
       program.get(), bld, Operand(inputs[0]),
       [&]() -> void
       {
-         //>> s1: %_, s2: %_, s1: %_:scc = p_reduce %a, (latekill)%lv, lv1: undef op:umin32 cluster_size:64
+         //>> s1: %_, s2: %_, s1: %_:scc = p_reduce %a, %lv, lv1: undef op:umin32 cluster_size:64
          Instruction* reduce =
             bld.reduction(aco_opcode::p_reduce, bld.def(s1), bld.def(bld.lm), bld.def(s1, scc),
                           inputs[1], Operand(v1.as_linear()), Operand(v1.as_linear()), umin32);
@@ -45,7 +45,7 @@ BEGIN_TEST(setup_reduce_temp.divergent_if_phi)
       });
    bld.pseudo(aco_opcode::p_phi, bld.def(v1), Operand::c32(1), Operand::zero());
    //>> /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
-   //! p_end_linear_vgpr (latekill)%lv
+   //! p_end_linear_vgpr %lv
 
    finish_setup_reduce_temp_test();
 END_TEST
diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp
index 90265a7156e..1b878c2f46d 100644
--- a/src/amd/compiler/tests/test_regalloc.cpp
+++ b/src/amd/compiler/tests/test_regalloc.cpp
@@ -297,9 +297,7 @@ END_TEST
 static void
 end_linear_vgpr(Temp tmp)
 {
-   Operand op(tmp);
-   op.setLateKill(true);
-   bld.pseudo(aco_opcode::p_end_linear_vgpr, op);
+   bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
 }
 
 BEGIN_TEST(regalloc.linear_vgpr.alloc.basic)
@@ -308,10 +306,10 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.basic)
 
    //>> lv1: %ltmp0:v[31] = p_start_linear_vgpr
    //! lv1: %ltmp1:v[30] = p_start_linear_vgpr
-   //! p_end_linear_vgpr (latekill)%ltmp0:v[31]
+   //! p_end_linear_vgpr %ltmp0:v[31]
    //! lv1: %ltmp2:v[31] = p_start_linear_vgpr
-   //! p_end_linear_vgpr (latekill)%ltmp1:v[30]
-   //! p_end_linear_vgpr (latekill)%ltmp2:v[31]
+   //! p_end_linear_vgpr %ltmp1:v[30]
+   //! p_end_linear_vgpr %ltmp2:v[31]
    Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
    Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
    end_linear_vgpr(ltmp0);
@@ -331,7 +329,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_grow)
 
       //! lv1: %ltmp0:v[31] = p_start_linear_vgpr
       //! lv1: %ltmp1:v[30] = p_start_linear_vgpr
-      //! p_end_linear_vgpr (latekill)%ltmp0:v[31]
+      //! p_end_linear_vgpr %ltmp0:v[31]
       Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       end_linear_vgpr(ltmp0);
@@ -347,8 +345,8 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_grow)
       //! lv2: %ltmp2:v[29-30] = p_start_linear_vgpr
       Temp ltmp2 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v2.as_linear()));
 
-      //! p_end_linear_vgpr (latekill)%ltmp1_2:v[31]
-      //! p_end_linear_vgpr (latekill)%ltmp2:v[29-30]
+      //! p_end_linear_vgpr %ltmp1_2:v[31]
+      //! p_end_linear_vgpr %ltmp2:v[29-30]
       end_linear_vgpr(ltmp1);
       end_linear_vgpr(ltmp2);
 
@@ -371,9 +369,9 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_shrink)
       //! lv1: %ltmp2:v[29] = p_start_linear_vgpr
       //! lv1: %ltmp3:v[28] = p_start_linear_vgpr
       //! lv1: %ltmp4:v[27] = p_start_linear_vgpr
-      //! p_end_linear_vgpr (latekill)%ltmp0:v[31]
-      //! p_end_linear_vgpr (latekill)%ltmp2:v[29]
-      //! p_end_linear_vgpr (latekill)%ltmp4:v[27]
+      //! p_end_linear_vgpr %ltmp0:v[31]
+      //! p_end_linear_vgpr %ltmp2:v[29]
+      //! p_end_linear_vgpr %ltmp4:v[27]
       Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       Temp ltmp2 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
@@ -392,9 +390,9 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_shrink)
       //! v28: %_:v[0-27] = p_unit_test
       bld.pseudo(aco_opcode::p_unit_test, bld.def(RegClass::get(RegType::vgpr, 28 * 4)));
 
-      //! p_end_linear_vgpr (latekill)%ltmp1_2:v[31]
-      //! p_end_linear_vgpr (latekill)%ltmp3_2:v[30]
-      //! p_end_linear_vgpr (latekill)%ltmp5:v[28-29]
+      //! p_end_linear_vgpr %ltmp1_2:v[31]
+      //! p_end_linear_vgpr %ltmp3_2:v[30]
+      //! p_end_linear_vgpr %ltmp5:v[28-29]
       end_linear_vgpr(ltmp1);
       end_linear_vgpr(ltmp3);
       end_linear_vgpr(ltmp5);
@@ -412,7 +410,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_for_normal)
 
       //! lv1: %ltmp0:v[31] = p_start_linear_vgpr
       //! lv1: %ltmp1:v[30] = p_start_linear_vgpr
-      //! p_end_linear_vgpr (latekill)%ltmp0:v[31]
+      //! p_end_linear_vgpr %ltmp0:v[31]
       Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       end_linear_vgpr(ltmp0);
@@ -421,7 +419,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_for_normal)
       //! v31: %_:v[0-30] = p_unit_test
       bld.pseudo(aco_opcode::p_unit_test, bld.def(RegClass::get(RegType::vgpr, 31 * 4)));
 
-      //! p_end_linear_vgpr (latekill)%ltmp1_2:v[31]
+      //! p_end_linear_vgpr %ltmp1_2:v[31]
       end_linear_vgpr(ltmp1);
 
       finish_ra_test(ra_test_policy{pessimistic});
@@ -437,7 +435,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_for_vec)
 
       //! lv1: %ltmp0:v[31] = p_start_linear_vgpr
       //! lv1: %ltmp1:v[30] = p_start_linear_vgpr
-      //! p_end_linear_vgpr (latekill)%ltmp0:v[31]
+      //! p_end_linear_vgpr %ltmp0:v[31]
       Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       end_linear_vgpr(ltmp0);
@@ -447,7 +445,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_for_vec)
       RegClass v31 = RegClass::get(RegType::vgpr, 31 * 4);
       bld.pseudo(aco_opcode::p_create_vector, bld.def(v31), Operand(v31));
 
-      //! p_end_linear_vgpr (latekill)%ltmp1_2:v[31]
+      //! p_end_linear_vgpr %ltmp1_2:v[31]
       end_linear_vgpr(ltmp1);
 
       finish_ra_test(ra_test_policy{pessimistic});
@@ -467,7 +465,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.killed_op)
       Temp tmp1 = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1));
 
       //! lv1: %ltmp0:v[31] = p_start_linear_vgpr %tmp1:v[31]
-      //! p_end_linear_vgpr (latekill)%ltmp0:v[31]
+      //! p_end_linear_vgpr %ltmp0:v[31]
       Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()), tmp1);
       end_linear_vgpr(ltmp0);
 
@@ -494,7 +492,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.move_killed_op)
       //~gfx8_optimistic! v1: %tmp1_2:v[31], v1: %tmp2_2:v[30] = p_parallelcopy %tmp1:v[30], %tmp2:v[31]
       //~gfx8_pessimistic! v1: %tmp2_2:v[30], v1: %tmp1_2:v[31] = p_parallelcopy %tmp2:v[31], %tmp1:v[30]
       //! lv1: %ltmp0:v[31] = p_start_linear_vgpr %tmp1_2:v[31]
-      //! p_end_linear_vgpr (latekill)%ltmp0:v[31]
+      //! p_end_linear_vgpr %ltmp0:v[31]
       Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()), tmp1);
       end_linear_vgpr(ltmp0);
 
@@ -514,7 +512,7 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_def)
       //>> lv2: %ltmp0:v[30-31] = p_start_linear_vgpr
       //! lv1: %ltmp1:v[29] = p_start_linear_vgpr
       //! lv1: %ltmp2:v[28] = p_start_linear_vgpr
-      //! p_end_linear_vgpr (latekill)%ltmp1:v[29]
+      //! p_end_linear_vgpr %ltmp1:v[29]
       Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v2.as_linear()));
       Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       Temp ltmp2 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
@@ -549,8 +547,8 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_def)
       program->blocks[2].logical_preds.push_back(1);
       program->blocks[2].kind |= block_kind_top_level;
 
-      //! p_end_linear_vgpr (latekill)%ltmp0_2:v[30-31]
-      //! p_end_linear_vgpr (latekill)%ltmp2_2:v[29]
+      //! p_end_linear_vgpr %ltmp0_2:v[30-31]
+      //! p_end_linear_vgpr %ltmp2_2:v[29]
       end_linear_vgpr(ltmp0);
       end_linear_vgpr(ltmp2);
 
@@ -578,7 +576,7 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_phis)
       //>> lv1: %ltmp0:v[31] = p_start_linear_vgpr
       //! lv1: %ltmp1:v[30] = p_start_linear_vgpr
       //! lv1: %ltmp2:v[29] = p_start_linear_vgpr
-      //! p_end_linear_vgpr (latekill)%ltmp1:v[30]
+      //! p_end_linear_vgpr %ltmp1:v[30]
       Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
       Temp ltmp2 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()));
@@ -614,8 +612,8 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_phis)
       Temp tmp = bld.pseudo(aco_opcode::p_phi, bld.def(v30), Operand(v30));
       bld.pseudo(aco_opcode::p_unit_test, tmp);
 
-      //! p_end_linear_vgpr (latekill)%ltmp0_2:v[31]
-      //! p_end_linear_vgpr (latekill)%ltmp2_2:v[30]
+      //! p_end_linear_vgpr %ltmp0_2:v[31]
+      //! p_end_linear_vgpr %ltmp2_2:v[30]
       end_linear_vgpr(ltmp0);
       end_linear_vgpr(ltmp2);