From f99443a68b743d866cd6fc79d29d2f549f200a23 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Fri, 16 Feb 2024 11:09:22 +0000
Subject: [PATCH] aco: don't combine linear and normal VGPR copies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27697>
---
 src/amd/compiler/aco_lower_to_hw_instr.cpp  |  3 ++
 src/amd/compiler/tests/test_to_hw_instr.cpp | 50 +++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 27787c7322f..03282ea2857 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -1767,6 +1767,9 @@ try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_
        copy.op.isConstant() != other->second.op.isConstant())
       return;
 
+   if (other->second.def.regClass().is_linear_vgpr() != copy.def.regClass().is_linear_vgpr())
+      return;
+
    /* don't create 64-bit copies before GFX10 */
    if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
        ctx->program->gfx_level < GFX10)
diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp
index e14ff7e56f4..4feb610b3e4 100644
--- a/src/amd/compiler/tests/test_to_hw_instr.cpp
+++ b/src/amd/compiler/tests/test_to_hw_instr.cpp
@@ -839,6 +839,56 @@ BEGIN_TEST(to_hw_instr.copy_linear_vgpr_v3)
    finish_to_hw_instr_test();
 END_TEST
 
+BEGIN_TEST(to_hw_instr.copy_linear_vgpr_coalesce)
+   if (!setup_cs(NULL, GFX10))
+      return;
+
+   PhysReg reg_v0{256};
+   PhysReg reg_v1{256 + 1};
+   PhysReg reg_v4{256 + 4};
+   PhysReg reg_v5{256 + 5};
+   RegClass v1_linear = v1.as_linear();
+
+   //>> p_unit_test 0
+   //! lv2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
+   //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+   //! lv2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
+   //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+   bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
+
+   Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear),
+                                   Definition(reg_v1, v1_linear), Operand(reg_v4, v1_linear),
+                                   Operand(reg_v5, v1_linear));
+   instr->pseudo().scratch_sgpr = m0;
+
+   //! p_unit_test 1
+   //! lv1: %0:v[0] = v_mov_b32 %0:v[4]
+   //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+   //! lv1: %0:v[0] = v_mov_b32 %0:v[4]
+   //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+   //! v1: %0:v[1] = v_mov_b32 %0:v[5]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+
+   instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear),
+                      Definition(reg_v1, v1), Operand(reg_v4, v1_linear), Operand(reg_v5, v1));
+   instr->pseudo().scratch_sgpr = m0;
+
+   //! p_unit_test 2
+   //! v1: %0:v[0] = v_mov_b32 %0:v[4]
+   //! lv1: %0:v[1] = v_mov_b32 %0:v[5]
+   //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+   //! lv1: %0:v[1] = v_mov_b32 %0:v[5]
+   //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+
+   instr =
+      bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1), Definition(reg_v1, v1_linear),
+                 Operand(reg_v4, v1), Operand(reg_v5, v1_linear));
+   instr->pseudo().scratch_sgpr = m0;
+
+   finish_to_hw_instr_test();
+END_TEST
+
 BEGIN_TEST(to_hw_instr.pack2x16_constant)
    PhysReg v0_lo{256};
    PhysReg v0_hi{256};