From 2d38da94d4829b339f580e3170744936c3d8a0dd Mon Sep 17 00:00:00 2001
From: Georg Lehmann <dadschoorse@gmail.com>
Date: Sun, 25 Jan 2026 15:06:28 +0100
Subject: [PATCH] aco: allow v_cmpx with DPP

The wording in the RDNA3 ISA doc was since clarified, v_cmpx with DPP
behaves exactly like one would expect:
FI controls whether the source value can be read from inactive lanes,
but inactive lanes always write a 0 bit. The same applies to v_cmp with DPP.

Foz-DB Navi48:
Totals from 987 (1.20% of 82405) affected shaders:
Instrs: 517003 -> 516445 (-0.11%); split: -0.11%, +0.00%
CodeSize: 2782688 -> 2780508 (-0.08%); split: -0.08%, +0.00%
Latency: 2059169 -> 2056327 (-0.14%); split: -0.14%, +0.00%
InvThroughput: 365374 -> 365328 (-0.01%); split: -0.03%, +0.01%
Copies: 64669 -> 65616 (+1.46%)
SALU: 70693 -> 70652 (-0.06%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39516>
---
 src/amd/compiler/aco_ir.cpp                      | 4 ----
 src/amd/compiler/aco_optimizer_postRA.cpp        | 6 +-----
 src/amd/compiler/tests/test_optimizer_postRA.cpp | 3 +--
 3 files changed, 2 insertions(+), 11 deletions(-)
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 3885f1bb315..88e5a37188e 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -489,10 +489,6 @@ can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp
          return false;
    }
 
-   /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */
-   if (instr->writes_exec())
-      return false;
-
    return opcode_supports_dpp(gfx_level, instr->opcode, instr->isVOP3P());
 }
 
diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp
index 77d04c6ccd4..690f3a19a2f 100644
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@@ -1078,10 +1078,6 @@ try_optimize_branching_sequence(pr_opt_ctx& ctx, aco_ptr<Instruction>& exec_copy
          : aco_opcode::num_opcodes;
    const bool vopc = v_cmpx_op != aco_opcode::num_opcodes;
 
-   /* V_CMPX+DPP returns 0 with reads from disabled lanes, unlike V_CMP+DPP (RDNA3 ISA doc, 7.7) */
-   if (vopc && exec_val->isDPP())
-      return false;
-
    /* If s_and_saveexec is used, we'll need to insert a new instruction to save the old exec. */
    bool save_original_exec =
       exec_copy->opcode == and_saveexec && !exec_copy->definitions[0].isKill();
@@ -1157,7 +1153,7 @@ try_optimize_branching_sequence(pr_opt_ctx& ctx, aco_ptr<Instruction>& exec_copy
    if (vopc) {
       /* Add one extra definition for exec and copy the VOP3-specific fields if present. */
       if (!vcmpx_exec_only) {
-         if (exec_val->isSDWA()) {
+         if (exec_val->isSDWA() || exec_val->isDPP()) {
             /* This might work but it needs testing and more code to copy the instruction. */
             return false;
          } else {
diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp
index f605a72a769..f924a7e2421 100644
--- a/src/amd/compiler/tests/test_optimizer_postRA.cpp
+++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp
@@ -492,8 +492,7 @@ BEGIN_TEST(optimizer_postRA.dpp_vcmpx)
    Operand a(inputs[0], PhysReg(256));
    Operand b(inputs[1], PhysReg(257));
 
-   //! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
-   //! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1]
+   //! s2: %res0:exec = v_cmpx_lt_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
    //! p_unit_test 0, %res0:exec
    Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
    Temp res0 = bld.vopc(aco_opcode::v_cmpx_lt_f32, bld.def(bld.lm, exec), Operand(tmp0, reg_v2), b);