From 2d38da94d4829b339f580e3170744936c3d8a0dd Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sun, 25 Jan 2026 15:06:28 +0100 Subject: [PATCH] aco: allow v_cmpx with DPP The wording in the RDNA3 ISA doc was since clarified, v_cmpx with DPP behaves exactly like one would expect: FI controls whether the source value can be read from inactive lanes, but inactive lanes always write a 0 bit. The same applies to v_cmp with DPP. Foz-DB Navi48: Totals from 987 (1.20% of 82405) affected shaders: Instrs: 517003 -> 516445 (-0.11%); split: -0.11%, +0.00% CodeSize: 2782688 -> 2780508 (-0.08%); split: -0.08%, +0.00% Latency: 2059169 -> 2056327 (-0.14%); split: -0.14%, +0.00% InvThroughput: 365374 -> 365328 (-0.01%); split: -0.03%, +0.01% Copies: 64669 -> 65616 (+1.46%) SALU: 70693 -> 70652 (-0.06%) Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_ir.cpp | 4 ---- src/amd/compiler/aco_optimizer_postRA.cpp | 6 +----- src/amd/compiler/tests/test_optimizer_postRA.cpp | 3 +-- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 3885f1bb315..88e5a37188e 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -489,10 +489,6 @@ can_use_DPP(amd_gfx_level gfx_level, const aco_ptr& instr, bool dpp return false; } - /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */ - if (instr->writes_exec()) - return false; - return opcode_supports_dpp(gfx_level, instr->opcode, instr->isVOP3P()); } diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 77d04c6ccd4..690f3a19a2f 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -1078,10 +1078,6 @@ try_optimize_branching_sequence(pr_opt_ctx& ctx, aco_ptr& exec_copy : aco_opcode::num_opcodes; const bool vopc = v_cmpx_op != aco_opcode::num_opcodes; - /* V_CMPX+DPP returns 0 with reads from disabled lanes, unlike V_CMP+DPP (RDNA3 ISA doc, 7.7) */ - if (vopc && exec_val->isDPP()) - return false; - /* If s_and_saveexec is used, we'll need to insert a new instruction to save the old exec. */ bool save_original_exec = exec_copy->opcode == and_saveexec && !exec_copy->definitions[0].isKill(); @@ -1157,7 +1153,7 @@ try_optimize_branching_sequence(pr_opt_ctx& ctx, aco_ptr& exec_copy if (vopc) { /* Add one extra definition for exec and copy the VOP3-specific fields if present. */ if (!vcmpx_exec_only) { - if (exec_val->isSDWA()) { + if (exec_val->isSDWA() || exec_val->isDPP()) { /* This might work but it needs testing and more code to copy the instruction. */ return false; } else { diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index f605a72a769..f924a7e2421 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -492,8 +492,7 @@ BEGIN_TEST(optimizer_postRA.dpp_vcmpx) Operand a(inputs[0], PhysReg(256)); Operand b(inputs[1], PhysReg(257)); - //! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi - //! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1] + //! s2: %res0:exec = v_cmpx_lt_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 0, %res0:exec Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); Temp res0 = bld.vopc(aco_opcode::v_cmpx_lt_f32, bld.def(bld.lm, exec), Operand(tmp0, reg_v2), b);