aco: Use s_cbranch_vccz/nz in post-RA optimization.

A simple post-RA optimization which takes advantage of the s_cbranch_vccz and s_cbranch_vccnz instructions. It works on the following pattern: vcc = v_cmp ... scc = s_and vcc, exec p_cbranch scc The result looks like this: vcc = v_cmp ... p_cbranch vcc Fossil DB results on Sienna Cichlid: Totals from 4814 (3.21% of 149839) affected shaders: CodeSize: 15371176 -> 15345964 (-0.16%) Instrs: 3028557 -> 3022254 (-0.21%) Latency: 21872753 -> 21823476 (-0.23%); split: -0.23%, +0.00% InvThroughput: 4470282 -> 4468691 (-0.04%); split: -0.04%, +0.00% Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7779>
2026-05-05 00:58:05 +02:00 · 2021-03-20 17:47:05 +01:00 · 2021-03-20 17:47:05 +01:00 · a93092d0ed
commit a93092d0ed
parent 0e4747d3fb
3 changed files with 154 additions and 2 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -2910,7 +2910,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)

      Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
      if (tmp.regClass() == s1) {
-         // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
         bool_to_scalar_condition(ctx, src, tmp);
      } else if (tmp.type() == RegType::vgpr) {
         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
@ -10168,7 +10167,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
       *    merge block.
       **/

-      // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
      assert(cond.regClass() == ctx->program->lane_mask);
      cond = bool_to_scalar_condition(ctx, cond);

--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@ -107,10 +107,67 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
   return instr_idx;
 }

+void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+{
+   /* We are looking for the following pattern:
+    *
+    * vcc = ...                      ; last_vcc_wr
+    * sX, scc = s_and_bXX vcc, exec  ; op0_instr
+    * (...vcc and exec must not be clobbered inbetween...)
+    * s_cbranch_XX scc               ; instr
+    *
+    * If possible, the above is optimized into:
+    *
+    * vcc = ...                      ; last_vcc_wr
+    * s_cbranch_XX vcc               ; instr modified to use vcc
+    */
+
+   /* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */
+   if (ctx.program->chip_class < GFX8)
+      return;
+
+   if (instr->format != Format::PSEUDO_BRANCH ||
+       instr->operands.size() == 0 ||
+       instr->operands[0].physReg() != scc)
+      return;
+
+   int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
+   int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
+   int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
+
+   /* We need to make sure:
+    * - the operand register used by the branch, and VCC were both written in the current block
+    * - VCC was NOT written after the operand register
+    * - EXEC is sane and was NOT written after the operand register
+    */
+   if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx ||
+       last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
+      return;
+
+   aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx];
+   aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
+
+   if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
+        op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
+       op0_instr->operands[0].physReg() != vcc ||
+       op0_instr->operands[1].physReg() != exec ||
+       !last_vcc_wr->isVOPC())
+      return;
+
+   assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId());
+
+   /* Reduce the uses of the SCC def */
+   ctx.uses[instr->operands[0].tempId()]--;
+   /* Use VCC instead of SCC in the branch */
+   instr->operands[0] = op0_instr->operands[0];
+}
+
 void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
 {
   ctx.current_instr_idx++;

+   try_apply_branch_vcc(ctx, instr);
+
   if (instr)
      save_reg_writes(ctx, instr);
 }
--- a/src/amd/compiler/tests/test_optimizer_postRA.cpp
+++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp
@ -25,3 +25,100 @@
 #include "helpers.h"

 using namespace aco;
+
+BEGIN_TEST(optimizer_postRA.vcmp)
+    PhysReg reg_v0(256);
+    PhysReg reg_s0(0);
+    PhysReg reg_s2(2);
+    PhysReg reg_s4(4);
+
+    //>> v1: %a:v[0] = p_startpgm
+    ASSERTED bool setup_ok = setup_cs("v1", GFX8);
+    assert(setup_ok);
+
+    auto &startpgm = bld.instructions->at(0);
+    assert(startpgm->opcode == aco_opcode::p_startpgm);
+    startpgm->definitions[0].setFixed(reg_v0);
+
+    Temp v_in = inputs[0];
+
+    {
+        /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */
+
+        //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
+        //! s2: %e:s[2-3] = p_cbranch_z %b:vcc
+        //! p_unit_test 0, %e:s[2-3]
+        auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
+        auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
+        auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+        writeout(0, Operand(br, reg_s2));
+    }
+
+    //; del b, e
+
+    {
+        /* When VCC is overwritten inbetween, don't optimize. */
+
+        //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
+        //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
+        //! s2: %f:vcc = s_mov_b64 0
+        //! s2: %e:s[2-3] = p_cbranch_z %d:scc
+        //! p_unit_test 1, %e:s[2-3], %f:vcc
+        auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
+        auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
+        auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand(0u));
+        auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+        writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc));
+    }
+
+    //; del b, c, d, e, f
+
+    {
+        /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */
+
+        //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0]
+        //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec
+        //! s2: %e:s[2-3] = p_cbranch_z %d:scc
+        //! p_unit_test 2, %e:s[2-3]
+        auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand(0u), Operand(v_in, reg_v0));
+        auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(vcmp, reg_s4), Operand(exec, bld.lm));
+        auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+        writeout(2, Operand(br, reg_s2));
+    }
+
+    //; del b, c, d, e
+
+    {
+        /* When the VCC isn't written by VOPC, don't optimize */
+
+        //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5]
+        //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
+        //! s2: %e:s[2-3] = p_cbranch_z %d:scc
+        //! p_unit_test 2, %e:s[2-3]
+        auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand(1u), Operand(reg_s4, bld.lm));
+        auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(salu, vcc), Operand(exec, bld.lm));
+        auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+        writeout(2, Operand(br, reg_s2));
+    }
+
+    //; del b, c, d, e, f, x
+
+    {
+        /* When EXEC is overwritten inbetween, don't optimize. */
+
+        //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
+        //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
+        //! s2: %f:exec = s_mov_b64 42
+        //! s2: %e:s[2-3] = p_cbranch_z %d:scc
+        //! p_unit_test 4, %e:s[2-3], %f:exec
+        auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
+        auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
+        auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(42u));
+        auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+        writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec));
+    }
+
+    //; del b, c, d, e, f, x
+
+    finish_optimizer_postRA_test();
+END_TEST