diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
index 1b081b3d7c1..8631189b7bf 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -41,14 +41,15 @@ struct constaddr_info {
 };
 
 struct asm_context {
-   Program *program;
+   Program* program;
    enum chip_class chip_class;
    std::vector<std::pair<int, SOPP_instruction*>> branches;
    std::map<unsigned, constaddr_info> constaddrs;
    const int16_t* opcode;
    // TODO: keep track of branch instructions referring blocks
    // and, when emitting the block, correct the offset in instr
-   asm_context(Program* program_) : program(program_), chip_class(program->chip_class) {
+   asm_context(Program* program_) : program(program_), chip_class(program->chip_class)
+   {
       if (chip_class <= GFX7)
          opcode = &instr_info.opcode_gfx7[0];
       else if (chip_class <= GFX9)
@@ -60,7 +61,8 @@ struct asm_context {
    int subvector_begin_pos = -1;
 };
 
-static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
+static uint32_t
+get_sdwa_sel(unsigned sel, PhysReg reg)
 {
    if (sel & sdwa_isra) {
       unsigned size = sdwa_rasize & sel;
@@ -72,7 +74,9 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
    return sel & sdwa_asuint;
 }
 
-unsigned get_mimg_nsa_dwords(const Instruction *instr) {
+unsigned
+get_mimg_nsa_dwords(const Instruction* instr)
+{
    unsigned addr_dwords = instr->operands.size() - 3;
    for (unsigned i = 1; i < addr_dwords; i++) {
       if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
@@ -81,7 +85,8 @@ unsigned get_mimg_nsa_dwords(const Instruction *instr) {
    return 0;
 }
 
-void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
+void
+emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
 {
    /* lower remaining pseudo-instructions */
    if (instr->opcode == aco_opcode::p_constaddr_getpc) {
@@ -99,11 +104,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
 
    uint32_t opcode = ctx.opcode[(int)instr->opcode];
    if (opcode == (uint32_t)-1) {
-      char *outmem;
+      char* outmem;
       size_t outsize;
       struct u_memstream mem;
       u_memstream_open(&mem, &outmem, &outsize);
-      FILE *const memf = u_memstream_get(&mem);
+      FILE* const memf = u_memstream_get(&mem);
 
       fprintf(memf, "Unsupported opcode: ");
       aco_print_instr(instr, memf);
@@ -144,11 +149,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
 
       uint32_t encoding = (0b1011 << 28);
       encoding |= opcode << 23;
-      encoding |=
-         !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ?
-         instr->definitions[0].physReg() << 16 :
-         !instr->operands.empty() && instr->operands[0].physReg() <= 127 ?
-         instr->operands[0].physReg() << 16 : 0;
+      encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc)
+                     ? instr->definitions[0].physReg() << 16
+                  : !instr->operands.empty() && instr->operands[0].physReg() <= 127
+                     ? instr->operands[0].physReg() << 16
+                     : 0;
       encoding |= sopk.imm;
       out.push_back(encoding);
       break;
@@ -177,7 +182,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       SOPP_instruction& sopp = instr->sopp();
       uint32_t encoding = (0b101111111 << 23);
       encoding |= opcode << 16;
-      encoding |= (uint16_t) sopp.imm;
+      encoding |= (uint16_t)sopp.imm;
       if (sopp.block != -1) {
          sopp.pass_flags = 0;
          ctx.branches.emplace_back(out.size(), &sopp);
@@ -208,7 +213,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          }
          out.push_back(encoding);
          /* SMRD instructions can take a literal on GFX7 */
-         if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && instr->operands[1].constantValue() >= 1024)
+         if (instr->operands.size() >= 2 && instr->operands[1].isConstant() &&
+             instr->operands[1].constantValue() >= 1024)
             out.push_back(instr->operands[1].constantValue() >> 2);
          return;
       }
@@ -235,7 +241,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       }
 
       if (is_load || instr->operands.size() >= 3) { /* SDATA */
-         encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) << 6;
+         encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg())
+                     << 6;
       }
       if (instr->operands.size() >= 1) { /* SBASE */
          encoding |= instr->operands[0].physReg() >> 1;
@@ -246,14 +253,16 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
 
       int32_t offset = 0;
       uint32_t soffset = ctx.chip_class >= GFX10
-                         ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
-                         : 0;        /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */
+                            ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
+                            : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on
+                                    GFX8 and below) */
       if (instr->operands.size() >= 2) {
-         const Operand &op_off1 = instr->operands[1];
+         const Operand& op_off1 = instr->operands[1];
          if (ctx.chip_class <= GFX9) {
             offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg();
          } else {
-            /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */
+            /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an
+             * SGPR */
             if (op_off1.isConstant()) {
                offset = op_off1.constantValue();
             } else {
@@ -263,8 +272,9 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          }
 
          if (soe) {
-            const Operand &op_off2 = instr->operands.back();
-            assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */
+            const Operand& op_off2 = instr->operands.back();
+            assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant
+                                               and an SGPR at the same time */
             assert(!op_off2.isConstant());
             soffset = op_off2.physReg();
          }
@@ -368,9 +378,13 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       encoding = 0;
       unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
       encoding |= (0xFF & reg) << 24;
-      reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)  ? instr->operands[2].physReg() : 0;
+      reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)
+               ? instr->operands[2].physReg()
+               : 0;
       encoding |= (0xFF & reg) << 16;
-      reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0;
+      reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0)
+               ? instr->operands[1].physReg()
+               : 0;
       encoding |= (0xFF & reg) << 8;
       encoding |= (0xFF & instr->operands[0].physReg());
       out.push_back(encoding);
@@ -402,7 +416,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       encoding |= instr->operands[2].physReg() << 24;
       encoding |= (mubuf.tfe ? 1 : 0) << 23;
       encoding |= (instr->operands[0].physReg() >> 2) << 16;
-      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
+      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
+                                                : instr->definitions[0].physReg();
       encoding |= (0xFF & reg) << 8;
       encoding |= (0xFF & instr->operands[1].physReg());
       out.push_back(encoding);
@@ -435,7 +450,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       encoding |= (mtbuf.tfe ? 1 : 0) << 23;
       encoding |= (mtbuf.slc ? 1 : 0) << 22;
       encoding |= (instr->operands[0].physReg() >> 2) << 16;
-      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
+      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
+                                                : instr->definitions[0].physReg();
       encoding |= (0xFF & reg) << 8;
       encoding |= (0xFF & instr->operands[1].physReg());
 
@@ -465,7 +481,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          encoding |= mimg.a16 ? 1 << 15 : 0;
          encoding |= mimg.da ? 1 << 14 : 0;
       } else {
-         encoding |= mimg.r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
+         encoding |= mimg.r128 ? 1 << 15
+                               : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
          encoding |= nsa_dwords << 1;
          encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
          encoding |= mimg.dlc ? 1 << 7 : 0;
@@ -485,7 +502,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       assert(!mimg.d16 || ctx.chip_class >= GFX9);
       encoding |= mimg.d16 ? 1 << 31 : 0;
       if (ctx.chip_class >= GFX10) {
-         encoding |= mimg.a16 ? 1 << 30 : 0; /* GFX10: A16 still exists, but is in a different place */
+         /* GFX10: A16 still exists, but is in a different place */
+         encoding |= mimg.a16 ? 1 << 30 : 0;
       }
 
       out.push_back(encoding);
@@ -539,7 +557,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F);
          assert(instr->format != Format::FLAT);
          encoding |= instr->operands[1].physReg() << 16;
-      } else if (instr->format != Format::FLAT || ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
+      } else if (instr->format != Format::FLAT ||
+                 ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
          if (ctx.chip_class <= GFX9)
             encoding |= 0x7F << 16;
          else
@@ -611,7 +630,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          }
          encoding |= vop3.opsel << 11;
          for (unsigned i = 0; i < 3; i++)
-            encoding |= vop3.abs[i] << (8+i);
+            encoding |= vop3.abs[i] << (8 + i);
          if (instr->definitions.size() == 2)
             encoding |= instr->definitions[1].physReg() << 8;
          encoding |= (0xFF & instr->definitions[0].physReg());
@@ -625,7 +644,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          }
          encoding |= vop3.omod << 27;
          for (unsigned i = 0; i < 3; i++)
-            encoding |= vop3.neg[i] << (29+i);
+            encoding |= vop3.neg[i] << (29 + i);
          out.push_back(encoding);
 
       } else if (instr->isVOP3P()) {
@@ -645,7 +664,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          encoding |= vop3.opsel_lo << 11;
          encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14;
          for (unsigned i = 0; i < 3; i++)
-            encoding |= vop3.neg_hi[i] << (8+i);
+            encoding |= vop3.neg_hi[i] << (8 + i);
          encoding |= (0xFF & instr->definitions[0].physReg());
          out.push_back(encoding);
          encoding = 0;
@@ -653,17 +672,17 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
             encoding |= instr->operands[i].physReg() << (i * 9);
          encoding |= (vop3.opsel_hi & 0x3) << 27;
          for (unsigned i = 0; i < 3; i++)
-            encoding |= vop3.neg_lo[i] << (29+i);
+            encoding |= vop3.neg_lo[i] << (29 + i);
          out.push_back(encoding);
 
-      } else if (instr->isDPP()){
+      } else if (instr->isDPP()) {
          assert(ctx.chip_class >= GFX8);
          DPP_instruction& dpp = instr->dpp();
 
          /* first emit the instruction without the DPP operand */
          Operand dpp_op = instr->operands[0];
          instr->operands[0] = Operand(PhysReg{250}, v1);
-         instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::DPP);
+         instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP);
          emit_instruction(ctx, out, instr);
          uint32_t encoding = (0xF & dpp.row_mask) << 28;
          encoding |= (0xF & dpp.bank_mask) << 24;
@@ -684,7 +703,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          /* first emit the instruction without the SDWA operand */
          Operand sdwa_op = instr->operands[0];
          instr->operands[0] = Operand(PhysReg{249}, v1);
-         instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::SDWA);
+         instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA);
          emit_instruction(ctx, out, instr);
 
          uint32_t encoding = 0;
@@ -737,7 +756,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
    }
 }
 
-void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
+void
+emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
 {
    for (aco_ptr<Instruction>& instr : block.instructions) {
 #if 0
@@ -754,15 +774,15 @@ void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
    }
 }
 
-void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
+void
+fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
 {
    bool exported = false;
    for (Block& block : program->blocks) {
       if (!(block.kind & block_kind_export_end))
          continue;
       std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
-      while ( it != block.instructions.rend())
-      {
+      while (it != block.instructions.rend()) {
          if ((*it)->isEXP()) {
             Export_instruction& exp = (*it)->exp();
             if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) {
@@ -785,15 +805,18 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
 
    if (!exported) {
       /* Abort in order to avoid a GPU hang. */
-      bool is_vertex_or_ngg = (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
-      aco_err(program, "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
+      bool is_vertex_or_ngg =
+         (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
+      aco_err(program,
+              "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
       aco_print_program(program, stderr);
       abort();
    }
 }
 
-static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
-                        unsigned insert_count, const uint32_t *insert_data)
+static void
+insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
+            unsigned insert_count, const uint32_t* insert_data)
 {
    out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);
 
@@ -804,9 +827,9 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
    }
 
    /* Find first branch after the inserted code */
-   auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [insert_before](const auto &branch) -> bool {
-      return (unsigned)branch.first >= insert_before;
-   });
+   auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(),
+                                 [insert_before](const auto& branch) -> bool
+                                 { return (unsigned)branch.first >= insert_before; });
 
    /* Update the locations of branches */
    for (; branch_it != ctx.branches.end(); ++branch_it)
@@ -822,15 +845,21 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
    }
 }
 
-static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
+static void
+fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
 {
-   /* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */
+   /* Branches with an offset of 0x3f are buggy on GFX10,
+    * we workaround by inserting NOPs if needed.
+    */
    bool gfx10_3f_bug = false;
 
    do {
-      auto buggy_branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [&ctx](const auto &branch) -> bool {
-         return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 0x3f;
-      });
+      auto buggy_branch_it = std::find_if(
+         ctx.branches.begin(), ctx.branches.end(),
+         [&ctx](const auto& branch) -> bool {
+            return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) ==
+                   0x3f;
+         });
 
       gfx10_3f_bug = buggy_branch_it != ctx.branches.end();
 
@@ -842,7 +871,9 @@ static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
    } while (gfx10_3f_bug);
 }
 
-void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, std::vector<uint32_t>& out)
+void
+emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,
+               std::vector<uint32_t>& out)
 {
    Builder bld(ctx.program);
 
@@ -857,26 +888,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
       /* for conditional branches, skip the long jump if the condition is false */
       aco_opcode inv;
       switch (branch->opcode) {
-      case aco_opcode::s_cbranch_scc0:
-         inv = aco_opcode::s_cbranch_scc1;
-         break;
-      case aco_opcode::s_cbranch_scc1:
-         inv = aco_opcode::s_cbranch_scc0;
-         break;
-      case aco_opcode::s_cbranch_vccz:
-         inv = aco_opcode::s_cbranch_vccnz;
-         break;
-      case aco_opcode::s_cbranch_vccnz:
-         inv = aco_opcode::s_cbranch_vccz;
-         break;
-      case aco_opcode::s_cbranch_execz:
-         inv = aco_opcode::s_cbranch_execnz;
-         break;
-      case aco_opcode::s_cbranch_execnz:
-         inv = aco_opcode::s_cbranch_execz;
-         break;
-      default:
-         unreachable("Unhandled long jump.");
+      case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break;
+      case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break;
+      case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break;
+      case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break;
+      case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break;
+      case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
+      default: unreachable("Unhandled long jump.");
       }
       instr.reset(bld.sopp(inv, -1, 7));
       emit_instruction(ctx, out, instr.get());
@@ -891,7 +909,9 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
    emit_instruction(ctx, out, instr.get());
    branch->pass_flags = out.size();
 
-   instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u)).instr);
+   instr.reset(
+      bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u))
+         .instr);
    emit_instruction(ctx, out, instr.get());
 
    /* restore SCC and clear the LSB of the new PC */
@@ -901,11 +921,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
    emit_instruction(ctx, out, instr.get());
 
    /* create the s_setpc_b64 to jump */
-   instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
+   instr.reset(
+      bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
    emit_instruction(ctx, out, instr.get());
 }
 
-void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
+void
+fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
 {
    bool repeat = false;
    do {
@@ -914,11 +936,12 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
       if (ctx.chip_class == GFX10)
          fix_branches_gfx10(ctx, out);
 
-      for (std::pair<int, SOPP_instruction*> &branch : ctx.branches) {
+      for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) {
          int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
          if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {
             std::vector<uint32_t> long_jump;
-            bool backwards = ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
+            bool backwards =
+               ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
             emit_long_jump(ctx, branch.second, backwards, long_jump);
 
             out[branch.first] = long_jump[0];
@@ -934,13 +957,14 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
             out[branch.first + branch.second->pass_flags - 1] = offset * 4;
          } else {
             out[branch.first] &= 0xffff0000u;
-            out[branch.first] |= (uint16_t) offset;
+            out[branch.first] |= (uint16_t)offset;
          }
       }
    } while (repeat);
 }
 
-void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
+void
+fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
 {
    for (auto& constaddr : ctx.constaddrs) {
       constaddr_info& info = constaddr.second;
@@ -948,13 +972,12 @@ void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
    }
 }
 
-unsigned emit_program(Program* program,
-                      std::vector<uint32_t>& code)
+unsigned
+emit_program(Program* program, std::vector<uint32_t>& code)
 {
    asm_context ctx(program);
 
-   if (program->stage.hw == HWStage::VS ||
-       program->stage.hw == HWStage::FS ||
+   if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS ||
        program->stage.hw == HWStage::NGG)
       fix_exports(ctx, code, program);
 
@@ -986,4 +1009,4 @@ unsigned emit_program(Program* program,
    return exec_size;
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp
index 5b32b4f468a..3d565f0c141 100644
--- a/src/amd/compiler/aco_dead_code_analysis.cpp
+++ b/src/amd/compiler/aco_dead_code_analysis.cpp
@@ -40,7 +40,8 @@ struct dce_ctx {
    std::vector<uint16_t> uses;
    std::vector<std::vector<bool>> live;
 
-   dce_ctx(Program* program) : current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
+   dce_ctx(Program* program)
+       : current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
    {
       live.reserve(program->blocks.size());
       for (Block& block : program->blocks)
@@ -48,7 +49,8 @@ struct dce_ctx {
    }
 };
 
-void process_block(dce_ctx& ctx, Block& block)
+void
+process_block(dce_ctx& ctx, Block& block)
 {
    std::vector<bool>& live = ctx.live[block.index];
    assert(live.size() == block.instructions.size());
@@ -72,23 +74,26 @@ void process_block(dce_ctx& ctx, Block& block)
 
    if (process_predecessors) {
       for (unsigned pred_idx : block.linear_preds)
-         ctx.current_block = std::max(ctx.current_block, (int) pred_idx);
+         ctx.current_block = std::max(ctx.current_block, (int)pred_idx);
    }
 }
 
 } /* end namespace */
 
-bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr)
+bool
+is_dead(const std::vector<uint16_t>& uses, Instruction* instr)
 {
    if (instr->definitions.empty() || instr->isBranch())
       return false;
    if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
-          [&uses] (const Definition& def) { return !def.isTemp() || uses[def.tempId()];}))
+                   [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
       return false;
    return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
 }
 
-std::vector<uint16_t> dead_code_analysis(Program *program) {
+std::vector<uint16_t>
+dead_code_analysis(Program* program)
+{
 
    dce_ctx ctx(program);
 
@@ -105,5 +110,4 @@ std::vector<uint16_t> dead_code_analysis(Program *program) {
    return ctx.uses;
 }
 
-}
-
+} // namespace aco
diff --git a/src/amd/compiler/aco_dominance.cpp b/src/amd/compiler/aco_dominance.cpp
index 45013b59688..c3dda2be957 100644
--- a/src/amd/compiler/aco_dominance.cpp
+++ b/src/amd/compiler/aco_dominance.cpp
@@ -38,7 +38,8 @@
 
 namespace aco {
 
-void dominator_tree(Program* program)
+void
+dominator_tree(Program* program)
 {
    program->blocks[0].logical_idom = 0;
    program->blocks[0].linear_idom = 0;
@@ -48,7 +49,7 @@ void dominator_tree(Program* program)
       int new_logical_idom = -1;
       int new_linear_idom = -1;
       for (unsigned pred_idx : block.logical_preds) {
-         if ((int) program->blocks[pred_idx].logical_idom == -1)
+         if ((int)program->blocks[pred_idx].logical_idom == -1)
             continue;
 
          if (new_logical_idom == -1) {
@@ -56,16 +57,16 @@ void dominator_tree(Program* program)
             continue;
          }
 
-         while ((int) pred_idx != new_logical_idom) {
-            if ((int) pred_idx > new_logical_idom)
+         while ((int)pred_idx != new_logical_idom) {
+            if ((int)pred_idx > new_logical_idom)
                pred_idx = program->blocks[pred_idx].logical_idom;
-            if ((int) pred_idx < new_logical_idom)
+            if ((int)pred_idx < new_logical_idom)
                new_logical_idom = program->blocks[new_logical_idom].logical_idom;
          }
       }
 
       for (unsigned pred_idx : block.linear_preds) {
-         if ((int) program->blocks[pred_idx].linear_idom == -1)
+         if ((int)program->blocks[pred_idx].linear_idom == -1)
             continue;
 
          if (new_linear_idom == -1) {
@@ -73,10 +74,10 @@ void dominator_tree(Program* program)
             continue;
          }
 
-         while ((int) pred_idx != new_linear_idom) {
-            if ((int) pred_idx > new_linear_idom)
+         while ((int)pred_idx != new_linear_idom) {
+            if ((int)pred_idx > new_linear_idom)
                pred_idx = program->blocks[pred_idx].linear_idom;
-            if ((int) pred_idx < new_linear_idom)
+            if ((int)pred_idx < new_linear_idom)
                new_linear_idom = program->blocks[new_linear_idom].linear_idom;
          }
       }
@@ -86,5 +87,5 @@ void dominator_tree(Program* program)
    }
 }
 
-}
+} // namespace aco
 #endif
diff --git a/src/amd/compiler/aco_form_hard_clauses.cpp b/src/amd/compiler/aco_form_hard_clauses.cpp
index 8fbedc32fe5..fe806f55c79 100644
--- a/src/amd/compiler/aco_form_hard_clauses.cpp
+++ b/src/amd/compiler/aco_form_hard_clauses.cpp
@@ -31,15 +31,15 @@ namespace aco {
 namespace {
 
 /* there can also be LDS and VALU clauses, but I don't see how those are interesting */
-enum clause_type
-{
+enum clause_type {
    clause_vmem,
    clause_flat,
    clause_smem,
    clause_other,
 };
 
-void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction> *instrs)
+void
+emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction>* instrs)
 {
    unsigned start = 0;
 
@@ -61,7 +61,8 @@ void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction> *instrs
 
 } /* end namespace */
 
-void form_hard_clauses(Program *program)
+void
+form_hard_clauses(Program* program)
 {
    for (Block& block : program->blocks) {
       unsigned num_instrs = 0;
@@ -77,7 +78,8 @@ void form_hard_clauses(Program *program)
 
          clause_type type = clause_other;
          if (instr->isVMEM() && !instr->operands.empty()) {
-            if (program->chip_class == GFX10 && instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
+            if (program->chip_class == GFX10 && instr->isMIMG() &&
+                get_mimg_nsa_dwords(instr.get()) > 0)
                type = clause_other;
             else
                type = clause_vmem;
@@ -109,4 +111,4 @@ void form_hard_clauses(Program *program)
       block.instructions = std::move(new_instructions);
    }
 }
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
index ddd4037f6b3..3ef70854c0a 100644
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -34,12 +34,15 @@ namespace aco {
 namespace {
 
 struct NOP_ctx_gfx6 {
-   void join(const NOP_ctx_gfx6 &other) {
-      set_vskip_mode_then_vector = MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
+   void join(const NOP_ctx_gfx6& other)
+   {
+      set_vskip_mode_then_vector =
+         MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
       valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz);
       valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz);
       valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
-      salu_wr_m0_then_gds_msg_ttrace = MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
+      salu_wr_m0_then_gds_msg_ttrace =
+         MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
       valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
       salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
       salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
@@ -53,23 +56,21 @@ struct NOP_ctx_gfx6 {
       }
    }
 
-   bool operator==(const NOP_ctx_gfx6 &other)
+   bool operator==(const NOP_ctx_gfx6& other)
    {
-      return
-         set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
-         valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
-         valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
-         valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
-         vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
-         salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
-         valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
-         salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
-         salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
-         setreg_then_getsetreg == other.setreg_then_getsetreg &&
-         smem_clause == other.smem_clause &&
-         smem_write == other.smem_write &&
-         BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
-         BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
+      return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
+             valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
+             valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
+             valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
+             vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
+             salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
+             valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
+             salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
+             salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
+             setreg_then_getsetreg == other.setreg_then_getsetreg &&
+             smem_clause == other.smem_clause && smem_write == other.smem_write &&
+             BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
+             BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
    }
 
    void add_wait_states(unsigned amount)
@@ -154,7 +155,8 @@ struct NOP_ctx_gfx10 {
    std::bitset<128> sgprs_read_by_VMEM;
    std::bitset<128> sgprs_read_by_SMEM;
 
-   void join(const NOP_ctx_gfx10 &other) {
+   void join(const NOP_ctx_gfx10& other)
+   {
       has_VOPC |= other.has_VOPC;
       has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
       has_VMEM |= other.has_VMEM;
@@ -167,23 +169,19 @@ struct NOP_ctx_gfx10 {
       sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
    }
 
-   bool operator==(const NOP_ctx_gfx10 &other)
+   bool operator==(const NOP_ctx_gfx10& other)
    {
-      return
-         has_VOPC == other.has_VOPC &&
-         has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
-         has_VMEM == other.has_VMEM &&
-         has_branch_after_VMEM == other.has_branch_after_VMEM &&
-         has_DS == other.has_DS &&
-         has_branch_after_DS == other.has_branch_after_DS &&
-         has_NSA_MIMG == other.has_NSA_MIMG &&
-         has_writelane == other.has_writelane &&
-         sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
-         sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
+      return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
+             has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM &&
+             has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS &&
+             has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
+             sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
+             sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
    }
 };
 
-int get_wait_states(aco_ptr<Instruction>& instr)
+int
+get_wait_states(aco_ptr<Instruction>& instr)
 {
    if (instr->opcode == aco_opcode::s_nop)
       return instr->sopp().imm + 1;
@@ -193,16 +191,16 @@ int get_wait_states(aco_ptr<Instruction>& instr)
       return 1;
 }
 
-bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
+bool
+regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
 {
-   return a_reg > b_reg ?
-          (a_reg - b_reg < b_size) :
-          (b_reg - a_reg < a_size);
+   return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
 }
 
 template <bool Valu, bool Vintrp, bool Salu>
-int handle_raw_hazard_internal(Program *program, Block *block,
-                               int nops_needed, PhysReg reg, uint32_t mask)
+int
+handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg,
+                           uint32_t mask)
 {
    unsigned mask_size = util_last_bit(mask);
    for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
@@ -217,10 +215,8 @@ int handle_raw_hazard_internal(Program *program, Block *block,
          }
       }
 
-      bool is_hazard = writemask != 0 &&
-                       ((pred->isVALU() && Valu) ||
-                        (pred->isVINTRP() && Vintrp) ||
-                        (pred->isSALU() && Salu));
+      bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) ||
+                                          (pred->isVINTRP() && Vintrp) || (pred->isSALU() && Salu));
       if (is_hazard)
          return nops_needed;
 
@@ -238,17 +234,19 @@ int handle_raw_hazard_internal(Program *program, Block *block,
     * huge value. */
    for (unsigned lin_pred : block->linear_preds) {
       res = std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
-         program, &program->blocks[lin_pred], nops_needed, reg, mask));
+                             program, &program->blocks[lin_pred], nops_needed, reg, mask));
    }
    return res;
 }
 
 template <bool Valu, bool Vintrp, bool Salu>
-void handle_raw_hazard(Program *program, Block *cur_block, int *NOPs, int min_states, Operand op)
+void
+handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op)
 {
    if (*NOPs >= min_states)
       return;
-   int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
+   int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
+      program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
    *NOPs = MAX2(*NOPs, res);
 }
 
@@ -256,7 +254,9 @@ static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
 
-void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
+void
+set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
+{
    unsigned end = start + size - 1;
    unsigned start_mod = start % BITSET_WORDBITS;
    if (start_mod + size <= BITSET_WORDBITS) {
@@ -268,7 +268,9 @@ void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
    }
 }
 
-bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
+bool
+test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
+{
    unsigned end = start + size - 1;
    unsigned start_mod = start % BITSET_WORDBITS;
    if (start_mod + size <= BITSET_WORDBITS) {
@@ -291,18 +293,21 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
  *
  * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
  */
-void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
-                                aco_ptr<Instruction>& instr, int *NOPs)
+void
+handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
+                           int* NOPs)
 {
    /* break off from previous SMEM clause if needed */
    if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
       /* Don't allow clauses with store instructions since the clause's
        * instructions may use the same address. */
-      if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
+      if (ctx.smem_write || instr->definitions.empty() ||
+          instr_info.is_atomic[(unsigned)instr->opcode]) {
          *NOPs = 1;
       } else if (program->dev.xnack_enabled) {
          for (Operand op : instr->operands) {
-            if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
+            if (!op.isConstant() &&
+                test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
                *NOPs = 1;
                break;
             }
@@ -316,8 +321,10 @@ void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
 }
 
 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
-void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx,
-                             aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
+void
+handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
+                        aco_ptr<Instruction>& instr,
+                        std::vector<aco_ptr<Instruction>>& new_instructions)
 {
    /* check hazards */
    int NOPs = 0;
@@ -343,14 +350,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
 
       handle_smem_clause_hazards(program, ctx, instr, &NOPs);
    } else if (instr->isSALU()) {
-      if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
+      if (instr->opcode == aco_opcode::s_setreg_b32 ||
+          instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
           instr->opcode == aco_opcode::s_getreg_b32) {
          NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
       }
 
       if (program->chip_class == GFX9) {
-         if (instr->opcode == aco_opcode::s_movrels_b32 || instr->opcode == aco_opcode::s_movrels_b64 ||
-             instr->opcode == aco_opcode::s_movreld_b32 || instr->opcode == aco_opcode::s_movreld_b64) {
+         if (instr->opcode == aco_opcode::s_movrels_b32 ||
+             instr->opcode == aco_opcode::s_movrels_b64 ||
+             instr->opcode == aco_opcode::s_movreld_b32 ||
+             instr->opcode == aco_opcode::s_movreld_b64) {
             NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
          }
       }
@@ -398,7 +408,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
          handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]);
       }
 
-      if (instr->opcode == aco_opcode::v_div_fmas_f32 || instr->opcode == aco_opcode::v_div_fmas_f64)
+      if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
+          instr->opcode == aco_opcode::v_div_fmas_f64)
          NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
    } else if (instr->isVMEM() || instr->isFlatLike()) {
       /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
@@ -412,13 +423,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
       NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
 
    if (program->chip_class == GFX9) {
-      bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) &&
-                                instr->flatlike().lds;
-      if (instr->isVINTRP() ||
+      bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
+      if (instr->isVINTRP() || lds_scratch_global ||
           instr->opcode == aco_opcode::ds_read_addtid_b32 ||
           instr->opcode == aco_opcode::ds_write_addtid_b32 ||
-          instr->opcode == aco_opcode::buffer_store_lds_dword ||
-          lds_scratch_global) {
+          instr->opcode == aco_opcode::buffer_store_lds_dword) {
          NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
       }
    }
@@ -428,7 +437,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
    // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
    if (NOPs) {
       /* create NOP */
-      aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
+      aco_ptr<SOPP_instruction> nop{
+         create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
       nop->imm = NOPs - 1;
       nop->block = -1;
       new_instructions.emplace_back(std::move(nop));
@@ -485,7 +495,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
             ctx.salu_wr_m0_then_lds = 1;
             ctx.salu_wr_m0_then_moverel = 1;
          }
-      } else if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32) {
+      } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
+                 instr->opcode == aco_opcode::s_setreg_imm32_b32) {
          SOPK_instruction& sopk = instr->sopk();
          unsigned offset = (sopk.imm >> 6) & 0x1f;
          unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
@@ -497,19 +508,16 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
       }
    } else if (instr->isVMEM() || instr->isFlatLike()) {
       /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
-      bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) &&
-                          instr->operands.size() == 4 &&
-                          instr->operands[3].size() > 2 &&
-                          instr->operands[2].physReg() >= 128;
-      /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
+      bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
+                          instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
+      /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
+       * store) */
       bool consider_mimg = instr->isMIMG() &&
                            instr->operands[1].regClass().type() == RegType::vgpr &&
-                           instr->operands[1].size() > 2 &&
-                           instr->operands[0].size() == 4;
+                           instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
       /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
-      bool consider_flat = instr->isFlatLike() &&
-                           instr->operands.size() == 3 &&
-                           instr->operands[2].size() > 2;
+      bool consider_flat =
+         instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
       if (consider_buf || consider_mimg || consider_flat) {
          PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
          unsigned size = instr->operands[consider_flat ? 2 : 3].size();
@@ -520,22 +528,26 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
 }
 
 template <std::size_t N>
-bool check_written_regs(const aco_ptr<Instruction> &instr, const std::bitset<N> &check_regs)
+bool
+check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
 {
-   return std::any_of(instr->definitions.begin(), instr->definitions.end(), [&check_regs](const Definition &def) -> bool {
-      bool writes_any = false;
-      for (unsigned i = 0; i < def.size(); i++) {
-         unsigned def_reg = def.physReg() + i;
-         writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
-      }
-      return writes_any;
-   });
+   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
+                      [&check_regs](const Definition& def) -> bool
+                      {
+                         bool writes_any = false;
+                         for (unsigned i = 0; i < def.size(); i++) {
+                            unsigned def_reg = def.physReg() + i;
+                            writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
+                         }
+                         return writes_any;
+                      });
 }
 
 template <std::size_t N>
-void mark_read_regs(const aco_ptr<Instruction> &instr, std::bitset<N> &reg_reads)
+void
+mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
 {
-   for (const Operand &op : instr->operands) {
+   for (const Operand& op : instr->operands) {
       for (unsigned i = 0; i < op.size(); i++) {
          unsigned reg = op.physReg() + i;
          if (reg < reg_reads.size())
@@ -544,7 +556,8 @@ void mark_read_regs(const aco_ptr<Instruction> &instr, std::bitset<N> &reg_reads
    }
 }
 
-bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
+bool
+VALU_writes_sgpr(aco_ptr<Instruction>& instr)
 {
    if (instr->isVOPC())
       return true;
@@ -557,24 +570,26 @@ bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
    return false;
 }
 
-bool instr_writes_exec(const aco_ptr<Instruction>& instr)
+bool
+instr_writes_exec(const aco_ptr<Instruction>& instr)
 {
-   return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool {
-      return def.physReg() == exec_lo || def.physReg() == exec_hi;
-   });
+   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
+                      [](const Definition& def) -> bool
+                      { return def.physReg() == exec_lo || def.physReg() == exec_hi; });
 }
 
-bool instr_writes_sgpr(const aco_ptr<Instruction>& instr)
+bool
+instr_writes_sgpr(const aco_ptr<Instruction>& instr)
 {
-   return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool {
-      return def.getTemp().type() == RegType::sgpr;
-   });
+   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
+                      [](const Definition& def) -> bool
+                      { return def.getTemp().type() == RegType::sgpr; });
 }
 
-inline bool instr_is_branch(const aco_ptr<Instruction>& instr)
+inline bool
+instr_is_branch(const aco_ptr<Instruction>& instr)
 {
-   return instr->opcode == aco_opcode::s_branch ||
-          instr->opcode == aco_opcode::s_cbranch_scc0 ||
+   return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
           instr->opcode == aco_opcode::s_cbranch_scc1 ||
           instr->opcode == aco_opcode::s_cbranch_vccz ||
           instr->opcode == aco_opcode::s_cbranch_vccnz ||
@@ -586,19 +601,20 @@ inline bool instr_is_branch(const aco_ptr<Instruction>& instr)
           instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
           instr->opcode == aco_opcode::s_subvector_loop_begin ||
           instr->opcode == aco_opcode::s_subvector_loop_end ||
-          instr->opcode == aco_opcode::s_setpc_b64 ||
-          instr->opcode == aco_opcode::s_swappc_b64 ||
-          instr->opcode == aco_opcode::s_getpc_b64 ||
-          instr->opcode == aco_opcode::s_call_b64;
+          instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
+          instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
 }
 
-void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 &ctx,
-                              aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
+void
+handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
+                         aco_ptr<Instruction>& instr,
+                         std::vector<aco_ptr<Instruction>>& new_instructions)
 {
-   //TODO: s_dcache_inv needs to be in it's own group on GFX10
+   // TODO: s_dcache_inv needs to be in it's own group on GFX10
 
    /* VMEMtoScalarWriteHazard
-    * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
+    * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)"
+    * in-between.
     */
    if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
       /* Remember all SGPRs that are read by the VMEM instruction */
@@ -624,7 +640,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
          ctx.sgprs_read_by_VMEM.reset();
 
          /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
-         aco_ptr<SOPP_instruction> depctr{create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
+         aco_ptr<SOPP_instruction> depctr{
+            create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
          depctr->imm = 0xffe3;
          depctr->block = -1;
          new_instructions.emplace_back(std::move(depctr));
@@ -639,13 +656,13 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
     */
    if (instr->isVOPC()) {
       ctx.has_VOPC = true;
-   } else if (ctx.has_VOPC &&
-              (instr->opcode == aco_opcode::v_permlane16_b32 ||
-               instr->opcode == aco_opcode::v_permlanex16_b32)) {
+   } else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 ||
+                               instr->opcode == aco_opcode::v_permlanex16_b32)) {
       ctx.has_VOPC = false;
 
       /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
-      aco_ptr<VOP1_instruction> v_mov{create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
+      aco_ptr<VOP1_instruction> v_mov{
+         create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
       v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1);
       v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1);
       new_instructions.emplace_back(std::move(v_mov));
@@ -663,7 +680,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
          ctx.has_nonVALU_exec_read = false;
 
          /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
-         aco_ptr<SOPP_instruction> depctr{create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
+         aco_ptr<SOPP_instruction> depctr{
+            create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
          depctr->imm = 0xfffe;
          depctr->block = -1;
          new_instructions.emplace_back(std::move(depctr));
@@ -689,7 +707,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
          ctx.sgprs_read_by_SMEM.reset();
 
          /* Insert s_mov to mitigate the problem */
-         aco_ptr<SOP1_instruction> s_mov{create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
+         aco_ptr<SOP1_instruction> s_mov{
+            create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
          s_mov->definitions[0] = Definition(sgpr_null, s1);
          s_mov->operands[0] = Operand(0u);
          new_instructions.emplace_back(std::move(s_mov));
@@ -738,14 +757,16 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
       ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
 
       /* Insert s_waitcnt_vscnt to mitigate the problem */
-      aco_ptr<SOPK_instruction> wait{create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
+      aco_ptr<SOPK_instruction> wait{
+         create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
       wait->definitions[0] = Definition(sgpr_null, s1);
       wait->imm = 0;
       new_instructions.emplace_back(std::move(wait));
    }
 
    /* NSAToVMEMBug
-    * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 0).
+    * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
+    * 0).
     */
    if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
       ctx.has_NSA_MIMG = true;
@@ -772,11 +793,12 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
 }
 
 template <typename Ctx>
-using HandleInstr = void (*)(Program *, Block *block, Ctx&, aco_ptr<Instruction>&,
+using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr<Instruction>&,
                              std::vector<aco_ptr<Instruction>>&);
 
 template <typename Ctx, HandleInstr<Ctx> Handle>
-void handle_block(Program *program, Ctx& ctx, Block& block)
+void
+handle_block(Program* program, Ctx& ctx, Block& block)
 {
    if (block.instructions.empty())
       return;
@@ -793,14 +815,15 @@ void handle_block(Program *program, Ctx& ctx, Block& block)
 }
 
 template <typename Ctx, HandleInstr<Ctx> Handle>
-void mitigate_hazards(Program *program)
+void
+mitigate_hazards(Program* program)
 {
    std::vector<Ctx> all_ctx(program->blocks.size());
    std::stack<unsigned> loop_header_indices;
 
    for (unsigned i = 0; i < program->blocks.size(); i++) {
       Block& block = program->blocks[i];
-      Ctx &ctx = all_ctx[i];
+      Ctx& ctx = all_ctx[i];
 
       if (block.kind & block_kind_loop_header) {
          loop_header_indices.push(i);
@@ -832,7 +855,8 @@ void mitigate_hazards(Program *program)
 
 } /* end namespace */
 
-void insert_NOPs(Program* program)
+void
+insert_NOPs(Program* program)
 {
    if (program->chip_class >= GFX10_3)
       ; /* no hazards/bugs to mitigate */
@@ -842,4 +866,4 @@ void insert_NOPs(Program* program)
       mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program);
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index 29a74e15843..288ade88764 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -24,6 +24,7 @@
 
 #include "aco_builder.h"
 #include "aco_ir.h"
+
 #include "util/u_math.h"
 
 #include <set>
@@ -55,10 +56,9 @@ struct wqm_ctx {
    std::vector<uint16_t> defined_in;
    std::vector<bool> needs_wqm;
    std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
-   wqm_ctx(Program* program_) : program(program_),
-                               defined_in(program->peekAllocationId(), 0xFFFF),
-                               needs_wqm(program->peekAllocationId()),
-                               branch_wqm(program->blocks.size())
+   wqm_ctx(Program* program_)
+       : program(program_), defined_in(program->peekAllocationId(), 0xFFFF),
+         needs_wqm(program->peekAllocationId()), branch_wqm(program->blocks.size())
    {
       for (unsigned i = 0; i < program->blocks.size(); i++)
          worklist.insert(i);
@@ -72,13 +72,15 @@ struct loop_info {
    bool has_divergent_break;
    bool has_divergent_continue;
    bool has_discard; /* has a discard or demote */
-   loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard) :
-             loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
-             has_divergent_continue(cont), has_discard(discard) {}
+   loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard)
+       : loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
+         has_divergent_continue(cont), has_discard(discard)
+   {}
 };
 
 struct block_info {
-   std::vector<std::pair<Operand, uint8_t>> exec; /* Vector of exec masks. Either a temporary or const -1. */
+   std::vector<std::pair<Operand, uint8_t>>
+      exec; /* Vector of exec masks. Either a temporary or const -1. */
    std::vector<WQMState> instr_needs;
    uint8_t block_needs;
    uint8_t ever_again_needs;
@@ -87,14 +89,16 @@ struct block_info {
 };
 
 struct exec_ctx {
-   Program *program;
+   Program* program;
    std::vector<block_info> info;
    std::vector<loop_info> loop;
    bool handle_wqm = false;
-   exec_ctx(Program *program_) : program(program_), info(program->blocks.size()) {}
+   exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {}
 };
 
-bool needs_exact(aco_ptr<Instruction>& instr) {
+bool
+needs_exact(aco_ptr<Instruction>& instr)
+{
    if (instr->isMUBUF()) {
       return instr->mubuf().disable_wqm;
    } else if (instr->isMTBUF()) {
@@ -108,7 +112,8 @@ bool needs_exact(aco_ptr<Instruction>& instr) {
    }
 }
 
-void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
+void
+set_needs_wqm(wqm_ctx& ctx, Temp tmp)
 {
    if (!ctx.needs_wqm[tmp.id()]) {
       ctx.needs_wqm[tmp.id()] = true;
@@ -117,7 +122,8 @@ void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
    }
 }
 
-void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
+void
+mark_block_wqm(wqm_ctx& ctx, unsigned block_idx)
 {
    if (ctx.branch_wqm[block_idx])
       return;
@@ -136,7 +142,8 @@ void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
       mark_block_wqm(ctx, pred_idx);
 }
 
-void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
+void
+get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block)
 {
    block_info& info = exec_ctx.info[block->index];
 
@@ -146,8 +153,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
       aco_ptr<Instruction>& instr = block->instructions[i];
 
       WQMState needs = needs_exact(instr) ? Exact : Unspecified;
-      bool propagate_wqm = instr->opcode == aco_opcode::p_wqm ||
-                           instr->opcode == aco_opcode::p_as_uniform;
+      bool propagate_wqm =
+         instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_as_uniform;
       bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
       bool pred_by_exec = needs_exec_mask(instr.get());
       for (const Definition& definition : instr->definitions) {
@@ -214,7 +221,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
  * breaks, which might benefit from being in exact) by adding Exact_Branch to a
  * divergent branch surrounding the nested loop, if such a branch exists.
  */
-void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
+void
+handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
 {
    for (unsigned idx = preheader + 1; idx < exec_ctx.program->blocks.size(); idx++) {
       Block& block = exec_ctx.program->blocks[idx];
@@ -231,7 +239,8 @@ void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
  * ensure that the exact exec mask is not empty by adding Exact_Branch to
  * the outer divergent branch.
  */
-void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
+void
+handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
 {
    assert(exec_ctx.program->blocks[preheader + 1].kind & block_kind_loop_header);
 
@@ -265,7 +274,8 @@ void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
    }
 }
 
-void calculate_wqm_needs(exec_ctx& exec_ctx)
+void
+calculate_wqm_needs(exec_ctx& exec_ctx)
 {
    wqm_ctx ctx(exec_ctx.program);
 
@@ -307,14 +317,12 @@ void calculate_wqm_needs(exec_ctx& exec_ctx)
          exec_ctx.info[i].block_needs |= Exact;
 
       /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
-      if ((block.kind & block_kind_discard ||
-           block.kind & block_kind_uses_discard_if) &&
+      if ((block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if) &&
           ever_again_needs & WQM)
          exec_ctx.info[i].block_needs |= Preserve_WQM;
 
       ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
-      if (block.kind & block_kind_discard ||
-          block.kind & block_kind_uses_discard_if ||
+      if (block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if ||
           block.kind & block_kind_uses_demote)
          ever_again_needs |= Exact;
 
@@ -327,7 +335,8 @@ void calculate_wqm_needs(exec_ctx& exec_ctx)
    exec_ctx.handle_wqm = true;
 }
 
-Operand get_exec_op(Operand t)
+Operand
+get_exec_op(Operand t)
 {
    if (t.isUndefined())
       return Operand(exec, t.regClass());
@@ -335,7 +344,8 @@ Operand get_exec_op(Operand t)
       return t;
 }
 
-void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
+void
+transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
 {
    if (ctx.info[idx].exec.back().second & mask_type_wqm)
       return;
@@ -346,7 +356,8 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
          ctx.info[idx].exec.back().first = exec_mask;
       }
 
-      exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), get_exec_op(exec_mask));
+      exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
+                           get_exec_op(exec_mask));
       ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
       return;
    }
@@ -355,11 +366,12 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
    assert(ctx.info[idx].exec.back().second & mask_type_wqm);
    assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
    assert(ctx.info[idx].exec.back().first.isTemp());
-   ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                                                ctx.info[idx].exec.back().first);
+   ctx.info[idx].exec.back().first = bld.pseudo(
+      aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
 }
 
-void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
+void
+transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
 {
    if (ctx.info[idx].exec.back().second & mask_type_exact)
       return;
@@ -372,8 +384,8 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
       assert(ctx.info[idx].exec.back().second & mask_type_exact);
       assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
       assert(ctx.info[idx].exec.back().first.isTemp());
-      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                                                   ctx.info[idx].exec.back().first);
+      ctx.info[idx].exec.back().first = bld.pseudo(
+         aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
       return;
    }
    /* otherwise, we create an exact mask and push to the stack */
@@ -382,14 +394,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
       wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
                      Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm));
    } else {
-      bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), ctx.info[idx].exec[0].first, wqm);
+      bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc),
+               ctx.info[idx].exec[0].first, wqm);
    }
    ctx.info[idx].exec.back().first = Operand(wqm);
    ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type_exact);
 }
 
-unsigned add_coupling_code(exec_ctx& ctx, Block* block,
-                           std::vector<aco_ptr<Instruction>>& instructions)
+unsigned
+add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions)
 {
    unsigned idx = block->index;
    Builder bld(ctx.program, &instructions);
@@ -417,7 +430,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
       } else {
          uint8_t mask = mask_type_global;
          if (ctx.program->needs_wqm) {
-            bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm));
+            bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
+                     Operand(exec, bld.lm));
             mask |= mask_type_wqm;
          } else {
             mask |= mask_type_exact;
@@ -440,7 +454,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
       if (info.has_discard) {
          aco_ptr<Pseudo_instruction> phi;
          for (int i = 0; i < info.num_exec_masks - 1; i++) {
-            phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
+            phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
+                                                             Format::PSEUDO, preds.size(), 1));
             phi->definitions[0] = bld.def(bld.lm);
             phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first);
             ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
@@ -450,14 +465,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
       /* create ssa name for restore mask */
       if (info.has_divergent_break) {
          /* this phi might be trivial but ensures a parallelcopy on the loop header */
-         aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+         aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
          phi->definitions[0] = bld.def(bld.lm);
          phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
          ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
       }
 
       /* create ssa name for loop active mask */
-      aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+      aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
       if (info.has_divergent_continue)
          phi->definitions[0] = bld.def(bld.lm);
       else
@@ -466,7 +483,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
       Temp loop_active = bld.insert(std::move(phi));
 
       if (info.has_divergent_break) {
-         uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
+         uint8_t mask_type =
+            (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
          ctx.info[idx].exec.emplace_back(loop_active, mask_type);
       } else {
          ctx.info[idx].exec.back().first = Operand(loop_active);
@@ -482,8 +500,10 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
          }
          uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
          assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
-         ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                                                    ctx.info[idx].exec.back().first), mask_type);
+         ctx.info[idx].exec.emplace_back(
+            bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
+                       ctx.info[idx].exec.back().first),
+            mask_type);
       }
 
       return i;
@@ -514,14 +534,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
          aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
          assert(phi->opcode == aco_opcode::p_linear_phi);
          for (unsigned i = 1; i < phi->operands.size(); i++)
-            phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
+            phi->operands[i] =
+               get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
       }
 
       if (info.has_divergent_break) {
          aco_ptr<Instruction>& phi = header->instructions[instr_idx];
          assert(phi->opcode == aco_opcode::p_linear_phi);
          for (unsigned i = 1; i < phi->operands.size(); i++)
-            phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
+            phi->operands[i] =
+               get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
       }
 
       assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
@@ -541,7 +563,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
             ctx.info[idx].exec.emplace_back(same, type);
          } else {
             /* create phi for loop footer */
-            aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+            aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
+               aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
             phi->definitions[0] = bld.def(bld.lm);
             if (exec_idx == info.num_exec_masks - 1u) {
                phi->definitions[0] = Definition(exec, bld.lm);
@@ -578,8 +601,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
       assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
       if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) {
          /* move current exec mask into exec register */
-         ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                                                      ctx.info[idx].exec.back().first);
+         ctx.info[idx].exec.back().first = bld.pseudo(
+            aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
       }
 
       ctx.loop.pop_back();
@@ -591,8 +614,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
    } else {
       assert(preds.size() == 2);
       /* if one of the predecessors ends in exact mask, we pop it from stack */
-      unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(),
-                                         ctx.info[preds[1]].exec.size());
+      unsigned num_exec_masks =
+         std::min(ctx.info[preds[0]].exec.size(), ctx.info[preds[1]].exec.size());
 
       if (block->kind & block_kind_merge)
          num_exec_masks--;
@@ -605,14 +628,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
          if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
             Operand t = ctx.info[preds[0]].exec[i].first;
             /* discard/demote can change the state of the current exec mask */
-            assert(!t.isTemp() || ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
+            assert(!t.isTemp() ||
+                   ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
             uint8_t mask = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
             ctx.info[idx].exec.emplace_back(t, mask);
             continue;
          }
 
          bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
-         Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
+         Temp phi = bld.pseudo(aco_opcode::p_linear_phi,
+                               in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
                                get_exec_op(ctx.info[preds[0]].exec[i].first),
                                get_exec_op(ctx.info[preds[1]].exec[i].first));
          uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
@@ -654,9 +679,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
    return i;
 }
 
-void process_instructions(exec_ctx& ctx, Block* block,
-                          std::vector<aco_ptr<Instruction>>& instructions,
-                          unsigned idx)
+void
+process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions,
+                     unsigned idx)
 {
    WQMState state;
    if (ctx.info[block->index].exec.back().second & mask_type_wqm)
@@ -667,17 +692,16 @@ void process_instructions(exec_ctx& ctx, Block* block,
    }
 
    /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
-   bool process = (ctx.handle_wqm &&
-                   (ctx.info[block->index].block_needs & state) !=
-                   (ctx.info[block->index].block_needs & (WQM | Exact))) ||
+   bool process = (ctx.handle_wqm && (ctx.info[block->index].block_needs & state) !=
+                                        (ctx.info[block->index].block_needs & (WQM | Exact))) ||
                   block->kind & block_kind_uses_discard_if ||
-                  block->kind & block_kind_uses_demote ||
-                  block->kind & block_kind_needs_lowering;
+                  block->kind & block_kind_uses_demote || block->kind & block_kind_needs_lowering;
    if (!process) {
       std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
       instructions.insert(instructions.end(),
                           std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
-                          std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
+                          std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
+                             block->instructions.end()));
       return;
    }
 
@@ -700,11 +724,13 @@ void process_instructions(exec_ctx& ctx, Block* block,
          /* discard from current exec */
          const Operand cond = instr->operands[0];
          Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
-                                   Operand(exec, bld.lm), cond).def(1).getTemp();
+                                   Operand(exec, bld.lm), cond)
+                             .def(1)
+                             .getTemp();
 
          /* discard from inner to outer exec mask on stack */
          for (int i = num - 2; i >= 0; i--) {
-            Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
+            Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                           ctx.info[block->index].exec[i].first, cond);
             ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
             exit_cond = andn2->definitions[1].getTemp();
@@ -726,14 +752,16 @@ void process_instructions(exec_ctx& ctx, Block* block,
          Definition dst = instr->definitions[0];
          assert(dst.size() == bld.lm.size());
          if (state == Exact) {
-            instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
+            instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov),
+                                                             Format::SOP1, 1, 1));
             instr->operands[0] = Operand(0u);
             instr->definitions[0] = dst;
          } else {
             std::pair<Operand, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
             assert(exact_mask.second & mask_type_exact);
 
-            instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
+            instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2),
+                                                             Format::SOP2, 2, 2));
             instr->operands[0] = Operand(exec, bld.lm); /* current exec */
             instr->operands[1] = Operand(exact_mask.first);
             instr->definitions[0] = dst;
@@ -741,7 +769,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
          }
       } else if (instr->opcode == aco_opcode::p_demote_to_helper) {
          /* turn demote into discard_if with only exact masks */
-         assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global));
+         assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) ==
+                (mask_type_exact | mask_type_global));
 
          int num;
          Temp cond, exit_cond;
@@ -749,8 +778,9 @@ void process_instructions(exec_ctx& ctx, Block* block,
             assert(instr->operands[0].constantValue() == -1u);
             /* transition to exact and set exec to zero */
             exit_cond = bld.tmp(s1);
-            cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
-                            Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
+            cond =
+               bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
+                        Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
 
             num = ctx.info[block->index].exec.size() - 2;
             if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) {
@@ -767,7 +797,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
 
          for (int i = num; i >= 0; i--) {
             if (ctx.info[block->index].exec[i].second & mask_type_exact) {
-               Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
+               Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                              ctx.info[block->index].exec[i].first, cond);
                if (i == (int)ctx.info[block->index].exec.size() - 1) {
                   andn2->operands[0] = Operand(exec, bld.lm);
@@ -783,14 +813,14 @@ void process_instructions(exec_ctx& ctx, Block* block,
          instr->opcode = aco_opcode::p_exit_early_if;
          instr->operands[0] = bld.scc(exit_cond);
          state = Exact;
-
       }
 
       bld.insert(std::move(instr));
    }
 }
 
-void add_branch_code(exec_ctx& ctx, Block* block)
+void
+add_branch_code(exec_ctx& ctx, Block* block)
 {
    unsigned idx = block->index;
    Builder bld(ctx.program, block);
@@ -806,8 +836,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
       }
       assert(ctx.info[idx].exec.size() <= 2);
 
-      if (ctx.info[idx].ever_again_needs == 0 ||
-          ctx.info[idx].ever_again_needs == Exact) {
+      if (ctx.info[idx].ever_again_needs == 0 || ctx.info[idx].ever_again_needs == Exact) {
          /* transition to Exact */
          aco_ptr<Instruction> branch = std::move(block->instructions.back());
          block->instructions.pop_back();
@@ -838,8 +867,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
          Block& loop_block = ctx.program->blocks[i];
          needs |= ctx.info[i].block_needs;
 
-         if (loop_block.kind & block_kind_uses_discard_if ||
-             loop_block.kind & block_kind_discard ||
+         if (loop_block.kind & block_kind_uses_discard_if || loop_block.kind & block_kind_discard ||
              loop_block.kind & block_kind_uses_demote)
             has_discard = true;
          if (loop_block.loop_nest_depth != loop_nest_depth)
@@ -871,12 +899,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
       if (block->kind & block_kind_top_level)
          num_exec_masks = std::min(num_exec_masks, 2u);
 
-      ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]],
-                            num_exec_masks,
-                            needs,
-                            has_divergent_break,
-                            has_divergent_continue,
-                            has_discard);
+      ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs,
+                            has_divergent_break, has_divergent_continue, has_discard);
    }
 
    /* For normal breaks, this is the exec mask. For discard+break, it's the
@@ -903,7 +927,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
                            Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
 
       for (int i = num - 1; i >= 0; i--) {
-         Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
+         Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                        get_exec_op(ctx.info[block->index].exec[i].first), cond);
          if (i == (int)ctx.info[idx].exec.size() - 1)
             andn2->definitions[0] = Definition(exec, bld.lm);
@@ -919,8 +943,10 @@ void add_branch_code(exec_ctx& ctx, Block* block)
    }
 
    if (block->kind & block_kind_continue_or_break) {
-      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & block_kind_loop_header);
-      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit);
+      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind &
+             block_kind_loop_header);
+      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind &
+             block_kind_loop_exit);
       assert(block->instructions.back()->opcode == aco_opcode::p_branch);
       block->instructions.pop_back();
 
@@ -931,8 +957,10 @@ void add_branch_code(exec_ctx& ctx, Block* block)
       }
 
       if (need_parallelcopy)
-         ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
-      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
+         ctx.info[idx].exec.back().first = bld.pseudo(
+            aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
+      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
+                 block->linear_succs[1], block->linear_succs[0]);
       return;
    }
 
@@ -949,8 +977,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
 
    if (block->kind & block_kind_branch) {
 
-      if (ctx.handle_wqm &&
-          ctx.info[idx].exec.size() >= 2 &&
+      if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 &&
           ctx.info[idx].exec.back().second == mask_type_exact &&
           !(ctx.info[idx].block_needs & Exact_Branch) &&
           ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
@@ -972,7 +999,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
          bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond);
       } else {
          Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
-                                 Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
+                                  Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
 
          ctx.info[idx].exec.back().first = Operand(old_exec);
       }
@@ -980,7 +1007,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
       /* add next current exec to the stack */
       ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type);
 
-      bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
+      bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
+                 block->linear_succs[1], block->linear_succs[0]);
       return;
    }
 
@@ -990,9 +1018,11 @@ void add_branch_code(exec_ctx& ctx, Block* block)
       block->instructions.pop_back();
       assert(ctx.info[idx].exec.size() >= 2);
       Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
-      bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm));
+      bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
+               Operand(exec, bld.lm));
 
-      bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
+      bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
+                 block->linear_succs[1], block->linear_succs[0]);
       return;
    }
 
@@ -1020,7 +1050,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
          bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
       }
 
-      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
+      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
+                 block->linear_succs[1], block->linear_succs[0]);
       return;
    }
 
@@ -1048,12 +1079,14 @@ void add_branch_code(exec_ctx& ctx, Block* block)
          bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
       }
 
-      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
+      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
+                 block->linear_succs[1], block->linear_succs[0]);
       return;
    }
 }
 
-void process_block(exec_ctx& ctx, Block* block)
+void
+process_block(exec_ctx& ctx, Block* block)
 {
    std::vector<aco_ptr<Instruction>> instructions;
    instructions.reserve(block->instructions.size());
@@ -1072,8 +1105,8 @@ void process_block(exec_ctx& ctx, Block* block)
 
 } /* end namespace */
 
-
-void insert_exec_mask(Program *program)
+void
+insert_exec_mask(Program* program)
 {
    exec_ctx ctx(program);
 
@@ -1082,8 +1115,6 @@ void insert_exec_mask(Program *program)
 
    for (Block& block : program->blocks)
       process_block(ctx, &block);
-
-}
-
 }
 
+} // namespace aco
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index 83c6dac0263..e4788270c98 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "aco_ir.h"
+
 #include "common/sid.h"
 
 #include <map>
@@ -49,7 +50,8 @@ namespace {
  * - or erase gprs with counters higher than to be waited for.
  */
 
-// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load
+// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
+// when there is a load followed by a use of a previous load
 
 /* Instructions of the same event will finish in-order except for smem
  * and maybe flat. Instructions of different events may not finish in-order. */
@@ -77,54 +79,50 @@ enum counter_type : uint8_t {
    num_counters = 4,
 };
 
-static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
+static const uint16_t exp_events =
+   event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
 static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
 static const uint16_t vm_events = event_vmem | event_flat;
 static const uint16_t vs_events = event_vmem_store;
 
-uint8_t get_counters_for_event(wait_event ev)
+uint8_t
+get_counters_for_event(wait_event ev)
 {
    switch (ev) {
    case event_smem:
    case event_lds:
    case event_gds:
-   case event_sendmsg:
-      return counter_lgkm;
-   case event_vmem:
-      return counter_vm;
-   case event_vmem_store:
-      return counter_vs;
-   case event_flat:
-      return counter_vm | counter_lgkm;
+   case event_sendmsg: return counter_lgkm;
+   case event_vmem: return counter_vm;
+   case event_vmem_store: return counter_vs;
+   case event_flat: return counter_vm | counter_lgkm;
    case event_exp_pos:
    case event_exp_param:
    case event_exp_mrt_null:
    case event_gds_gpr_lock:
-   case event_vmem_gpr_lock:
-      return counter_exp;
-   default:
-      return 0;
+   case event_vmem_gpr_lock: return counter_exp;
+   default: return 0;
    }
 }
 
 struct wait_entry {
    wait_imm imm;
-   uint16_t events; /* use wait_event notion */
+   uint16_t events;  /* use wait_event notion */
    uint8_t counters; /* use counter_type notion */
-   bool wait_on_read:1;
-   bool logical:1;
-   bool has_vmem_nosampler:1;
-   bool has_vmem_sampler:1;
+   bool wait_on_read : 1;
+   bool logical : 1;
+   bool has_vmem_nosampler : 1;
+   bool has_vmem_sampler : 1;
 
    wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_)
-           : imm(imm_), events(event_), counters(get_counters_for_event(event_)),
-             wait_on_read(wait_on_read_), logical(logical_),
-             has_vmem_nosampler(false), has_vmem_sampler(false) {}
+       : imm(imm_), events(event_), counters(get_counters_for_event(event_)),
+         wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false),
+         has_vmem_sampler(false)
+   {}
 
    bool join(const wait_entry& other)
    {
-      bool changed = (other.events & ~events) ||
-                     (other.counters & ~counters) ||
+      bool changed = (other.events & ~events) || (other.counters & ~counters) ||
                      (other.wait_on_read && !wait_on_read) ||
                      (other.has_vmem_nosampler && !has_vmem_nosampler) ||
                      (other.has_vmem_sampler && !has_vmem_sampler);
@@ -156,7 +154,8 @@ struct wait_entry {
 
       if (counter == counter_exp) {
          imm.exp = wait_imm::unset_counter;
-         events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock);
+         events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock |
+                     event_vmem_gpr_lock);
       }
 
       if (counter == counter_vs) {
@@ -170,7 +169,7 @@ struct wait_entry {
 };
 
 struct wait_ctx {
-   Program *program;
+   Program* program;
    enum chip_class chip_class;
    uint16_t max_vm_cnt;
    uint16_t max_exp_cnt;
@@ -189,24 +188,21 @@ struct wait_ctx {
    wait_imm barrier_imm[storage_count];
    uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
 
-   std::map<PhysReg,wait_entry> gpr_map;
+   std::map<PhysReg, wait_entry> gpr_map;
 
    wait_ctx() {}
-   wait_ctx(Program *program_)
-           : program(program_),
-             chip_class(program_->chip_class),
-             max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14),
-             max_exp_cnt(6),
-             max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
-             max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
-             unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
+   wait_ctx(Program* program_)
+       : program(program_), chip_class(program_->chip_class),
+         max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6),
+         max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
+         max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
+         unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0))
+   {}
 
    bool join(const wait_ctx* other, bool logical)
    {
-      bool changed = other->exp_cnt > exp_cnt ||
-                     other->vm_cnt > vm_cnt ||
-                     other->lgkm_cnt > lgkm_cnt ||
-                     other->vs_cnt > vs_cnt ||
+      bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt ||
+                     other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt ||
                      (other->pending_flat_lgkm && !pending_flat_lgkm) ||
                      (other->pending_flat_vm && !pending_flat_vm);
 
@@ -218,12 +214,11 @@ struct wait_ctx {
       pending_flat_vm |= other->pending_flat_vm;
       pending_s_buffer_store |= other->pending_s_buffer_store;
 
-      for (const auto& entry : other->gpr_map)
-      {
+      for (const auto& entry : other->gpr_map) {
          if (entry.second.logical != logical)
             continue;
 
-         using iterator = std::map<PhysReg,wait_entry>::iterator;
+         using iterator = std::map<PhysReg, wait_entry>::iterator;
          const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
          if (insert_pair.second) {
             changed = true;
@@ -241,12 +236,14 @@ struct wait_ctx {
       return changed;
    }
 
-   void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) {
+   void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
+   {
       entry.remove_counter(counter);
    }
 };
 
-wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
+wait_imm
+check_instr(Instruction* instr, wait_ctx& ctx)
 {
    wait_imm wait;
 
@@ -257,7 +254,7 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
       /* check consecutively read gprs */
       for (unsigned j = 0; j < op.size(); j++) {
          PhysReg reg{op.physReg() + j};
-         std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
+         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
          if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
             continue;
 
@@ -267,22 +264,24 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
 
    for (const Definition& def : instr->definitions) {
       /* check consecutively written gprs */
-      for (unsigned j = 0; j < def.getTemp().size(); j++)
-      {
+      for (unsigned j = 0; j < def.getTemp().size(); j++) {
          PhysReg reg{def.physReg() + j};
 
-         std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
+         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
          if (it == ctx.gpr_map.end())
             continue;
 
          /* Vector Memory reads and writes return in the order they were issued */
-         bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
+         bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
+                            instr->operands[1].regClass() == s4;
          if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) &&
-             it->second.has_vmem_nosampler == !has_sampler && it->second.has_vmem_sampler == has_sampler)
+             it->second.has_vmem_nosampler == !has_sampler &&
+             it->second.has_vmem_sampler == has_sampler)
             continue;
 
          /* LDS reads and writes return in the order they were issued. same for GDS */
-         if (instr->isDS() && (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
+         if (instr->isDS() &&
+             (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
             continue;
 
          wait.combine(it->second.imm);
@@ -292,7 +291,8 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
    return wait;
 }
 
-wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
+wait_imm
+parse_wait_instr(wait_ctx& ctx, Instruction* instr)
 {
    if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
        instr->definitions[0].physReg() == sgpr_null) {
@@ -305,10 +305,12 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
    return wait_imm();
 }
 
-wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
+wait_imm
+perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
 {
    wait_imm imm;
-   sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
+   sync_scope subgroup_scope =
+      ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
    if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
       unsigned storage = sync.storage;
       while (storage) {
@@ -321,7 +323,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
          if (bar_scope_lds <= subgroup_scope)
             events &= ~event_lds;
 
-         /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations in-order for the same workgroup */
+         /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
+          * in-order for the same workgroup */
          if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
             events &= ~(event_vmem | event_vmem_store | event_smem);
 
@@ -333,7 +336,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
    return imm;
 }
 
-void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
+void
+force_waitcnt(wait_ctx& ctx, wait_imm& imm)
 {
    if (ctx.vm_cnt)
       imm.vm = 0;
@@ -348,7 +352,8 @@ void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
    }
 }
 
-wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
+wait_imm
+kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
 {
    wait_imm imm;
 
@@ -364,7 +369,6 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
 
    imm.combine(parse_wait_instr(ctx, instr));
 
-
    /* It's required to wait for scalar stores before "writing back" data.
     * It shouldn't cost anything anyways since we're about to do s_endpgm.
     */
@@ -380,20 +384,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
        *
        * TODO: Refine this when we have proper alias analysis.
        */
-      if (ctx.pending_s_buffer_store &&
-          !instr->smem().definitions.empty() &&
+      if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&
           !instr->smem().sync.can_reorder()) {
          imm.lgkm = 0;
       }
    }
 
    if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) {
-      if (instr->exp().dest >= V_008DFC_SQ_EXP_POS &&
-          instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
+      if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
 
-         /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos export.
-          * Wait for all stores (and atomics) to complete, so PS can read them.
-          * TODO: This only really applies to DONE pos exports. Consider setting the DONE bit earlier.
+         /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos
+          * export. Wait for all stores (and atomics) to complete, so PS can read them.
+          * TODO: This only really applies to DONE pos exports.
+          *       Consider setting the DONE bit earlier.
           */
          if (ctx.vs_cnt > 0)
             imm.vs = 0;
@@ -444,9 +447,8 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
       }
 
       /* remove all gprs with higher counter from map */
-      std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin();
-      while (it != ctx.gpr_map.end())
-      {
+      std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
+      while (it != ctx.gpr_map.end()) {
          if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
             ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
          if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
@@ -472,13 +474,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
    return imm;
 }
 
-void update_barrier_counter(uint8_t *ctr, unsigned max)
+void
+update_barrier_counter(uint8_t* ctr, unsigned max)
 {
    if (*ctr != wait_imm::unset_counter && *ctr < max)
       (*ctr)++;
 }
 
-void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
+void
+update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
 {
    for (unsigned i = 0; i < storage_count; i++) {
       wait_imm& bar = ctx.barrier_imm[i];
@@ -506,7 +510,8 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memor
    }
 }
 
-void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
+void
+update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
 {
    uint8_t counters = get_counters_for_event(event);
 
@@ -529,7 +534,7 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo
    if (ctx.pending_flat_vm)
       counters &= ~counter_vm;
 
-   for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) {
+   for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
       wait_entry& entry = e.second;
 
       if (entry.events & ctx.unordered_events)
@@ -537,18 +542,23 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo
 
       assert(entry.events);
 
-      if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt)
+      if ((counters & counter_exp) && (entry.events & exp_events) == event &&
+          entry.imm.exp < ctx.max_exp_cnt)
          entry.imm.exp++;
-      if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt)
+      if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
+          entry.imm.lgkm < ctx.max_lgkm_cnt)
          entry.imm.lgkm++;
-      if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt)
+      if ((counters & counter_vm) && (entry.events & vm_events) == event &&
+          entry.imm.vm < ctx.max_vm_cnt)
          entry.imm.vm++;
-      if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt)
+      if ((counters & counter_vs) && (entry.events & vs_events) == event &&
+          entry.imm.vs < ctx.max_vs_cnt)
          entry.imm.vs++;
    }
 }
 
-void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
+void
+update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
 {
    assert(ctx.chip_class < GFX10);
 
@@ -559,8 +569,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s
 
    update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
 
-   for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
-   {
+   for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
       if (e.second.counters & counter_vm)
          e.second.imm.vm = 0;
       if (e.second.counters & counter_lgkm)
@@ -570,8 +579,9 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s
    ctx.pending_flat_vm = true;
 }
 
-void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
-                       bool has_sampler=false)
+void
+insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
+                  bool has_sampler = false)
 {
    uint16_t counters = get_counters_for_event(event);
    wait_imm imm;
@@ -589,24 +599,27 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event
    new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler;
 
    for (unsigned i = 0; i < rc.size(); i++) {
-      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg()+i}, new_entry);
+      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
       if (!it.second)
          it.first->second.join(new_entry);
    }
 }
 
-void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler=false)
+void
+insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false)
 {
    if (!op.isConstant() && !op.isUndefined())
       insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler);
 }
 
-void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler=false)
+void
+insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false)
 {
    insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler);
 }
 
-void gen(Instruction* instr, wait_ctx& ctx)
+void
+gen(Instruction* instr, wait_ctx& ctx)
 {
    switch (instr->format) {
    case Format::EXP: {
@@ -622,13 +635,11 @@ void gen(Instruction* instr, wait_ctx& ctx)
       update_counters(ctx, ev);
 
       /* insert new entries for exported vgprs */
-      for (unsigned i = 0; i < 4; i++)
-      {
+      for (unsigned i = 0; i < 4; i++) {
          if (exp_instr.enabled_mask & (1 << i)) {
             unsigned idx = exp_instr.compressed ? i >> 1 : i;
             assert(idx < exp_instr.operands.size());
             insert_wait_entry(ctx, exp_instr.operands[idx], ev);
-
          }
       }
       insert_wait_entry(ctx, exec, s2, ev, false);
@@ -651,8 +662,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
 
       if (!instr->definitions.empty())
          insert_wait_entry(ctx, instr->definitions[0], event_smem);
-      else if (ctx.chip_class >= GFX10 &&
-               !smem.sync.can_reorder())
+      else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder())
          ctx.pending_s_buffer_store = true;
 
       break;
@@ -677,23 +687,21 @@ void gen(Instruction* instr, wait_ctx& ctx)
    case Format::MTBUF:
    case Format::MIMG:
    case Format::GLOBAL: {
-      wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
+      wait_event ev =
+         !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
       update_counters(ctx, ev, get_sync_info(instr));
 
-      bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
+      bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
+                         instr->operands[1].regClass() == s4;
 
       if (!instr->definitions.empty())
          insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler);
 
-      if (ctx.chip_class == GFX6 &&
-          instr->format != Format::MIMG &&
-          instr->operands.size() == 4) {
+      if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
          ctx.exp_cnt++;
          update_counters(ctx, event_vmem_gpr_lock);
          insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
-      } else if (ctx.chip_class == GFX6 &&
-                 instr->isMIMG() &&
-                 !instr->operands[2].isUndefined()) {
+      } else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
          ctx.exp_cnt++;
          update_counters(ctx, event_vmem_gpr_lock);
          insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
@@ -702,35 +710,37 @@ void gen(Instruction* instr, wait_ctx& ctx)
       break;
    }
    case Format::SOPP: {
-      if (instr->opcode == aco_opcode::s_sendmsg ||
-          instr->opcode == aco_opcode::s_sendmsghalt)
+      if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
          update_counters(ctx, event_sendmsg);
       break;
    }
-   default:
-      break;
+   default: break;
    }
 }
 
-void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
+void
+emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
 {
    if (imm.vs != wait_imm::unset_counter) {
       assert(ctx.chip_class >= GFX10);
-      SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
+      SOPK_instruction* waitcnt_vs =
+         create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
       waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
       waitcnt_vs->imm = imm.vs;
       instructions.emplace_back(waitcnt_vs);
       imm.vs = wait_imm::unset_counter;
    }
    if (!imm.empty()) {
-      SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
+      SOPP_instruction* waitcnt =
+         create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
       waitcnt->imm = imm.pack(ctx.chip_class);
       waitcnt->block = -1;
       instructions.emplace_back(waitcnt);
    }
 }
 
-void handle_block(Program *program, Block& block, wait_ctx& ctx)
+void
+handle_block(Program* program, Block& block, wait_ctx& ctx)
 {
    std::vector<aco_ptr<Instruction>> new_instructions;
 
@@ -763,7 +773,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
 
 } /* end namespace */
 
-void insert_wait_states(Program* program)
+void
+insert_wait_states(Program* program)
 {
    /* per BB ctx */
    std::vector<bool> done(program->blocks.size());
@@ -818,5 +829,4 @@ void insert_wait_states(Program* program)
    }
 }
 
-}
-
+} // namespace aco
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 2af31108aae..c7bdbb8b3c4 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -47,14 +47,15 @@ namespace {
 
 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
 
-static void _isel_err(isel_context *ctx, const char *file, unsigned line,
-                      const nir_instr *instr, const char *msg)
+static void
+_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
+          const char* msg)
 {
-   char *out;
+   char* out;
    size_t outsize;
    struct u_memstream mem;
    u_memstream_open(&mem, &out, &outsize);
-   FILE *const memf = u_memstream_get(&mem);
+   FILE* const memf = u_memstream_get(&mem);
 
    fprintf(memf, "%s: ", msg);
    nir_print_instr(instr, memf);
@@ -90,43 +91,48 @@ struct loop_context {
    bool divergent_if_old;
 };
 
-static bool visit_cf_list(struct isel_context *ctx,
-                          struct exec_list *list);
+static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
 
-static void add_logical_edge(unsigned pred_idx, Block *succ)
+static void
+add_logical_edge(unsigned pred_idx, Block* succ)
 {
    succ->logical_preds.emplace_back(pred_idx);
 }
 
-
-static void add_linear_edge(unsigned pred_idx, Block *succ)
+static void
+add_linear_edge(unsigned pred_idx, Block* succ)
 {
    succ->linear_preds.emplace_back(pred_idx);
 }
 
-static void add_edge(unsigned pred_idx, Block *succ)
+static void
+add_edge(unsigned pred_idx, Block* succ)
 {
    add_logical_edge(pred_idx, succ);
    add_linear_edge(pred_idx, succ);
 }
 
-static void append_logical_start(Block *b)
+static void
+append_logical_start(Block* b)
 {
    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 }
 
-static void append_logical_end(Block *b)
+static void
+append_logical_end(Block* b)
 {
    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 }
 
-Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
+Temp
+get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)
 {
    uint32_t id = ctx->first_temp_id + def->index;
    return Temp(id, ctx->program->temp_rc[id]);
 }
 
-Temp emit_mbcnt(isel_context *ctx, Temp dst, Operand mask = Operand(), Operand base = Operand(0u))
+Temp
+emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand(0u))
 {
    Builder bld(ctx->program, ctx->block);
    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
@@ -142,7 +148,8 @@ Temp emit_mbcnt(isel_context *ctx, Temp dst, Operand mask = Operand(), Operand b
 
    if (mask.isTemp()) {
       RegClass rc = RegClass(mask.regClass().type(), 1);
-      Builder::Result mask_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
+      Builder::Result mask_split =
+         bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
       mask_lo = Operand(mask_split.def(0).getTemp());
       mask_hi = Operand(mask_split.def(1).getTemp());
    } else if (mask.physReg() == exec) {
@@ -158,7 +165,8 @@ Temp emit_mbcnt(isel_context *ctx, Temp dst, Operand mask = Operand(), Operand b
       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
 }
 
-Temp emit_wqm(Builder& bld, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
+Temp
+emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
 {
    if (!dst.id())
       dst = bld.tmp(src.regClass());
@@ -178,7 +186,8 @@ Temp emit_wqm(Builder& bld, Temp src, Temp dst=Temp(0, s1), bool program_needs_w
    return dst;
 }
 
-static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
+static Temp
+emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
 {
    if (index.regClass() == s1)
       return bld.readlane(bld.def(s1), data, index);
@@ -190,14 +199,18 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
       index_op.setLateKill(true);
       input_data.setLateKill(true);
 
-      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
+      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
+                        index_op, input_data);
    } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
 
       /* GFX10 wave64 mode: emulate full-wave bpermute */
       Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
-      Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
-      Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
-      Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
+      Builder::Result index_is_lo_split =
+         bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
+      Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
+                                     index_is_lo_split.def(1).getTemp());
+      Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
+                                     index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
       Operand input_data(data);
 
@@ -209,7 +222,8 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
        * Note, that these have twice the allocation granularity of normal VGPRs */
       ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
 
-      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
+      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
+                        index_x4, input_data, same_half);
    } else {
       /* GFX8-9 or GFX10 wave32: bpermute works normally */
       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
@@ -217,7 +231,8 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
    }
 }
 
-static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
+static Temp
+emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
 {
    if (ctx->options->chip_class >= GFX8) {
       unsigned and_mask = mask & 0x1f;
@@ -247,7 +262,8 @@ static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsig
    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
 }
 
-Temp as_vgpr(isel_context *ctx, Temp val)
+Temp
+as_vgpr(isel_context* ctx, Temp val)
 {
    if (val.type() == RegType::sgpr) {
       Builder bld(ctx->program, ctx->block);
@@ -257,8 +273,9 @@ Temp as_vgpr(isel_context *ctx, Temp val)
    return val;
 }
 
-//assumes a != 0xffffffff
-void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
+// assumes a != 0xffffffff
+void
+emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
 {
    assert(b != 0);
    Builder bld(ctx->program, ctx->block);
@@ -285,13 +302,14 @@ void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
    Temp pre_shift_dst = a;
    if (pre_shift) {
       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
-      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
+      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst),
+               Operand((uint32_t)info.pre_shift), a);
    }
 
    Temp increment_dst = pre_shift_dst;
    if (increment) {
       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
-      bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
+      bld.vadd32(Definition(increment_dst), Operand((uint32_t)info.increment), pre_shift_dst);
    }
 
    Temp multiply_dst = increment_dst;
@@ -302,18 +320,20 @@ void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
    }
 
    if (post_shift) {
-      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
+      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift),
+               multiply_dst);
    }
 }
 
-void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
+void
+emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 }
 
-
-Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
+Temp
+emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 {
    /* no need to extract the whole vector */
    if (src.regClass() == dst_rc) {
@@ -347,7 +367,8 @@ Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst
    }
 }
 
-void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
+void
+emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 {
    if (num_components == 1)
       return;
@@ -365,9 +386,10 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
    } else {
       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
    }
-   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
+   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
+      aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
    split->operands[0] = Operand(vec_src);
-   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
    for (unsigned i = 0; i < num_components; i++) {
       elems[i] = ctx->program->allocateTmp(rc);
       split->definitions[i] = Definition(elems[i]);
@@ -378,7 +400,8 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 
 /* This vector expansion uses a mask to determine which elements in the new vector
  * come from the original vector. The other elements are undefined. */
-void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
+void
+expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 {
    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 
@@ -395,14 +418,16 @@ void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_compo
    }
 
    unsigned component_size = dst.size() / num_components;
-   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 
-   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+      aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
    vec->definitions[0] = Definition(dst);
    unsigned k = 0;
    for (unsigned i = 0; i < num_components; i++) {
       if (mask & (1 << i)) {
-         Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
+         Temp src =
+            emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
          if (dst.type() == RegType::sgpr)
             src = bld.as_uniform(src);
          vec->operands[i] = Operand(src);
@@ -416,7 +441,8 @@ void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_compo
 }
 
 /* adjust misaligned small bit size loads */
-void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
+void
+byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
    Operand shift;
@@ -426,9 +452,11 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
       shift = Operand(offset.constantValue() * 8);
    } else {
       /* bit_offset = 8 * (offset & 0x3) */
-      Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
+      Temp tmp =
+         bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
       select = bld.tmp(s1);
-      shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
+      shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
+                       Operand(3u));
    }
 
    if (vec.size() == 1) {
@@ -463,7 +491,8 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
    }
 }
 
-void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
+void
+byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
 {
    Builder bld(ctx->program, ctx->block);
    if (offset.isTemp()) {
@@ -471,10 +500,12 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
 
       if (vec.size() == 4) {
          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
+                    Definition(tmp[2]), Definition(tmp[3]), vec);
       } else if (vec.size() == 3) {
          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
+                    Definition(tmp[2]), vec);
       } else if (vec.size() == 2) {
          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
@@ -506,17 +537,18 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
    for (unsigned i = skip; i < num_components; i++)
       elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
 
-   /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
    if (dst.type() == RegType::vgpr) {
+      /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
       num_components = dst.bytes() / component_size;
-      aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+      aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
       for (unsigned i = 0; i < num_components; i++)
          create_vec->operands[i] = Operand(elems[i]);
       create_vec->definitions[0] = Definition(dst);
       bld.insert(std::move(create_vec));
 
-   /* if dst is sgpr - split the src, but move the original to sgpr. */
    } else if (skip) {
+      /* if dst is sgpr - split the src, but move the original to sgpr. */
       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
       byte_align_scalar(ctx, vec, offset, dst);
    } else {
@@ -527,7 +559,8 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
    ctx->allocated_vec.emplace(dst.id(), elems);
 }
 
-Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
+Temp
+bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
 {
    Builder bld(ctx->program, ctx->block);
    if (!dst.id())
@@ -536,10 +569,12 @@ Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2
    assert(val.regClass() == s1);
    assert(dst.regClass() == bld.lm);
 
-   return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
+   return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t)-1), Operand(0u),
+                   bld.scc(val));
 }
 
-Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
+Temp
+bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
 {
    Builder bld(ctx->program, ctx->block);
    if (!dst.id())
@@ -563,9 +598,12 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1
  *
  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
  */
-Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool sign_extend, Temp dst=Temp())
+Temp
+convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
+            bool sign_extend, Temp dst = Temp())
 {
-   assert(!(sign_extend && dst_bits < src_bits) && "Shrinking integers is not supported for signed inputs");
+   assert(!(sign_extend && dst_bits < src_bits) &&
+          "Shrinking integers is not supported for signed inputs");
 
    if (!dst.id()) {
       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
@@ -592,14 +630,15 @@ Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, u
    if (tmp == src) {
    } else if (src.regClass() == s1) {
       assert(src_bits < 32);
-      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc),
-                 src, Operand(0u), Operand(src_bits), Operand((unsigned)sign_extend));
+      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand(0u),
+                 Operand(src_bits), Operand((unsigned)sign_extend));
    } else if (ctx->options->chip_class >= GFX8) {
       assert(src_bits < 32);
       assert(src_bits != 8 || src.regClass() == v1b);
       assert(src_bits != 16 || src.regClass() == v2b);
       assert(dst_bits >= 16);
-      aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+      aco_ptr<SDWA_instruction> sdwa{
+         create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
       sdwa->operands[0] = Operand(src);
       sdwa->definitions[0] = Definition(tmp);
       if (sign_extend)
@@ -617,7 +656,8 @@ Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, u
 
    if (dst_bits == 64) {
       if (sign_extend && dst.regClass() == s2) {
-         Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
+         Temp high =
+            bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
       } else if (sign_extend && dst.regClass() == v2) {
          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
@@ -636,7 +676,8 @@ enum sgpr_extract_mode {
    sgpr_extract_undef,
 };
 
-Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src, sgpr_extract_mode mode)
+Temp
+extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
 {
    Temp vec = get_ssa_temp(ctx, src->src.ssa);
    unsigned src_size = src->src.ssa->bit_size;
@@ -655,7 +696,8 @@ Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src
       bld.copy(Definition(tmp), vec);
    else
       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
-                 Operand(swizzle), Operand(src_size), Operand((uint32_t)(mode == sgpr_extract_sext)));
+                 Operand(swizzle), Operand(src_size),
+                 Operand((uint32_t)(mode == sgpr_extract_sext)));
 
    if (dst.regClass() == s2)
       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
@@ -663,7 +705,8 @@ Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src
    return dst;
 }
 
-Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
+Temp
+get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
 {
    if (src.src.ssa->num_components == 1 && size == 1)
       return get_ssa_temp(ctx, src.src.ssa);
@@ -685,17 +728,19 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
    if (elem_size < 4 && vec.type() == RegType::sgpr) {
       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
       assert(size == 1);
-      return extract_8_16_bit_sgpr_element(
-         ctx, ctx->program->allocateTmp(s1), &src, sgpr_extract_undef);
+      return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
+                                           sgpr_extract_undef);
    }
 
-   RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
+   RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
+                                    : RegClass(vec.type(), elem_size / 4);
    if (size == 1) {
       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
    } else {
       assert(size <= 4);
-      std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
-      aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
+      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+      aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
       for (unsigned i = 0; i < size; ++i) {
          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
          vec_instr->operands[i] = Operand{elems[i]};
@@ -708,7 +753,8 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
    }
 }
 
-Temp get_alu_src_vop3p(struct isel_context *ctx, nir_alu_src src)
+Temp
+get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
 {
    /* returns v2b or v1 for vop3p usage.
     * The source expects exactly 2 16bit components
@@ -735,28 +781,32 @@ Temp get_alu_src_vop3p(struct isel_context *ctx, nir_alu_src src)
    }
 }
 
-uint32_t get_alu_src_ub(isel_context *ctx, nir_alu_instr *instr, int src_idx)
+uint32_t
+get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
 {
-   nir_ssa_scalar scalar = nir_ssa_scalar{instr->src[src_idx].src.ssa,
-                                          instr->src[src_idx].swizzle[0]};
+   nir_ssa_scalar scalar =
+      nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
 }
 
-Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr, bool non_uniform=false)
+Temp
+convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
 {
    if (ptr.size() == 2)
       return ptr;
    Builder bld(ctx->program, ctx->block);
    if (ptr.type() == RegType::vgpr && !non_uniform)
       ptr = bld.as_uniform(ptr);
-   return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)),
-                     ptr, Operand((unsigned)ctx->options->address32_hi));
+   return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
+                     Operand((unsigned)ctx->options->address32_hi));
 }
 
-void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op,
-                           Temp dst, bool writes_scc, uint8_t uses_ub = 0)
+void
+emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
+                      bool writes_scc, uint8_t uses_ub = 0)
 {
-   aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
+   aco_ptr<SOP2_instruction> sop2{
+      create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
    sop2->definitions[0] = Definition(dst);
@@ -778,10 +828,10 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
    ctx->block->instructions.emplace_back(std::move(sop2));
 }
 
-void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
-                           bool commutative, bool swap_srcs=false,
-                           bool flush_denorms = false, bool nuw = false,
-                           uint8_t uses_ub = 0)
+void
+emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
+                      bool commutative, bool swap_srcs = false, bool flush_denorms = false,
+                      bool nuw = false, uint8_t uses_ub = 0)
 {
    Builder bld(ctx->program, ctx->block);
    bld.is_precise = instr->exact;
@@ -824,8 +874,8 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
    }
 }
 
-void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
-                                   aco_opcode op, Temp dst)
+void
+emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
    bld.is_precise = instr->exact;
@@ -849,11 +899,12 @@ void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
 }
 
-void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
-                            bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
+void
+emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
+                       bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
 {
    assert(num_sources == 2 || num_sources == 3);
-   Temp src[3] = { Temp(0, v1), Temp(0, v1), Temp(0, v1) };
+   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
    bool has_sgpr = false;
    for (unsigned i = 0; i < num_sources; i++) {
       src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
@@ -874,7 +925,8 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode
       if (dst.size() == 1)
          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
       else
-         bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(UINT64_C(0x3FF0000000000000)), tmp);
+         bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(UINT64_C(0x3FF0000000000000)),
+                  tmp);
    } else if (num_sources == 3) {
       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
    } else {
@@ -882,8 +934,9 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode
    }
 }
 
-Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr,
-                                       aco_opcode op, Temp dst, bool swap_srcs=false)
+Builder::Result
+emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
+                       bool swap_srcs = false)
 {
    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
@@ -892,8 +945,10 @@ Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr,
    assert(instr->dest.dest.ssa.num_components == 2);
 
    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
-   unsigned opsel_lo = (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
-   unsigned opsel_hi = (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
+   unsigned opsel_lo =
+      (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
+   unsigned opsel_hi =
+      (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
 
    Builder bld(ctx->program, ctx->block);
    bld.is_precise = instr->exact;
@@ -902,7 +957,8 @@ Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr,
    return res;
 }
 
-void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+void
+emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
    bld.is_precise = instr->exact;
@@ -913,7 +969,8 @@ void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 }
 
-void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+void
+emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 {
    Temp src0 = get_alu_src(ctx, instr->src[0]);
    Temp src1 = get_alu_src(ctx, instr->src[1]);
@@ -924,62 +981,25 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
       if (src0.type() == RegType::vgpr) {
          /* to swap the operands, we might also have to change the opcode */
          switch (op) {
-            case aco_opcode::v_cmp_lt_f16:
-               op = aco_opcode::v_cmp_gt_f16;
-               break;
-            case aco_opcode::v_cmp_ge_f16:
-               op = aco_opcode::v_cmp_le_f16;
-               break;
-            case aco_opcode::v_cmp_lt_i16:
-               op = aco_opcode::v_cmp_gt_i16;
-               break;
-            case aco_opcode::v_cmp_ge_i16:
-               op = aco_opcode::v_cmp_le_i16;
-               break;
-            case aco_opcode::v_cmp_lt_u16:
-               op = aco_opcode::v_cmp_gt_u16;
-               break;
-            case aco_opcode::v_cmp_ge_u16:
-               op = aco_opcode::v_cmp_le_u16;
-               break;
-            case aco_opcode::v_cmp_lt_f32:
-               op = aco_opcode::v_cmp_gt_f32;
-               break;
-            case aco_opcode::v_cmp_ge_f32:
-               op = aco_opcode::v_cmp_le_f32;
-               break;
-            case aco_opcode::v_cmp_lt_i32:
-               op = aco_opcode::v_cmp_gt_i32;
-               break;
-            case aco_opcode::v_cmp_ge_i32:
-               op = aco_opcode::v_cmp_le_i32;
-               break;
-            case aco_opcode::v_cmp_lt_u32:
-               op = aco_opcode::v_cmp_gt_u32;
-               break;
-            case aco_opcode::v_cmp_ge_u32:
-               op = aco_opcode::v_cmp_le_u32;
-               break;
-            case aco_opcode::v_cmp_lt_f64:
-               op = aco_opcode::v_cmp_gt_f64;
-               break;
-            case aco_opcode::v_cmp_ge_f64:
-               op = aco_opcode::v_cmp_le_f64;
-               break;
-            case aco_opcode::v_cmp_lt_i64:
-               op = aco_opcode::v_cmp_gt_i64;
-               break;
-            case aco_opcode::v_cmp_ge_i64:
-               op = aco_opcode::v_cmp_le_i64;
-               break;
-            case aco_opcode::v_cmp_lt_u64:
-               op = aco_opcode::v_cmp_gt_u64;
-               break;
-            case aco_opcode::v_cmp_ge_u64:
-               op = aco_opcode::v_cmp_le_u64;
-               break;
-            default: /* eq and ne are commutative */
-               break;
+         case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
+         case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
+         case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
+         case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
+         case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
+         case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
+         case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
+         case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
+         case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
+         case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
+         case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
+         case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
+         case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
+         case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
+         case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
+         case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
+         case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
+         case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
+         default: /* eq and ne are commutative */ break;
          }
          Temp t = src0;
          src0 = src1;
@@ -993,7 +1013,8 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
    bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
 }
 
-void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
+void
+emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 {
    Temp src0 = get_alu_src(ctx, instr->src[0]);
    Temp src1 = get_alu_src(ctx, instr->src[1]);
@@ -1010,13 +1031,18 @@ void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
    bool_to_vector_condition(ctx, cmp, dst);
 }
 
-void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
-                     aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
+void
+emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
+                aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
+                aco_opcode s64_op = aco_opcode::num_opcodes)
 {
-   aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
-   aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
-   bool use_valu = s_op == aco_opcode::num_opcodes ||
-                   nir_dest_is_divergent(instr->dest.dest) ||
+   aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
+                     : instr->src[0].src.ssa->bit_size == 32 ? s32_op
+                                                             : aco_opcode::num_opcodes;
+   aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
+                     : instr->src[0].src.ssa->bit_size == 32 ? v32_op
+                                                             : v16_op;
+   bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) ||
                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
    aco_opcode op = use_valu ? v_op : s_op;
@@ -1029,7 +1055,9 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
       emit_sopc_instruction(ctx, instr, op, dst);
 }
 
-void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
+void
+emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
+                   Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
    Temp src0 = get_alu_src(ctx, instr->src[0]);
@@ -1042,7 +1070,8 @@ void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSp
    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
 }
 
-void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
+void
+emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
    Temp cond = get_alu_src(ctx, instr->src[0]);
@@ -1082,9 +1111,11 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 
    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
       if (dst.regClass() == s1 || dst.regClass() == s2) {
-         assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
+         assert((then.regClass() == s1 || then.regClass() == s2) &&
+                els.regClass() == then.regClass());
          assert(dst.size() == then.size());
-         aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
+         aco_opcode op =
+            dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
       } else {
          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
@@ -1107,12 +1138,14 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
 }
 
-void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
-                    aco_opcode op, uint32_t undo)
+void
+emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
+               uint32_t undo)
 {
    /* multiply by 16777216 to handle denormals */
-   Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
-                               as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
+   Temp is_denormal =
+      bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val),
+               bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
    scaled = bld.vop1(op, bld.def(v1), scaled);
    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
@@ -1122,7 +1155,8 @@ void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
 }
 
-void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+void
+emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
 {
    if (ctx->block->fp_mode.denorm32 == 0) {
       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
@@ -1132,7 +1166,8 @@ void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
 }
 
-void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+void
+emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
 {
    if (ctx->block->fp_mode.denorm32 == 0) {
       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
@@ -1142,7 +1177,8 @@ void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
 }
 
-void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+void
+emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
 {
    if (ctx->block->fp_mode.denorm32 == 0) {
       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
@@ -1152,7 +1188,8 @@ void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
 }
 
-void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+void
+emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
 {
    if (ctx->block->fp_mode.denorm32 == 0) {
       bld.vop1(aco_opcode::v_log_f32, dst, val);
@@ -1162,7 +1199,8 @@ void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
 }
 
-Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+Temp
+emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
 {
    if (ctx->options->chip_class >= GFX7)
       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
@@ -1181,11 +1219,13 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
 
    /* Extract the fractional part. */
-   Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
+   Temp fract_mask =
+      bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
 
    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
-   bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
+              fract_mask);
 
    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
@@ -1197,8 +1237,10 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
 
    /* Decide the operation to apply depending on the unbiased exponent. */
-   Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
-   Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
+   Temp exp_lt0 =
+      bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
+   Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
+                          bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
@@ -1207,7 +1249,8 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
 }
 
-Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
+Temp
+emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
 {
    if (ctx->options->chip_class >= GFX7)
       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
@@ -1217,9 +1260,11 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    Temp src0 = as_vgpr(ctx, val);
 
    Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
-   Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
+   Temp min_val =
+      bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
 
-   Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
+   Temp isnan =
+      bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
 
@@ -1239,11 +1284,13 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    return add->definitions[0].getTemp();
 }
 
-Temp uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
+Temp
+uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
 {
    if (bld.program->chip_class < GFX8) {
       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
-      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand((uint32_t) -1), add.def(1).getTemp());
+      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(),
+                          Operand((uint32_t)-1), add.def(1).getTemp());
    }
 
    Builder::Result add(NULL);
@@ -1256,7 +1303,8 @@ Temp uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
    return dst.getTemp();
 }
 
-void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
+void
+visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
 {
    if (!instr->dest.dest.is_ssa) {
       isel_err(&instr->instr, "nir alu dst not in ssa");
@@ -1265,18 +1313,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    Builder bld(ctx->program, ctx->block);
    bld.is_precise = instr->exact;
    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
-   switch(instr->op) {
+   switch (instr->op) {
    case nir_op_vec2:
    case nir_op_vec3:
    case nir_op_vec4:
    case nir_op_vec5: {
-      std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
       unsigned num = instr->dest.dest.ssa.num_components;
       for (unsigned i = 0; i < num; ++i)
          elems[i] = get_alu_src(ctx, instr->src[i]);
 
       if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
          for (unsigned i = 0; i < num; ++i) {
             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
@@ -1291,7 +1340,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          bool use_s_pack = ctx->program->chip_class >= GFX9;
          Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
 
-         std::array<Temp,NIR_MAX_VEC_COMPONENTS> packed;
+         std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
          for (unsigned i = 0; i < num; i++) {
             unsigned packed_size = use_s_pack ? 16 : 32;
@@ -1303,32 +1352,36 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
             }
 
             if (offset != packed_size - instr->dest.dest.ssa.bit_size)
-               elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
+               elems[i] =
+                  bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
 
             if (offset)
-               elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
-                                   elems[i], Operand(offset));
+               elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
+                                   Operand(offset));
 
             if (packed[idx].id())
-               packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
-                                      elems[i], packed[idx]);
+               packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
+                                      packed[idx]);
             else
                packed[idx] = elems[i];
          }
 
          if (use_s_pack) {
             for (unsigned i = 0; i < dst.size(); i++) {
-               bool same = !!packed[i*2].id() == !!packed[i*2+1].id();
+               bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
 
-               if (packed[i*2].id() && packed[i*2+1].id())
-                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i*2], packed[i*2+1]);
-               else if (packed[i*2+1].id())
-                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), Operand(const_vals[i * 2]), packed[i*2+1]);
-               else if (packed[i*2].id())
-                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i*2], Operand(const_vals[i * 2 + 1]));
+               if (packed[i * 2].id() && packed[i * 2 + 1].id())
+                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
+                                       packed[i * 2 + 1]);
+               else if (packed[i * 2 + 1].id())
+                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
+                                       Operand(const_vals[i * 2]), packed[i * 2 + 1]);
+               else if (packed[i * 2].id())
+                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
+                                       Operand(const_vals[i * 2 + 1]));
 
                if (same)
-                  const_vals[i] = const_vals[i*2] | (const_vals[i*2+1] << 16);
+                  const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
                else
                   const_vals[i] = 0;
             }
@@ -1347,7 +1400,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          else if (dst.size() == 2)
             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);
          else
-            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1], packed[2]);
+            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1],
+                       packed[2]);
       }
       break;
    }
@@ -1392,7 +1446,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (dst.regClass() == s1) {
          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
       } else if (dst.regClass() == v1) {
-         bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
+         bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
+                  bld.vsub32(bld.def(v1), Operand(0u), src));
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
       }
@@ -1401,15 +1456,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_isign: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == s1) {
-         Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
+         Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src,
+                             Operand((uint32_t)-1));
          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
       } else if (dst.regClass() == s2) {
-         Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
+         Temp neg =
+            bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
          Temp neqz;
          if (ctx->program->chip_class >= GFX8)
             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
          else
-            neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
+            neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u))
+                      .def(1)
+                      .getTemp();
          /* SCC gets zero-extended to 64 bit */
          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
       } else if (dst.regClass() == v1) {
@@ -1417,7 +1476,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       } else if (dst.regClass() == v2) {
          Temp upper = emit_extract_vector(ctx, src, 1, v1);
          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
-         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
+         Temp gtz =
+            bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
@@ -1548,8 +1608,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
-         bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
-                  get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+         bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
+                  get_alu_src(ctx, instr->src[0]));
       } else if (dst.regClass() == v2) {
          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
       } else if (dst.regClass() == s2) {
@@ -1569,10 +1629,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
       } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false, false, 1);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
+                               false, 1);
       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
-         bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
-                  get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+         bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
+                  get_alu_src(ctx, instr->src[0]));
       } else if (dst.regClass() == v2) {
          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
       } else if (dst.regClass() == s1) {
@@ -1594,8 +1655,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
-         bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
-                  get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
+         bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
+                  get_alu_src(ctx, instr->src[0]));
       } else if (dst.regClass() == v2) {
          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
       } else if (dst.regClass() == s1) {
@@ -1624,9 +1685,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_ifind_msb: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (src.regClass() == s1 || src.regClass() == s2) {
-         aco_opcode op = src.regClass() == s2 ?
-                         (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
-                         (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
+         aco_opcode op = src.regClass() == s2
+                            ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
+                                                             : aco_opcode::s_flbit_i32_i64)
+                            : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
+                                                             : aco_opcode::s_flbit_i32);
          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
 
          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
@@ -1634,30 +1697,38 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          Temp msb = sub.def(0).getTemp();
          Temp carry = sub.def(1).getTemp();
 
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb,
+                  bld.scc(carry));
       } else if (src.regClass() == v1) {
-         aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
+         aco_opcode op =
+            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
          Temp msb_rev = bld.tmp(v1);
          emit_vop1_instruction(ctx, instr, op, msb_rev);
          Temp msb = bld.tmp(v1);
-         Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
+         Temp carry =
+            bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1),
+                      carry);
       } else if (src.regClass() == v2) {
-         aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
+         aco_opcode op =
+            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
 
          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
 
          lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand(32u)),
-                                           bld.vop1(op, bld.def(v1), lo));
+                         bld.vop1(op, bld.def(v1), lo));
          hi = bld.vop1(op, bld.def(v1), hi);
-         Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand((uint32_t)-1), hi);
+         Temp found_hi =
+            bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand((uint32_t)-1), hi);
 
          Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
 
          Temp msb = bld.tmp(v1);
-         Temp carry = bld.vsub32(Definition(msb), Operand(63u), Operand(msb_rev), true).def(1).getTemp();
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
+         Temp carry =
+            bld.vsub32(Definition(msb), Operand(63u), Operand(msb_rev), true).def(1).getTemp();
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1),
+                      carry);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
       }
@@ -1705,8 +1776,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 
       if (dst.regClass() == s2) {
          Temp carry = bld.tmp(s1);
-         Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
-         Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
+         Temp dst0 =
+            bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
+         Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
+                              bld.scc(carry));
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
       } else if (dst.regClass() == v2) {
          Temp dst0 = bld.tmp(v1);
@@ -1723,17 +1796,18 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp src1 = get_alu_src(ctx, instr->src[1]);
       if (dst.regClass() == s1) {
          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
-         bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
-                  src0, src1);
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
+         bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), tmp,
+                  bld.scc(carry));
       } else if (dst.regClass() == v2b) {
-         Instruction *add_instr;
+         Instruction* add_instr;
          if (ctx->program->chip_class >= GFX10) {
             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
          } else {
             if (src1.type() == RegType::sgpr)
                std::swap(src0, src1);
-            add_instr = bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
+            add_instr =
+               bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
          }
          add_instr->vop3().clamp = 1;
       } else if (dst.regClass() == v1) {
@@ -1765,12 +1839,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (dst.regClass() == s2) {
          Temp carry = bld.tmp(s1);
          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
-         carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
+         carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
+                          bld.scc(carry))
+                    .def(1)
+                    .getTemp();
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
       } else if (dst.regClass() == v2) {
          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
-         carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
+         carry =
+            bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
@@ -1811,8 +1889,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
       if (dst.regClass() == s2) {
          Temp borrow = bld.tmp(s1);
-         Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
-         Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(borrow));
+         Temp dst0 =
+            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
+         Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
+                              bld.scc(borrow));
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
       } else if (dst.regClass() == v2) {
          Temp lower = bld.tmp(v1);
@@ -1845,12 +1925,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (dst.regClass() == s2) {
          Temp borrow = bld.tmp(s1);
          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
-         borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
+         borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
+                           bld.scc(borrow))
+                     .def(1)
+                     .getTemp();
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
       } else if (dst.regClass() == v2) {
          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
-         borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
+         borrow =
+            bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
@@ -1870,25 +1954,22 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
 
-         if (src0_ub <= 0xffff && src1_ub <= 0xffff &&
-             src0_ub * src1_ub <= 0xffff &&
-             (ctx->options->chip_class == GFX8 ||
-              ctx->options->chip_class == GFX9)) {
+         if (src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff &&
+             (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9)) {
             /* If the 16-bit multiplication can't overflow, emit v_mul_lo_u16
              * but only on GFX8-9 because GFX10 doesn't zero the upper 16
              * bits.
              */
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst,
-                                  true /* commutative */, false, false,
-                                  true /* nuw */);
-         } else if (src0_ub <= 0xffff && src1_ub <= 0xffff &&
-             ctx->options->chip_class >= GFX9) {
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true /* commutative */,
+                                  false, false, true /* nuw */);
+         } else if (src0_ub <= 0xffff && src1_ub <= 0xffff && ctx->options->chip_class >= GFX9) {
             /* Initialize the accumulator to 0 to allow further combinations
              * in the optimizer.
              */
             Operand op0(src0);
             Operand op1(src1);
-            bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0), bld.set16bit(op1), Operand(0u));
+            bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0),
+                     bld.set16bit(op1), Operand(0u));
          } else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true);
          } else if (nir_src_is_const(instr->src[0].src)) {
@@ -1992,8 +2073,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          else
             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
       } else if (dst.regClass() == v2) {
-         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
-                                     as_vgpr(ctx, src0), as_vgpr(ctx, src1));
+         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
+                                     as_vgpr(ctx, src1));
          add->vop3().neg[1] = true;
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
@@ -2007,9 +2088,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
       } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
+                               ctx->block->fp_mode.must_flush_denorms32);
       } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst, ctx->block->fp_mode.must_flush_denorms16_64);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
+                                ctx->block->fp_mode.must_flush_denorms16_64);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
       }
@@ -2022,9 +2105,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
       } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
+                               ctx->block->fp_mode.must_flush_denorms32);
       } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst, ctx->block->fp_mode.must_flush_denorms16_64);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
+                                ctx->block->fp_mode.must_flush_denorms16_64);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
       }
@@ -2032,27 +2117,23 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_cube_face_coord_amd: {
       Temp in = get_alu_src(ctx, instr->src[0], 3);
-      Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
-                      emit_extract_vector(ctx, in, 1, v1),
-                      emit_extract_vector(ctx, in, 2, v1) };
+      Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
+                     emit_extract_vector(ctx, in, 2, v1)};
       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
-      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1),
-                    Operand(0x3f000000u/*0.5*/),
+      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3f000000u /*0.5*/),
                     bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma));
-      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1),
-                    Operand(0x3f000000u/*0.5*/),
+      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3f000000u /*0.5*/),
                     bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma));
       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
       break;
    }
    case nir_op_cube_face_index_amd: {
       Temp in = get_alu_src(ctx, instr->src[0], 3);
-      Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
-                      emit_extract_vector(ctx, in, 1, v1),
-                      emit_extract_vector(ctx, in, 2, v1) };
+      Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
+                     emit_extract_vector(ctx, in, 2, v1)};
       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
       break;
    }
@@ -2084,12 +2165,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       }
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand((uint16_t)0xbc00u), as_vgpr(ctx, src));
+         bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand((uint16_t)0xbc00u),
+                  as_vgpr(ctx, src));
       } else if (dst.regClass() == v1) {
          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0xbf800000u), as_vgpr(ctx, src));
       } else if (dst.regClass() == v2) {
          if (ctx->block->fp_mode.must_flush_denorms16_64)
-            src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src));
+            src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2),
+                           Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src));
          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
@@ -2102,14 +2185,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fabs: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Instruction *mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand((uint16_t)0x3c00), as_vgpr(ctx, src)).instr;
+         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
+                                         Operand((uint16_t)0x3c00), as_vgpr(ctx, src))
+                               .instr;
          mul->vop3().abs[1] = true;
       } else if (dst.regClass() == v1) {
-         Instruction *mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), as_vgpr(ctx, src)).instr;
+         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
+                                         Operand(0x3f800000u), as_vgpr(ctx, src))
+                               .instr;
          mul->vop3().abs[1] = true;
       } else if (dst.regClass() == v2) {
          if (ctx->block->fp_mode.must_flush_denorms16_64)
-            src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src));
+            src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2),
+                           Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src));
          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
@@ -2122,18 +2210,21 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fsat: {
       if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
-         Instruction* vop3p = bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand(uint16_t(0x3C00)),
-                                        instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
+         Instruction* vop3p =
+            bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand(uint16_t(0x3C00)),
+                      instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
          vop3p->vop3p().clamp = true;
          emit_split_vector(ctx, dst, 2);
          break;
       }
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
+         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u),
+                  Operand((uint16_t)0x3c00), src);
       } else if (dst.regClass() == v1) {
          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
-         /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
+         /* apparently, it is not necessary to flush denorms if this instruction is used with these
+          * operands */
          // TODO: confirm that this holds under any circumstances
       } else if (dst.regClass() == v2) {
          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
@@ -2234,10 +2325,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
             Temp src0 = get_alu_src(ctx, instr->src[0]);
             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
             Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
-            Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
-            Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
-            Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
-            add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
+            Temp tmp1 =
+               bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
+            Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc),
+                                 tmp0, tmp1);
+            Temp add =
+               bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)),
+                        bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
+            add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
+                             bld.copy(bld.def(v1), Operand(0u)), add);
             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
          }
       } else {
@@ -2272,22 +2368,32 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
             Temp src0 = get_alu_src(ctx, instr->src[0]);
             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
 
-            Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
-            Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
-            Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
-            Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
+            Temp bitmask =
+               bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
+            Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
+                                bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
+            Temp tmp =
+               bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
+                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
+            Instruction* sub =
+               bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
+                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
             sub->vop3().neg[1] = true;
             tmp = sub->definitions[0].getTemp();
 
-            Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
-            Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
+            Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u),
+                                Operand(0x432fffffu));
+            Instruction* vop3 =
+               bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
             vop3->vop3().abs[0] = true;
             Temp cond = vop3->definitions[0].getTemp();
 
             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
-            Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
-            Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
+            Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
+                                     as_vgpr(ctx, src0_lo), cond);
+            Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
+                                     as_vgpr(ctx, src0_hi), cond);
 
             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
          }
@@ -2303,7 +2409,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (dst.regClass() == v2b) {
          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
          Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
-         aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
+         aco_opcode opcode =
+            instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
          bld.vop1(opcode, Definition(dst), tmp);
       } else if (dst.regClass() == v1) {
          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
@@ -2313,7 +2420,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          if (ctx->options->chip_class < GFX9)
             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
 
-         aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
+         aco_opcode opcode =
+            instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
          bld.vop1(opcode, Definition(dst), tmp);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
@@ -2365,16 +2473,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          assert(ctx->program->chip_class >= GFX9);
          /* replace negative zero with positive zero */
          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand(0u), src);
-         src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand((uint16_t)-1), src, Operand((uint16_t)1u));
+         src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand((uint16_t)-1), src,
+                        Operand((uint16_t)1u));
          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
       } else if (dst.regClass() == v1) {
          src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0u), src);
-         src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand((uint32_t)-1), src, Operand(1u));
+         src =
+            bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand((uint32_t)-1), src, Operand(1u));
          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
       } else if (dst.regClass() == v2) {
-         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
+         Temp cond =
+            bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
          Temp tmp = bld.copy(bld.def(v1), Operand(0x3FF00000u));
-         Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
+         Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
+                                   emit_extract_vector(ctx, src, 1, v1), cond);
 
          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
          tmp = bld.copy(bld.def(v1), Operand(0xBFF00000u));
@@ -2673,14 +2785,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 
       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
-         exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
+         exponent =
+            bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
          Temp new_exponent = bld.tmp(v1);
-         Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
+         Temp borrow =
+            bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
          if (ctx->program->chip_class >= GFX8)
             mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
          else
@@ -2688,7 +2802,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
-         lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
+         lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu),
+                              borrow);
          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
@@ -2700,18 +2815,29 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
          if (src.type() == RegType::vgpr)
             src = bld.as_uniform(src);
-         Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
-         exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
-         exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
-         exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
-         Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
-         Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
-         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
-         mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
+         Temp exponent =
+            bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
+         exponent =
+            bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
+         exponent =
+            bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
+         exponent =
+            bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
+         Temp mantissa =
+            bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
+         Temp sign =
+            bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
+         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
+                             Operand(0x800000u), mantissa);
+         mantissa =
+            bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
-         exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
-         mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
-         Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
+         exponent =
+            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
+         mantissa =
+            bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
+         Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent,
+                              Operand(0xffffffffu)); // exp >= 64
          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
@@ -2719,15 +2845,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
          Temp borrow = bld.tmp(s1);
-         lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
-         upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, bld.scc(borrow));
+         lower =
+            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
+         upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign,
+                          bld.scc(borrow));
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 
       } else if (instr->src[0].src.ssa->bit_size == 64) {
-         Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
+         Temp vec =
+            bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
-         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
+         vec =
+            bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
@@ -2750,7 +2880,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 
       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
-         Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
+         Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)),
+                                           Operand(64u), exponent);
          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
@@ -2758,7 +2889,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
          Temp new_exponent = bld.tmp(v1);
-         Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
+         Temp cond_small =
+            bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
          if (ctx->program->chip_class >= GFX8)
             mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
          else
@@ -2766,38 +2898,54 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
-         upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
-         lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
-         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
+         upper =
+            bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
+         lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower,
+                          exponent_in_range);
+         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper,
+                          exponent_in_range);
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 
       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
          if (src.type() == RegType::vgpr)
             src = bld.as_uniform(src);
-         Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
-         exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
-         exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
-         Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
-         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
-         Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
-         Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
+         Temp exponent =
+            bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
+         exponent =
+            bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
+         exponent =
+            bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
+         Temp mantissa =
+            bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
+         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
+                             Operand(0x800000u), mantissa);
+         Temp exponent_small =
+            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
+         Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa,
+                               exponent_small);
          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
-         Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
-         mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
+         Temp exponent_large =
+            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
+         mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa,
+                             exponent_large);
          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
-         mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
+         mantissa =
+            bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
-         Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
+         Temp cond_small =
+            bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 
       } else if (instr->src[0].src.ssa->bit_size == 64) {
-         Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
+         Temp vec =
+            bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
-         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
+         vec =
+            bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
@@ -2836,7 +2984,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          src = bool_to_scalar_condition(ctx, src);
          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
       } else if (dst.regClass() == v1) {
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
+                      src);
       } else {
          unreachable("Wrong destination register class for nir_op_b2f32.");
       }
@@ -2848,7 +2997,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 
       if (dst.regClass() == s2) {
          src = bool_to_scalar_condition(ctx, src);
-         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
+         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u),
+                  bld.scc(src));
       } else if (dst.regClass() == v2) {
          Temp one = bld.copy(bld.def(v2), Operand(0x3FF00000u));
          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
@@ -2864,14 +3014,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_i2i64: {
       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
          /* no need to do the extract in get_alu_src() */
-         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ?
-                                  sgpr_extract_sext : sgpr_extract_undef;
+         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
+                                     ? sgpr_extract_sext
+                                     : sgpr_extract_undef;
          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
       } else {
          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
          const unsigned output_bitsize = instr->dest.dest.ssa.bit_size;
-         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
-                     input_bitsize, output_bitsize, output_bitsize > input_bitsize, dst);
+         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
+                     output_bitsize > input_bitsize, dst);
       }
       break;
    }
@@ -2881,12 +3032,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_u2u64: {
       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
          /* no need to do the extract in get_alu_src() */
-         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ?
-                                  sgpr_extract_zext : sgpr_extract_undef;
+         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
+                                     ? sgpr_extract_zext
+                                     : sgpr_extract_undef;
          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
       } else {
-         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
-                     instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
+         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
+                     instr->dest.dest.ssa.bit_size, false, dst);
       }
       break;
    }
@@ -2920,12 +3072,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          assert(src.regClass() == v1 || src.regClass() == v2);
          assert(dst.regClass() == bld.lm);
          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
-                  Definition(dst), Operand(0u), src).def(0).setHint(vcc);
+                  Definition(dst), Operand(0u), src)
+            .def(0)
+            .setHint(vcc);
       } else {
          assert(src.regClass() == s1 || src.regClass() == s2);
          Temp tmp;
          if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
-            tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
+            tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src)
+                     .def(1)
+                     .getTemp();
          } else {
             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
                            bld.scc(bld.def(s1)), Operand(0u), src);
@@ -2948,21 +3104,25 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       break;
    }
    case nir_op_unpack_64_2x32_split_x:
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
+      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
+                 get_alu_src(ctx, instr->src[0]));
       break;
    case nir_op_unpack_64_2x32_split_y:
-      bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
+      bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
+                 get_alu_src(ctx, instr->src[0]));
       break;
    case nir_op_unpack_32_2x16_split_x:
       if (dst.type() == RegType::vgpr) {
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
+         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
+                    get_alu_src(ctx, instr->src[0]));
       } else {
          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
       }
       break;
    case nir_op_unpack_32_2x16_split_y:
       if (dst.type() == RegType::vgpr) {
-         bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
+         bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
+                    get_alu_src(ctx, instr->src[0]));
       } else {
          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
                     get_alu_src(ctx, instr->src[0]), Operand(1u), Operand(16u), Operand(0u));
@@ -2976,7 +3136,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          src1 = emit_extract_vector(ctx, src1, 0, v2b);
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
       } else {
-         src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
+         src0 =
+            bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u));
          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
       }
@@ -2988,14 +3149,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          if (val && val->u32 == 0 && ctx->program->chip_class <= GFX9) {
             /* upper bits zero on GFX6-GFX9 */
             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), get_alu_src(ctx, instr->src[0]));
-         } else if (!ctx->block->fp_mode.care_about_round16_64 || ctx->block->fp_mode.round16_64 == fp_round_tz) {
+         } else if (!ctx->block->fp_mode.care_about_round16_64 ||
+                    ctx->block->fp_mode.round16_64 == fp_round_tz) {
             if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
                emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
             else
                emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
          } else {
-            Temp src0 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
-            Temp src1 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
+            Temp src0 =
+               bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
+            Temp src1 =
+               bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
          }
       } else {
@@ -3009,7 +3173,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (src.regClass() == v1)
          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
       if (dst.regClass() == v1) {
-         assert(ctx->block->fp_mode.must_flush_denorms16_64 == (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
+         assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
+                (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
@@ -3022,9 +3187,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (src.regClass() == s1)
          src = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand(16u));
       else
-         src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
+         src =
+            bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
       if (dst.regClass() == v1) {
-         assert(ctx->block->fp_mode.must_flush_denorms16_64 == (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
+         assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
+                (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
@@ -3042,8 +3209,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp f32, cmp_res;
 
       if (ctx->program->chip_class >= GFX8) {
-         Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
-         cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
+         Temp mask = bld.copy(bld.def(s1),
+                              Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
+         cmp_res =
+            bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
       } else {
          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
@@ -3053,12 +3222,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
          Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
          tmp0->vop3().abs[0] = true;
-         Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), f32);
-         cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc), tmp0->definitions[0].getTemp(), tmp1);
+         Temp tmp1 =
+            bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), f32);
+         cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
+                            tmp0->definitions[0].getTemp(), tmp1);
       }
 
       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
-         Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
+         Temp copysign_0 =
+            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
       } else {
          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
@@ -3092,7 +3264,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          if (const_insert && const_bitmask) {
             lhs = Operand(const_insert->u32 & const_bitmask->u32);
          } else {
-            insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
+            insert =
+               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
             lhs = Operand(insert);
          }
 
@@ -3126,7 +3299,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
          if (const_offset && const_bits) {
             uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f);
-            aco_opcode opcode = instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
+            aco_opcode opcode =
+               instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand(extract));
             break;
          }
@@ -3135,20 +3309,25 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          Temp bits = get_alu_src(ctx, instr->src[2]);
          if (instr->op == nir_op_ubfe) {
             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
-            Temp masked = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
+            Temp masked =
+               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
          } else {
-            Operand bits_op = const_bits ? Operand(const_bits->u32 << 16) :
-                              bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
-            Operand offset_op = const_offset ? Operand(const_offset->u32 & 0x1fu) :
-                                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(0x1fu));
+            Operand bits_op = const_bits ? Operand(const_bits->u32 << 16)
+                                         : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1),
+                                                    bld.def(s1, scc), bits, Operand(16u));
+            Operand offset_op = const_offset ? Operand(const_offset->u32 & 0x1fu)
+                                             : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
+                                                        bld.def(s1, scc), offset, Operand(0x1fu));
 
-            Temp extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
+            Temp extract =
+               bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
          }
 
       } else {
-         aco_opcode opcode = instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
+         aco_opcode opcode =
+            instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
       }
       break;
@@ -3184,12 +3363,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          }
          assert(def.bytes() <= 4);
          if (def.regClass() == s1) {
-            bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
-                       Operand(index), Operand(bits), Operand((uint32_t)is_signed));
+            bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src), Operand(index),
+                       Operand(bits), Operand((uint32_t)is_signed));
          } else {
             src = emit_extract_vector(ctx, src, 0, def.regClass());
-            bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand(index),
-                       Operand(bits), Operand((uint32_t)is_signed));
+            bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand(index), Operand(bits),
+                       Operand((uint32_t)is_signed));
          }
          if (dst.size() == 2)
             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(), Operand(0u));
@@ -3215,7 +3394,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
             def = bld.def(src.type(), 1);
          }
          if (def.regClass() == s1) {
-            bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src), Operand(index), Operand(bits));
+            bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src), Operand(index),
+                       Operand(bits));
          } else {
             src = emit_extract_vector(ctx, src, 0, def.regClass());
             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand(index), Operand(bits));
@@ -3234,8 +3414,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       } else if (src.regClass() == v1) {
          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
       } else if (src.regClass() == v2) {
-         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
-                  emit_extract_vector(ctx, src, 1, v1),
+         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
       } else if (src.regClass() == s2) {
@@ -3246,51 +3425,63 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       break;
    }
    case nir_op_flt: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
+                      aco_opcode::v_cmp_lt_f64);
       break;
    }
    case nir_op_fge: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
+                      aco_opcode::v_cmp_ge_f64);
       break;
    }
    case nir_op_feq: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
+                      aco_opcode::v_cmp_eq_f64);
       break;
    }
    case nir_op_fneu: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
+                      aco_opcode::v_cmp_neq_f64);
       break;
    }
    case nir_op_ilt: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
+                      aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
       break;
    }
    case nir_op_ige: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
+                      aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
       break;
    }
    case nir_op_ieq: {
       if (instr->src[0].src.ssa->bit_size == 1)
          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
       else
-         emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
-                         ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
+         emit_comparison(
+            ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
+            aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
+            ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
       break;
    }
    case nir_op_ine: {
       if (instr->src[0].src.ssa->bit_size == 1)
          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
       else
-         emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
-                         ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
+         emit_comparison(
+            ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
+            aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
+            ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
       break;
    }
    case nir_op_ult: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
+                      aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
       break;
    }
    case nir_op_uge: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
+                      aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
       break;
    }
    case nir_op_fddx:
@@ -3327,12 +3518,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       emit_wqm(bld, tmp, dst, true);
       break;
    }
-   default:
-      isel_err(&instr->instr, "Unknown NIR ALU instr");
+   default: isel_err(&instr->instr, "Unknown NIR ALU instr");
    }
 }
 
-void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
+void
+visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
 {
    Temp dst = get_ssa_temp(ctx, &instr->def);
 
@@ -3347,7 +3538,7 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
    if (instr->def.bit_size == 1) {
       assert(dst.regClass() == bld.lm);
       int val = instr->value[0].b ? -1 : 0;
-      Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
+      Operand op = bld.lm.size() == 1 ? Operand((uint32_t)val) : Operand((uint64_t)val);
       bld.copy(Definition(dst), op);
    } else if (instr->def.bit_size == 8) {
       bld.copy(Definition(dst), Operand((uint32_t)instr->value[0].u8));
@@ -3358,7 +3549,8 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
       bld.copy(Definition(dst), Operand(instr->value[0].u32));
    } else {
       assert(dst.size() != 1);
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
       if (instr->def.bit_size == 64)
          for (unsigned i = 0; i < dst.size(); i++)
             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
@@ -3371,10 +3563,11 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
    }
 }
 
-uint32_t widen_mask(uint32_t mask, unsigned multiplier)
+uint32_t
+widen_mask(uint32_t mask, unsigned multiplier)
 {
    uint32_t new_mask = 0;
-   for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
+   for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
       if (mask & (1u << i))
          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
    return new_mask;
@@ -3399,9 +3592,8 @@ struct LoadEmitInfo {
 };
 
 struct EmitLoadParameters {
-   using Callback = Temp (*)(Builder &bld, const LoadEmitInfo &info,
-                             Temp offset, unsigned bytes_needed,
-                             unsigned align, unsigned const_offset,
+   using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
+                             unsigned bytes_needed, unsigned align, unsigned const_offset,
                              Temp dst_hint);
 
    Callback callback;
@@ -3410,14 +3602,15 @@ struct EmitLoadParameters {
    unsigned max_const_offset_plus_one;
 };
 
-void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
-               const EmitLoadParameters &params)
+void
+emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
+          const EmitLoadParameters& params)
 {
    unsigned load_size = info.num_components * info.component_size;
    unsigned component_size = info.component_size;
 
    unsigned num_vals = 0;
-   Temp *const vals = (Temp *)alloca(info.dst.bytes() * sizeof(Temp));
+   Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
 
    unsigned const_offset = info.const_offset;
 
@@ -3435,8 +3628,7 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
       }
 
       if (byte_align) {
-         if (bytes_needed > 2 ||
-             (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
+         if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
              !params.supports_8bit_16bit_loads) {
             if (info.component_stride) {
                assert(params.supports_8bit_16bit_loads && "unimplemented");
@@ -3463,22 +3655,21 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
       unsigned reduced_const_offset = const_offset;
       bool remove_const_offset_completely = need_to_align_offset;
       if (const_offset &&
-          (remove_const_offset_completely ||
-           const_offset >= params.max_const_offset_plus_one)) {
+          (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
          unsigned to_add = const_offset;
          if (remove_const_offset_completely) {
             reduced_const_offset = 0;
          } else {
-            to_add = const_offset / params.max_const_offset_plus_one *
-                     params.max_const_offset_plus_one;
+            to_add =
+               const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
             reduced_const_offset %= params.max_const_offset_plus_one;
          }
          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
          if (offset.isConstant()) {
             offset = Operand(offset.constantValue() + to_add);
          } else if (offset_tmp.regClass() == s1) {
-            offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                              offset_tmp, Operand(to_add));
+            offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
+                              Operand(to_add));
          } else if (offset_tmp.regClass() == v1) {
             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand(to_add));
          } else {
@@ -3488,12 +3679,14 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
 
             if (offset_tmp.regClass() == s2) {
                Temp carry = bld.tmp(s1);
-               lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, Operand(to_add));
+               lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
+                             Operand(to_add));
                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
             } else {
                Temp new_lo = bld.tmp(v1);
-               Temp carry = bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp();
+               Temp carry =
+                  bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp();
                hi = bld.vadd32(bld.def(v1), hi, Operand(0u), false, carry);
                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
             }
@@ -3509,11 +3702,14 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
          if (offset.isConstant()) {
             aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
          } else if (offset_tmp.regClass() == s1) {
-            aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfffffffcu), offset_tmp);
+            aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
+                                      Operand(0xfffffffcu), offset_tmp);
          } else if (offset_tmp.regClass() == s2) {
-            aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp);
+            aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
+                                      Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp);
          } else if (offset_tmp.regClass() == v1) {
-            aligned_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp);
+            aligned_offset =
+               bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp);
          } else if (offset_tmp.regClass() == v2) {
             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
@@ -3521,13 +3717,11 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
          }
       }
-      Temp aligned_offset_tmp = aligned_offset.isTemp() ?
-                                   aligned_offset.getTemp() :
-                                   bld.copy(bld.def(s1), aligned_offset);
+      Temp aligned_offset_tmp =
+         aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset);
 
-      Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed,
-                                 align, reduced_const_offset,
-                                 byte_align ? Temp() : info.dst);
+      Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
+                                 reduced_const_offset, byte_align ? Temp() : info.dst);
 
       /* the callback wrote directly to dst */
       if (val == info.dst) {
@@ -3543,7 +3737,8 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
             if (offset.isConstant())
                byte_align_off = Operand(offset.constantValue() % 4u);
             else if (offset.size() == 2)
-               byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1)));
+               byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
+                                                            RegClass(offset.getTemp().type(), 1)));
             else
                byte_align_off = offset;
          }
@@ -3574,7 +3769,7 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
    bool has_vgprs = false;
    for (unsigned i = 0; i < num_vals;) {
-      Temp *const tmp = (Temp *)alloca(num_vals * sizeof(Temp));
+      Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
       unsigned num_tmps = 0;
       unsigned tmp_size = 0;
       RegType reg_type = RegType::sgpr;
@@ -3597,7 +3792,8 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
       if (tmp[0].bytes() % component_size) {
          /* trim tmp[0] */
          assert(i == num_vals);
-         RegClass new_rc = RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
+         RegClass new_rc =
+            RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
          tmp[0] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand(0u));
       }
 
@@ -3633,7 +3829,8 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
    if (info.dst.type() == RegType::vgpr || !has_vgprs)
       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
 
-   int padding_bytes = MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
+   int padding_bytes =
+      MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
 
    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
       aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
@@ -3652,16 +3849,16 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info,
    }
 }
 
-Operand load_lds_size_m0(Builder& bld)
+Operand
+load_lds_size_m0(Builder& bld)
 {
    /* TODO: m0 does not need to be initialized on GFX9+ */
    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0xffffffffu)));
 }
 
-Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info,
-                       Temp offset, unsigned bytes_needed,
-                       unsigned align, unsigned const_offset,
-                       Temp dst_hint)
+Temp
+lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
+                  unsigned align, unsigned const_offset, Temp dst_hint)
 {
    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
 
@@ -3714,7 +3911,7 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info,
 
    RegClass rc = RegClass::get(RegType::vgpr, size);
    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
-   Instruction *instr;
+   Instruction* instr;
    if (read2)
       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
    else
@@ -3724,12 +3921,11 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info,
    return val;
 }
 
-const EmitLoadParameters lds_load_params { lds_load_callback, false, true, UINT32_MAX };
+const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
 
-Temp smem_load_callback(Builder& bld, const LoadEmitInfo &info,
-                        Temp offset, unsigned bytes_needed,
-                        unsigned align, unsigned const_offset,
-                        Temp dst_hint)
+Temp
+smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
+                   unsigned align, unsigned const_offset, Temp dst_hint)
 {
    unsigned size = 0;
    aco_opcode op;
@@ -3767,15 +3963,14 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo &info,
    return val;
 }
 
-const EmitLoadParameters smem_load_params { smem_load_callback, true, false, 1024 };
+const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
 
-Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo &info,
-                         Temp offset, unsigned bytes_needed,
-                         unsigned align_, unsigned const_offset,
-                         Temp dst_hint)
+Temp
+mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
+                    unsigned align_, unsigned const_offset, Temp dst_hint)
 {
    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
-   Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+   Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t)0);
 
    if (info.soffset.id()) {
       if (soffset.isTemp())
@@ -3823,23 +4018,25 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo &info,
    return val;
 }
 
-const EmitLoadParameters mubuf_load_params { mubuf_load_callback, true, true, 4096 };
-const EmitLoadParameters scratch_load_params { mubuf_load_callback, false, true, 4096 };
+const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
+const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};
 
-Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
+Temp
+get_gfx6_global_rsrc(Builder& bld, Temp addr)
 {
    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 
    if (addr.type() == RegType::vgpr)
-      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
-   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
+      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u),
+                        Operand(-1u), Operand(rsrc_conf));
+   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u),
+                     Operand(rsrc_conf));
 }
 
-Temp global_load_callback(Builder& bld, const LoadEmitInfo &info,
-                          Temp offset, unsigned bytes_needed,
-                          unsigned align_, unsigned const_offset,
-                          Temp dst_hint)
+Temp
+global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
+                     unsigned align_, unsigned const_offset, Temp dst_hint)
 {
    unsigned bytes_size = 0;
    bool use_mubuf = bld.program->chip_class == GFX6;
@@ -3847,27 +4044,38 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo &info,
    aco_opcode op;
    if (bytes_needed == 1) {
       bytes_size = 1;
-      op = use_mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
+      op = use_mubuf ? aco_opcode::buffer_load_ubyte
+           : global  ? aco_opcode::global_load_ubyte
+                     : aco_opcode::flat_load_ubyte;
    } else if (bytes_needed == 2) {
       bytes_size = 2;
-      op = use_mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
+      op = use_mubuf ? aco_opcode::buffer_load_ushort
+           : global  ? aco_opcode::global_load_ushort
+                     : aco_opcode::flat_load_ushort;
    } else if (bytes_needed <= 4) {
       bytes_size = 4;
-      op = use_mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
+      op = use_mubuf ? aco_opcode::buffer_load_dword
+           : global  ? aco_opcode::global_load_dword
+                     : aco_opcode::flat_load_dword;
    } else if (bytes_needed <= 8) {
       bytes_size = 8;
-      op = use_mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
+      op = use_mubuf ? aco_opcode::buffer_load_dwordx2
+           : global  ? aco_opcode::global_load_dwordx2
+                     : aco_opcode::flat_load_dwordx2;
    } else if (bytes_needed <= 12 && !use_mubuf) {
       bytes_size = 12;
       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
    } else {
       bytes_size = 16;
-      op = use_mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
+      op = use_mubuf ? aco_opcode::buffer_load_dwordx4
+           : global  ? aco_opcode::global_load_dwordx4
+                     : aco_opcode::flat_load_dwordx4;
    }
    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
    if (use_mubuf) {
-      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+      aco_ptr<MUBUF_instruction> mubuf{
+         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
       mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
       mubuf->operands[2] = Operand(0u);
@@ -3882,7 +4090,8 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo &info,
    } else {
       offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
 
-      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
+      aco_ptr<FLAT_instruction> flat{
+         create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
       flat->operands[0] = Operand(offset);
       flat->operands[1] = Operand(s1);
       flat->glc = info.glc;
@@ -3896,10 +4105,11 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo &info,
    return val;
 }
 
-const EmitLoadParameters global_load_params { global_load_callback, true, true, 1 };
+const EmitLoadParameters global_load_params{global_load_callback, true, true, 1};
 
-Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
-              Temp address, unsigned base_offset, unsigned align)
+Temp
+load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
+         Temp address, unsigned base_offset, unsigned align)
 {
    assert(util_is_power_of_two_nonzero(align));
 
@@ -3915,7 +4125,9 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, unsigned num_componen
    return dst;
 }
 
-void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *bytes, Temp src)
+void
+split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
+                 Temp src)
 {
    if (!count)
       return;
@@ -3932,7 +4144,8 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
    }
 
    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
-   unsigned elem_size_bytes = 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
+   unsigned elem_size_bytes =
+      1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
 
    ASSERTED bool is_subdword = elem_size_bytes < 4;
    assert(!is_subdword || dst_type == RegType::vgpr);
@@ -3956,12 +4169,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
       if (elem_size_bytes % elem_size)
          goto split;
 
-      temps.insert(temps.end(), it->second.begin(),
-                   it->second.begin() + src.bytes() / elem_size);
+      temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
       elem_size_bytes = elem_size;
    }
 
-   split:
+split:
    /* split src if necessary */
    if (temps.empty()) {
       if (is_subdword && src.type() == RegType::sgpr)
@@ -3970,7 +4182,8 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
          src = bld.as_uniform(src);
 
       unsigned num_elems = src.bytes() / elem_size_bytes;
-      aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
+      aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
       split->operands[0] = Operand(src);
       for (unsigned i = 0; i < num_elems; i++) {
          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
@@ -3990,7 +4203,8 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
          continue;
       }
 
-      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
+      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
+                                                                      Format::PSEUDO, op_count, 1)};
       for (unsigned j = 0; j < op_count; j++) {
          Temp tmp = temps[idx++];
          if (dst_type == RegType::sgpr)
@@ -4003,8 +4217,8 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
    return;
 }
 
-bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
-                     int *start, int *count)
+bool
+scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
 {
    unsigned start_elem = ffs(todo_mask) - 1;
    bool skip = !(mask & (1 << start_elem));
@@ -4018,13 +4232,15 @@ bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
    return !skip;
 }
 
-void advance_write_mask(uint32_t *todo_mask, int start, int count)
+void
+advance_write_mask(uint32_t* todo_mask, int start, int count)
 {
    *todo_mask &= ~u_bit_consecutive(0, count) << start;
 }
 
-void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
-               Temp address, unsigned base_offset, unsigned align)
+void
+store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
+          unsigned base_offset, unsigned align)
 {
    assert(util_is_power_of_two_nonzero(align));
    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
@@ -4058,7 +4274,7 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
 
-      //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
+      // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
       aco_opcode op = aco_opcode::num_opcodes;
       if (byte >= 16 && aligned16 && large_ds_write) {
          op = aco_opcode::ds_write_b128;
@@ -4121,13 +4337,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
          inline_offset = offsets[i];
       }
-      assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
 
-      Instruction *instr;
+      /* offsets[i] shouldn't be large enough for this to happen */
+      assert(inline_offset <= max_offset);
+
+      Instruction* instr;
       if (write2) {
          Temp second_data = write_datas[second];
          inline_offset /= split_data.bytes();
-         instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset, inline_offset + write2_off);
+         instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
+                        inline_offset + write2_off);
       } else {
          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
       }
@@ -4135,29 +4354,25 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
    }
 }
 
-aco_opcode get_buffer_store_op(unsigned bytes)
+aco_opcode
+get_buffer_store_op(unsigned bytes)
 {
    switch (bytes) {
-   case 1:
-      return aco_opcode::buffer_store_byte;
-   case 2:
-      return aco_opcode::buffer_store_short;
-   case 4:
-      return aco_opcode::buffer_store_dword;
-   case 8:
-      return aco_opcode::buffer_store_dwordx2;
-   case 12:
-      return aco_opcode::buffer_store_dwordx3;
-   case 16:
-      return aco_opcode::buffer_store_dwordx4;
+   case 1: return aco_opcode::buffer_store_byte;
+   case 2: return aco_opcode::buffer_store_short;
+   case 4: return aco_opcode::buffer_store_dword;
+   case 8: return aco_opcode::buffer_store_dwordx2;
+   case 12: return aco_opcode::buffer_store_dwordx3;
+   case 16: return aco_opcode::buffer_store_dwordx4;
    }
    unreachable("Unexpected store size");
    return aco_opcode::num_opcodes;
 }
 
-void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
-                        Temp data, unsigned writemask, int swizzle_element_size,
-                        unsigned *write_count, Temp *write_datas, unsigned *offsets)
+void
+split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
+                   Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
+                   Temp* write_datas, unsigned* offsets)
 {
    unsigned write_count_with_skips = 0;
    bool skips[16];
@@ -4211,8 +4426,9 @@ void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem
    }
 }
 
-Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
-                           unsigned split_cnt = 0u, Temp dst = Temp())
+Temp
+create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
+                      unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
 {
    Builder bld(ctx->program, ctx->block);
    unsigned dword_size = elem_size_bytes / 4;
@@ -4221,7 +4437,8 @@ Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType
       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
 
    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
-   aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
+   aco_ptr<Pseudo_instruction> instr{
+      create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
    instr->definitions[0] = Definition(dst);
 
    for (unsigned i = 0; i < cnt; ++i) {
@@ -4230,7 +4447,8 @@ Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType
          allocated_vec[i] = arr[i];
          instr->operands[i] = Operand(arr[i]);
       } else {
-         Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
+         Temp zero =
+            bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
          allocated_vec[i] = zero;
          instr->operands[i] = Operand(zero);
       }
@@ -4246,7 +4464,8 @@ Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType
    return dst;
 }
 
-inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
+inline unsigned
+resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
 {
    if (const_offset >= 4096) {
       unsigned excess_const_offset = const_offset / 4096u * 4096u;
@@ -4255,7 +4474,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
       if (!voffset.id())
          voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
       else if (unlikely(voffset.regClass() == s1))
-         voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
+         voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
+                            Operand(excess_const_offset), Operand(voffset));
       else if (likely(voffset.regClass() == v1))
          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
       else
@@ -4265,9 +4485,10 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
    return const_offset;
 }
 
-void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
-                             unsigned const_offset = 0u, memory_sync_info sync=memory_sync_info(),
-                             bool slc = false, bool swizzled = false)
+void
+emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
+                        unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),
+                        bool slc = false, bool swizzled = false)
 {
    assert(vdata.id());
    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
@@ -4279,17 +4500,20 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
 
    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
-   Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
-                                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
-                                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
-                                 /* dlc*/ false, /* slc */ slc);
+   Builder::Result r =
+      bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
+                /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+                /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+                /* dlc*/ false, /* slc */ slc);
 
    r.instr->mubuf().sync = sync;
 }
 
-void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
-                                   unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
-                                   bool allow_combining = true, memory_sync_info sync=memory_sync_info(), bool slc = false)
+void
+store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
+                 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
+                 bool allow_combining = true, memory_sync_info sync = memory_sync_info(),
+                 bool slc = false)
 {
    Builder bld(ctx->program, ctx->block);
    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
@@ -4299,19 +4523,21 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
    unsigned write_count = 0;
    Temp write_datas[32];
    unsigned offsets[32];
-   split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
-                      allow_combining ? 16 : 4, &write_count, write_datas, offsets);
+   split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,
+                      &write_count, write_datas, offsets);
 
    for (unsigned i = 0; i < write_count; i++) {
       unsigned const_offset = offsets[i] + base_const_offset;
-      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, slc, !allow_combining);
+      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
+                              slc, !allow_combining);
    }
 }
 
-void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
-                     unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
-                     unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
-                     bool slc = false)
+void
+load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
+                unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
+                unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
+                bool slc = false)
 {
    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
    assert((num_components * elem_size_bytes) == dst.bytes());
@@ -4331,14 +4557,16 @@ void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset,
    emit_load(ctx, bld, info, mubuf_load_params);
 }
 
-Temp wave_id_in_threadgroup(isel_context *ctx)
+Temp
+wave_id_in_threadgroup(isel_context* ctx)
 {
    Builder bld(ctx->program, ctx->block);
    return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
                    get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(24u | (4u << 16)));
 }
 
-Temp thread_id_in_threadgroup(isel_context *ctx)
+Temp
+thread_id_in_threadgroup(isel_context* ctx)
 {
    /* tid_in_tg = wave_id * wave_size + tid_in_wave */
 
@@ -4349,28 +4577,27 @@ Temp thread_id_in_threadgroup(isel_context *ctx)
       return tid_in_wave;
 
    Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
-   Temp num_pre_threads = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
-                                   Operand(ctx->program->wave_size == 64 ? 6u : 5u));
+   Temp num_pre_threads = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
+                                   wave_id_in_tg, Operand(ctx->program->wave_size == 64 ? 6u : 5u));
    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
 }
 
-Temp get_tess_rel_patch_id(isel_context *ctx)
+Temp
+get_tess_rel_patch_id(isel_context* ctx)
 {
    Builder bld(ctx->program, ctx->block);
 
    switch (ctx->shader->info.stage) {
    case MESA_SHADER_TESS_CTRL:
-      return bld.pseudo(aco_opcode::p_extract, bld.def(v1),
-                        get_arg(ctx, ctx->args->ac.tcs_rel_ids),
+      return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
                         Operand(0u), Operand(8u), Operand(0u));
-   case MESA_SHADER_TESS_EVAL:
-      return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);
-   default:
-      unreachable("Unsupported stage in get_tess_rel_patch_id");
+   case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);
+   default: unreachable("Unsupported stage in get_tess_rel_patch_id");
    }
 }
 
-bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
+bool
+store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    unsigned write_mask = nir_intrinsic_write_mask(instr);
    unsigned component = nir_intrinsic_component(instr);
@@ -4398,40 +4625,41 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
    return true;
 }
 
-bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
+bool
+load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
 {
    /* Only TCS per-vertex inputs are supported by this function.
-    * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
+    * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
+    * is the same.
     */
    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
       return false;
 
-   nir_src *off_src = nir_get_io_offset_src(instr);
-   nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
-   nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
-   bool can_use_temps = nir_src_is_const(*off_src) &&
-                        vertex_index_instr->type == nir_instr_type_intrinsic &&
-                        nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
+   nir_src* off_src = nir_get_io_offset_src(instr);
+   nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr);
+   nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
+   bool can_use_temps =
+      nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
+      nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
 
    if (!can_use_temps)
       return false;
 
-   unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
-   Temp *src = &ctx->inputs.temps[idx];
+   unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) +
+                  4 * nir_src_as_uint(*off_src);
+   Temp* src = &ctx->inputs.temps[idx];
    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
 
    return true;
 }
 
-static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos);
+static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);
 
-void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
 {
-   if (ctx->stage == vertex_vs ||
-       ctx->stage == tess_eval_vs ||
-       ctx->stage == fragment_fs ||
-       ctx->stage == vertex_ngg ||
-       ctx->stage == tess_eval_ngg ||
+   if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||
+       ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg ||
        (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
        ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
       bool stored_to_temps = store_output_to_temps(ctx, instr);
@@ -4443,13 +4671,17 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
       unreachable("Shader stage not implemented");
    }
 
-   /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we have to emit an exp here manually */
-   if (ctx->stage.hw == HWStage::NGG && (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
+   /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we
+    * have to emit an exp here manually */
+   if (ctx->stage.hw == HWStage::NGG &&
+       (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
        nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
       export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
 }
 
-void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
+void
+emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
+                  Temp prim_mask)
 {
    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
@@ -4460,47 +4692,48 @@ void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp
       if (ctx->program->dev.has_16bank_lds) {
          assert(ctx->options->chip_class <= GFX8);
          Builder::Result interp_p1 =
-            bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
-                       Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
-         interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
-                                coord1, bld.m0(prim_mask), interp_p1, idx, component);
-         bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
-                 bld.m0(prim_mask), interp_p1, idx, component);
+            bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u) /* P0 */,
+                       bld.m0(prim_mask), idx, component);
+         interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
+                                bld.m0(prim_mask), interp_p1, idx, component);
+         bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
+                    interp_p1, idx, component);
       } else {
          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
 
          if (ctx->options->chip_class == GFX8)
             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
 
-         Builder::Result interp_p1 =
-            bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
-                       coord1, bld.m0(prim_mask), idx, component);
-         bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
-                    interp_p1, idx, component);
+         Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
+                                                bld.m0(prim_mask), idx, component);
+         bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
+                    component);
       }
    } else {
-      Builder::Result interp_p1 =
-         bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
-                    bld.m0(prim_mask), idx, component);
+      Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
+                                             bld.m0(prim_mask), idx, component);
 
       if (ctx->program->dev.has_16bank_lds)
          interp_p1.instr->operands[0].setLateKill(true);
 
-      bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
-                 bld.m0(prim_mask), interp_p1, idx, component);
+      bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
+                 idx, component);
    }
 }
 
-void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
+void
+emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
 {
    Builder bld(ctx->program, ctx->block);
 
-   aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
+   aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
+      aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
    for (unsigned i = 0; i < num_components; i++)
       vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
       assert(num_components == 4);
-      vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
+      vec->operands[3] =
+         bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
    }
 
    if (ctx->options->adjust_frag_coord_z &&
@@ -4525,7 +4758,8 @@ void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
 
       /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */
       Temp cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand(1u), Operand(x_rate));
-      vec->operands[2] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);
+      vec->operands[2] =
+         bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);
    }
 
    for (Operand& op : vec->operands)
@@ -4537,7 +4771,8 @@ void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
    return;
 }
 
-void emit_load_frag_shading_rate(isel_context *ctx, Temp dst)
+void
+emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
    Temp cond;
@@ -4545,27 +4780,26 @@ void emit_load_frag_shading_rate(isel_context *ctx, Temp dst)
    /* VRS Rate X = Ancillary[2:3]
     * VRS Rate Y = Ancillary[4:5]
     */
-   Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
-                          get_arg(ctx, ctx->args->ac.ancillary), Operand(2u), Operand(2u));
-   Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
-                          get_arg(ctx, ctx->args->ac.ancillary), Operand(4u), Operand(2u));
+   Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
+                          Operand(2u), Operand(2u));
+   Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
+                          Operand(4u), Operand(2u));
 
    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand(1u), Operand(x_rate));
-   x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                     bld.copy(bld.def(v1), Operand(0u)),
+   x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)),
                      bld.copy(bld.def(v1), Operand(4u)), cond);
 
    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand(1u), Operand(y_rate));
-   y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                     bld.copy(bld.def(v1), Operand(0u)),
+   y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)),
                      bld.copy(bld.def(v1), Operand(1u)), cond);
 
    bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
 }
 
-void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
@@ -4578,11 +4812,11 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr
    if (instr->dest.ssa.num_components == 1) {
       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
    } else {
-      aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
-      for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
-      {
+      aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
+         aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
+      for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {
          Temp tmp = ctx->program->allocateTmp(v1);
-         emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
+         emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
          vec->operands[i] = Operand(tmp);
       }
       vec->definitions[0] = Definition(dst);
@@ -4590,8 +4824,9 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr
    }
 }
 
-bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
-                             unsigned offset, unsigned binding_align, unsigned channels)
+bool
+check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
+                        unsigned binding_align, unsigned channels)
 {
    unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
    if (vtx_info->chan_byte_size != 4 && channels == 3)
@@ -4607,9 +4842,9 @@ bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_i
           (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
 }
 
-uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
-                              unsigned offset, unsigned *channels, unsigned max_channels,
-                              unsigned binding_align)
+uint8_t
+get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
+                      unsigned* channels, unsigned max_channels, unsigned binding_align)
 {
    if (!vtx_info->chan_byte_size) {
       *channels = vtx_info->num_channels;
@@ -4640,18 +4875,15 @@ uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_
 
    switch (vtx_info->chan_format) {
    case V_008F0C_BUF_DATA_FORMAT_8:
-      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8,
-                                    V_008F0C_BUF_DATA_FORMAT_8_8,
+      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
                                     V_008F0C_BUF_DATA_FORMAT_INVALID,
                                     V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
    case V_008F0C_BUF_DATA_FORMAT_16:
-      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16,
-                                    V_008F0C_BUF_DATA_FORMAT_16_16,
+      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
                                     V_008F0C_BUF_DATA_FORMAT_INVALID,
                                     V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
    case V_008F0C_BUF_DATA_FORMAT_32:
-      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32,
-                                    V_008F0C_BUF_DATA_FORMAT_32_32,
+      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
                                     V_008F0C_BUF_DATA_FORMAT_32_32_32,
                                     V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
    }
@@ -4661,7 +4893,8 @@ uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_
 
 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
  * so we may need to fix it up. */
-Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
+Temp
+adjust_vertex_fetch_alpha(isel_context* ctx, unsigned adjustment, Temp alpha)
 {
    Builder bld(ctx->program, ctx->block);
 
@@ -4688,7 +4921,8 @@ Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alph
    return alpha;
 }
 
-void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -4697,9 +4931,11 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
    if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
 
       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
-         isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
+         isel_err(offset.ssa->parent_instr,
+                  "Unimplemented non-zero nir_intrinsic_load_input offset");
 
-      Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
+      Temp vertex_buffers =
+         convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
 
       unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
       unsigned component = nir_intrinsic_component(instr);
@@ -4713,7 +4949,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
 
       unsigned dfmt = attrib_format & 0xf;
       unsigned nfmt = (attrib_format >> 4) & 0x7;
-      const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
+      const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
 
       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
       unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
@@ -4721,8 +4957,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
       if (post_shuffle)
          num_channels = MAX2(num_channels, 3);
 
-      unsigned desc_index = ctx->program->info->vs.use_per_attribute_vb_descs ?
-                            location : attrib_binding;
+      unsigned desc_index =
+         ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
       desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask &
                                  u_bit_consecutive(0, desc_index));
       Operand off = bld.copy(bld.def(s1), Operand(desc_index * 16u));
@@ -4745,12 +4981,11 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
             index = bld.copy(bld.def(v1), start_instance);
          }
       } else {
-         index = bld.vadd32(bld.def(v1),
-                            get_arg(ctx, ctx->args->ac.base_vertex),
+         index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),
                             get_arg(ctx, ctx->args->ac.vertex_id));
       }
 
-      Temp *const channels = (Temp *)alloca(num_channels * sizeof(Temp));
+      Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));
       unsigned channel_start = 0;
       bool direct_fetch = false;
 
@@ -4771,14 +5006,15 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
 
          /* use MUBUF when possible to avoid possible alignment issues */
          /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
-         bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT ||
-                           nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
-                           nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
-                          vtx_info->chan_byte_size == 4;
+         bool use_mubuf =
+            (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
+             nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
+            vtx_info->chan_byte_size == 4;
          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
          if (!use_mubuf) {
-            fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
-                                               vtx_info->num_channels - channel_start, binding_align);
+            fetch_dfmt =
+               get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
+                                     vtx_info->num_channels - channel_start, binding_align);
          } else {
             if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
@@ -4791,7 +5027,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
 
          Temp fetch_index = index;
          if (attrib_stride != 0 && fetch_offset > attrib_stride) {
-            fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
+            fetch_index =
+               bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
             fetch_offset = fetch_offset % attrib_stride;
          }
 
@@ -4812,7 +5049,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
                assert(!use_mubuf);
                opcode = aco_opcode::tbuffer_load_format_d16_xy;
             } else {
-               opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
+               opcode =
+                  use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
             }
             break;
          case 6:
@@ -4824,25 +5062,26 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
                assert(!use_mubuf);
                opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
             } else {
-               opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+               opcode =
+                  use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
             }
             break;
          case 12:
             assert(ctx->options->chip_class >= GFX7 ||
                    (!use_mubuf && ctx->options->chip_class == GFX6));
-            opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
+            opcode =
+               use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
             break;
          case 16:
-            opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
+            opcode =
+               use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
             break;
-         default:
-            unreachable("Unimplemented load_input vector size");
+         default: unreachable("Unimplemented load_input vector size");
          }
 
          Temp fetch_dst;
-         if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
-             !expanded && (alpha_adjust == AC_FETCH_FORMAT_NONE ||
-                           num_channels <= 3)) {
+         if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&
+             (alpha_adjust == AC_FETCH_FORMAT_NONE || num_channels <= 3)) {
             direct_fetch = true;
             fetch_dst = dst;
          } else {
@@ -4850,14 +5089,14 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
          }
 
          if (use_mubuf) {
-            Instruction *mubuf = bld.mubuf(
-               opcode, Definition(fetch_dst), list, fetch_index, soffset,
-                      fetch_offset, false, false, true).instr;
+            Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,
+                                           soffset, fetch_offset, false, false, true)
+                                    .instr;
             mubuf->mubuf().vtx_binding = attrib_binding + 1;
          } else {
-            Instruction *mtbuf = bld.mtbuf(
-               opcode, Definition(fetch_dst), list, fetch_index, soffset,
-                      fetch_dfmt, nfmt, fetch_offset, false, true).instr;
+            Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,
+                                           soffset, fetch_dfmt, nfmt, fetch_offset, false, true)
+                                    .instr;
             mtbuf->mtbuf().vtx_binding = attrib_binding + 1;
          }
 
@@ -4867,24 +5106,25 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
             channels[channel_start] = fetch_dst;
          } else {
             for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
-               channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
-                                                                 bitsize == 16 ? v2b : v1);
+               channels[channel_start + i] =
+                  emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
          }
 
          channel_start += fetch_component;
       }
 
       if (!direct_fetch) {
-         bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
-                         nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
+         bool is_float =
+            nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
 
          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
-         const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
+         const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
          unsigned num_components = instr->dest.ssa.num_components;
 
-         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
-         std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+         std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
          unsigned num_temp = 0;
          for (unsigned i = 0; i < num_components; i++) {
             unsigned idx = i + component;
@@ -4913,7 +5153,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
       }
    } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
-         isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
+         isel_err(offset.ssa->parent_instr,
+                  "Unimplemented non-zero nir_intrinsic_load_input offset");
 
       Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
 
@@ -4933,17 +5174,20 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
          case 2:
             vertex_id = 1; /* P20 */
             break;
-         default:
-            unreachable("invalid vertex index");
+         default: unreachable("invalid vertex index");
          }
       }
 
       if (dst.size() == 1) {
-         bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component);
+         bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id),
+                    bld.m0(prim_mask), idx, component);
       } else {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
          for (unsigned i = 0; i < dst.size(); i++)
-            vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i);
+            vec->operands[i] =
+               bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id),
+                          bld.m0(prim_mask), idx, component + i);
          vec->definitions[0] = Definition(dst);
          bld.insert(std::move(vec));
       }
@@ -4952,7 +5196,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 }
 
-void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
 
@@ -4965,18 +5210,17 @@ void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *ins
    unreachable("LDS-based TCS input should have been lowered in NIR.");
 }
 
-void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    switch (ctx->shader->info.stage) {
-   case MESA_SHADER_TESS_CTRL:
-      visit_load_tcs_per_vertex_input(ctx, instr);
-      break;
-   default:
-      unreachable("Unimplemented shader stage");
+   case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
+   default: unreachable("Unimplemented shader stage");
    }
 }
 
-void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
 
@@ -4997,20 +5241,21 @@ void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
    emit_split_vector(ctx, tess_coord, 3);
 }
 
-Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
+Temp
+load_desc_ptr(isel_context* ctx, unsigned desc_set)
 {
    if (ctx->program->info->need_indirect_descriptor_sets) {
       Builder bld(ctx->program, ctx->block);
       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
       Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2));
-      return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false);
+      return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);
    }
 
    return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
 }
 
-
-void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
@@ -5020,13 +5265,14 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
    unsigned binding = nir_intrinsic_binding(instr);
 
    Temp desc_ptr;
-   radv_pipeline_layout *pipeline_layout = ctx->options->layout;
-   radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
+   radv_pipeline_layout* pipeline_layout = ctx->options->layout;
+   radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout;
    unsigned offset = layout->binding[binding].offset;
    unsigned stride;
    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
-      unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
+      unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
+                     layout->binding[binding].dynamic_offset_offset;
       desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
       offset = pipeline_layout->push_constant_size + 16 * idx;
       stride = 16;
@@ -5036,7 +5282,8 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 
    if (nir_src_is_const(instr->src[0])) {
-      index = bld.copy(bld.def(s1), Operand((uint32_t)(offset + nir_src_as_uint(instr->src[0]) * stride)));
+      index = bld.copy(bld.def(s1),
+                       Operand((uint32_t)(offset + nir_src_as_uint(instr->src[0]) * stride)));
    } else if (index.type() == RegType::vgpr) {
       if (stride != 1) {
          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
@@ -5048,25 +5295,27 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
       if (stride != 1)
          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index);
       if (offset)
-         index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+         index =
+            bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
    }
 
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
+   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
    elems[0] = desc_ptr;
    elems[1] = index;
    ctx->allocated_vec.emplace(dst.id(), elems);
-   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index,
-              Operand(0u));
+   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand(0u));
 }
 
-void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
-                 Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
-                 bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
+void
+load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
+            Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
+            bool allow_smem = true, memory_sync_info sync = memory_sync_info())
 {
    Builder bld(ctx->program, ctx->block);
 
-   bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
+   bool use_smem =
+      dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
    if (use_smem)
       offset = bld.as_uniform(offset);
    else {
@@ -5088,7 +5337,8 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_
       emit_load(ctx, bld, info, mubuf_load_params);
 }
 
-Temp load_buffer_rsrc(isel_context *ctx, Temp rsrc)
+Temp
+load_buffer_rsrc(isel_context* ctx, Temp rsrc)
 {
    Builder bld(ctx->program, ctx->block);
    Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
@@ -5097,17 +5347,19 @@ Temp load_buffer_rsrc(isel_context *ctx, Temp rsrc)
    return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding);
 }
 
-bool is_inline_ubo(isel_context *ctx, nir_src rsrc)
+bool
+is_inline_ubo(isel_context* ctx, nir_src rsrc)
 {
    nir_binding binding = nir_chase_binding(rsrc);
    if (!binding.success)
       return false;
 
-   radv_descriptor_set_layout *layout = ctx->options->layout->set[binding.desc_set].layout;
+   radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout;
    return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
 }
 
-void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
@@ -5116,17 +5368,16 @@ void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
 
    if (is_inline_ubo(ctx, instr->src[0])) {
       Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)));
-      Temp binding_off = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
+      Temp binding_off =
+         bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
       rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off);
 
-      uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                           S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                           S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                           S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+      uint32_t desc_type =
+         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
       if (ctx->options->chip_class >= GFX10) {
          desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
-                      S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                      S_008F0C_RESOURCE_LEVEL(1);
+                      S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
       } else {
          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
@@ -5143,7 +5394,7 @@ void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
 }
 
 void
-visit_load_sbt_amd(isel_context *ctx, nir_intrinsic_instr *instr)
+visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
@@ -5165,20 +5416,22 @@ visit_load_sbt_amd(isel_context *ctx, nir_intrinsic_instr *instr)
              false, true);
 }
 
-void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
    unsigned offset = nir_intrinsic_base(instr);
    unsigned count = instr->dest.ssa.num_components;
-   nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
+   nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
 
    if (index_cv && instr->dest.ssa.bit_size == 32) {
       unsigned start = (offset + index_cv->u32) / 4u;
       start -= ctx->args->ac.base_inline_push_consts;
       if (start + count <= ctx->args->ac.num_inline_push_consts) {
-         std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
+         std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
          for (unsigned i = 0; i < count; ++i) {
             elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
             vec->operands[i] = Operand{elems[i]};
@@ -5192,7 +5445,8 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
    if (offset != 0) // TODO check if index != 0 as well
-      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
+                             index);
    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
    Temp vec = dst;
    bool trim = false;
@@ -5212,28 +5466,19 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
    aco_opcode op;
 
    switch (vec.size()) {
-   case 1:
-      op = aco_opcode::s_load_dword;
-      break;
-   case 2:
-      op = aco_opcode::s_load_dwordx2;
-      break;
+   case 1: op = aco_opcode::s_load_dword; break;
+   case 2: op = aco_opcode::s_load_dwordx2; break;
    case 3:
       vec = bld.tmp(s4);
       trim = true;
       FALLTHROUGH;
-   case 4:
-      op = aco_opcode::s_load_dwordx4;
-      break;
+   case 4: op = aco_opcode::s_load_dwordx4; break;
    case 6:
       vec = bld.tmp(s8);
       trim = true;
       FALLTHROUGH;
-   case 8:
-      op = aco_opcode::s_load_dwordx8;
-      break;
-   default:
-      unreachable("unimplemented or forbidden load_push_constant.");
+   case 8: op = aco_opcode::s_load_dwordx8; break;
+   default: unreachable("unimplemented or forbidden load_push_constant.");
    }
 
    bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true;
@@ -5247,29 +5492,25 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
    if (trim) {
       emit_split_vector(ctx, vec, 4);
       RegClass rc = dst.size() == 3 ? s1 : s2;
-      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
-                 emit_extract_vector(ctx, vec, 0, rc),
-                 emit_extract_vector(ctx, vec, 1, rc),
-                 emit_extract_vector(ctx, vec, 2, rc));
-
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
+                 emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
    }
    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
 }
 
-void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
 
    Builder bld(ctx->program, ctx->block);
 
-   uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                        S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+   uint32_t desc_type =
+      S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
    if (ctx->options->chip_class >= GFX10) {
       desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
-                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                   S_008F0C_RESOURCE_LEVEL(1);
+                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
    } else {
       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
@@ -5280,20 +5521,23 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
    if (base && offset.type() == RegType::sgpr)
-      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
+                              Operand(base));
    else if (base && offset.type() == RegType::vgpr)
       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
 
-   Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
-                          bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
-                          Operand(MIN2(base + range, ctx->shader->constant_data_size)),
-                          Operand(desc_type));
+   Temp rsrc =
+      bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
+                 bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
+                            Operand(ctx->constant_data_offset)),
+                 Operand(MIN2(base + range, ctx->shader->constant_data_size)), Operand(desc_type));
    unsigned size = instr->dest.ssa.bit_size / 8;
    // TODO: get alignment information for subdword constants
    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
 }
 
-void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
       ctx->cf_info.exec_potentially_empty_discard = true;
@@ -5310,22 +5554,23 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
    return;
 }
 
-void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
+void
+visit_discard(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
 
    if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
       ctx->cf_info.exec_potentially_empty_discard = true;
 
-   bool divergent = ctx->cf_info.parent_if.is_divergent ||
-                    ctx->cf_info.parent_loop.has_divergent_continue;
+   bool divergent =
+      ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue;
 
    if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) {
       /* we handle discards the same way as jump instructions */
       append_logical_end(ctx->block);
 
       /* in loops, discard behaves like break */
-      Block *linear_target = ctx->cf_info.parent_loop.exit;
+      Block* linear_target = ctx->cf_info.parent_loop.exit;
       ctx->block->kind |= block_kind_discard;
 
       /* uniform discard - loop ends here */
@@ -5342,7 +5587,8 @@ void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
       ctx->program->needs_exact = true;
       /* save exec somewhere temporarily so that it doesn't get
        * overwritten before the discard from outer exec masks */
-      Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
+      Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF),
+                           Operand(exec, bld.lm));
       bld.pseudo(aco_opcode::p_discard_if, cond);
       ctx->block->kind |= block_kind_uses_discard_if;
       return;
@@ -5382,25 +5628,23 @@ enum aco_descriptor_type {
 };
 
 static bool
-should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
+should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
+{
    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
       return false;
    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
-   return dim == ac_image_cube ||
-          dim == ac_image_1darray ||
-          dim == ac_image_2darray ||
+   return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
           dim == ac_image_2darraymsaa;
 }
 
-Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
-                      enum aco_descriptor_type desc_type,
-                      const nir_tex_instr *tex_instr, bool write)
+Temp
+get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,
+                 enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)
 {
-/* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
-   std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
-   if (it != ctx->tex_desc.end())
-      return it->second;
-*/
+   /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
+      std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<
+      32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;
+   */
    Temp index = Temp();
    bool index_set = false;
    unsigned constant_index = 0;
@@ -5413,13 +5657,13 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
       descriptor_set = 0;
       base_index = tex_instr->sampler_index;
    } else {
-      while(deref_instr->deref_type != nir_deref_type_var) {
+      while (deref_instr->deref_type != nir_deref_type_var) {
          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
          if (!array_size)
             array_size = 1;
 
          assert(deref_instr->deref_type == nir_deref_type_array);
-         nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
+         nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);
          if (const_value) {
             constant_index += array_size * const_value->u32;
          } else {
@@ -5428,13 +5672,15 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
                indirect = bld.as_uniform(indirect);
 
             if (array_size != 1)
-               indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
+               indirect =
+                  bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
 
             if (!index_set) {
                index = indirect;
                index_set = true;
             } else {
-               index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
+               index =
+                  bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
             }
          }
 
@@ -5447,8 +5693,8 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
    Temp list = load_desc_ptr(ctx, descriptor_set);
    list = convert_pointer_to_64_bit(ctx, list);
 
-   struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
-   struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
+   struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;
+   struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;
    unsigned offset = binding->offset;
    unsigned stride = binding->size;
    aco_opcode opcode;
@@ -5487,20 +5733,18 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
       opcode = aco_opcode::s_load_dwordx4;
       offset += 64;
       break;
-   default:
-      unreachable("invalid desc_type\n");
+   default: unreachable("invalid desc_type\n");
    }
 
    offset += constant_index * stride;
 
    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
-      (!index_set || binding->immutable_samplers_equal)) {
+       (!index_set || binding->immutable_samplers_equal)) {
       if (binding->immutable_samplers_equal)
          constant_index = 0;
 
-      const uint32_t *samplers = radv_immutable_samplers(layout, binding);
-      uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ?
-                             C_008F30_TRUNC_COORD : 0xffffffffu;
+      const uint32_t* samplers = radv_immutable_samplers(layout, binding);
+      uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;
       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
                         Operand(samplers[constant_index * 4 + 0] & dword0_mask),
                         Operand(samplers[constant_index * 4 + 1]),
@@ -5512,8 +5756,9 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
    if (!index_set) {
       off = bld.copy(bld.def(s1), Operand(offset));
    } else {
-      off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
-                                   bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
+      off = Operand(
+         (Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
+                        bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
    }
 
    Temp res = bld.smem(opcode, bld.def(type), list, off);
@@ -5522,103 +5767,80 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
       Temp components[8];
       for (unsigned i = 0; i < 8; i++)
          components[i] = bld.tmp(s1);
-      bld.pseudo(aco_opcode::p_split_vector,
-                 Definition(components[0]),
-                 Definition(components[1]),
-                 Definition(components[2]),
-                 Definition(components[3]),
-                 res);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
+                 Definition(components[2]), Definition(components[3]), res);
 
       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);
-      bld.pseudo(aco_opcode::p_split_vector,
-                 bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
-                 Definition(components[4]),
-                 Definition(components[5]),
-                 Definition(components[6]),
-                 Definition(components[7]),
-                 desc2);
+      bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
+                 Definition(components[4]), Definition(components[5]), Definition(components[6]),
+                 Definition(components[7]), desc2);
 
-      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
-                       components[0], components[1], components[2], components[3],
-                       components[4], components[5], components[6], components[7]);
-   } else if (desc_type == ACO_DESC_IMAGE &&
-              ctx->options->has_image_load_dcc_bug &&
-              !tex_instr && !write) {
+      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
+                       components[2], components[3], components[4], components[5], components[6],
+                       components[7]);
+   } else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&
+              !write) {
       Temp components[8];
       for (unsigned i = 0; i < 8; i++)
          components[i] = bld.tmp(s1);
 
-      bld.pseudo(aco_opcode::p_split_vector,
-                 Definition(components[0]), Definition(components[1]),
-                 Definition(components[2]), Definition(components[3]),
-                 Definition(components[4]), Definition(components[5]),
-                 Definition(components[6]), Definition(components[7]), res);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
+                 Definition(components[2]), Definition(components[3]), Definition(components[4]),
+                 Definition(components[5]), Definition(components[6]), Definition(components[7]),
+                 res);
 
       /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
        * hardware bug.
        */
-      components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
-                               components[6],
-                               bld.copy(bld.def(s1), Operand((uint32_t)C_00A018_WRITE_COMPRESS_ENABLE)));
+      components[6] =
+         bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],
+                  bld.copy(bld.def(s1), Operand((uint32_t)C_00A018_WRITE_COMPRESS_ENABLE)));
 
-      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
-                       components[0], components[1], components[2], components[3],
-                       components[4], components[5], components[6], components[7]);
+      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
+                       components[2], components[3], components[4], components[5], components[6],
+                       components[7]);
    } else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {
       Temp components[4];
       for (unsigned i = 0; i < 4; i++)
          components[i] = bld.tmp(s1);
 
-      bld.pseudo(aco_opcode::p_split_vector,
-                 Definition(components[0]), Definition(components[1]),
+      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
                  Definition(components[2]), Definition(components[3]), res);
 
       /* We want to always use the linear filtering truncation behaviour for
        * nir_texop_tg4, even if the sampler uses nearest/point filtering.
        */
-      components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
-                               components[0], Operand((uint32_t)C_008F30_TRUNC_COORD));
+      components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],
+                               Operand((uint32_t)C_008F30_TRUNC_COORD));
 
-      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
-                       components[0], components[1], components[2], components[3]);
+      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],
+                       components[2], components[3]);
    }
 
    return res;
 }
 
-static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
+static int
+image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
 {
    switch (dim) {
-   case GLSL_SAMPLER_DIM_BUF:
-      return 1;
-   case GLSL_SAMPLER_DIM_1D:
-      return array ? 2 : 1;
-   case GLSL_SAMPLER_DIM_2D:
-      return array ? 3 : 2;
-   case GLSL_SAMPLER_DIM_MS:
-      return array ? 4 : 3;
+   case GLSL_SAMPLER_DIM_BUF: return 1;
+   case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
+   case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
+   case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3;
    case GLSL_SAMPLER_DIM_3D:
-   case GLSL_SAMPLER_DIM_CUBE:
-      return 3;
+   case GLSL_SAMPLER_DIM_CUBE: return 3;
    case GLSL_SAMPLER_DIM_RECT:
-   case GLSL_SAMPLER_DIM_SUBPASS:
-      return 2;
-   case GLSL_SAMPLER_DIM_SUBPASS_MS:
-      return 3;
-   default:
-      break;
+   case GLSL_SAMPLER_DIM_SUBPASS: return 2;
+   case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3;
+   default: break;
    }
    return 0;
 }
 
-
-static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
-                                   Definition dst,
-                                   Temp rsrc,
-                                   Operand samp,
-                                   std::vector<Temp> coords,
-                                   unsigned wqm_mask=0,
-                                   Operand vdata=Operand(v1))
+static MIMG_instruction*
+emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp,
+          std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1))
 {
    /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */
    unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5;
@@ -5629,7 +5851,8 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
       if (coords.size() > 1) {
          coord = bld.tmp(RegType::vgpr, coords.size());
 
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
          for (unsigned i = 0; i < coords.size(); i++)
             vec->operands[i] = Operand(coords[i]);
          vec->definitions[0] = Definition(coord);
@@ -5659,8 +5882,8 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
       }
    }
 
-   aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(
-      op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
+   aco_ptr<MIMG_instruction> mimg{
+      create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
    if (dst.isTemp())
       mimg->definitions[0] = dst;
    mimg->operands[0] = Operand(rsrc);
@@ -5669,12 +5892,13 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
    for (unsigned i = 0; i < coords.size(); i++)
       mimg->operands[3 + i] = Operand(coords[i]);
 
-   MIMG_instruction *res = mimg.get();
+   MIMG_instruction* res = mimg.get();
    bld.insert(std::move(mimg));
    return res;
 }
 
-void visit_bvh64_intersect_ray_amd(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -5699,8 +5923,8 @@ void visit_bvh64_intersect_ray_amd(isel_context *ctx, nir_intrinsic_instr *instr
    args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));
    args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));
 
-   MIMG_instruction *mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray,
-                                      Definition(dst), resource, Operand(s4), args);
+   MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst),
+                                      resource, Operand(s4), args);
    mimg->dim = ac_image_1d;
    mimg->dmask = 0xf;
    mimg->unrm = true;
@@ -5721,17 +5945,18 @@ void visit_bvh64_intersect_ray_amd(isel_context *ctx, nir_intrinsic_instr *instr
  * The sample index should be adjusted as follows:
  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
  */
-static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector<Temp>& coords, Operand sample_index, Temp fmask_desc_ptr)
+static Temp
+adjust_sample_index_using_fmask(isel_context* ctx, bool da, std::vector<Temp>& coords,
+                                Operand sample_index, Temp fmask_desc_ptr)
 {
    Builder bld(ctx->program, ctx->block);
    Temp fmask = bld.tmp(v1);
    unsigned dim = ctx->options->chip_class >= GFX10
-                  ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
-                  : 0;
+                     ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
+                     : 0;
 
-   MIMG_instruction *load = emit_mimg(bld, aco_opcode::image_load,
-                                      Definition(fmask), fmask_desc_ptr,
-                                      Operand(s4), coords);
+   MIMG_instruction* load = emit_mimg(bld, aco_opcode::image_load, Definition(fmask),
+                                      fmask_desc_ptr, Operand(s4), coords);
    load->glc = false;
    load->dlc = false;
    load->dmask = 0x1;
@@ -5747,7 +5972,8 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
          sample_index4 = Operand(0u);
       }
    } else if (sample_index.regClass() == s1) {
-      sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
+      sample_index4 =
+         bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
    } else {
       assert(sample_index.regClass() == v1);
       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
@@ -5759,14 +5985,17 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
    else
-      final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
+      final_sample =
+         bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
 
    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
     * resource descriptor is 0 (invalid),
     */
    Temp compare = bld.tmp(bld.lm);
-   bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
-                Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
+   bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand(0u),
+                emit_extract_vector(ctx, fmask_desc_ptr, 1, s1))
+      .def(0)
+      .setHint(vcc);
 
    Temp sample_index_v = bld.copy(bld.def(v1), sample_index);
 
@@ -5774,13 +6003,15 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
 }
 
-static std::vector<Temp> get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
+static std::vector<Temp>
+get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr, const struct glsl_type* type)
 {
 
    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
    bool is_array = glsl_sampler_type_is_array(type);
-   ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
+   ASSERTED bool add_frag_pos =
+      (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
    assert(!add_frag_pos && "Input attachments should be lowered.");
    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
@@ -5794,14 +6025,18 @@ static std::vector<Temp> get_image_coords(isel_context *ctx, const nir_intrinsic
       /* get sample index */
       if (instr->intrinsic == nir_intrinsic_image_deref_load ||
           instr->intrinsic == nir_intrinsic_image_deref_sparse_load) {
-         nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
-         Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
+         nir_const_value* sample_cv = nir_src_as_const_value(instr->src[2]);
+         Operand sample_index =
+            sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
          std::vector<Temp> fmask_load_address;
          for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
             fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
 
-         Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false);
-         coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr);
+         Temp fmask_desc_ptr =
+            get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                             ACO_DESC_FMASK, nullptr, false);
+         coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address,
+                                                         sample_index, fmask_desc_ptr);
       } else {
          coords[count] = emit_extract_vector(ctx, src2, 0, v1);
       }
@@ -5822,7 +6057,8 @@ static std::vector<Temp> get_image_coords(isel_context *ctx, const nir_intrinsic
        instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
        instr->intrinsic == nir_intrinsic_image_deref_store) {
       int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;
-      bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
+      bool level_zero =
+         nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
 
       if (!level_zero)
          coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
@@ -5831,8 +6067,8 @@ static std::vector<Temp> get_image_coords(isel_context *ctx, const nir_intrinsic
    return coords;
 }
 
-
-memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
+memory_sync_info
+get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
 {
    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
    if (semantics & semantic_atomicrmw)
@@ -5848,7 +6084,8 @@ memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class
    return memory_sync_info(storage, semantics);
 }
 
-Operand emit_tfe_init(Builder& bld, Temp dst)
+Operand
+emit_tfe_init(Builder& bld, Temp dst)
 {
    Temp tmp = bld.tmp(dst.regClass());
 
@@ -5867,11 +6104,13 @@ Operand emit_tfe_init(Builder& bld, Temp dst)
    return Operand(tmp);
 }
 
-void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
-   const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
-   const struct glsl_type *type = glsl_without_array(var->type);
+   const nir_variable* var =
+      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+   const struct glsl_type* type = glsl_without_array(var->type);
    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
    bool is_array = glsl_sampler_type_is_array(type);
    bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
@@ -5881,8 +6120,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
    unsigned access = var->data.access | nir_intrinsic_access(instr);
 
    unsigned result_size = instr->dest.ssa.num_components - is_sparse;
-   unsigned expand_mask = nir_ssa_def_components_read(&instr->dest.ssa) &
-                          u_bit_consecutive(0, result_size);
+   unsigned expand_mask =
+      nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size);
    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
    if (dim == GLSL_SAMPLER_DIM_BUF)
       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
@@ -5911,25 +6150,17 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
 
       aco_opcode opcode;
       switch (util_bitcount(dmask)) {
-      case 1:
-         opcode = aco_opcode::buffer_load_format_x;
-         break;
-      case 2:
-         opcode = aco_opcode::buffer_load_format_xy;
-         break;
-      case 3:
-         opcode = aco_opcode::buffer_load_format_xyz;
-         break;
-      case 4:
-         opcode = aco_opcode::buffer_load_format_xyzw;
-         break;
-      default:
-         unreachable(">4 channel buffer image load");
+      case 1: opcode = aco_opcode::buffer_load_format_x; break;
+      case 2: opcode = aco_opcode::buffer_load_format_xy; break;
+      case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
+      case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
+      default: unreachable(">4 channel buffer image load");
       }
-      aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
+      aco_ptr<MUBUF_instruction> load{
+         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
       load->operands[0] = Operand(resource);
       load->operands[1] = Operand(vindex);
-      load->operands[2] = Operand((uint32_t) 0);
+      load->operands[2] = Operand((uint32_t)0);
       load->definitions[0] = Definition(tmp);
       load->idxen = true;
       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
@@ -5946,8 +6177,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
       aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
 
       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
-      MIMG_instruction *load = emit_mimg(bld, opcode, Definition(tmp), resource,
-                                         Operand(s4), coords, 0, vdata);
+      MIMG_instruction* load =
+         emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata);
       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
       load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
@@ -5962,16 +6193,19 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
       /* The result components are 64-bit but the sparse residency code is
        * 32-bit. So add a zero to the end so expand_vector() works correctly.
        */
-      tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size()+1), tmp, Operand(0u));
+      tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
+                       Operand(0u));
    }
 
    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);
 }
 
-void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
 {
-   const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
-   const struct glsl_type *type = glsl_without_array(var->type);
+   const nir_variable* var =
+      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+   const struct glsl_type* type = glsl_without_array(var->type);
    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
    bool is_array = glsl_sampler_type_is_array(type);
    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
@@ -5983,32 +6217,28 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
 
    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
    unsigned access = var->data.access | nir_intrinsic_access(instr);
-   bool glc = ctx->options->chip_class == GFX6 || access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
+   bool glc = ctx->options->chip_class == GFX6 ||
+                    access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)
+                 ? 1
+                 : 0;
 
    if (dim == GLSL_SAMPLER_DIM_BUF) {
-      Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true);
+      Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                                   ACO_DESC_BUFFER, nullptr, true);
       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
       aco_opcode opcode;
       switch (data.size()) {
-      case 1:
-         opcode = aco_opcode::buffer_store_format_x;
-         break;
-      case 2:
-         opcode = aco_opcode::buffer_store_format_xy;
-         break;
-      case 3:
-         opcode = aco_opcode::buffer_store_format_xyz;
-         break;
-      case 4:
-         opcode = aco_opcode::buffer_store_format_xyzw;
-         break;
-      default:
-         unreachable(">4 channel buffer image store");
+      case 1: opcode = aco_opcode::buffer_store_format_x; break;
+      case 2: opcode = aco_opcode::buffer_store_format_xy; break;
+      case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
+      case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
+      default: unreachable(">4 channel buffer image store");
       }
-      aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
+      aco_ptr<MUBUF_instruction> store{
+         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
       store->operands[0] = Operand(rsrc);
       store->operands[1] = Operand(vindex);
-      store->operands[2] = Operand((uint32_t) 0);
+      store->operands[2] = Operand((uint32_t)0);
       store->operands[3] = Operand(data);
       store->idxen = true;
       store->glc = glc;
@@ -6022,14 +6252,15 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
 
    assert(data.type() == RegType::vgpr);
    std::vector<Temp> coords = get_image_coords(ctx, instr, type);
-   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true);
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                                    ACO_DESC_IMAGE, nullptr, true);
 
    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
 
    Builder bld(ctx->program, ctx->block);
-   MIMG_instruction *store = emit_mimg(bld, opcode, Definition(), resource,
-                                       Operand(s4), coords, 0, Operand(data));
+   MIMG_instruction* store =
+      emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));
    store->glc = glc;
    store->dlc = false;
    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
@@ -6042,11 +6273,13 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
    return;
 }
 
-void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
-   const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
-   const struct glsl_type *type = glsl_without_array(var->type);
+   const nir_variable* var =
+      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+   const struct glsl_type* type = glsl_without_array(var->type);
    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
    bool is_array = glsl_sampler_type_is_array(type);
    Builder bld(ctx->program, ctx->block);
@@ -6056,62 +6289,64 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
 
    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
-      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
+      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
+                        get_ssa_temp(ctx, instr->src[4].ssa), data);
 
    aco_opcode buf_op, buf_op64, image_op;
    switch (instr->intrinsic) {
-      case nir_intrinsic_image_deref_atomic_add:
-         buf_op = aco_opcode::buffer_atomic_add;
-         buf_op64 = aco_opcode::buffer_atomic_add_x2;
-         image_op = aco_opcode::image_atomic_add;
-         break;
-      case nir_intrinsic_image_deref_atomic_umin:
-         buf_op = aco_opcode::buffer_atomic_umin;
-         buf_op64 = aco_opcode::buffer_atomic_umin_x2;
-         image_op = aco_opcode::image_atomic_umin;
-         break;
-      case nir_intrinsic_image_deref_atomic_imin:
-         buf_op = aco_opcode::buffer_atomic_smin;
-         buf_op64 = aco_opcode::buffer_atomic_smin_x2;
-         image_op = aco_opcode::image_atomic_smin;
-         break;
-      case nir_intrinsic_image_deref_atomic_umax:
-         buf_op = aco_opcode::buffer_atomic_umax;
-         buf_op64 = aco_opcode::buffer_atomic_umax_x2;
-         image_op = aco_opcode::image_atomic_umax;
-         break;
-      case nir_intrinsic_image_deref_atomic_imax:
-         buf_op = aco_opcode::buffer_atomic_smax;
-         buf_op64 = aco_opcode::buffer_atomic_smax_x2;
-         image_op = aco_opcode::image_atomic_smax;
-         break;
-      case nir_intrinsic_image_deref_atomic_and:
-         buf_op = aco_opcode::buffer_atomic_and;
-         buf_op64 = aco_opcode::buffer_atomic_and_x2;
-         image_op = aco_opcode::image_atomic_and;
-         break;
-      case nir_intrinsic_image_deref_atomic_or:
-         buf_op = aco_opcode::buffer_atomic_or;
-         buf_op64 = aco_opcode::buffer_atomic_or_x2;
-         image_op = aco_opcode::image_atomic_or;
-         break;
-      case nir_intrinsic_image_deref_atomic_xor:
-         buf_op = aco_opcode::buffer_atomic_xor;
-         buf_op64 = aco_opcode::buffer_atomic_xor_x2;
-         image_op = aco_opcode::image_atomic_xor;
-         break;
-      case nir_intrinsic_image_deref_atomic_exchange:
-         buf_op = aco_opcode::buffer_atomic_swap;
-         buf_op64 = aco_opcode::buffer_atomic_swap_x2;
-         image_op = aco_opcode::image_atomic_swap;
-         break;
-      case nir_intrinsic_image_deref_atomic_comp_swap:
-         buf_op = aco_opcode::buffer_atomic_cmpswap;
-         buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
-         image_op = aco_opcode::image_atomic_cmpswap;
-         break;
-      default:
-         unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
+   case nir_intrinsic_image_deref_atomic_add:
+      buf_op = aco_opcode::buffer_atomic_add;
+      buf_op64 = aco_opcode::buffer_atomic_add_x2;
+      image_op = aco_opcode::image_atomic_add;
+      break;
+   case nir_intrinsic_image_deref_atomic_umin:
+      buf_op = aco_opcode::buffer_atomic_umin;
+      buf_op64 = aco_opcode::buffer_atomic_umin_x2;
+      image_op = aco_opcode::image_atomic_umin;
+      break;
+   case nir_intrinsic_image_deref_atomic_imin:
+      buf_op = aco_opcode::buffer_atomic_smin;
+      buf_op64 = aco_opcode::buffer_atomic_smin_x2;
+      image_op = aco_opcode::image_atomic_smin;
+      break;
+   case nir_intrinsic_image_deref_atomic_umax:
+      buf_op = aco_opcode::buffer_atomic_umax;
+      buf_op64 = aco_opcode::buffer_atomic_umax_x2;
+      image_op = aco_opcode::image_atomic_umax;
+      break;
+   case nir_intrinsic_image_deref_atomic_imax:
+      buf_op = aco_opcode::buffer_atomic_smax;
+      buf_op64 = aco_opcode::buffer_atomic_smax_x2;
+      image_op = aco_opcode::image_atomic_smax;
+      break;
+   case nir_intrinsic_image_deref_atomic_and:
+      buf_op = aco_opcode::buffer_atomic_and;
+      buf_op64 = aco_opcode::buffer_atomic_and_x2;
+      image_op = aco_opcode::image_atomic_and;
+      break;
+   case nir_intrinsic_image_deref_atomic_or:
+      buf_op = aco_opcode::buffer_atomic_or;
+      buf_op64 = aco_opcode::buffer_atomic_or_x2;
+      image_op = aco_opcode::image_atomic_or;
+      break;
+   case nir_intrinsic_image_deref_atomic_xor:
+      buf_op = aco_opcode::buffer_atomic_xor;
+      buf_op64 = aco_opcode::buffer_atomic_xor_x2;
+      image_op = aco_opcode::image_atomic_xor;
+      break;
+   case nir_intrinsic_image_deref_atomic_exchange:
+      buf_op = aco_opcode::buffer_atomic_swap;
+      buf_op64 = aco_opcode::buffer_atomic_swap_x2;
+      image_op = aco_opcode::image_atomic_swap;
+      break;
+   case nir_intrinsic_image_deref_atomic_comp_swap:
+      buf_op = aco_opcode::buffer_atomic_cmpswap;
+      buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
+      image_op = aco_opcode::image_atomic_cmpswap;
+      break;
+   default:
+      unreachable("visit_image_atomic should only be called with "
+                  "nir_intrinsic_image_deref_atomic_* instructions.");
    }
 
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -6119,8 +6354,10 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
 
    if (dim == GLSL_SAMPLER_DIM_BUF) {
       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
-      Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true);
-      //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
+      Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                                       ACO_DESC_BUFFER, nullptr, true);
+      // assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet
+      // implemented.");
       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
          is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
       mubuf->operands[0] = Operand(resource);
@@ -6141,10 +6378,11 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 
    std::vector<Temp> coords = get_image_coords(ctx, instr, type);
-   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true);
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                                    ACO_DESC_IMAGE, nullptr, true);
    Definition def = return_previous ? Definition(dst) : Definition();
-   MIMG_instruction *mimg = emit_mimg(bld, image_op, def, resource,
-                                      Operand(s4), coords, 0, Operand(data));
+   MIMG_instruction* mimg =
+      emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));
    mimg->glc = return_previous;
    mimg->dlc = false; /* Not needed for atomics */
    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
@@ -6157,7 +6395,8 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    return;
 }
 
-void get_buffer_size(isel_context *ctx, Temp desc, Temp dst)
+void
+get_buffer_size(isel_context* ctx, Temp desc, Temp dst)
 {
    if (ctx->options->chip_class == GFX8) {
       /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
@@ -6165,18 +6404,21 @@ void get_buffer_size(isel_context *ctx, Temp desc, Temp dst)
 
       Temp size = emit_extract_vector(ctx, desc, 2, s1);
 
-      Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
-      size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), bld.as_uniform(size_div3), Operand(1u));
+      Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1),
+                                bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
+      size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
+                           bld.as_uniform(size_div3), Operand(1u));
 
       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
-      stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
+      stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride,
+                        Operand((5u << 16) | 16u));
 
       Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
       size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
 
       Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
-      bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
-               size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
+      bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size,
+               bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
       if (dst.type() == RegType::vgpr)
          bld.copy(Definition(dst), shr_dst);
 
@@ -6186,16 +6428,19 @@ void get_buffer_size(isel_context *ctx, Temp desc, Temp dst)
    }
 }
 
-void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
 {
-   const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
-   const struct glsl_type *type = glsl_without_array(var->type);
+   const nir_variable* var =
+      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+   const struct glsl_type* type = glsl_without_array(var->type);
    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
    bool is_array = glsl_sampler_type_is_array(type);
    Builder bld(ctx->program, ctx->block);
 
    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
-      Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, false);
+      Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                                   ACO_DESC_BUFFER, NULL, false);
       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
    }
 
@@ -6204,19 +6449,19 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
    std::vector<Temp> lod{bld.copy(bld.def(v1), Operand(0u))};
 
    /* Resource */
-   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, false);
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                                    ACO_DESC_IMAGE, NULL, false);
 
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
 
-   MIMG_instruction *mimg = emit_mimg(bld, aco_opcode::image_get_resinfo,
-                                      Definition(dst), resource, Operand(s4), lod);
+   MIMG_instruction* mimg =
+      emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod);
    uint8_t& dmask = mimg->dmask;
    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
    mimg->da = glsl_sampler_type_is_array(type);
 
-   if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
-       glsl_sampler_type_is_array(type)) {
+   if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && glsl_sampler_type_is_array(type)) {
 
       assert(instr->dest.ssa.num_components == 3);
       Temp tmp = ctx->program->allocateTmp(v3);
@@ -6224,13 +6469,12 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
       emit_split_vector(ctx, tmp, 3);
 
       /* divide 3rd value by 6 by multiplying with magic number */
-      Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
-      Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
+      Temp c = bld.copy(bld.def(s1), Operand((uint32_t)0x2AAAAAAB));
+      Temp by_6 =
+         bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
 
-      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
-                 emit_extract_vector(ctx, tmp, 0, v1),
-                 emit_extract_vector(ctx, tmp, 1, v1),
-                 by_6);
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, tmp, 0, v1),
+                 emit_extract_vector(ctx, tmp, 1, v1), by_6);
 
    } else if (ctx->options->chip_class == GFX9 &&
               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
@@ -6242,14 +6486,18 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
 }
 
-void get_image_samples(isel_context *ctx, Definition dst, Temp resource)
+void
+get_image_samples(isel_context* ctx, Definition dst, Temp resource)
 {
    Builder bld(ctx->program, ctx->block);
 
    Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
-   Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
-   Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
-   Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
+   Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
+                                Operand(16u | 4u << 16));
+   Temp samples =
+      bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
+   Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
+                        Operand(28u | 4u << 16 /* offset=28, width=4 */));
 
    Operand default_sample = Operand(1u);
    if (ctx->options->robust_buffer_access) {
@@ -6257,7 +6505,8 @@ void get_image_samples(isel_context *ctx, Definition dst, Temp resource)
        * all zero, then it's a null descriptor.
        */
       Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
-      Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
+      Temp is_non_null_descriptor =
+         bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
       default_sample = Operand(is_non_null_descriptor);
    }
 
@@ -6265,15 +6514,18 @@ void get_image_samples(isel_context *ctx, Definition dst, Temp resource)
    bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa));
 }
 
-void visit_image_samples(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, false);
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                                    ACO_DESC_IMAGE, NULL, false);
    get_image_samples(ctx, Definition(dst), resource);
 }
 
-void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    unsigned num_components = instr->num_components;
@@ -6292,7 +6544,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
                get_memory_sync_info(instr, storage_buffer, 0));
 }
 
-void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
@@ -6303,13 +6556,14 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
 
    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
-   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+   bool glc =
+      nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
 
    unsigned write_count = 0;
    Temp write_datas[32];
    unsigned offsets[32];
-   split_buffer_store(ctx, instr, false, RegType::vgpr,
-                      data, writemask, 16, &write_count, write_datas, offsets);
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
+                      write_datas, offsets);
 
    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
     * correctly when the SGPR offset is used.
@@ -6320,10 +6574,11 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    for (unsigned i = 0; i < write_count; i++) {
       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
 
-      aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
+      aco_ptr<MUBUF_instruction> store{
+         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
       store->operands[0] = Operand(rsrc);
       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
-      store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+      store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t)0);
       store->operands[3] = Operand(write_datas[i]);
       store->offset = offsets[i];
       store->offen = (offset.type() == RegType::vgpr);
@@ -6336,7 +6591,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 }
 
-void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
@@ -6353,54 +6609,56 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
 
    aco_opcode op32, op64;
    switch (instr->intrinsic) {
-      case nir_intrinsic_ssbo_atomic_add:
-         op32 = aco_opcode::buffer_atomic_add;
-         op64 = aco_opcode::buffer_atomic_add_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_imin:
-         op32 = aco_opcode::buffer_atomic_smin;
-         op64 = aco_opcode::buffer_atomic_smin_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_umin:
-         op32 = aco_opcode::buffer_atomic_umin;
-         op64 = aco_opcode::buffer_atomic_umin_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_imax:
-         op32 = aco_opcode::buffer_atomic_smax;
-         op64 = aco_opcode::buffer_atomic_smax_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_umax:
-         op32 = aco_opcode::buffer_atomic_umax;
-         op64 = aco_opcode::buffer_atomic_umax_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_and:
-         op32 = aco_opcode::buffer_atomic_and;
-         op64 = aco_opcode::buffer_atomic_and_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_or:
-         op32 = aco_opcode::buffer_atomic_or;
-         op64 = aco_opcode::buffer_atomic_or_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_xor:
-         op32 = aco_opcode::buffer_atomic_xor;
-         op64 = aco_opcode::buffer_atomic_xor_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_exchange:
-         op32 = aco_opcode::buffer_atomic_swap;
-         op64 = aco_opcode::buffer_atomic_swap_x2;
-         break;
-      case nir_intrinsic_ssbo_atomic_comp_swap:
-         op32 = aco_opcode::buffer_atomic_cmpswap;
-         op64 = aco_opcode::buffer_atomic_cmpswap_x2;
-         break;
-      default:
-         unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
+   case nir_intrinsic_ssbo_atomic_add:
+      op32 = aco_opcode::buffer_atomic_add;
+      op64 = aco_opcode::buffer_atomic_add_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_imin:
+      op32 = aco_opcode::buffer_atomic_smin;
+      op64 = aco_opcode::buffer_atomic_smin_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_umin:
+      op32 = aco_opcode::buffer_atomic_umin;
+      op64 = aco_opcode::buffer_atomic_umin_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_imax:
+      op32 = aco_opcode::buffer_atomic_smax;
+      op64 = aco_opcode::buffer_atomic_smax_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_umax:
+      op32 = aco_opcode::buffer_atomic_umax;
+      op64 = aco_opcode::buffer_atomic_umax_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_and:
+      op32 = aco_opcode::buffer_atomic_and;
+      op64 = aco_opcode::buffer_atomic_and_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_or:
+      op32 = aco_opcode::buffer_atomic_or;
+      op64 = aco_opcode::buffer_atomic_or_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_xor:
+      op32 = aco_opcode::buffer_atomic_xor;
+      op64 = aco_opcode::buffer_atomic_xor_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_exchange:
+      op32 = aco_opcode::buffer_atomic_swap;
+      op64 = aco_opcode::buffer_atomic_swap_x2;
+      break;
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+      op32 = aco_opcode::buffer_atomic_cmpswap;
+      op64 = aco_opcode::buffer_atomic_cmpswap_x2;
+      break;
+   default:
+      unreachable(
+         "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
    }
    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
-   aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
+   aco_ptr<MUBUF_instruction> mubuf{
+      create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
    mubuf->operands[0] = Operand(rsrc);
    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
-   mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+   mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t)0);
    mubuf->operands[3] = Operand(data);
    if (return_previous)
       mubuf->definitions[0] = Definition(dst);
@@ -6414,7 +6672,9 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    ctx->block->instructions.emplace_back(std::move(mubuf));
 }
 
-void visit_get_ssbo_size(isel_context *ctx, nir_intrinsic_instr *instr) {
+void
+visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr)
+{
 
    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -6436,15 +6696,15 @@ void visit_get_ssbo_size(isel_context *ctx, nir_intrinsic_instr *instr) {
    }
 }
 
-void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    unsigned num_components = instr->num_components;
    unsigned component_size = instr->dest.ssa.bit_size / 8;
 
    LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
-                        get_ssa_temp(ctx, &instr->dest.ssa),
-                        num_components, component_size};
+                        get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size};
    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
    info.align_mul = nir_intrinsic_align_mul(instr);
    info.align_offset = nir_intrinsic_align_offset(instr);
@@ -6452,7 +6712,8 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
     * it's safe to use SMEM */
    bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
-   if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) {
+   if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) ||
+       !can_use_smem) {
       emit_load(ctx, bld, info, global_load_params);
    } else {
       info.offset = Operand(bld.as_uniform(info.offset));
@@ -6460,7 +6721,8 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 }
 
-void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
@@ -6469,7 +6731,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
    Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
-   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+   bool glc =
+      nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
 
    if (ctx->options->chip_class >= GFX7)
       addr = as_vgpr(ctx, addr);
@@ -6477,8 +6740,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
    unsigned write_count = 0;
    Temp write_datas[32];
    unsigned offsets[32];
-   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
-                      16, &write_count, write_datas, offsets);
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
+                      write_datas, offsets);
 
    for (unsigned i = 0; i < write_count; i++) {
       if (ctx->options->chip_class >= GFX7) {
@@ -6490,11 +6753,12 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
             Temp carry = bld.tmp(bld.lm);
             bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
 
-            bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
-                     Operand(offset), addr0);
-            bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
-                     Operand(0u), addr1,
-                     carry).def(1).setHint(vcc);
+            bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0),
+                     bld.hint_vcc(Definition(carry)), Operand(offset), addr0);
+            bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), Operand(0u),
+                     addr1, carry)
+               .def(1)
+               .setHint(vcc);
 
             store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
 
@@ -6504,15 +6768,9 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
          bool global = ctx->options->chip_class >= GFX9;
          aco_opcode op;
          switch (write_datas[i].bytes()) {
-         case 1:
-            op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
-            break;
-         case 2:
-            op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
-            break;
-         case 4:
-            op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
-            break;
+         case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
+         case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
+         case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
          case 8:
             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
             break;
@@ -6522,11 +6780,11 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
          case 16:
             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
             break;
-         default:
-            unreachable("store_global not implemented for this size.");
+         default: unreachable("store_global not implemented for this size.");
          }
 
-         aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
+         aco_ptr<FLAT_instruction> flat{
+            create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
          flat->operands[0] = Operand(store_addr);
          flat->operands[1] = Operand(s1);
          flat->operands[2] = Operand(write_datas[i]);
@@ -6544,7 +6802,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
 
          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
 
-         aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
+         aco_ptr<MUBUF_instruction> mubuf{
+            create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
          mubuf->operands[0] = Operand(rsrc);
          mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
          mubuf->operands[2] = Operand(0u);
@@ -6561,7 +6820,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 }
 
-void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
@@ -6582,52 +6842,54 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    if (ctx->options->chip_class >= GFX7) {
       bool global = ctx->options->chip_class >= GFX9;
       switch (instr->intrinsic) {
-         case nir_intrinsic_global_atomic_add:
-            op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
-            op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
-            break;
-         case nir_intrinsic_global_atomic_imin:
-            op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
-            op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
-            break;
-         case nir_intrinsic_global_atomic_umin:
-            op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
-            op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
-            break;
-         case nir_intrinsic_global_atomic_imax:
-            op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
-            op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
-            break;
-         case nir_intrinsic_global_atomic_umax:
-            op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
-            op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
-            break;
-         case nir_intrinsic_global_atomic_and:
-            op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
-            op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
-            break;
-         case nir_intrinsic_global_atomic_or:
-            op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
-            op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
-            break;
-         case nir_intrinsic_global_atomic_xor:
-            op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
-            op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
-            break;
-         case nir_intrinsic_global_atomic_exchange:
-            op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
-            op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
-            break;
-         case nir_intrinsic_global_atomic_comp_swap:
-            op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
-            op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
-            break;
-         default:
-            unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
+      case nir_intrinsic_global_atomic_add:
+         op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
+         op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
+         break;
+      case nir_intrinsic_global_atomic_imin:
+         op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
+         op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
+         break;
+      case nir_intrinsic_global_atomic_umin:
+         op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
+         op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
+         break;
+      case nir_intrinsic_global_atomic_imax:
+         op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
+         op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
+         break;
+      case nir_intrinsic_global_atomic_umax:
+         op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
+         op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
+         break;
+      case nir_intrinsic_global_atomic_and:
+         op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
+         op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
+         break;
+      case nir_intrinsic_global_atomic_or:
+         op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
+         op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
+         break;
+      case nir_intrinsic_global_atomic_xor:
+         op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
+         op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
+         break;
+      case nir_intrinsic_global_atomic_exchange:
+         op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
+         op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
+         break;
+      case nir_intrinsic_global_atomic_comp_swap:
+         op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
+         op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
+         break;
+      default:
+         unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
+                     "instructions.");
       }
 
       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
-      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
+      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
+         op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
       flat->operands[0] = Operand(addr);
       flat->operands[1] = Operand(s1);
       flat->operands[2] = Operand(data);
@@ -6644,55 +6906,57 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
       assert(ctx->options->chip_class == GFX6);
 
       switch (instr->intrinsic) {
-         case nir_intrinsic_global_atomic_add:
-            op32 = aco_opcode::buffer_atomic_add;
-            op64 = aco_opcode::buffer_atomic_add_x2;
-            break;
-         case nir_intrinsic_global_atomic_imin:
-            op32 = aco_opcode::buffer_atomic_smin;
-            op64 = aco_opcode::buffer_atomic_smin_x2;
-            break;
-         case nir_intrinsic_global_atomic_umin:
-            op32 = aco_opcode::buffer_atomic_umin;
-            op64 = aco_opcode::buffer_atomic_umin_x2;
-            break;
-         case nir_intrinsic_global_atomic_imax:
-            op32 = aco_opcode::buffer_atomic_smax;
-            op64 = aco_opcode::buffer_atomic_smax_x2;
-            break;
-         case nir_intrinsic_global_atomic_umax:
-            op32 = aco_opcode::buffer_atomic_umax;
-            op64 = aco_opcode::buffer_atomic_umax_x2;
-            break;
-         case nir_intrinsic_global_atomic_and:
-            op32 = aco_opcode::buffer_atomic_and;
-            op64 = aco_opcode::buffer_atomic_and_x2;
-            break;
-         case nir_intrinsic_global_atomic_or:
-            op32 = aco_opcode::buffer_atomic_or;
-            op64 = aco_opcode::buffer_atomic_or_x2;
-            break;
-         case nir_intrinsic_global_atomic_xor:
-            op32 = aco_opcode::buffer_atomic_xor;
-            op64 = aco_opcode::buffer_atomic_xor_x2;
-            break;
-         case nir_intrinsic_global_atomic_exchange:
-            op32 = aco_opcode::buffer_atomic_swap;
-            op64 = aco_opcode::buffer_atomic_swap_x2;
-            break;
-         case nir_intrinsic_global_atomic_comp_swap:
-            op32 = aco_opcode::buffer_atomic_cmpswap;
-            op64 = aco_opcode::buffer_atomic_cmpswap_x2;
-            break;
-         default:
-            unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
+      case nir_intrinsic_global_atomic_add:
+         op32 = aco_opcode::buffer_atomic_add;
+         op64 = aco_opcode::buffer_atomic_add_x2;
+         break;
+      case nir_intrinsic_global_atomic_imin:
+         op32 = aco_opcode::buffer_atomic_smin;
+         op64 = aco_opcode::buffer_atomic_smin_x2;
+         break;
+      case nir_intrinsic_global_atomic_umin:
+         op32 = aco_opcode::buffer_atomic_umin;
+         op64 = aco_opcode::buffer_atomic_umin_x2;
+         break;
+      case nir_intrinsic_global_atomic_imax:
+         op32 = aco_opcode::buffer_atomic_smax;
+         op64 = aco_opcode::buffer_atomic_smax_x2;
+         break;
+      case nir_intrinsic_global_atomic_umax:
+         op32 = aco_opcode::buffer_atomic_umax;
+         op64 = aco_opcode::buffer_atomic_umax_x2;
+         break;
+      case nir_intrinsic_global_atomic_and:
+         op32 = aco_opcode::buffer_atomic_and;
+         op64 = aco_opcode::buffer_atomic_and_x2;
+         break;
+      case nir_intrinsic_global_atomic_or:
+         op32 = aco_opcode::buffer_atomic_or;
+         op64 = aco_opcode::buffer_atomic_or_x2;
+         break;
+      case nir_intrinsic_global_atomic_xor:
+         op32 = aco_opcode::buffer_atomic_xor;
+         op64 = aco_opcode::buffer_atomic_xor_x2;
+         break;
+      case nir_intrinsic_global_atomic_exchange:
+         op32 = aco_opcode::buffer_atomic_swap;
+         op64 = aco_opcode::buffer_atomic_swap_x2;
+         break;
+      case nir_intrinsic_global_atomic_comp_swap:
+         op32 = aco_opcode::buffer_atomic_cmpswap;
+         op64 = aco_opcode::buffer_atomic_cmpswap_x2;
+         break;
+      default:
+         unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
+                     "instructions.");
       }
 
       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
 
       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
 
-      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
+      aco_ptr<MUBUF_instruction> mubuf{
+         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
       mubuf->operands[0] = Operand(rsrc);
       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
       mubuf->operands[2] = Operand(0u);
@@ -6710,7 +6974,8 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 }
 
-void visit_load_buffer(isel_context *ctx, nir_intrinsic_instr *intrin)
+void
+visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
 {
    Builder bld(ctx->program, ctx->block);
 
@@ -6728,11 +6993,12 @@ void visit_load_buffer(isel_context *ctx, nir_intrinsic_instr *intrin)
    unsigned num_components = intrin->dest.ssa.num_components;
    unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0;
 
-   load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset,
-                   elem_size_bytes, num_components, swizzle_element_size, !swizzled, reorder, slc);
+   load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
+                   num_components, swizzle_element_size, !swizzled, reorder, slc);
 }
 
-void visit_store_buffer(isel_context *ctx, nir_intrinsic_instr *intrin)
+void
+visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
 {
    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
    Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);
@@ -6749,31 +7015,28 @@ void visit_store_buffer(isel_context *ctx, nir_intrinsic_instr *intrin)
    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
    memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none);
 
-   store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset,
-                    elem_size_bytes, write_mask, !swizzled, sync, slc);
+   store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
+                    write_mask, !swizzled, sync, slc);
 }
 
-sync_scope translate_nir_scope(nir_scope scope)
+sync_scope
+translate_nir_scope(nir_scope scope)
 {
    switch (scope) {
    case NIR_SCOPE_NONE:
-   case NIR_SCOPE_INVOCATION:
-      return scope_invocation;
-   case NIR_SCOPE_SUBGROUP:
-      return scope_subgroup;
-   case NIR_SCOPE_WORKGROUP:
-      return scope_workgroup;
-   case NIR_SCOPE_QUEUE_FAMILY:
-      return scope_queuefamily;
-   case NIR_SCOPE_DEVICE:
-      return scope_device;
-   case NIR_SCOPE_SHADER_CALL:
-      unreachable("unsupported scope");
+   case NIR_SCOPE_INVOCATION: return scope_invocation;
+   case NIR_SCOPE_SUBGROUP: return scope_subgroup;
+   case NIR_SCOPE_WORKGROUP: return scope_workgroup;
+   case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;
+   case NIR_SCOPE_DEVICE: return scope_device;
+   case NIR_SCOPE_SHADER_CALL: unreachable("unsupported scope");
    }
    unreachable("invalid scope");
 }
 
-void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
+void
+emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
+{
    Builder bld(ctx->program, ctx->block);
 
    unsigned semantics = 0;
@@ -6787,11 +7050,10 @@ void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
     */
-   bool shared_storage_used =
-      ctx->stage.hw == HWStage::CS ||
-      ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS ||
-      (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||
-      ctx->stage.hw == HWStage::NGG;
+   bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||
+                              ctx->stage.hw == HWStage::HS ||
+                              (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||
+                              ctx->stage.hw == HWStage::NGG;
 
    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
     * They are allowed in CS, TCS, and in any NGG shader.
@@ -6801,7 +7063,7 @@ void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
 
    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
    if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
-      storage |= storage_buffer | storage_image; //TODO: split this when NIR gets nir_var_mem_image
+      storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image
    if (shared_storage_used && (nir_storage & nir_var_mem_shared))
       storage |= storage_shared;
 
@@ -6819,7 +7081,8 @@ void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
                exec_scope);
 }
 
-void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -6832,7 +7095,8 @@ void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
 }
 
-void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    unsigned writemask = nir_intrinsic_write_mask(instr);
    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
@@ -6843,7 +7107,8 @@ void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
 }
 
-void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    unsigned offset = nir_intrinsic_base(instr);
    Builder bld(ctx->program, ctx->block);
@@ -6853,76 +7118,75 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
 
    unsigned num_operands = 3;
    aco_opcode op32, op64, op32_rtn, op64_rtn;
-   switch(instr->intrinsic) {
-      case nir_intrinsic_shared_atomic_add:
-         op32 = aco_opcode::ds_add_u32;
-         op64 = aco_opcode::ds_add_u64;
-         op32_rtn = aco_opcode::ds_add_rtn_u32;
-         op64_rtn = aco_opcode::ds_add_rtn_u64;
-         break;
-      case nir_intrinsic_shared_atomic_imin:
-         op32 = aco_opcode::ds_min_i32;
-         op64 = aco_opcode::ds_min_i64;
-         op32_rtn = aco_opcode::ds_min_rtn_i32;
-         op64_rtn = aco_opcode::ds_min_rtn_i64;
-         break;
-      case nir_intrinsic_shared_atomic_umin:
-         op32 = aco_opcode::ds_min_u32;
-         op64 = aco_opcode::ds_min_u64;
-         op32_rtn = aco_opcode::ds_min_rtn_u32;
-         op64_rtn = aco_opcode::ds_min_rtn_u64;
-         break;
-      case nir_intrinsic_shared_atomic_imax:
-         op32 = aco_opcode::ds_max_i32;
-         op64 = aco_opcode::ds_max_i64;
-         op32_rtn = aco_opcode::ds_max_rtn_i32;
-         op64_rtn = aco_opcode::ds_max_rtn_i64;
-         break;
-      case nir_intrinsic_shared_atomic_umax:
-         op32 = aco_opcode::ds_max_u32;
-         op64 = aco_opcode::ds_max_u64;
-         op32_rtn = aco_opcode::ds_max_rtn_u32;
-         op64_rtn = aco_opcode::ds_max_rtn_u64;
-         break;
-      case nir_intrinsic_shared_atomic_and:
-         op32 = aco_opcode::ds_and_b32;
-         op64 = aco_opcode::ds_and_b64;
-         op32_rtn = aco_opcode::ds_and_rtn_b32;
-         op64_rtn = aco_opcode::ds_and_rtn_b64;
-         break;
-      case nir_intrinsic_shared_atomic_or:
-         op32 = aco_opcode::ds_or_b32;
-         op64 = aco_opcode::ds_or_b64;
-         op32_rtn = aco_opcode::ds_or_rtn_b32;
-         op64_rtn = aco_opcode::ds_or_rtn_b64;
-         break;
-      case nir_intrinsic_shared_atomic_xor:
-         op32 = aco_opcode::ds_xor_b32;
-         op64 = aco_opcode::ds_xor_b64;
-         op32_rtn = aco_opcode::ds_xor_rtn_b32;
-         op64_rtn = aco_opcode::ds_xor_rtn_b64;
-         break;
-      case nir_intrinsic_shared_atomic_exchange:
-         op32 = aco_opcode::ds_write_b32;
-         op64 = aco_opcode::ds_write_b64;
-         op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
-         op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
-         break;
-      case nir_intrinsic_shared_atomic_comp_swap:
-         op32 = aco_opcode::ds_cmpst_b32;
-         op64 = aco_opcode::ds_cmpst_b64;
-         op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
-         op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
-         num_operands = 4;
-         break;
-      case nir_intrinsic_shared_atomic_fadd:
-         op32 = aco_opcode::ds_add_f32;
-         op32_rtn = aco_opcode::ds_add_rtn_f32;
-         op64 = aco_opcode::num_opcodes;
-         op64_rtn = aco_opcode::num_opcodes;
-         break;
-      default:
-         unreachable("Unhandled shared atomic intrinsic");
+   switch (instr->intrinsic) {
+   case nir_intrinsic_shared_atomic_add:
+      op32 = aco_opcode::ds_add_u32;
+      op64 = aco_opcode::ds_add_u64;
+      op32_rtn = aco_opcode::ds_add_rtn_u32;
+      op64_rtn = aco_opcode::ds_add_rtn_u64;
+      break;
+   case nir_intrinsic_shared_atomic_imin:
+      op32 = aco_opcode::ds_min_i32;
+      op64 = aco_opcode::ds_min_i64;
+      op32_rtn = aco_opcode::ds_min_rtn_i32;
+      op64_rtn = aco_opcode::ds_min_rtn_i64;
+      break;
+   case nir_intrinsic_shared_atomic_umin:
+      op32 = aco_opcode::ds_min_u32;
+      op64 = aco_opcode::ds_min_u64;
+      op32_rtn = aco_opcode::ds_min_rtn_u32;
+      op64_rtn = aco_opcode::ds_min_rtn_u64;
+      break;
+   case nir_intrinsic_shared_atomic_imax:
+      op32 = aco_opcode::ds_max_i32;
+      op64 = aco_opcode::ds_max_i64;
+      op32_rtn = aco_opcode::ds_max_rtn_i32;
+      op64_rtn = aco_opcode::ds_max_rtn_i64;
+      break;
+   case nir_intrinsic_shared_atomic_umax:
+      op32 = aco_opcode::ds_max_u32;
+      op64 = aco_opcode::ds_max_u64;
+      op32_rtn = aco_opcode::ds_max_rtn_u32;
+      op64_rtn = aco_opcode::ds_max_rtn_u64;
+      break;
+   case nir_intrinsic_shared_atomic_and:
+      op32 = aco_opcode::ds_and_b32;
+      op64 = aco_opcode::ds_and_b64;
+      op32_rtn = aco_opcode::ds_and_rtn_b32;
+      op64_rtn = aco_opcode::ds_and_rtn_b64;
+      break;
+   case nir_intrinsic_shared_atomic_or:
+      op32 = aco_opcode::ds_or_b32;
+      op64 = aco_opcode::ds_or_b64;
+      op32_rtn = aco_opcode::ds_or_rtn_b32;
+      op64_rtn = aco_opcode::ds_or_rtn_b64;
+      break;
+   case nir_intrinsic_shared_atomic_xor:
+      op32 = aco_opcode::ds_xor_b32;
+      op64 = aco_opcode::ds_xor_b64;
+      op32_rtn = aco_opcode::ds_xor_rtn_b32;
+      op64_rtn = aco_opcode::ds_xor_rtn_b64;
+      break;
+   case nir_intrinsic_shared_atomic_exchange:
+      op32 = aco_opcode::ds_write_b32;
+      op64 = aco_opcode::ds_write_b64;
+      op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
+      op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
+      break;
+   case nir_intrinsic_shared_atomic_comp_swap:
+      op32 = aco_opcode::ds_cmpst_b32;
+      op64 = aco_opcode::ds_cmpst_b64;
+      op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
+      op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
+      num_operands = 4;
+      break;
+   case nir_intrinsic_shared_atomic_fadd:
+      op32 = aco_opcode::ds_add_f32;
+      op32_rtn = aco_opcode::ds_add_rtn_f32;
+      op64 = aco_opcode::num_opcodes;
+      op64_rtn = aco_opcode::num_opcodes;
+      break;
+   default: unreachable("Unhandled shared atomic intrinsic");
    }
 
    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
@@ -6942,7 +7206,8 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 
    aco_ptr<DS_instruction> ds;
-   ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
+   ds.reset(
+      create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
    ds->operands[0] = Operand(address);
    ds->operands[1] = Operand(data);
    if (num_operands == 4) {
@@ -6957,21 +7222,22 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    ctx->block->instructions.emplace_back(std::move(ds));
 }
 
-Temp get_scratch_resource(isel_context *ctx)
+Temp
+get_scratch_resource(isel_context* ctx)
 {
    Builder bld(ctx->program, ctx->block);
    Temp scratch_addr = ctx->program->private_segment_buffer;
    if (ctx->stage != compute_cs)
       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
 
-   uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
-                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
+   uint32_t rsrc_conf =
+      S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
 
    if (ctx->program->chip_class >= GFX10) {
       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
-                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                   S_008F0C_RESOURCE_LEVEL(1);
-   } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
+                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   } else if (ctx->program->chip_class <=
+              GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
    }
@@ -6980,10 +7246,13 @@ Temp get_scratch_resource(isel_context *ctx)
    if (ctx->program->chip_class <= GFX8)
       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
 
-   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
+   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u),
+                     Operand(rsrc_conf));
 }
 
-void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
+void
+visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
+{
    Builder bld(ctx->program, ctx->block);
    Temp rsrc = get_scratch_resource(ctx);
    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
@@ -6999,7 +7268,9 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
    emit_load(ctx, bld, info, scratch_load_params);
 }
 
-void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
+void
+visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
+{
    Builder bld(ctx->program, ctx->block);
    Temp rsrc = get_scratch_resource(ctx);
    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
@@ -7012,21 +7283,23 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
    Temp write_datas[32];
    unsigned offsets[32];
    unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
-   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
-                      swizzle_component_size, &write_count, write_datas, offsets);
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
+                      &write_count, write_datas, offsets);
 
    for (unsigned i = 0; i < write_count; i++) {
       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
-      Instruction *mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],
+                                     offsets[i], true, true);
       mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
    }
 }
 
-void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
+void
+visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)
+{
    uint8_t log2_ps_iter_samples;
    if (ctx->program->info->ps.uses_sample_shading) {
-      log2_ps_iter_samples =
-         util_logbase2(ctx->options->key.fs.num_samples);
+      log2_ps_iter_samples = util_logbase2(ctx->options->key.fs.num_samples);
    } else {
       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
    }
@@ -7041,31 +7314,34 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
                                 get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
       Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id,
                            bld.copy(bld.def(v1), Operand(1u)));
-      bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
+      bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask,
+               get_arg(ctx, ctx->args->ac.sample_coverage));
    } else {
       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage));
    }
 }
 
-void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
 
    unsigned stream = nir_intrinsic_stream_id(instr);
    Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
    next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
-   nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
+   nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);
 
    /* get GSVS ring */
-   Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
+   Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),
+                             ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
 
-   unsigned num_components =
-      ctx->program->info->gs.num_stream_output_components[stream];
+   unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream];
 
    unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
    unsigned stream_offset = 0;
    for (unsigned i = 0; i < stream; i++) {
-      unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
+      unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] *
+                             ctx->shader->info.gs.vertices_out;
       stream_offset += prev_stride * ctx->program->wave_size;
    }
 
@@ -7075,26 +7351,25 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
    Temp gsvs_dwords[4];
    for (unsigned i = 0; i < 4; i++)
       gsvs_dwords[i] = bld.tmp(s1);
-   bld.pseudo(aco_opcode::p_split_vector,
-              Definition(gsvs_dwords[0]),
-              Definition(gsvs_dwords[1]),
-              Definition(gsvs_dwords[2]),
-              Definition(gsvs_dwords[3]),
-              gsvs_ring);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),
+              Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);
 
    if (stream_offset) {
       Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
 
       Temp carry = bld.tmp(s1);
-      gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
-      gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
+      gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),
+                                gsvs_dwords[0], stream_offset_tmp);
+      gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),
+                                gsvs_dwords[1], Operand(0u), bld.scc(carry));
    }
 
-   gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
+   gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],
+                             Operand(S_008F04_STRIDE(stride)));
    gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
 
-   gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
-                          gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
+   gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],
+                          gsvs_dwords[2], gsvs_dwords[3]);
 
    unsigned offset = 0;
    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
@@ -7112,11 +7387,13 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
                if (vaddr_offset.isUndefined())
                   vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
                else
-                  vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
+                  vaddr_offset =
+                     bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
                const_offset %= 4096u;
             }
 
-            aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
+            aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(
+               aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
             mtbuf->operands[0] = Operand(gsvs_ring);
             mtbuf->operands[1] = vaddr_offset;
             mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
@@ -7142,55 +7419,71 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
    bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
 }
 
-Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
+Temp
+emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
 {
    Builder bld(ctx->program, ctx->block);
 
    if (cluster_size == 1) {
       return src;
-   } if (op == nir_op_iand && cluster_size == 4) {
-      //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
-      Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
+   }
+   if (op == nir_op_iand && cluster_size == 4) {
+      /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */
+      Temp tmp =
+         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
    } else if (op == nir_op_ior && cluster_size == 4) {
-      //subgroupClusteredOr(val, 4) -> wqm(val & exec)
-      return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
-                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
+      /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
+      return bld.sop1(
+         Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
+         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
-      //subgroupAnd(val) -> (exec & ~val) == 0
-      Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
+      /* subgroupAnd(val) -> (exec & ~val) == 0 */
+      Temp tmp =
+         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
+            .def(1)
+            .getTemp();
       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
-      //subgroupOr(val) -> (val & exec) != 0
-      Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
+      /* subgroupOr(val) -> (val & exec) != 0 */
+      Temp tmp =
+         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
+            .def(1)
+            .getTemp();
       return bool_to_vector_condition(ctx, tmp);
    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
-      //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
-      Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
+      /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
+      Temp tmp =
+         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
-      tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
+      tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u))
+               .def(1)
+               .getTemp();
       return bool_to_vector_condition(ctx, tmp);
    } else {
-      //subgroupClustered{And,Or,Xor}(val, n) ->
-      //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ;  just v_mbcnt_lo_u32_b32 on wave32
-      //cluster_offset = ~(n - 1) & lane_id
-      //cluster_mask = ((1 << n) - 1)
-      //subgroupClusteredAnd():
-      //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
-      //subgroupClusteredOr():
-      //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
-      //subgroupClusteredXor():
-      //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
+      /* subgroupClustered{And,Or,Xor}(val, n):
+       *   lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
+       *   cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
+       * subgroupClusteredAnd():
+       *   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
+       * subgroupClusteredOr():
+       *   return ((val & exec) >> cluster_offset) & cluster_mask != 0
+       * subgroupClusteredXor():
+       *   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
+       */
       Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
-      Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
+      Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
+                                     Operand(~uint32_t(cluster_size - 1)), lane_id);
 
       Temp tmp;
       if (op == nir_op_iand)
-         tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
+         tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
+                        Operand(exec, bld.lm));
       else
-         tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
+         tmp =
+            bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
 
       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
 
@@ -7205,7 +7498,8 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te
          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
 
       if (op == nir_op_iand) {
-         return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand(cluster_mask), tmp);
+         return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand(cluster_mask),
+                         tmp);
       } else if (op == nir_op_ior) {
          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand(0u), tmp);
       } else if (op == nir_op_ixor) {
@@ -7218,17 +7512,20 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te
    }
 }
 
-Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
+Temp
+emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
 {
    Builder bld(ctx->program, ctx->block);
    assert(src.regClass() == bld.lm);
 
-   //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
-   //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
-   //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
+   /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
+    * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
+    * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
+    */
    Temp tmp;
    if (op == nir_op_iand)
-      tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
+      tmp =
+         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
    else
       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
 
@@ -7246,13 +7543,15 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
    return Temp();
 }
 
-Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
+Temp
+emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
 {
    Builder bld(ctx->program, ctx->block);
 
-   //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
-   //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
-   //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
+   /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
+    * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
+    * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
+    */
    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
    if (op == nir_op_iand)
       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
@@ -7265,32 +7564,39 @@ Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
    return Temp();
 }
 
-ReduceOp get_reduce_op(nir_op op, unsigned bit_size)
+ReduceOp
+get_reduce_op(nir_op op, unsigned bit_size)
 {
    switch (op) {
-   #define CASEI(name) case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64;
-   #define CASEF(name) case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
-   CASEI(iadd)
-   CASEI(imul)
-   CASEI(imin)
-   CASEI(umin)
-   CASEI(imax)
-   CASEI(umax)
-   CASEI(iand)
-   CASEI(ior)
-   CASEI(ixor)
-   CASEF(fadd)
-   CASEF(fmul)
-   CASEF(fmin)
-   CASEF(fmax)
-   default:
-      unreachable("unknown reduction op");
-   #undef CASEI
-   #undef CASEF
+#define CASEI(name)                                                                                \
+   case nir_op_##name:                                                                             \
+      return (bit_size == 32)   ? name##32                                                         \
+             : (bit_size == 16) ? name##16                                                         \
+             : (bit_size == 8)  ? name##8                                                          \
+                                : name##64;
+#define CASEF(name)                                                                                \
+   case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
+      CASEI(iadd)
+      CASEI(imul)
+      CASEI(imin)
+      CASEI(umin)
+      CASEI(imax)
+      CASEI(umax)
+      CASEI(iand)
+      CASEI(ior)
+      CASEI(ixor)
+      CASEF(fadd)
+      CASEF(fmul)
+      CASEF(fmin)
+      CASEF(fmax)
+   default: unreachable("unknown reduction op");
+#undef CASEI
+#undef CASEF
    }
 }
 
-void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
+void
+emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
 {
    Builder bld(ctx->program, ctx->block);
    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
@@ -7301,7 +7607,8 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s
       bld.copy(dst, src);
 }
 
-void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst, nir_src src, Temp count)
+void
+emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
 {
    Builder bld(ctx->program, ctx->block);
    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
@@ -7329,8 +7636,7 @@ void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst,
       src_tmp = bld.as_uniform(src_tmp);
 
    if (op == nir_op_ixor && count.type() == RegType::sgpr)
-      count = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
-                       count, Operand(1u));
+      count = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand(1u));
    else if (op == nir_op_ixor)
       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), count);
 
@@ -7360,7 +7666,8 @@ void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst,
    }
 }
 
-bool emit_uniform_reduce(isel_context *ctx, nir_intrinsic_instr *instr)
+bool
+emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
    if (op == nir_op_imul || op == nir_op_fmul)
@@ -7373,8 +7680,8 @@ bool emit_uniform_reduce(isel_context *ctx, nir_intrinsic_instr *instr)
       if (bit_size > 32)
          return false;
 
-      Temp thread_count = bld.sop1(
-         Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
+      Temp thread_count =
+         bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
 
       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
    } else {
@@ -7384,7 +7691,8 @@ bool emit_uniform_reduce(isel_context *ctx, nir_intrinsic_instr *instr)
    return true;
 }
 
-bool emit_uniform_scan(isel_context *ctx, nir_intrinsic_instr *instr)
+bool
+emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
@@ -7408,18 +7716,15 @@ bool emit_uniform_scan(isel_context *ctx, nir_intrinsic_instr *instr)
       return true;
    }
 
-   assert(op == nir_op_imin || op == nir_op_umin ||
-          op == nir_op_imax || op == nir_op_umax ||
-          op == nir_op_iand || op == nir_op_ior ||
-          op == nir_op_fmin || op == nir_op_fmax);
+   assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
+          op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
 
    if (inc) {
       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
       return true;
    }
 
-   /* Copy the source and write the reduction operation identity to the first
-    * lane. */
+   /* Copy the source and write the reduction operation identity to the first lane. */
    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
@@ -7440,8 +7745,9 @@ bool emit_uniform_scan(isel_context *ctx, nir_intrinsic_instr *instr)
    return true;
 }
 
-Temp emit_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op,
-                          unsigned cluster_size, Definition dst, Temp src)
+Temp
+emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
+                     Definition dst, Temp src)
 {
    assert(src.bytes() <= 8);
    assert(src.type() == RegType::vgpr);
@@ -7454,14 +7760,13 @@ Temp emit_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op,
    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
 
    /* scalar identity temporary */
-   bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) && aco_op != aco_opcode::p_reduce;
+   bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) &&
+                     aco_op != aco_opcode::p_reduce;
    if (aco_op == aco_opcode::p_exclusive_scan) {
-      need_sitmp |=
-         (op == imin8 || op == imin16 || op == imin32 || op == imin64 ||
-          op == imax8 || op == imax16 || op == imax32 || op == imax64 ||
-          op == fmin16 || op == fmin32 || op == fmin64 ||
-          op == fmax16 || op == fmax32 || op == fmax64 ||
-          op == fmul16 || op == fmul64);
+      need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
+                     op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
+                     op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
+                     op == fmul64);
    }
    if (need_sitmp)
       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
@@ -7481,7 +7786,8 @@ Temp emit_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op,
    if (clobber_vcc)
       defs[num_defs++] = bld.def(bld.lm, vcc);
 
-   Pseudo_reduction_instruction *reduce = create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
+   Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
+      aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
    reduce->operands[0] = Operand(src);
    /* setup_reduce_temp will update these undef operands if needed */
    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
@@ -7495,7 +7801,8 @@ Temp emit_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op,
    return dst.getTemp();
 }
 
-void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
+void
+emit_interp_center(isel_context* ctx, Temp dst, Temp pos1, Temp pos2)
 {
    Builder bld(ctx->program, ctx->block);
    Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
@@ -7529,7 +7836,8 @@ void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
    }
 
    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
-   aco_opcode mad = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
+   aco_opcode mad =
+      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
    Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
    Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
    tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
@@ -7542,14 +7850,15 @@ void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
    return;
 }
 
-Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i);
-void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt);
-static void create_vs_exports(isel_context *ctx);
+Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
+void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
+static void create_vs_exports(isel_context* ctx);
 
-void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
+void
+visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
-   switch(instr->intrinsic) {
+   switch (instr->intrinsic) {
    case nir_intrinsic_load_barycentric_sample:
    case nir_intrinsic_load_barycentric_pixel:
    case nir_intrinsic_load_barycentric_centroid: {
@@ -7573,14 +7882,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
             bary = get_arg(ctx, ctx->args->ac.linear_sample);
          break;
-      default:
-         break;
+      default: break;
       }
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
-      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
-                 Operand(p1), Operand(p2));
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2));
       emit_split_vector(ctx, dst, 2);
       break;
    }
@@ -7591,55 +7898,64 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       Temp p1 = emit_extract_vector(ctx, model, 0, v1);
       Temp p2 = emit_extract_vector(ctx, model, 1, v1);
       Temp p3 = emit_extract_vector(ctx, model, 2, v1);
-      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
-                 Operand(p1), Operand(p2), Operand(p3));
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2),
+                 Operand(p3));
       emit_split_vector(ctx, dst, 3);
       break;
    }
    case nir_intrinsic_load_barycentric_at_sample: {
       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
       switch (ctx->options->key.fs.num_samples) {
-         case 2: sample_pos_offset += 1 << 3; break;
-         case 4: sample_pos_offset += 3 << 3; break;
-         case 8: sample_pos_offset += 7 << 3; break;
-         default: break;
+      case 2: sample_pos_offset += 1 << 3; break;
+      case 4: sample_pos_offset += 3 << 3; break;
+      case 8: sample_pos_offset += 7 << 3; break;
+      default: break;
       }
       Temp sample_pos;
       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
       Temp private_segment_buffer = ctx->program->private_segment_buffer;
-      //TODO: bounds checking?
+      // TODO: bounds checking?
       if (addr.type() == RegType::sgpr) {
          Operand offset;
          if (const_addr) {
             sample_pos_offset += const_addr->u32 << 3;
             offset = Operand(sample_pos_offset);
          } else if (ctx->options->chip_class >= GFX9) {
-            offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
+            offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr,
+                              Operand(sample_pos_offset));
          } else {
-            offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
-            offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(sample_pos_offset));
+            offset =
+               bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
+            offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
+                              Operand(sample_pos_offset));
          }
 
          Operand off = bld.copy(bld.def(s1), Operand(offset));
-         sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
+         sample_pos =
+            bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
 
       } else if (ctx->options->chip_class >= GFX9) {
          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
-         sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
+         sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr,
+                                 private_segment_buffer, sample_pos_offset);
       } else if (ctx->options->chip_class >= GFX7) {
          /* addr += private_segment_buffer + sample_pos_offset */
          Temp tmp0 = bld.tmp(s1);
          Temp tmp1 = bld.tmp(s1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1),
+                    private_segment_buffer);
          Definition scc_tmp = bld.def(s1, scc);
-         tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
-         tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
+         tmp0 =
+            bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
+         tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u),
+                         bld.scc(scc_tmp.getTemp()));
          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
          Temp pck0 = bld.tmp(v1);
          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
          tmp1 = as_vgpr(ctx, tmp1);
-         Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
+         Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1),
+                                  bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
 
          /* sample_pos = flat_load_dwordx2 addr */
@@ -7649,14 +7965,16 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
 
          uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-         Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf));
+         Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
+                                Operand(0u), Operand(rsrc_conf));
 
          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u));
 
          sample_pos = bld.tmp(v2);
 
-         aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
+         aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(
+            aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
          load->definitions[0] = Definition(sample_pos);
          load->operands[0] = Operand(rsrc);
          load->operands[1] = Operand(addr);
@@ -7690,14 +8008,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
    case nir_intrinsic_load_front_face: {
       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
-               Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
+               Operand(0u), get_arg(ctx, ctx->args->ac.front_face))
+         .def(0)
+         .setHint(vcc);
       break;
    }
    case nir_intrinsic_load_view_index: {
-      if (ctx->stage.has(SWStage::VS) ||
-          ctx->stage.has(SWStage::GS) ||
-          ctx->stage.has(SWStage::TCS) ||
-          ctx->stage.has(SWStage::TES)) {
+      if (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::GS) ||
+          ctx->stage.has(SWStage::TCS) || ctx->stage.has(SWStage::TES)) {
          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
          break;
@@ -7725,48 +8043,22 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
       break;
    }
-   case nir_intrinsic_load_tess_coord:
-      visit_load_tess_coord(ctx, instr);
-      break;
-   case nir_intrinsic_load_interpolated_input:
-      visit_load_interpolated_input(ctx, instr);
-      break;
-   case nir_intrinsic_store_output:
-      visit_store_output(ctx, instr);
-      break;
+   case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
+   case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
+   case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
    case nir_intrinsic_load_input:
-   case nir_intrinsic_load_input_vertex:
-      visit_load_input(ctx, instr);
-      break;
-   case nir_intrinsic_load_per_vertex_input:
-      visit_load_per_vertex_input(ctx, instr);
-      break;
-   case nir_intrinsic_load_ubo:
-      visit_load_ubo(ctx, instr);
-      break;
-   case nir_intrinsic_load_push_constant:
-      visit_load_push_constant(ctx, instr);
-      break;
-   case nir_intrinsic_load_constant:
-      visit_load_constant(ctx, instr);
-      break;
-   case nir_intrinsic_vulkan_resource_index:
-      visit_load_resource(ctx, instr);
-      break;
+   case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;
+   case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
+   case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
+   case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
+   case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
+   case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break;
    case nir_intrinsic_terminate:
-   case nir_intrinsic_discard:
-      visit_discard(ctx, instr);
-      break;
+   case nir_intrinsic_discard: visit_discard(ctx, instr); break;
    case nir_intrinsic_terminate_if:
-   case nir_intrinsic_discard_if:
-      visit_discard_if(ctx, instr);
-      break;
-   case nir_intrinsic_load_shared:
-      visit_load_shared(ctx, instr);
-      break;
-   case nir_intrinsic_store_shared:
-      visit_store_shared(ctx, instr);
-      break;
+   case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break;
+   case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
+   case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
    case nir_intrinsic_shared_atomic_add:
    case nir_intrinsic_shared_atomic_imin:
    case nir_intrinsic_shared_atomic_umin:
@@ -7777,16 +8069,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_shared_atomic_xor:
    case nir_intrinsic_shared_atomic_exchange:
    case nir_intrinsic_shared_atomic_comp_swap:
-   case nir_intrinsic_shared_atomic_fadd:
-      visit_shared_atomic(ctx, instr);
-      break;
+   case nir_intrinsic_shared_atomic_fadd: visit_shared_atomic(ctx, instr); break;
    case nir_intrinsic_image_deref_load:
-   case nir_intrinsic_image_deref_sparse_load:
-      visit_image_load(ctx, instr);
-      break;
-   case nir_intrinsic_image_deref_store:
-      visit_image_store(ctx, instr);
-      break;
+   case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;
+   case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;
    case nir_intrinsic_image_deref_atomic_add:
    case nir_intrinsic_image_deref_atomic_umin:
    case nir_intrinsic_image_deref_atomic_imin:
@@ -7796,33 +8082,15 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_image_deref_atomic_or:
    case nir_intrinsic_image_deref_atomic_xor:
    case nir_intrinsic_image_deref_atomic_exchange:
-   case nir_intrinsic_image_deref_atomic_comp_swap:
-      visit_image_atomic(ctx, instr);
-      break;
-   case nir_intrinsic_image_deref_size:
-      visit_image_size(ctx, instr);
-      break;
-   case nir_intrinsic_image_deref_samples:
-      visit_image_samples(ctx, instr);
-      break;
-   case nir_intrinsic_load_ssbo:
-      visit_load_ssbo(ctx, instr);
-      break;
-   case nir_intrinsic_store_ssbo:
-      visit_store_ssbo(ctx, instr);
-      break;
-   case nir_intrinsic_load_global:
-      visit_load_global(ctx, instr);
-      break;
-   case nir_intrinsic_load_buffer_amd:
-      visit_load_buffer(ctx, instr);
-      break;
-   case nir_intrinsic_store_buffer_amd:
-      visit_store_buffer(ctx, instr);
-      break;
-   case nir_intrinsic_store_global:
-      visit_store_global(ctx, instr);
-      break;
+   case nir_intrinsic_image_deref_atomic_comp_swap: visit_image_atomic(ctx, instr); break;
+   case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;
+   case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
+   case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
+   case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
+   case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;
+   case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
+   case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
+   case nir_intrinsic_store_global: visit_store_global(ctx, instr); break;
    case nir_intrinsic_global_atomic_add:
    case nir_intrinsic_global_atomic_imin:
    case nir_intrinsic_global_atomic_umin:
@@ -7832,9 +8100,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_global_atomic_or:
    case nir_intrinsic_global_atomic_xor:
    case nir_intrinsic_global_atomic_exchange:
-   case nir_intrinsic_global_atomic_comp_swap:
-      visit_global_atomic(ctx, instr);
-      break;
+   case nir_intrinsic_global_atomic_comp_swap: visit_global_atomic(ctx, instr); break;
    case nir_intrinsic_ssbo_atomic_add:
    case nir_intrinsic_ssbo_atomic_imin:
    case nir_intrinsic_ssbo_atomic_umin:
@@ -7844,21 +8110,11 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_ssbo_atomic_or:
    case nir_intrinsic_ssbo_atomic_xor:
    case nir_intrinsic_ssbo_atomic_exchange:
-   case nir_intrinsic_ssbo_atomic_comp_swap:
-      visit_atomic_ssbo(ctx, instr);
-      break;
-   case nir_intrinsic_load_scratch:
-      visit_load_scratch(ctx, instr);
-      break;
-   case nir_intrinsic_store_scratch:
-      visit_store_scratch(ctx, instr);
-      break;
-   case nir_intrinsic_get_ssbo_size:
-      visit_get_ssbo_size(ctx, instr);
-      break;
-   case nir_intrinsic_scoped_barrier:
-      emit_scoped_barrier(ctx, instr);
-      break;
+   case nir_intrinsic_ssbo_atomic_comp_swap: visit_atomic_ssbo(ctx, instr); break;
+   case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
+   case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
+   case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break;
+   case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
    case nir_intrinsic_load_num_workgroups: {
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
@@ -7873,7 +8129,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
    case nir_intrinsic_load_workgroup_id: {
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      struct ac_arg *args = ctx->args->ac.workgroup_ids;
+      struct ac_arg* args = ctx->args->ac.workgroup_ids;
       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
@@ -7883,7 +8139,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
    case nir_intrinsic_load_local_invocation_index: {
       if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {
-         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
+         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+                  get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
          break;
       } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {
          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));
@@ -7896,26 +8153,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        * we need this multiplied by the wave size, and then OR the thread id to it.
        */
       if (ctx->program->wave_size == 64) {
-         /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */
-         Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
-                                get_arg(ctx, ctx->args->ac.tg_size));
-         bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
+         /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
+          * feed that to v_or */
+         Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
+                                Operand(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));
+         bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,
+                  id);
       } else {
-         /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR  */
+         /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
          Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
                                 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
-         bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id);
+         bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+                  tg_num, Operand(0x5u), id);
       }
       break;
    }
    case nir_intrinsic_load_subgroup_id: {
       if (ctx->stage == compute_cs) {
-         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
-                  get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
+         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),
+                  Operand(0x6u | (0x6u << 16)));
       } else if (ctx->stage.hw == HWStage::NGG) {
          /* Get the id of the current wave within the threadgroup (workgroup) */
-         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
-                  get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(24u | (4u << 16)));
+         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
+                  Operand(24u | (4u << 16)));
       } else {
          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
       }
@@ -7927,11 +8189,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
    case nir_intrinsic_load_num_subgroups: {
       if (ctx->stage == compute_cs)
-         bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
-                  get_arg(ctx, ctx->args->ac.tg_size));
+         bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+                  bld.def(s1, scc), Operand(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));
       else if (ctx->stage.hw == HWStage::NGG)
-         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
-                  get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(28u | (4u << 16)));
+         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
+                  Operand(28u | (4u << 16)));
       else
          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
       break;
@@ -7968,7 +8231,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
          emit_uniform_subgroup(ctx, instr, src);
       } else {
          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
-         if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
+         if (instr->intrinsic == nir_intrinsic_read_invocation ||
+             !nir_src_is_divergent(instr->src[1]))
             tid = bld.as_uniform(tid);
          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
 
@@ -7979,7 +8243,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
             Temp tmp = bld.tmp(v1);
             tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
             if (dst.type() == RegType::vgpr)
-               bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
+               bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
+                          bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
             else
                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
          } else if (src.regClass() == v1) {
@@ -8006,7 +8271,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
             tmp = emit_extract_vector(ctx, tmp, 0, v1);
             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
-            emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
+            emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp),
+                     dst);
          } else {
             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
          }
@@ -8026,9 +8292,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
-         emit_wqm(bld,
-                  bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
-                  dst);
+         emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
       } else if (src.regClass() == v2) {
          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
@@ -8052,7 +8316,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       assert(src.regClass() == bld.lm);
       assert(dst.regClass() == bld.lm);
 
-      Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
+      Temp tmp =
+         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
+            .def(1)
+            .getTemp();
       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
       break;
@@ -8072,13 +8339,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_exclusive_scan: {
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
-      unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
-         nir_intrinsic_cluster_size(instr) : 0;
-      cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
+      nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
+      unsigned cluster_size =
+         instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
+      cluster_size = util_next_power_of_two(
+         MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
 
-      if (!nir_src_is_divergent(instr->src[0]) &&
-          cluster_size == ctx->program->wave_size && instr->dest.ssa.bit_size != 1) {
+      if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
+          instr->dest.ssa.bit_size != 1) {
          /* We use divergence analysis to assign the regclass, so check if it's
           * working as expected */
          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
@@ -8113,8 +8381,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
          case nir_intrinsic_inclusive_scan:
             emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
             break;
-         default:
-            assert(false);
+         default: assert(false);
          }
       } else if (cluster_size == 1) {
          bld.copy(Definition(dst), src);
@@ -8127,14 +8394,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
 
          aco_opcode aco_op;
          switch (instr->intrinsic) {
-            case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
-            case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
-            case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
-            default:
-               unreachable("unknown reduce intrinsic");
+         case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
+         case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
+         case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
+         default: unreachable("unknown reduce intrinsic");
          }
 
-         Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, bld.def(dst.regClass()), src);
+         Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
+                                             bld.def(dst.regClass()), src);
          emit_wqm(bld, tmp_dst, dst);
       }
       break;
@@ -8155,31 +8422,39 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
             assert(src.regClass() == bld.lm);
             assert(dst.regClass() == bld.lm);
             uint32_t half_mask = 0x11111111u << lane;
-            Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
+            Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask),
+                                       Operand(half_mask));
             Temp tmp = bld.tmp(bld.lm);
             bld.sop1(Builder::s_wqm, Definition(tmp),
                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
-                              bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
+                              bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src,
+                                       Operand(exec, bld.lm))));
             emit_wqm(bld, tmp, dst);
          } else if (instr->dest.ssa.bit_size == 8) {
             Temp tmp = bld.tmp(v1);
             if (ctx->program->chip_class >= GFX8)
                emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
             else
-               emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
+               emit_wqm(bld,
+                        bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
+                        tmp);
             bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
          } else if (instr->dest.ssa.bit_size == 16) {
             Temp tmp = bld.tmp(v1);
             if (ctx->program->chip_class >= GFX8)
                emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
             else
-               emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
+               emit_wqm(bld,
+                        bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
+                        tmp);
             bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
          } else if (instr->dest.ssa.bit_size == 32) {
             if (ctx->program->chip_class >= GFX8)
                emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
             else
-               emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst);
+               emit_wqm(bld,
+                        bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl),
+                        dst);
          } else if (instr->dest.ssa.bit_size == 64) {
             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
@@ -8187,8 +8462,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
                lo = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
                hi = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
             } else {
-               lo = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
-               hi = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
+               lo = emit_wqm(
+                  bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
+               hi = emit_wqm(
+                  bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
             }
             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
             emit_split_vector(ctx, dst, 2);
@@ -8209,20 +8486,11 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       }
       uint16_t dpp_ctrl = 0;
       switch (instr->intrinsic) {
-      case nir_intrinsic_quad_swap_horizontal:
-         dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
-         break;
-      case nir_intrinsic_quad_swap_vertical:
-         dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
-         break;
-      case nir_intrinsic_quad_swap_diagonal:
-         dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
-         break;
-      case nir_intrinsic_quad_swizzle_amd:
-         dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
-         break;
-      default:
-         break;
+      case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
+      case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
+      case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
+      case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
+      default: break;
       }
       if (ctx->program->chip_class < GFX8)
          dpp_ctrl |= (1 << 15);
@@ -8234,7 +8502,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
 
       if (instr->dest.ssa.bit_size == 1) {
          assert(src.regClass() == bld.lm);
-         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
+         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u),
+                            Operand((uint32_t)-1), src);
          if (ctx->program->chip_class >= GFX8)
             src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
          else
@@ -8293,7 +8562,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
 
       if (instr->dest.ssa.bit_size == 1) {
          assert(src.regClass() == bld.lm);
-         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
+         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u),
+                            Operand((uint32_t)-1), src);
          src = emit_masked_swizzle(ctx, bld, src, mask);
          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
          emit_wqm(bld, tmp, dst);
@@ -8353,8 +8623,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       assert(dst.regClass() == v1);
       assert(ctx->program->chip_class >= GFX8);
-      bld.vop3(aco_opcode::v_perm_b32, Definition(dst),
-               get_ssa_temp(ctx, instr->src[0].ssa),
+      bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),
                as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),
                as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
       break;
@@ -8368,7 +8637,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
          bld.copy(Definition(dst), src);
       } else if (dst.regClass() == v1 && src.regClass() == v1) {
          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
-                  bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)), bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
+                  bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
+                  bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
       } else {
          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
       }
@@ -8395,7 +8665,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_demote_if: {
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
       assert(src.regClass() == bld.lm);
-      Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
+      Temp cond =
+         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
 
       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
@@ -8418,20 +8689,22 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
    case nir_intrinsic_elect: {
       Temp first = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
-      emit_wqm(bld, bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand(1u), first),
+      emit_wqm(bld,
+               bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand(1u), first),
                get_ssa_temp(ctx, &instr->dest.ssa));
       break;
    }
    case nir_intrinsic_shader_clock: {
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP && ctx->options->chip_class >= GFX10_3) {
+      if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP &&
+          ctx->options->chip_class >= GFX10_3) {
          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand(0u));
       } else {
-         aco_opcode opcode =
-            nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
-               aco_opcode::s_memrealtime : aco_opcode::s_memtime;
+         aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE
+                                ? aco_opcode::s_memrealtime
+                                : aco_opcode::s_memtime;
          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
       }
       emit_split_vector(ctx, dst, 2);
@@ -8467,12 +8740,13 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
 
       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
          if (ctx->options->chip_class >= GFX10)
-            bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
+            bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u),
+                         get_arg(ctx, ctx->args->ac.gs_invocation_id));
          else
             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
-         bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
-                  get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u));
+         bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
+                  Operand(8u), Operand(5u));
       } else {
          unreachable("Unsupported stage for load_invocation_id");
       }
@@ -8494,7 +8768,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
          break;
       default:
          if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
-            /* In case of NGG, the GS threads always have the primitive ID even if there is no SW GS. */
+            /* In case of NGG, the GS threads always have the primitive ID
+             * even if there is no SW GS. */
             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
             break;
          }
@@ -8519,7 +8794,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_end_primitive_with_counter: {
       if (ctx->stage.hw != HWStage::NGG) {
          unsigned stream = nir_intrinsic_stream_id(instr);
-         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
+         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,
+                  sendmsg_gs(true, false, stream));
       }
       break;
    }
@@ -8538,7 +8814,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       break;
    }
    case nir_intrinsic_load_ring_tess_factors_offset_amd: {
-      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.tcs_factor_offset));
+      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               get_arg(ctx, ctx->args->ac.tcs_factor_offset));
       break;
    }
    case nir_intrinsic_load_ring_tess_offchip_amd: {
@@ -8547,7 +8824,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       break;
    }
    case nir_intrinsic_load_ring_tess_offchip_offset_amd: {
-      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.tess_offchip_offset));
+      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               get_arg(ctx, ctx->args->ac.tess_offchip_offset));
       break;
    }
    case nir_intrinsic_load_ring_esgs_amd: {
@@ -8557,12 +8835,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       break;
    }
    case nir_intrinsic_load_ring_es2gs_offset_amd: {
-      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.es2gs_offset));
+      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               get_arg(ctx, ctx->args->ac.es2gs_offset));
       break;
    }
    case nir_intrinsic_load_gs_vertex_offset_amd: {
       unsigned b = nir_intrinsic_base(instr);
-      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
+      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
       break;
    }
    case nir_intrinsic_has_input_vertex_amd:
@@ -8575,9 +8855,11 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_workgroup_num_input_vertices_amd:
    case nir_intrinsic_load_workgroup_num_input_primitives_amd: {
       assert(ctx->stage.hw == HWStage::NGG);
-      unsigned pos = instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;
-      bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
-               get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(pos | (9u << 16u)));
+      unsigned pos =
+         instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;
+      bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info),
+               Operand(pos | (9u << 16u)));
       break;
    }
    case nir_intrinsic_load_initial_edgeflag_amd: {
@@ -8586,11 +8868,13 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       unsigned i = nir_src_as_uint(instr->src[0]);
 
       Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
-      bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), gs_invocation_id, Operand(8u + i), Operand(1u));
+      bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               gs_invocation_id, Operand(8u + i), Operand(1u));
       break;
    }
    case nir_intrinsic_load_packed_passthrough_primitive_amd: {
-      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
+      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
       break;
    }
    case nir_intrinsic_export_vertex_amd: {
@@ -8602,8 +8886,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       assert(ctx->stage.hw == HWStage::NGG);
       Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa);
       bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
-         1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
-         false /* compressed */, true/* done */, false /* valid mask */);
+              1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */,
+              true /* done */, false /* valid mask */);
       break;
    }
    case nir_intrinsic_alloc_vertices_and_primitives_amd: {
@@ -8618,21 +8902,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
-      bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u, true);
+      bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
+             true);
       break;
    }
    case nir_intrinsic_load_shader_query_enabled_amd: {
       unsigned cmp_bit = 0;
-      Temp shader_query_enabled = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), get_arg(ctx, ctx->args->ngg_gs_state), Operand(cmp_bit));
-      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bool_to_vector_condition(ctx, shader_query_enabled));
+      Temp shader_query_enabled = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
+                                           get_arg(ctx, ctx->args->ngg_gs_state), Operand(cmp_bit));
+      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               bool_to_vector_condition(ctx, shader_query_enabled));
       break;
    }
-   case nir_intrinsic_load_sbt_amd:
-      visit_load_sbt_amd(ctx, instr);
-      break;
-   case nir_intrinsic_bvh64_intersect_ray_amd:
-      visit_bvh64_intersect_ray_amd(ctx, instr);
-      break;
+   case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break;
+   case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
    default:
       isel_err(&instr->instr, "Unimplemented intrinsic instr");
       abort();
@@ -8641,13 +8924,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 }
 
-
-void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
-                    Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
-                    enum glsl_base_type *stype)
+void
+tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,
+               Temp* fmask_ptr, enum glsl_base_type* stype)
 {
-   nir_deref_instr *texture_deref_instr = NULL;
-   nir_deref_instr *sampler_deref_instr = NULL;
+   nir_deref_instr* texture_deref_instr = NULL;
+   nir_deref_instr* sampler_deref_instr = NULL;
    int plane = -1;
 
    for (unsigned i = 0; i < instr->num_srcs; i++) {
@@ -8658,11 +8940,8 @@ void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
       case nir_tex_src_sampler_deref:
          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
          break;
-      case nir_tex_src_plane:
-         plane = nir_src_as_int(instr->src[i].src);
-         break;
-      default:
-         break;
+      case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break;
+      default: break;
       }
    }
 
@@ -8672,11 +8951,11 @@ void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
       sampler_deref_instr = texture_deref_instr;
 
    if (plane >= 0) {
-      assert(instr->op != nir_texop_txf_ms &&
-             instr->op != nir_texop_samples_identical);
-      assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
-      *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
-   } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
+      assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
+      assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
+      *res_ptr = get_sampler_desc(ctx, texture_deref_instr,
+                                  (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
+   } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);
    } else if (instr->op == nir_texop_fragment_mask_fetch) {
       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
@@ -8695,26 +8974,25 @@ void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
                         bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
          Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
          bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
-                    Definition(img[2]), Definition(img[3]), Definition(img[4]),
-                    Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr);
+                    Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]),
+                    Definition(img[6]), Definition(img[7]), *res_ptr);
          bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
                     Definition(samp[2]), Definition(samp[3]), *samp_ptr);
 
          samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
-         *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
-                               img[0], img[1], img[2], img[3],
-                               img[4], img[5], img[6], img[7]);
-         *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
-                                samp[0], samp[1], samp[2], samp[3]);
+         *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2],
+                               img[3], img[4], img[5], img[6], img[7]);
+         *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2],
+                                samp[3]);
       }
    }
-   if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
-                     instr->op == nir_texop_samples_identical))
+   if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical))
       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
 }
 
-void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
-                       Temp *out_ma, Temp *out_sc, Temp *out_tc)
+void
+build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,
+                  Temp* out_tc)
 {
    Builder bld(ctx->program, ctx->block);
 
@@ -8727,28 +9005,30 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
    Operand two(0x40000000u);
    Operand four(0x40800000u);
 
-   Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
+   Temp is_ma_positive =
+      bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
 
    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
    is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
-   Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
+   Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)),
+                               bld.def(s1, scc), is_ma_z, is_ma_y);
 
-   // select sc
+   /* select sc */
    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
-   Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
-                       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
-                       one, is_ma_y);
+   Temp sgn = bld.vop2_e64(
+      aco_opcode::v_cndmask_b32, bld.def(v1),
+      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y);
    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
 
-   // select tc
+   /* select tc */
    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
 
-   // select ma
+   /* select ma */
    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
                   deriv_z, is_ma_z);
@@ -8756,24 +9036,29 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
 }
 
-void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
+void
+prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,
+                    bool is_deriv, bool is_array)
 {
    Builder bld(ctx->program, ctx->block);
    Temp ma, tc, sc, id;
-   aco_opcode madak = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
-   aco_opcode madmk = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
+   aco_opcode madak =
+      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
+   aco_opcode madmk =
+      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
 
    if (is_array) {
       coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
 
-      // see comment in ac_prepare_cube_coords()
+      /* see comment in ac_prepare_cube_coords() */
       if (ctx->options->chip_class <= GFX8)
          coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]);
    }
 
    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
 
-   aco_ptr<VOP3_instruction> vop3a{create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
+   aco_ptr<VOP3_instruction> vop3a{
+      create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
    vop3a->operands[0] = Operand(ma);
    vop3a->abs[0] = true;
    Temp invma = bld.tmp(v1);
@@ -8782,11 +9067,11 @@ void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx
 
    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
    if (!is_deriv)
-      sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
+      sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand(0x3fc00000u /*1.5*/));
 
    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
    if (!is_deriv)
-      tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
+      tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand(0x3fc00000u /*1.5*/));
 
    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
 
@@ -8795,69 +9080,70 @@ void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx
       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
 
       for (unsigned i = 0; i < 2; i++) {
-         // see comment in ac_prepare_cube_coords()
+         /* see comment in ac_prepare_cube_coords() */
          Temp deriv_ma;
          Temp deriv_sc, deriv_tc;
-         build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
-                           &deriv_ma, &deriv_sc, &deriv_tc);
+         build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);
 
          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
 
          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
-                               bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
-                               bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
+                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
+                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
-                               bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
-                               bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
+                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
+                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
       }
 
-      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
-      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
+      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u /*1.5*/), sc);
+      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u /*1.5*/), tc);
    }
 
    if (is_array)
-      id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
+      id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand(0x41000000u /*8.0*/));
    coords.resize(3);
    coords[0] = sc;
    coords[1] = tc;
    coords[2] = id;
 }
 
-void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
+void
+get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])
 {
    if (vec->parent_instr->type != nir_instr_type_alu)
       return;
-   nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
+   nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
    if (vec_instr->op != nir_op_vec(vec->num_components))
       return;
 
    for (unsigned i = 0; i < vec->num_components; i++) {
-      cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
-              nir_src_as_const_value(vec_instr->src[i].src) : NULL;
+      cv[i] =
+         vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
    }
 }
 
-void visit_tex(isel_context *ctx, nir_tex_instr *instr)
+void
+visit_tex(isel_context* ctx, nir_tex_instr* instr)
 {
    Builder bld(ctx->program, ctx->block);
    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
-        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
-        has_clamped_lod = false;
+        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
+        has_sample_index = false, has_clamped_lod = false;
    Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
-        lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
-        clamped_lod = Temp();
+                                      lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
+                                      clamped_lod = Temp();
    std::vector<Temp> coords;
    std::vector<Temp> derivs;
-   nir_const_value *sample_index_cv = NULL;
-   nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
+   nir_const_value* sample_index_cv = NULL;
+   nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
    enum glsl_base_type stype;
    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
 
    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
-   bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
-                                      instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
+   bool tg4_integer_cube_workaround =
+      tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
 
    for (unsigned i = 0; i < instr->num_srcs; i++) {
       switch (instr->src[i].src_type) {
@@ -8910,8 +9196,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          break;
       case nir_tex_src_texture_offset:
       case nir_tex_src_sampler_offset:
-      default:
-         break;
+      default: break;
       }
    }
 
@@ -8940,10 +9225,12 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
                continue;
 
             acc = emit_extract_vector(ctx, offset, i, s1);
-            acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
+            acc =
+               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
 
             if (i) {
-               acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
+               acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
+                              Operand(8u * i));
             }
 
             if (pack == Temp()) {
@@ -8954,7 +9241,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          }
 
          if (pack_const && pack != Temp())
-            pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
+            pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
+                            Operand(pack_const), pack);
       } else {
          for (unsigned i = 0; i < offset.size(); i++) {
             if (const_offset[i])
@@ -8986,7 +9274,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
    }
 
    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
-      prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
+      prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,
+                          instr->is_array && instr->op != nir_texop_lod);
 
    /* pack derivatives */
    if (has_ddx || has_ddy) {
@@ -9003,32 +9292,26 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       has_derivs = true;
    }
 
-   if (instr->coord_components > 1 &&
-       instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
-       instr->is_array &&
-       instr->op != nir_texop_txf)
+   if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
+       instr->is_array && instr->op != nir_texop_txf)
       coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
 
    if (instr->coord_components > 2 &&
-      (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
-       instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
-       instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
-       instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
-       instr->is_array &&
-       instr->op != nir_texop_txf &&
-       instr->op != nir_texop_txf_ms &&
-       instr->op != nir_texop_fragment_fetch &&
-       instr->op != nir_texop_fragment_mask_fetch)
+       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
+        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
+        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
+       instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms &&
+       instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch)
       coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
 
-   if (ctx->options->chip_class == GFX9 &&
-       instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
+   if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
        instr->op != nir_texop_lod && instr->coord_components) {
       assert(coords.size() > 0 && coords.size() < 3);
 
-      coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ?
-                                                                     Operand((uint32_t) 0) :
-                                                                     Operand((uint32_t) 0x3f000000)));
+      coords.insert(
+         std::next(coords.begin()),
+         bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand((uint32_t)0)
+                                                          : Operand((uint32_t)0x3f000000)));
    }
 
    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
@@ -9038,9 +9321,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
 
    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
-            instr->op != nir_texop_txs &&
-	    instr->op != nir_texop_fragment_fetch &&
-	    instr->op != nir_texop_fragment_mask_fetch) {
+            instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch &&
+            instr->op != nir_texop_fragment_mask_fetch) {
       assert(has_sample_index);
       Operand op(sample_index);
       if (sample_index_cv)
@@ -9062,9 +9344,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       dmask = u_bit_consecutive(0, util_last_bit(dmask));
    if (instr->is_sparse)
       dmask = MAX2(dmask, 1) | 0x10;
-   unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
-                  ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
-                  : 0;
+   unsigned dim =
+      ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
+         ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
+         : 0;
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
    Temp tmp_dst = dst;
 
@@ -9079,7 +9362,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);
    } else if (instr->op == nir_texop_samples_identical) {
       tmp_dst = bld.tmp(v1);
-   } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
+   } else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||
+              dst.type() == RegType::sgpr) {
       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
    }
 
@@ -9087,20 +9371,15 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       if (!has_lod)
          lod = bld.copy(bld.def(v1), Operand(0u));
 
-      bool div_by_6 = instr->op == nir_texop_txs &&
-                      instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
-                      instr->is_array &&
-                      (dmask & (1 << 2));
+      bool div_by_6 = instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
+                      instr->is_array && (dmask & (1 << 2));
       if (tmp_dst.id() == dst.id() && div_by_6)
          tmp_dst = bld.tmp(tmp_dst.regClass());
 
-      MIMG_instruction *tex = emit_mimg(bld, aco_opcode::image_get_resinfo,
-                                        Definition(tmp_dst), resource, Operand(s4),
-                                        std::vector<Temp>{lod});
-      if (ctx->options->chip_class == GFX9 &&
-          instr->op == nir_texop_txs &&
-          instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
-          instr->is_array) {
+      MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst),
+                                        resource, Operand(s4), std::vector<Temp>{lod});
+      if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs &&
+          instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
       } else if (instr->op == nir_texop_query_levels) {
          tex->dmask = 1 << 3;
@@ -9113,15 +9392,14 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       if (div_by_6) {
          /* divide 3rd value by 6 by multiplying with magic number */
          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
-         Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
-         Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
+         Temp c = bld.copy(bld.def(s1), Operand((uint32_t)0x2AAAAAAB));
+         Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1),
+                              emit_extract_vector(ctx, tmp_dst, 2, v1), c);
          assert(instr->dest.ssa.num_components == 3);
          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
                               emit_extract_vector(ctx, tmp_dst, 0, v1),
-                              emit_extract_vector(ctx, tmp_dst, 1, v1),
-                              by_6);
-
+                              emit_extract_vector(ctx, tmp_dst, 1, v1), by_6);
       }
 
       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
@@ -9133,9 +9411,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
    if (tg4_integer_workarounds) {
       Temp tg4_lod = bld.copy(bld.def(v1), Operand(0u));
       Temp size = bld.tmp(v2);
-      MIMG_instruction *tex = emit_mimg(bld, aco_opcode::image_get_resinfo,
-                                        Definition(size), resource, Operand(s4),
-                                        std::vector<Temp>{tg4_lod});
+      MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size),
+                                        resource, Operand(s4), std::vector<Temp>{tg4_lod});
       tex->dim = dim;
       tex->dmask = 0x3;
       tex->da = da;
@@ -9146,7 +9423,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
-         half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
+         half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000 /*-0.5*/),
+                                  half_texel[i]);
       }
 
       if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
@@ -9158,25 +9436,24 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
           * radv_init_sampler().
           */
          unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
-         Temp not_needed = bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand(bit_idx));
+         Temp not_needed =
+            bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand(bit_idx));
 
          not_needed = bool_to_vector_condition(ctx, not_needed);
          half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                                  Operand(0xbf000000/*-0.5*/), half_texel[0], not_needed);
+                                  Operand(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
          half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                                  Operand(0xbf000000/*-0.5*/), half_texel[1], not_needed);
+                                  Operand(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
       }
 
-      Temp new_coords[2] = {
-         bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
-         bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])
-      };
+      Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
+                            bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
 
       if (tg4_integer_cube_workaround) {
-         // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
-         Temp *const desc = (Temp *)alloca(resource.size() * sizeof(Temp));
-         aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
-                                                                           Format::PSEUDO, 1, resource.size())};
+         /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
+         Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
+         aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
          split->operands[0] = Operand(resource);
          for (unsigned i = 0; i < resource.size(); i++) {
             desc[i] = bld.tmp(s1);
@@ -9184,21 +9461,22 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          }
          ctx->block->instructions.emplace_back(std::move(split));
 
-         Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
+         Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
+                              Operand(20u | (6u << 16)));
          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
 
          Temp nfmt;
          if (stype == GLSL_TYPE_UINT) {
-            nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
-                            Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
-                            Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
-                            bld.scc(compare_cube_wa));
+            nfmt =
+               bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
+                        Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
+                        Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
          } else {
-            nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
-                            Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
-                            Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
-                            bld.scc(compare_cube_wa));
+            nfmt =
+               bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
+                        Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
+                        Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
          }
          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
@@ -9209,46 +9487,42 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
                             Operand((uint32_t)C_008F14_NUM_FORMAT));
          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
 
-         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
-                                                                         Format::PSEUDO, resource.size(), 1)};
+         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
          for (unsigned i = 0; i < resource.size(); i++)
             vec->operands[i] = Operand(desc[i]);
          resource = bld.tmp(resource.regClass());
          vec->definitions[0] = Definition(resource);
          ctx->block->instructions.emplace_back(std::move(vec));
 
-         new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                                  new_coords[0], coords[0], tg4_compare_cube_wa64);
-         new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                                  new_coords[1], coords[1], tg4_compare_cube_wa64);
+         new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
+                                  tg4_compare_cube_wa64);
+         new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
+                                  tg4_compare_cube_wa64);
       }
       coords[0] = new_coords[0];
       coords[1] = new_coords[1];
    }
 
    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
-      //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
+      // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
+      // ac_build_buffer_load_format_gfx9_safe()
 
       assert(coords.size() == 1);
       aco_opcode op;
       switch (util_last_bit(dmask & 0xf)) {
-      case 1:
-         op = aco_opcode::buffer_load_format_x; break;
-      case 2:
-         op = aco_opcode::buffer_load_format_xy; break;
-      case 3:
-         op = aco_opcode::buffer_load_format_xyz; break;
-      case 4:
-         op = aco_opcode::buffer_load_format_xyzw; break;
-      default:
-         unreachable("Tex instruction loads more than 4 components.");
+      case 1: op = aco_opcode::buffer_load_format_x; break;
+      case 2: op = aco_opcode::buffer_load_format_xy; break;
+      case 3: op = aco_opcode::buffer_load_format_xyz; break;
+      case 4: op = aco_opcode::buffer_load_format_xyzw; break;
+      default: unreachable("Tex instruction loads more than 4 components.");
       }
 
-      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
-         op, Format::MUBUF, 3 + instr->is_sparse, 1)};
+      aco_ptr<MUBUF_instruction> mubuf{
+         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
       mubuf->operands[0] = Operand(resource);
       mubuf->operands[1] = Operand(coords[0]);
-      mubuf->operands[2] = Operand((uint32_t) 0);
+      mubuf->operands[2] = Operand((uint32_t)0);
       mubuf->definitions[0] = Definition(tmp_dst);
       mubuf->idxen = true;
       mubuf->tfe = instr->is_sparse;
@@ -9284,16 +9558,16 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
    if (has_clamped_lod)
       args.emplace_back(clamped_lod);
 
-
-   if (instr->op == nir_texop_txf ||
-       instr->op == nir_texop_txf_ms ||
-       instr->op == nir_texop_samples_identical ||
-       instr->op == nir_texop_fragment_fetch ||
+   if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms ||
+       instr->op == nir_texop_samples_identical || instr->op == nir_texop_fragment_fetch ||
        instr->op == nir_texop_fragment_mask_fetch) {
-      aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
+      aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
+                            instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
+                         ? aco_opcode::image_load
+                         : aco_opcode::image_load_mip;
       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
-      MIMG_instruction *tex = emit_mimg(bld, op, Definition(tmp_dst), resource,
-                                        Operand(s4), args, 0, vdata);
+      MIMG_instruction* tex =
+         emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);
       tex->dim = dim;
       tex->dmask = dmask & 0xf;
       tex->unrm = true;
@@ -9304,7 +9578,9 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          assert(dmask == 1 && dst.regClass() == bld.lm);
          assert(dst.id() != tmp_dst.id());
 
-         bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand(0u), tmp_dst).def(0).setHint(vcc);
+         bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand(0u), tmp_dst)
+            .def(0)
+            .setHint(vcc);
       } else {
          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
       }
@@ -9421,14 +9697,13 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       opcode = aco_opcode::image_get_lod;
    }
 
-   bool implicit_derivs = bld.program->stage == fragment_fs &&
-                          !has_derivs && !has_lod && !level_zero &&
-                          instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
+   bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
+                          !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
 
    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
-   MIMG_instruction *tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource,
-                                     Operand(sampler), args, implicit_derivs ? wqm_mask : 0, vdata);
+   MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler),
+                                     args, implicit_derivs ? wqm_mask : 0, vdata);
    tex->dim = dim;
    tex->dmask = dmask & 0xf;
    tex->da = da;
@@ -9447,30 +9722,30 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
          else
             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
-         val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
+         val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
+                           tg4_compare_cube_wa64);
       }
 
       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
       if (instr->is_sparse)
-         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
-                              val[0], val[1], val[2], val[3],
-                              emit_extract_vector(ctx, tmp_dst, 4, v1));
+         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
+                              val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
       else
-         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
-                              val[0], val[1], val[2], val[3]);
+         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
+                              val[3]);
    }
    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
-
 }
 
-
-Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
+Operand
+get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)
 {
    Temp tmp = get_ssa_temp(ctx, ssa);
    if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
       return Operand(rc);
-   } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
+   } else if (logical && ssa->bit_size == 1 &&
+              ssa->parent_instr->type == nir_instr_type_load_const) {
       if (ctx->program->wave_size == 64)
          return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
       else
@@ -9480,7 +9755,8 @@ Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool l
    }
 }
 
-void visit_phi(isel_context *ctx, nir_phi_instr *instr)
+void
+visit_phi(isel_context* ctx, nir_phi_instr* instr)
 {
    aco_ptr<Pseudo_instruction> phi;
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -9492,17 +9768,19 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
 
    /* we want a sorted list of sources, since the predecessor list is also sorted */
    std::map<unsigned, nir_ssa_def*> phi_src;
-   nir_foreach_phi_src(src, instr)
+   nir_foreach_phi_src (src, instr)
       phi_src[src->pred->index] = src->src.ssa;
 
    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
    unsigned num_operands = 0;
-   Operand *const operands = (Operand *)alloca((std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
+   Operand* const operands = (Operand*)alloca(
+      (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
    unsigned num_defined = 0;
    unsigned cur_pred_idx = 0;
-   for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
+   for (std::pair<unsigned, nir_ssa_def*> src : phi_src) {
       if (cur_pred_idx < preds.size()) {
-         /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
+         /* handle missing preds (IF merges with discard/break) and extra preds
+          * (loop exit with discard) */
          unsigned block = ctx->cf_info.nir_to_aco[src.first];
          unsigned skipped = 0;
          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
@@ -9533,18 +9811,19 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
     * this operand later in visit_loop() if it's not necessary or replace the
     * undef with something correct. */
    if (!logical && ctx->block->kind & block_kind_loop_header) {
-      nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
-      nir_block *last = nir_loop_last_block(loop);
+      nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
+      nir_block* last = nir_loop_last_block(loop);
       if (last->successors[0] != instr->instr.block)
          operands[num_operands++] = Operand(RegClass());
    }
 
    /* we can use a linear phi in some cases if one src is undef */
    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
-      phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
+      phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
+                                                       num_operands, 1));
 
-      Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
-      Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
+      Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
+      Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
       assert(invert->kind & block_kind_invert);
 
       unsigned then_block = invert->linear_preds[0];
@@ -9572,8 +9851,8 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
 }
 
-
-void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
+void
+visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)
 {
    Temp dst = get_ssa_temp(ctx, &instr->def);
 
@@ -9582,7 +9861,8 @@ void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
    if (dst.size() == 1) {
       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
    } else {
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
       for (unsigned i = 0; i < dst.size(); i++)
          vec->operands[i] = Operand(0u);
       vec->definitions[0] = Definition(dst);
@@ -9590,9 +9870,10 @@ void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
    }
 }
 
-void begin_loop(isel_context *ctx, loop_context *lc)
+void
+begin_loop(isel_context* ctx, loop_context* lc)
 {
-   //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
+   // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
    append_logical_end(ctx->block);
    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
    Builder bld(ctx->program, ctx->block);
@@ -9603,7 +9884,7 @@ void begin_loop(isel_context *ctx, loop_context *lc)
 
    ctx->program->next_loop_depth++;
 
-   Block *loop_header = ctx->program->create_and_insert_block();
+   Block* loop_header = ctx->program->create_and_insert_block();
    loop_header->kind |= block_kind_loop_header;
    add_edge(loop_preheader_idx, loop_header);
    ctx->block = loop_header;
@@ -9617,15 +9898,18 @@ void begin_loop(isel_context *ctx, loop_context *lc)
    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
 }
 
-void end_loop(isel_context *ctx, loop_context *lc)
+void
+end_loop(isel_context* ctx, loop_context* lc)
 {
-   //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
+   // TODO: what if a loop ends with a unconditional or uniformly branched continue
+   //       and this branch is never taken?
    if (!ctx->cf_info.has_branch) {
       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
       Builder bld(ctx->program, ctx->block);
       append_logical_end(ctx->block);
 
-      if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) {
+      if (ctx->cf_info.exec_potentially_empty_discard ||
+          ctx->cf_info.exec_potentially_empty_break) {
          /* Discards can result in code running with an empty exec mask.
           * This would result in divergent breaks not ever being taken. As a
           * workaround, break the loop when the loop mask is empty instead of
@@ -9634,14 +9918,14 @@ void end_loop(isel_context *ctx, loop_context *lc)
          unsigned block_idx = ctx->block->index;
 
          /* create helper blocks to avoid critical edges */
-         Block *break_block = ctx->program->create_and_insert_block();
+         Block* break_block = ctx->program->create_and_insert_block();
          break_block->kind = block_kind_uniform;
          bld.reset(break_block);
          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
          add_linear_edge(block_idx, break_block);
          add_linear_edge(break_block->index, &lc->loop_exit);
 
-         Block *continue_block = ctx->program->create_and_insert_block();
+         Block* continue_block = ctx->program->create_and_insert_block();
          continue_block->kind = block_kind_uniform;
          bld.reset(continue_block);
          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
@@ -9671,7 +9955,7 @@ void end_loop(isel_context *ctx, loop_context *lc)
    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
    append_logical_start(ctx->block);
 
-   #if 0
+#if 0
    // TODO: check if it is beneficial to not branch on continues
    /* trim linear phis in loop header */
    for (auto&& instr : loop_entry->instructions) {
@@ -9690,7 +9974,7 @@ void end_loop(isel_context *ctx, loop_context *lc)
          break;
       }
    }
-   #endif
+#endif
 
    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
    ctx->cf_info.parent_loop.exit = lc->exit_old;
@@ -9701,10 +9985,11 @@ void end_loop(isel_context *ctx, loop_context *lc)
       ctx->cf_info.exec_potentially_empty_discard = false;
 }
 
-void emit_loop_jump(isel_context *ctx, bool is_break)
+void
+emit_loop_jump(isel_context* ctx, bool is_break)
 {
    Builder bld(ctx->program, ctx->block);
-   Block *logical_target;
+   Block* logical_target;
    append_logical_end(ctx->block);
    unsigned idx = ctx->block->index;
 
@@ -9766,64 +10051,45 @@ void emit_loop_jump(isel_context *ctx, bool is_break)
    ctx->block = continue_block;
 }
 
-void emit_loop_break(isel_context *ctx)
+void
+emit_loop_break(isel_context* ctx)
 {
    emit_loop_jump(ctx, true);
 }
 
-void emit_loop_continue(isel_context *ctx)
+void
+emit_loop_continue(isel_context* ctx)
 {
    emit_loop_jump(ctx, false);
 }
 
-void visit_jump(isel_context *ctx, nir_jump_instr *instr)
+void
+visit_jump(isel_context* ctx, nir_jump_instr* instr)
 {
    /* visit_block() would usually do this but divergent jumps updates ctx->block */
    ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
 
    switch (instr->type) {
-   case nir_jump_break:
-      emit_loop_break(ctx);
-      break;
-   case nir_jump_continue:
-      emit_loop_continue(ctx);
-      break;
-   default:
-      isel_err(&instr->instr, "Unknown NIR jump instr");
-      abort();
+   case nir_jump_break: emit_loop_break(ctx); break;
+   case nir_jump_continue: emit_loop_continue(ctx); break;
+   default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
    }
 }
 
-void visit_block(isel_context *ctx, nir_block *block)
+void
+visit_block(isel_context* ctx, nir_block* block)
 {
-   nir_foreach_instr(instr, block) {
+   nir_foreach_instr (instr, block) {
       switch (instr->type) {
-      case nir_instr_type_alu:
-         visit_alu_instr(ctx, nir_instr_as_alu(instr));
-         break;
-      case nir_instr_type_load_const:
-         visit_load_const(ctx, nir_instr_as_load_const(instr));
-         break;
-      case nir_instr_type_intrinsic:
-         visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
-         break;
-      case nir_instr_type_tex:
-         visit_tex(ctx, nir_instr_as_tex(instr));
-         break;
-      case nir_instr_type_phi:
-         visit_phi(ctx, nir_instr_as_phi(instr));
-         break;
-      case nir_instr_type_ssa_undef:
-         visit_undef(ctx, nir_instr_as_ssa_undef(instr));
-         break;
-      case nir_instr_type_deref:
-         break;
-      case nir_instr_type_jump:
-         visit_jump(ctx, nir_instr_as_jump(instr));
-         break;
-      default:
-         isel_err(instr, "Unknown NIR instr type");
-         //abort();
+      case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
+      case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
+      case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
+      case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
+      case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
+      case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;
+      case nir_instr_type_deref: break;
+      case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
+      default: isel_err(instr, "Unknown NIR instr type");
       }
    }
 
@@ -9831,10 +10097,9 @@ void visit_block(isel_context *ctx, nir_block *block)
       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
 }
 
-
-
-static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last,
-                                    aco_ptr<Instruction>& header_phi, Operand *vals)
+static Operand
+create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
+                     aco_ptr<Instruction>& header_phi, Operand* vals)
 {
    vals[0] = Operand(header_phi->definitions[0].getTemp());
    RegClass rc = vals[0].regClass();
@@ -9878,11 +10143,12 @@ static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned
    return vals[last - first];
 }
 
-static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond);
-static void begin_uniform_if_else(isel_context *ctx, if_context *ic);
-static void end_uniform_if(isel_context *ctx, if_context *ic);
+static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
+static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
+static void end_uniform_if(isel_context* ctx, if_context* ic);
 
-static void visit_loop(isel_context *ctx, nir_loop *loop)
+static void
+visit_loop(isel_context* ctx, nir_loop* loop)
 {
    loop_context lc;
    begin_loop(ctx, &lc);
@@ -9927,13 +10193,14 @@ static void visit_loop(isel_context *ctx, nir_loop *loop)
     * merge block would get CSE'd */
    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
-      Operand *const vals = (Operand *)alloca(num_vals * sizeof(Operand));
+      Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
          if (instr->opcode == aco_opcode::p_linear_phi) {
             if (ctx->cf_info.has_branch)
                instr->operands.pop_back();
             else
-               instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
+               instr->operands.back() =
+                  create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
          } else if (!is_phi(instr)) {
             break;
          }
@@ -9943,7 +10210,8 @@ static void visit_loop(isel_context *ctx, nir_loop *loop)
    end_loop(ctx, &lc);
 }
 
-static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
+static void
+begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
 {
    ic->cond = cond;
 
@@ -9953,7 +10221,8 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond
    /* branch to linear then block */
    assert(cond.regClass() == ctx->program->lane_mask);
    aco_ptr<Pseudo_branch_instruction> branch;
-   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 1));
+   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
+                                                              Format::PSEUDO_BRANCH, 1, 1));
    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
    branch->definitions[0].setHint(vcc);
    branch->operands[0] = Operand(cond);
@@ -9978,7 +10247,6 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond
    ctx->cf_info.exec_potentially_empty_break = false;
    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
 
-
    /** emit logical then block */
    ctx->program->next_divergent_if_logical_depth++;
    Block* BB_then_logical = ctx->program->create_and_insert_block();
@@ -9987,13 +10255,15 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond
    append_logical_start(BB_then_logical);
 }
 
-static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
+static void
+begin_divergent_if_else(isel_context* ctx, if_context* ic)
 {
-   Block *BB_then_logical = ctx->block;
+   Block* BB_then_logical = ctx->block;
    append_logical_end(BB_then_logical);
-    /* branch from logical then block to invert block */
+   /* branch from logical then block to invert block */
    aco_ptr<Pseudo_branch_instruction> branch;
-   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
+   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
+                                                              Format::PSEUDO_BRANCH, 0, 1));
    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
    branch->definitions[0].setHint(vcc);
    BB_then_logical->instructions.emplace_back(std::move(branch));
@@ -10011,33 +10281,33 @@ static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
    BB_then_linear->kind |= block_kind_uniform;
    add_linear_edge(ic->BB_if_idx, BB_then_linear);
    /* branch from linear then block to invert block */
-   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
+   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
+                                                              Format::PSEUDO_BRANCH, 0, 1));
    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
    branch->definitions[0].setHint(vcc);
    BB_then_linear->instructions.emplace_back(std::move(branch));
    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
 
-
    /** emit invert merge block */
    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
    ic->invert_idx = ctx->block->index;
 
    /* branch to linear else block (skip else) */
-   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
+   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
+                                                              Format::PSEUDO_BRANCH, 0, 1));
    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
    branch->definitions[0].setHint(vcc);
    ctx->block->instructions.push_back(std::move(branch));
 
    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
-   ic->exec_potentially_empty_break_depth_old =
-      std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
+   ic->exec_potentially_empty_break_depth_old = std::min(
+      ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
    /* divergent branches use cbranch_execz */
    ctx->cf_info.exec_potentially_empty_discard = false;
    ctx->cf_info.exec_potentially_empty_break = false;
    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
 
-
    /** emit logical else block */
    ctx->program->next_divergent_if_logical_depth++;
    Block* BB_else_logical = ctx->program->create_and_insert_block();
@@ -10047,14 +10317,16 @@ static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
    append_logical_start(BB_else_logical);
 }
 
-static void end_divergent_if(isel_context *ctx, if_context *ic)
+static void
+end_divergent_if(isel_context* ctx, if_context* ic)
 {
-   Block *BB_else_logical = ctx->block;
+   Block* BB_else_logical = ctx->block;
    append_logical_end(BB_else_logical);
 
    /* branch from logical else block to endif block */
    aco_ptr<Pseudo_branch_instruction> branch;
-   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
+   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
+                                                              Format::PSEUDO_BRANCH, 0, 1));
    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
    branch->definitions[0].setHint(vcc);
    BB_else_logical->instructions.emplace_back(std::move(branch));
@@ -10067,30 +10339,28 @@ static void end_divergent_if(isel_context *ctx, if_context *ic)
    assert(!ctx->cf_info.has_branch);
    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
 
-
    /** emit linear else block */
    Block* BB_else_linear = ctx->program->create_and_insert_block();
    BB_else_linear->kind |= block_kind_uniform;
    add_linear_edge(ic->invert_idx, BB_else_linear);
 
    /* branch from linear else block to endif block */
-   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
+   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
+                                                              Format::PSEUDO_BRANCH, 0, 1));
    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
    branch->definitions[0].setHint(vcc);
    BB_else_linear->instructions.emplace_back(std::move(branch));
    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
 
-
    /** emit endif merge block */
    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
    append_logical_start(ctx->block);
 
-
    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
-   ctx->cf_info.exec_potentially_empty_break_depth =
-      std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
+   ctx->cf_info.exec_potentially_empty_break_depth = std::min(
+      ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
    if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
        !ctx->cf_info.parent_if.is_divergent) {
       ctx->cf_info.exec_potentially_empty_break = false;
@@ -10104,7 +10374,8 @@ static void end_divergent_if(isel_context *ctx, if_context *ic)
    }
 }
 
-static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
+static void
+begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
 {
    assert(cond.regClass() == s1);
 
@@ -10113,7 +10384,8 @@ static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
 
    aco_ptr<Pseudo_branch_instruction> branch;
    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
-   branch.reset(create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
+   branch.reset(
+      create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
    branch->definitions[0].setHint(vcc);
    branch->operands[0] = Operand(cond);
@@ -10127,7 +10399,6 @@ static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
    ctx->cf_info.has_branch = false;
    ctx->cf_info.parent_loop.has_divergent_branch = false;
 
-
    /** emit then block */
    ctx->program->next_uniform_if_depth++;
    Block* BB_then = ctx->program->create_and_insert_block();
@@ -10136,9 +10407,10 @@ static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
    ctx->block = BB_then;
 }
 
-static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
+static void
+begin_uniform_if_else(isel_context* ctx, if_context* ic)
 {
-   Block *BB_then = ctx->block;
+   Block* BB_then = ctx->block;
 
    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
@@ -10147,7 +10419,8 @@ static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
       append_logical_end(BB_then);
       /* branch from then block to endif block */
       aco_ptr<Pseudo_branch_instruction> branch;
-      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
+      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
+                                                                 Format::PSEUDO_BRANCH, 0, 1));
       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
       branch->definitions[0].setHint(vcc);
       BB_then->instructions.emplace_back(std::move(branch));
@@ -10167,15 +10440,17 @@ static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
    ctx->block = BB_else;
 }
 
-static void end_uniform_if(isel_context *ctx, if_context *ic)
+static void
+end_uniform_if(isel_context* ctx, if_context* ic)
 {
-   Block *BB_else = ctx->block;
+   Block* BB_else = ctx->block;
 
    if (!ctx->cf_info.has_branch) {
       append_logical_end(BB_else);
       /* branch from then block to endif block */
       aco_ptr<Pseudo_branch_instruction> branch;
-      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
+      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
+                                                                 Format::PSEUDO_BRANCH, 0, 1));
       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
       branch->definitions[0].setHint(vcc);
       BB_else->instructions.emplace_back(std::move(branch));
@@ -10188,7 +10463,6 @@ static void end_uniform_if(isel_context *ctx, if_context *ic)
    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
 
-
    /** emit endif merge block */
    ctx->program->next_uniform_if_depth--;
    if (!ctx->cf_info.has_branch) {
@@ -10197,7 +10471,8 @@ static void end_uniform_if(isel_context *ctx, if_context *ic)
    }
 }
 
-static bool visit_if(isel_context *ctx, nir_if *if_stmt)
+static bool
+visit_if(isel_context* ctx, nir_if* if_stmt)
 {
    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
    Builder bld(ctx->program, ctx->block);
@@ -10269,41 +10544,38 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
 }
 
-static bool visit_cf_list(isel_context *ctx,
-                          struct exec_list *list)
+static bool
+visit_cf_list(isel_context* ctx, struct exec_list* list)
 {
-   foreach_list_typed(nir_cf_node, node, node, list) {
+   foreach_list_typed (nir_cf_node, node, node, list) {
       switch (node->type) {
-      case nir_cf_node_block:
-         visit_block(ctx, nir_cf_node_as_block(node));
-         break;
+      case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
       case nir_cf_node_if:
          if (!visit_if(ctx, nir_cf_node_as_if(node)))
             return true;
          break;
-      case nir_cf_node_loop:
-         visit_loop(ctx, nir_cf_node_as_loop(node));
-         break;
-      default:
-         unreachable("unimplemented cf list type");
+      case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
+      default: unreachable("unimplemented cf list type");
       }
    }
    return false;
 }
 
-static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
+static void
+export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)
 {
    assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
 
    int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
-                ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
-                : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
+                   ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
+                   : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
    unsigned mask = ctx->outputs.mask[slot];
    if (!is_pos && !mask)
       return;
    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
       return;
-   aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
+   aco_ptr<Export_instruction> exp{
+      create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
    exp->enabled_mask = mask;
    for (unsigned i = 0; i < 4; ++i) {
       if (mask & (1 << i))
@@ -10324,9 +10596,11 @@ static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex
    ctx->block->instructions.emplace_back(std::move(exp));
 }
 
-static void export_vs_psiz_layer_viewport_vrs(isel_context *ctx, int *next_pos)
+static void
+export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos)
 {
-   aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
+   aco_ptr<Export_instruction> exp{
+      create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
    exp->enabled_mask = 0;
    for (unsigned i = 0; i < 4; ++i)
       exp->operands[i] = Operand(v1);
@@ -10374,11 +10648,10 @@ static void export_vs_psiz_layer_viewport_vrs(isel_context *ctx, int *next_pos)
       Temp rates = bld.copy(bld.def(v1), Operand((unsigned)ctx->options->force_vrs_rates));
 
       /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
-      Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm),
-                           Operand(0x3f800000u),
+      Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand(0x3f800000u),
                            Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3]));
-      rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                       bld.copy(bld.def(v1), Operand(0u)), rates, cond);
+      rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)),
+                       rates, cond);
 
       exp->operands[1] = Operand(rates);
       exp->enabled_mask |= 0x2;
@@ -10391,27 +10664,31 @@ static void export_vs_psiz_layer_viewport_vrs(isel_context *ctx, int *next_pos)
    ctx->block->instructions.emplace_back(std::move(exp));
 }
 
-static void create_vs_exports(isel_context *ctx)
+static void
+create_vs_exports(isel_context* ctx)
 {
    assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
 
-   radv_vs_output_info *outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
-                                  ? &ctx->program->info->tes.outinfo
-                                  : &ctx->program->info->vs.outinfo;
+   radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
+                                     ? &ctx->program->info->tes.outinfo
+                                     : &ctx->program->info->vs.outinfo;
 
    ctx->block->kind |= block_kind_export_end;
 
    if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) {
       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
       if (ctx->stage.has(SWStage::TES))
-         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->ac.tes_patch_id);
+         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
+            get_arg(ctx, ctx->args->ac.tes_patch_id);
       else
-         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->ac.vs_prim_id);
+         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
+            get_arg(ctx, ctx->args->ac.vs_prim_id);
    }
 
    if (ctx->options->key.has_multiview_view_index) {
       ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
-      ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
+      ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] =
+         as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
    }
 
    /* Hardware requires position data to always be exported, even if the
@@ -10423,8 +10700,8 @@ static void create_vs_exports(isel_context *ctx)
    int next_pos = 0;
    export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
 
-   bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate ||
-                                        ctx->options->force_vrs_rates;
+   bool writes_primitive_shading_rate =
+      outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index ||
        writes_primitive_shading_rate) {
       export_vs_psiz_layer_viewport_vrs(ctx, &next_pos);
@@ -10442,9 +10719,7 @@ static void create_vs_exports(isel_context *ctx)
    }
 
    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
-      if (i < VARYING_SLOT_VAR0 &&
-          i != VARYING_SLOT_LAYER &&
-          i != VARYING_SLOT_PRIMITIVE_ID &&
+      if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID &&
           i != VARYING_SLOT_VIEWPORT)
          continue;
 
@@ -10452,7 +10727,8 @@ static void create_vs_exports(isel_context *ctx)
    }
 }
 
-static bool export_fs_mrt_z(isel_context *ctx)
+static bool
+export_fs_mrt_z(isel_context* ctx)
 {
    Builder bld(ctx->program, ctx->block);
    unsigned enabled_channels = 0;
@@ -10465,8 +10741,7 @@ static bool export_fs_mrt_z(isel_context *ctx)
 
    /* Both stencil and sample mask only need 16-bits. */
    if (!ctx->program->info->ps.writes_z &&
-       (ctx->program->info->ps.writes_stencil ||
-        ctx->program->info->ps.writes_sample_mask)) {
+       (ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) {
       compr = true; /* COMPR flag */
 
       if (ctx->program->info->ps.writes_stencil) {
@@ -10480,7 +10755,7 @@ static bool export_fs_mrt_z(isel_context *ctx)
          /* SampleMask should be in Y[15:0]. */
          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
          enabled_channels |= 0xc;
-     }
+      }
    } else {
       if (ctx->program->info->ps.writes_z) {
          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
@@ -10501,19 +10776,19 @@ static bool export_fs_mrt_z(isel_context *ctx)
    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
     * writemask component.
     */
-   if (ctx->options->chip_class == GFX6 &&
-       ctx->options->family != CHIP_OLAND &&
+   if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND &&
        ctx->options->family != CHIP_HAINAN) {
-            enabled_channels |= 0x1;
+      enabled_channels |= 0x1;
    }
 
-   bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
-           enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr);
+   bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
+           V_008DFC_SQ_EXP_MRTZ, compr);
 
    return true;
 }
 
-static bool export_fs_mrt_color(isel_context *ctx, int slot)
+static bool
+export_fs_mrt_color(isel_context* ctx, int slot)
 {
    Builder bld(ctx->program, ctx->block);
    unsigned write_mask = ctx->outputs.mask[slot];
@@ -10541,34 +10816,25 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
    bool is_16bit = values[0].regClass() == v2b;
 
    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
-   if (ctx->options->enable_mrt_output_nan_fixup &&
-       !is_16bit &&
-       (col_format == V_028714_SPI_SHADER_32_R ||
-        col_format == V_028714_SPI_SHADER_32_GR ||
-        col_format == V_028714_SPI_SHADER_32_AR ||
-        col_format == V_028714_SPI_SHADER_32_ABGR ||
+   if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
+       (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
+        col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
       for (int i = 0; i < 4; i++) {
          if (!(write_mask & (1 << i)))
             continue;
 
-         Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
-                               bld.hint_vcc(bld.def(bld.lm)), values[i],
-                               bld.copy(bld.def(v1), Operand(3u)));
+         Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
+                               values[i], bld.copy(bld.def(v1), Operand(3u)));
          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
                               bld.copy(bld.def(v1), Operand(0u)), isnan);
       }
    }
 
-   switch (col_format)
-   {
-   case V_028714_SPI_SHADER_32_R:
-      enabled_channels = 1;
-      break;
+   switch (col_format) {
+   case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
 
-   case V_028714_SPI_SHADER_32_GR:
-      enabled_channels = 0x3;
-      break;
+   case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
 
    case V_028714_SPI_SHADER_32_AR:
       if (ctx->options->chip_class >= GFX10) {
@@ -10583,21 +10849,24 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
 
    case V_028714_SPI_SHADER_FP16_ABGR:
       for (int i = 0; i < 2; i++) {
-         bool enabled = (write_mask >> (i*2)) & 0x3;
+         bool enabled = (write_mask >> (i * 2)) & 0x3;
          if (enabled) {
-            enabled_channels |= 0x3 << (i*2);
+            enabled_channels |= 0x3 << (i * 2);
             if (is_16bit) {
-               values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
-                                      values[i*2].isUndefined() ? Operand(v2b) : values[i*2],
-                                      values[i*2+1].isUndefined() ? Operand(v2b): values[i*2+1]);
-            } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9 ) {
-               values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
-                                    values[i*2].isUndefined() ? Operand(0u) : values[i*2],
-                                    values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
+               values[i] =
+                  bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
+                             values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
+                             values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
+            } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
+               values[i] =
+                  bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
+                           values[i * 2].isUndefined() ? Operand(0u) : values[i * 2],
+                           values[i * 2 + 1].isUndefined() ? Operand(0u) : values[i * 2 + 1]);
             } else {
-               values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
-                                    values[i*2].isUndefined() ? values[i*2+1] : values[i*2],
-                                    values[i*2+1].isUndefined() ? values[i*2] : values[i*2+1]);
+               values[i] =
+                  bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
+                           values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
+                           values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
             }
          } else {
             values[i] = Operand(v1);
@@ -10633,9 +10902,9 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
 
          for (unsigned i = 0; i < 4; i++) {
             if ((write_mask >> i) & 1) {
-               values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
-                                    i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
-                                    values[i]);
+               values[i] =
+                  bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
+                           i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val), values[i]);
             }
          }
       } else if (is_16bit) {
@@ -10654,18 +10923,18 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
       if (is_int8 || is_int10) {
          /* clamp */
          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
-         uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
+         uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
          Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
 
          for (unsigned i = 0; i < 4; i++) {
             if ((write_mask >> i) & 1) {
-               values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
-                                    i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
-                                    values[i]);
-               values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
-                                    i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
-                                    values[i]);
+               values[i] =
+                  bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
+                           i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val), values[i]);
+               values[i] =
+                  bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
+                           i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val), values[i]);
             }
          }
       } else if (is_16bit) {
@@ -10678,24 +10947,21 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
       }
       break;
 
-   case V_028714_SPI_SHADER_32_ABGR:
-      enabled_channels = 0xF;
-      break;
+   case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
 
    case V_028714_SPI_SHADER_ZERO:
-   default:
-      return false;
+   default: return false;
    }
 
-   if ((bool) compr_op) {
+   if ((bool)compr_op) {
       for (int i = 0; i < 2; i++) {
          /* check if at least one of the values to be compressed is enabled */
-         bool enabled = (write_mask >> (i*2)) & 0x3;
+         bool enabled = (write_mask >> (i * 2)) & 0x3;
          if (enabled) {
-            enabled_channels |= 0x3 << (i*2);
+            enabled_channels |= 0x3 << (i * 2);
             values[i] = bld.vop3(compr_op, bld.def(v1),
-                                 values[i*2].isUndefined() ? Operand(0u) : values[i*2],
-                                 values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
+                                 values[i * 2].isUndefined() ? Operand(0u) : values[i * 2],
+                                 values[i * 2 + 1].isUndefined() ? Operand(0u) : values[i * 2 + 1]);
          } else {
             values[i] = Operand(v1);
          }
@@ -10708,12 +10974,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
    }
 
-   bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
-           enabled_channels, target, compr);
+   bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target,
+           compr);
    return true;
 }
 
-static void create_fs_null_export(isel_context *ctx)
+static void
+create_fs_null_export(isel_context* ctx)
 {
    /* FS must always have exports.
     * So when there are none, we need to add a null export.
@@ -10725,13 +10992,13 @@ static void create_fs_null_export(isel_context *ctx)
            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
 }
 
-static void create_fs_exports(isel_context *ctx)
+static void
+create_fs_exports(isel_context* ctx)
 {
    bool exported = false;
 
    /* Export depth, stencil and sample mask. */
-   if (ctx->outputs.mask[FRAG_RESULT_DEPTH] ||
-       ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
+   if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
        ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
       exported |= export_fs_mrt_z(ctx);
 
@@ -10746,17 +11013,16 @@ static void create_fs_exports(isel_context *ctx)
    ctx->block->kind |= block_kind_export_end;
 }
 
-static void create_workgroup_barrier(Builder& bld)
+static void
+create_workgroup_barrier(Builder& bld)
 {
    bld.barrier(aco_opcode::p_barrier,
-               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
-               scope_workgroup);
+               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup);
 }
 
-static void emit_stream_output(isel_context *ctx,
-                               Temp const *so_buffers,
-                               Temp const *so_write_offset,
-                               const struct radv_stream_output *output)
+static void
+emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
+                   const struct radv_stream_output* output)
 {
    unsigned num_comps = util_bitcount(output->component_mask);
    unsigned writemask = (1 << num_comps) - 1;
@@ -10791,39 +11057,34 @@ static void emit_stream_output(isel_context *ctx,
       unsigned offset = output->offset + start * 4;
 
       Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
+      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
       for (int i = 0; i < count; ++i)
-         vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
+         vec->operands[i] =
+            (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
       vec->definitions[0] = Definition(write_data);
       ctx->block->instructions.emplace_back(std::move(vec));
 
       aco_opcode opcode;
       switch (count) {
-      case 1:
-         opcode = aco_opcode::buffer_store_dword;
-         break;
-      case 2:
-         opcode = aco_opcode::buffer_store_dwordx2;
-         break;
-      case 3:
-         opcode = aco_opcode::buffer_store_dwordx3;
-         break;
-      case 4:
-         opcode = aco_opcode::buffer_store_dwordx4;
-         break;
-      default:
-         unreachable("Unsupported dword count.");
+      case 1: opcode = aco_opcode::buffer_store_dword; break;
+      case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
+      case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
+      case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
+      default: unreachable("Unsupported dword count.");
       }
 
-      aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
+      aco_ptr<MUBUF_instruction> store{
+         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
       store->operands[0] = Operand(so_buffers[buf]);
       store->operands[1] = Operand(so_write_offset[buf]);
-      store->operands[2] = Operand((uint32_t) 0);
+      store->operands[2] = Operand((uint32_t)0);
       store->operands[3] = Operand(write_data);
       if (offset > 4095) {
          /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
          Builder bld(ctx->program, ctx->block);
-         store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
+         store->operands[0] =
+            bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
       } else {
          store->offset = offset;
       }
@@ -10835,7 +11096,8 @@ static void emit_stream_output(isel_context *ctx,
    }
 }
 
-static void emit_streamout(isel_context *ctx, unsigned stream)
+static void
+emit_streamout(isel_context* ctx, unsigned stream)
 {
    Builder bld(ctx->program, ctx->block);
 
@@ -10862,7 +11124,8 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
 
    bld.reset(ctx->block);
 
-   Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
+   Temp so_write_index =
+      bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
 
    Temp so_write_offset[4];
 
@@ -10877,7 +11140,8 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
                                 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
 
-         so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
+         so_write_offset[i] =
+            bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
       } else {
          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
@@ -10887,8 +11151,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
    }
 
    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
-      struct radv_stream_output *output =
-         &ctx->program->info->so.outputs[i];
+      struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
       if (stream != output->stream)
          continue;
 
@@ -10899,7 +11162,8 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
    end_divergent_if(ctx, &ic);
 }
 
-Pseudo_instruction *add_startpgm(struct isel_context *ctx)
+Pseudo_instruction*
+add_startpgm(struct isel_context* ctx)
 {
    unsigned arg_count = ctx->args->ac.arg_count;
    if (ctx->stage == fragment_fs) {
@@ -10911,7 +11175,7 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx)
        * could run before argument setup, then this wouldn't be necessary
        * anymore.
        */
-      struct ac_shader_args *args = &ctx->args->ac;
+      struct ac_shader_args* args = &ctx->args->ac;
       arg_count = 0;
       for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) {
          if (args->args[i].file != AC_ARG_VGPR) {
@@ -10930,7 +11194,8 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx)
       }
    }
 
-   aco_ptr<Pseudo_instruction> startpgm{create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count)};
+   aco_ptr<Pseudo_instruction> startpgm{
+      create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count)};
    for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
       if (ctx->args->ac.args[i].skip)
          continue;
@@ -10945,7 +11210,7 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx)
       startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
       arg++;
    }
-   Pseudo_instruction *instr = startpgm.get();
+   Pseudo_instruction* instr = startpgm.get();
    ctx->block->instructions.push_back(std::move(startpgm));
 
    /* Stash these in the program so that they can be accessed later when
@@ -10957,37 +11222,36 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx)
    return instr;
 }
 
-void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm)
+void
+fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)
 {
    assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
    Builder bld(ctx->program, ctx->block);
    constexpr unsigned hs_idx = 1u;
-   Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
-                                              get_arg(ctx, ctx->args->ac.merged_wave_info),
-                                              Operand((8u << 16) | (hs_idx * 8u)));
+   Builder::Result hs_thread_count =
+      bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+               get_arg(ctx, ctx->args->ac.merged_wave_info), Operand((8u << 16) | (hs_idx * 8u)));
    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
 
    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
 
-   Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                               get_arg(ctx, ctx->args->ac.vertex_id),
-                               get_arg(ctx, ctx->args->ac.instance_id),
-                               ls_has_nonzero_hs_threads);
-   Temp vs_rel_patch_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                               get_arg(ctx, ctx->args->ac.tcs_rel_ids),
-                               get_arg(ctx, ctx->args->ac.vs_rel_patch_id),
-                               ls_has_nonzero_hs_threads);
-   Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                             get_arg(ctx, ctx->args->ac.tcs_patch_id),
-                             get_arg(ctx, ctx->args->ac.vertex_id),
-                             ls_has_nonzero_hs_threads);
+   Temp instance_id =
+      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),
+               get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);
+   Temp vs_rel_patch_id =
+      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
+               get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);
+   Temp vertex_id =
+      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),
+               get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);
 
    ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
    ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;
    ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
 }
 
-void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
+void
+split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
 {
    /* Split all arguments except for the first (ring_offsets) and the last
     * (exec) so that the dead channels don't stay live throughout the program.
@@ -11000,13 +11264,16 @@ void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
    }
 }
 
-void handle_bc_optimize(isel_context *ctx)
+void
+handle_bc_optimize(isel_context* ctx)
 {
    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
    Builder bld(ctx->program, ctx->block);
    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
-   bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
-   bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
+   bool uses_center =
+      G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
+   bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) ||
+                        G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
    ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
    ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
    if (uses_center && uses_centroid) {
@@ -11016,10 +11283,12 @@ void handle_bc_optimize(isel_context *ctx)
       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
          Temp new_coord[2];
          for (unsigned i = 0; i < 2; i++) {
-            Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
-            Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
-            new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                                    persp_centroid, persp_center, sel);
+            Temp persp_centroid =
+               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
+            Temp persp_center =
+               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
+            new_coord[i] =
+               bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel);
          }
          ctx->persp_centroid = bld.tmp(v2);
          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
@@ -11030,10 +11299,12 @@ void handle_bc_optimize(isel_context *ctx)
       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
          Temp new_coord[2];
          for (unsigned i = 0; i < 2; i++) {
-            Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
-            Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
-            new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                                    linear_centroid, linear_center, sel);
+            Temp linear_centroid =
+               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
+            Temp linear_center =
+               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
+            new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid,
+                                    linear_center, sel);
          }
          ctx->linear_centroid = bld.tmp(v2);
          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
@@ -11043,9 +11314,10 @@ void handle_bc_optimize(isel_context *ctx)
    }
 }
 
-void setup_fp_mode(isel_context *ctx, nir_shader *shader)
+void
+setup_fp_mode(isel_context* ctx, nir_shader* shader)
 {
-   Program *program = ctx->program;
+   Program* program = ctx->program;
 
    unsigned float_controls = shader->info.float_controls_execution_mode;
 
@@ -11058,15 +11330,17 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader)
    program->next_fp_mode.must_flush_denorms32 =
       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
    program->next_fp_mode.must_flush_denorms16_64 =
-      float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
-                        FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
+      float_controls &
+      (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
 
    program->next_fp_mode.care_about_round32 =
-      float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
+      float_controls &
+      (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
 
    program->next_fp_mode.care_about_round16_64 =
-      float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
-                        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
+      float_controls &
+      (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
+       FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
 
    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
@@ -11086,7 +11360,8 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader)
    else
       program->next_fp_mode.round32 = fp_round_ne;
 
-   if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
+   if (float_controls &
+       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
       program->next_fp_mode.round16_64 = fp_round_tz;
    else
       program->next_fp_mode.round16_64 = fp_round_ne;
@@ -11094,7 +11369,8 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader)
    ctx->block->fp_mode = program->next_fp_mode;
 }
 
-void cleanup_cfg(Program *program)
+void
+cleanup_cfg(Program* program)
 {
    /* create linear_succs/logical_succs */
    for (Block& BB : program->blocks) {
@@ -11105,7 +11381,8 @@ void cleanup_cfg(Program *program)
    }
 }
 
-Temp lanecount_to_mask(isel_context *ctx, Temp count, bool allow64 = true)
+Temp
+lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)
 {
    assert(count.regClass() == s1);
 
@@ -11119,30 +11396,33 @@ Temp lanecount_to_mask(isel_context *ctx, Temp count, bool allow64 = true)
          return mask;
 
       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
-      Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
+      Temp active_64 =
+         bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
       cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
    } else {
-      /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
+      /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
+       * the register */
       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
    }
 
    return cond;
 }
 
-Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
+Temp
+merged_wave_info_to_mask(isel_context* ctx, unsigned i)
 {
    Builder bld(ctx->program, ctx->block);
 
    /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
-   Temp count = i == 0
-                ? get_arg(ctx, ctx->args->ac.merged_wave_info)
-                : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
-                           get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(i * 8u));
+   Temp count = i == 0 ? get_arg(ctx, ctx->args->ac.merged_wave_info)
+                       : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
+                                  get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(i * 8u));
 
    return lanecount_to_mask(ctx, count);
 }
 
-void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt)
+void
+ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
 {
    assert(vtx_cnt.id() && prm_cnt.id());
 
@@ -11152,25 +11432,31 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
    if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS)) {
       /* Navi 1x workaround: make sure to always export at least 1 vertex and triangle */
       prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand(0u));
-      prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0));
-      vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), vtx_cnt, bld.scc(prm_cnt_0));
+      prm_cnt =
+         bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0));
+      vtx_cnt =
+         bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), vtx_cnt, bld.scc(prm_cnt_0));
    }
 
    /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
-   Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
+   Temp tmp =
+      bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
    tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
 
-   /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
+   /* Request the SPI to allocate space for the primitives and vertices
+    * that will be exported by the threadgroup. */
    bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
 
    if (prm_cnt_0.id()) {
       /* Navi 1x workaround: export a triangle with NaN coordinates when GS has no output.
-       * It can't have all-zero positions because that would render an undesired pixel with conservative rasterization.
+       * It can't have all-zero positions because that would render an undesired pixel with
+       * conservative rasterization.
        */
       Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
       Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
                            Operand(1u, ctx->program->wave_size == 64), first_lane);
-      cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond, Operand(0u, ctx->program->wave_size == 64), bld.scc(prm_cnt_0));
+      cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond,
+                      Operand(0u, ctx->program->wave_size == 64), bld.scc(prm_cnt_0));
 
       if_context ic_prim_0;
       begin_divergent_if_then(ctx, &ic_prim_0, cond);
@@ -11182,12 +11468,12 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
       /* Use NaN for the coordinates, so that the rasterizer allways culls it.  */
       Temp nan_coord = bld.copy(bld.def(v1), Operand(-1u));
 
-      bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1),
-        1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
-        false /* compressed */, true /* done */, false /* valid mask */);
-      bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord,
-        0xf /* enabled mask */, V_008DFC_SQ_EXP_POS /* dest */,
-        false /* compressed */, true /* done */, true /* valid mask */);
+      bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */,
+              V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */,
+              false /* valid mask */);
+      bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */,
+              V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */,
+              true /* valid mask */);
 
       begin_divergent_if_else(ctx, &ic_prim_0);
       end_divergent_if(ctx, &ic_prim_0);
@@ -11197,25 +11483,23 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
 
 } /* end namespace */
 
-void select_program(Program *program,
-                    unsigned shader_count,
-                    struct nir_shader *const *shaders,
-                    ac_shader_config* config,
-                    struct radv_shader_args *args)
+void
+select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
+               ac_shader_config* config, struct radv_shader_args* args)
 {
    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
    if_context ic_merged_wave_info;
    bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
 
    for (unsigned i = 0; i < shader_count; i++) {
-      nir_shader *nir = shaders[i];
+      nir_shader* nir = shaders[i];
       init_context(&ctx, nir);
 
       setup_fp_mode(&ctx, nir);
 
       if (!i) {
          /* needs to be after init_context() for FS */
-         Pseudo_instruction *startpgm = add_startpgm(&ctx);
+         Pseudo_instruction* startpgm = add_startpgm(&ctx);
          append_logical_start(ctx.block);
 
          if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
@@ -11229,20 +11513,22 @@ void select_program(Program *program,
       }
 
       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
-      nir_function_impl *func = nir_shader_get_entrypoint(nir);
-      bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
-                          ((nir->info.stage == MESA_SHADER_VERTEX &&
-                            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
-                           (nir->info.stage == MESA_SHADER_TESS_EVAL &&
-                            ctx.stage == tess_eval_geometry_gs));
+      nir_function_impl* func = nir_shader_get_entrypoint(nir);
+      bool empty_shader =
+         nir_cf_list_is_empty_block(&func->body) &&
+         ((nir->info.stage == MESA_SHADER_VERTEX &&
+           (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
+          (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
 
-      bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
-      bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
+      bool check_merged_wave_info =
+         ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
+      bool endif_merged_wave_info =
+         ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
 
-      if (program->chip_class == GFX10 &&
-          program->stage.hw == HWStage::NGG &&
+      if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG &&
           program->stage.num_sw_stages() == 1) {
-         /* Workaround for Navi 1x HW bug to ensure all NGG waves launch before s_sendmsg(GS_ALLOC_REQ). */
+         /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
+          * s_sendmsg(GS_ALLOC_REQ). */
          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
       }
 
@@ -11263,7 +11549,8 @@ void select_program(Program *program,
 
          if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
             ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),
-                                        get_arg(&ctx, args->ac.merged_wave_info), Operand(2u), Operand(8u), Operand(0u));
+                                        get_arg(&ctx, args->ac.merged_wave_info), Operand(2u),
+                                        Operand(8u), Operand(0u));
          }
       } else if (ctx.stage == geometry_gs)
          ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);
@@ -11282,7 +11569,8 @@ void select_program(Program *program,
          Builder bld(ctx.program, ctx.block);
          bld.barrier(aco_opcode::p_barrier,
                      memory_sync_info(storage_vmem_output, semantic_release, scope_device));
-         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
+         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,
+                  sendmsg_gs_done(false, false, 0));
       }
 
       if (ctx.stage == fragment_fs) {
@@ -11313,9 +11601,9 @@ void select_program(Program *program,
    cleanup_cfg(program);
 }
 
-void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
-                           ac_shader_config* config,
-                           struct radv_shader_args *args)
+void
+select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
+                      struct radv_shader_args* args)
 {
    isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
 
@@ -11326,14 +11614,16 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
 
    Builder bld(ctx.program, ctx.block);
 
-   Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
+   Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),
+                             program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
 
    Operand stream_id(0u);
    if (args->shader_info->so.num_outputs)
       stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
                            get_arg(&ctx, ctx.args->ac.streamout_config), Operand(0x20018u));
 
-   Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
+   Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u),
+                              get_arg(&ctx, ctx.args->ac.vertex_id));
 
    std::stack<if_context> if_contexts;
 
@@ -11348,7 +11638,8 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
       memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
 
       if (!stream_id.isConstant()) {
-         Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
+         Temp cond =
+            bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
          if_contexts.emplace();
          begin_uniform_if_then(&ctx, &if_contexts.top(), cond);
          bld.reset(ctx.block);
@@ -11367,8 +11658,8 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
 
             Temp val = bld.tmp(v1);
             unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
-            load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1,
-                            0u, true, true, true);
+            load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
+                            true, true);
 
             ctx.outputs.mask[i] |= 1 << j;
             ctx.outputs.temps[i * 4u + j] = val;
@@ -11407,14 +11698,14 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
    cleanup_cfg(program);
 }
 
-void select_trap_handler_shader(Program *program, struct nir_shader *shader,
-                                ac_shader_config* config,
-                                struct radv_shader_args *args)
+void
+select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
+                           struct radv_shader_args* args)
 {
    assert(args->options->chip_class == GFX8);
 
-   init_program(program, compute_cs, args->shader_info,
-                args->options->chip_class, args->options->family, args->options->wgp_mode, config);
+   init_program(program, compute_cs, args->shader_info, args->options->chip_class,
+                args->options->family, args->options->wgp_mode, config);
 
    isel_context ctx = {};
    ctx.program = program;
@@ -11433,12 +11724,12 @@ void select_trap_handler_shader(Program *program, struct nir_shader *shader,
    Builder bld(ctx.program, ctx.block);
 
    /* Load the buffer descriptor from TMA. */
-   bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4),
-            Operand(PhysReg{tma}, s2), Operand(0u));
+   bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
+            Operand(0u));
 
    /* Store TTMP0-TTMP1. */
-   bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4),
-            Operand(0u), Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
+   bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand(0u),
+            Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
 
    uint32_t hw_regs_idx[] = {
       2, /* HW_REG_STATUS */
@@ -11453,8 +11744,8 @@ void select_trap_handler_shader(Program *program, struct nir_shader *shader,
       bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
                ((20 - 1) << 11) | hw_regs_idx[i]);
 
-      bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
-               Operand(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
+      bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4), Operand(8u + i * 4),
+               Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
    }
 
    program->config->float_mode = program->blocks[0].fp_mode.val;
@@ -11465,4 +11756,4 @@ void select_trap_handler_shader(Program *program, struct nir_shader *shader,
 
    cleanup_cfg(program);
 }
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
index 50d1a76efd3..43dd76a31aa 100644
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -39,21 +39,22 @@ struct shader_io_state {
    uint8_t mask[VARYING_SLOT_MAX];
    Temp temps[VARYING_SLOT_MAX * 4u];
 
-   shader_io_state() {
+   shader_io_state()
+   {
       memset(mask, 0, sizeof(mask));
       std::fill_n(temps, VARYING_SLOT_MAX * 4u, Temp(0, RegClass::v1));
    }
 };
 
 struct isel_context {
-   const struct radv_nir_compiler_options *options;
-   struct radv_shader_args *args;
-   Program *program;
-   nir_shader *shader;
+   const struct radv_nir_compiler_options* options;
+   struct radv_shader_args* args;
+   Program* program;
+   nir_shader* shader;
    uint32_t constant_data_offset;
-   Block *block;
+   Block* block;
    uint32_t first_temp_id;
-   std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
+   std::unordered_map<unsigned, std::array<Temp, NIR_MAX_VEC_COMPONENTS>> allocated_vec;
    Stage stage;
    struct {
       bool has_branch;
@@ -66,7 +67,8 @@ struct isel_context {
       struct {
          bool is_divergent = false;
       } parent_if;
-      bool exec_potentially_empty_discard = false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
+      bool exec_potentially_empty_discard =
+         false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
       uint16_t exec_potentially_empty_break_depth = UINT16_MAX;
       /* Set to false when loop_nest_depth==exec_potentially_empty_break_depth
        * and parent_if.is_divergent==false. Called _break but it's also used for
@@ -76,7 +78,7 @@ struct isel_context {
    } cf_info;
 
    /* NIR range analysis. */
-   struct hash_table *range_ht;
+   struct hash_table* range_ht;
    nir_unsigned_upper_bound_config ub_config;
 
    Temp arg_temps[AC_MAX_ARGS];
@@ -102,22 +104,19 @@ struct isel_context {
    shader_io_state outputs;
 };
 
-inline Temp get_arg(isel_context *ctx, struct ac_arg arg)
+inline Temp
+get_arg(isel_context* ctx, struct ac_arg arg)
 {
    assert(arg.used);
    return ctx->arg_temps[arg.arg_index];
 }
 
-void init_context(isel_context *ctx, nir_shader *shader);
-void cleanup_context(isel_context *ctx);
+void init_context(isel_context* ctx, nir_shader* shader);
+void cleanup_context(isel_context* ctx);
 
-isel_context
-setup_isel_context(Program* program,
-                   unsigned shader_count,
-                   struct nir_shader *const *shaders,
-                   ac_shader_config* config,
-                   struct radv_shader_args *args,
-                   bool is_gs_copy_shader);
+isel_context setup_isel_context(Program* program, unsigned shader_count,
+                                struct nir_shader* const* shaders, ac_shader_config* config,
+                                struct radv_shader_args* args, bool is_gs_copy_shader);
 
 } // namespace aco
 
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 191c8e86cce..430f9f62530 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -36,7 +36,8 @@ namespace aco {
 
 namespace {
 
-unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
+unsigned
+get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
 {
    switch (interp) {
    case INTERP_MODE_SMOOTH:
@@ -58,13 +59,13 @@ unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
       else if (intrin == nir_intrinsic_load_barycentric_sample)
          return S_0286CC_LINEAR_SAMPLE_ENA(1);
       break;
-   default:
-      break;
+   default: break;
    }
    return 0;
 }
 
-bool is_loop_header_block(nir_block *block)
+bool
+is_loop_header_block(nir_block* block)
 {
    return block->cf_node.parent->type == nir_cf_node_loop &&
           block == nir_loop_first_block(nir_cf_node_as_loop(block->cf_node.parent));
@@ -72,20 +73,20 @@ bool is_loop_header_block(nir_block *block)
 
 /* similar to nir_block_is_unreachable(), but does not require dominance information */
 bool
-is_block_reachable(nir_function_impl *impl, nir_block *known_reachable, nir_block *block)
+is_block_reachable(nir_function_impl* impl, nir_block* known_reachable, nir_block* block)
 {
    if (block == nir_start_block(impl) || block == known_reachable)
       return true;
 
    /* skip loop back-edges */
    if (is_loop_header_block(block)) {
-      nir_loop *loop = nir_cf_node_as_loop(block->cf_node.parent);
-      nir_block *preheader = nir_block_cf_tree_prev(nir_loop_first_block(loop));
+      nir_loop* loop = nir_cf_node_as_loop(block->cf_node.parent);
+      nir_block* preheader = nir_block_cf_tree_prev(nir_loop_first_block(loop));
       return is_block_reachable(impl, known_reachable, preheader);
    }
 
-   set_foreach(block->predecessors, entry) {
-      if (is_block_reachable(impl, known_reachable, (nir_block *)entry->key))
+   set_foreach (block->predecessors, entry) {
+      if (is_block_reachable(impl, known_reachable, (nir_block*)entry->key))
          return true;
    }
 
@@ -94,12 +95,12 @@ is_block_reachable(nir_function_impl *impl, nir_block *known_reachable, nir_bloc
 
 /* Check whether the given SSA def is only used by cross-lane instructions. */
 bool
-only_used_by_cross_lane_instrs(nir_ssa_def *ssa, bool follow_phis = true)
+only_used_by_cross_lane_instrs(nir_ssa_def* ssa, bool follow_phis = true)
 {
    nir_foreach_use (src, ssa) {
       switch (src->parent_instr->type) {
       case nir_instr_type_alu: {
-         nir_alu_instr *alu = nir_instr_as_alu(src->parent_instr);
+         nir_alu_instr* alu = nir_instr_as_alu(src->parent_instr);
          if (alu->op != nir_op_unpack_64_2x32_split_x && alu->op != nir_op_unpack_64_2x32_split_y)
             return false;
          if (!only_used_by_cross_lane_instrs(&alu->dest.dest.ssa, follow_phis))
@@ -108,7 +109,7 @@ only_used_by_cross_lane_instrs(nir_ssa_def *ssa, bool follow_phis = true)
          continue;
       }
       case nir_instr_type_intrinsic: {
-         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr);
+         nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(src->parent_instr);
          if (intrin->intrinsic != nir_intrinsic_read_invocation &&
              intrin->intrinsic != nir_intrinsic_read_first_invocation &&
              intrin->intrinsic != nir_intrinsic_lane_permute_16_amd)
@@ -121,14 +122,13 @@ only_used_by_cross_lane_instrs(nir_ssa_def *ssa, bool follow_phis = true)
          if (!follow_phis)
             return false;
 
-         nir_phi_instr *phi = nir_instr_as_phi(src->parent_instr);
+         nir_phi_instr* phi = nir_instr_as_phi(src->parent_instr);
          if (!only_used_by_cross_lane_instrs(&phi->dest.ssa, false))
             return false;
 
          continue;
       }
-      default:
-         return false;
+      default: return false;
       }
    }
 
@@ -140,12 +140,12 @@ only_used_by_cross_lane_instrs(nir_ssa_def *ssa, bool follow_phis = true)
  * block instead. This is so that we can use any SGPR live-out of the side
  * without the branch without creating a linear phi in the invert or merge block. */
 bool
-sanitize_if(nir_function_impl *impl, nir_if *nif)
+sanitize_if(nir_function_impl* impl, nir_if* nif)
 {
-   //TODO: skip this if the condition is uniform and there are no divergent breaks/continues?
+   // TODO: skip this if the condition is uniform and there are no divergent breaks/continues?
 
-   nir_block *then_block = nir_if_last_then_block(nif);
-   nir_block *else_block = nir_if_last_else_block(nif);
+   nir_block* then_block = nir_if_last_then_block(nif);
+   nir_block* else_block = nir_if_last_else_block(nif);
    bool then_jump = nir_block_ends_in_jump(then_block) ||
                     !is_block_reachable(impl, nir_if_first_then_block(nif), then_block);
    bool else_jump = nir_block_ends_in_jump(else_block) ||
@@ -167,47 +167,46 @@ sanitize_if(nir_function_impl *impl, nir_if *nif)
    nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
 
    /* Finally, move the continue from branch after the if-statement. */
-   nir_block *last_continue_from_blk = else_jump ? then_block : else_block;
-   nir_block *first_continue_from_blk = else_jump ?
-      nir_if_first_then_block(nif) : nir_if_first_else_block(nif);
+   nir_block* last_continue_from_blk = else_jump ? then_block : else_block;
+   nir_block* first_continue_from_blk =
+      else_jump ? nir_if_first_then_block(nif) : nir_if_first_else_block(nif);
 
    nir_cf_list tmp;
    nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk),
-                        nir_after_block(last_continue_from_blk));
+                  nir_after_block(last_continue_from_blk));
    nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node));
 
    return true;
 }
 
 bool
-sanitize_cf_list(nir_function_impl *impl, struct exec_list *cf_list)
+sanitize_cf_list(nir_function_impl* impl, struct exec_list* cf_list)
 {
    bool progress = false;
-   foreach_list_typed(nir_cf_node, cf_node, node, cf_list) {
+   foreach_list_typed (nir_cf_node, cf_node, node, cf_list) {
       switch (cf_node->type) {
-      case nir_cf_node_block:
-         break;
+      case nir_cf_node_block: break;
       case nir_cf_node_if: {
-         nir_if *nif = nir_cf_node_as_if(cf_node);
+         nir_if* nif = nir_cf_node_as_if(cf_node);
          progress |= sanitize_cf_list(impl, &nif->then_list);
          progress |= sanitize_cf_list(impl, &nif->else_list);
          progress |= sanitize_if(impl, nif);
          break;
       }
       case nir_cf_node_loop: {
-         nir_loop *loop = nir_cf_node_as_loop(cf_node);
+         nir_loop* loop = nir_cf_node_as_loop(cf_node);
          progress |= sanitize_cf_list(impl, &loop->body);
          break;
       }
-      case nir_cf_node_function:
-         unreachable("Invalid cf type");
+      case nir_cf_node_function: unreachable("Invalid cf type");
       }
    }
 
    return progress;
 }
 
-void apply_nuw_to_ssa(isel_context *ctx, nir_ssa_def *ssa)
+void
+apply_nuw_to_ssa(isel_context* ctx, nir_ssa_def* ssa)
 {
    nir_ssa_scalar scalar;
    scalar.def = ssa;
@@ -216,7 +215,7 @@ void apply_nuw_to_ssa(isel_context *ctx, nir_ssa_def *ssa)
    if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != nir_op_iadd)
       return;
 
-   nir_alu_instr *add = nir_instr_as_alu(ssa->parent_instr);
+   nir_alu_instr* add = nir_instr_as_alu(ssa->parent_instr);
 
    if (add->no_unsigned_wrap)
       return;
@@ -230,20 +229,19 @@ void apply_nuw_to_ssa(isel_context *ctx, nir_ssa_def *ssa)
       src1 = tmp;
    }
 
-   uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht,
-                                               src1, &ctx->ub_config);
+   uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, src1, &ctx->ub_config);
    add->no_unsigned_wrap =
-      !nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub,
-                                   &ctx->ub_config);
+      !nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub, &ctx->ub_config);
 }
 
-void apply_nuw_to_offsets(isel_context *ctx, nir_function_impl *impl)
+void
+apply_nuw_to_offsets(isel_context* ctx, nir_function_impl* impl)
 {
-   nir_foreach_block(block, impl) {
-      nir_foreach_instr(instr, block) {
+   nir_foreach_block (block, impl) {
+      nir_foreach_instr (instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;
-         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr);
 
          switch (intrin->intrinsic) {
          case nir_intrinsic_load_constant:
@@ -261,14 +259,14 @@ void apply_nuw_to_offsets(isel_context *ctx, nir_function_impl *impl)
             if (!nir_src_is_divergent(intrin->src[2]))
                apply_nuw_to_ssa(ctx, intrin->src[2].ssa);
             break;
-         default:
-            break;
+         default: break;
          }
       }
    }
 }
 
-RegClass get_reg_class(isel_context *ctx, RegType type, unsigned components, unsigned bitsize)
+RegClass
+get_reg_class(isel_context* ctx, RegType type, unsigned components, unsigned bitsize)
 {
    if (bitsize == 1)
       return RegClass(RegType::sgpr, ctx->program->lane_mask.size() * components);
@@ -277,17 +275,16 @@ RegClass get_reg_class(isel_context *ctx, RegType type, unsigned components, uns
 }
 
 void
-setup_vs_output_info(isel_context *ctx, nir_shader *nir,
-                     bool export_prim_id, bool export_clip_dists,
-                     radv_vs_output_info *outinfo)
+setup_vs_output_info(isel_context* ctx, nir_shader* nir, bool export_prim_id,
+                     bool export_clip_dists, radv_vs_output_info* outinfo)
 {
    memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
           sizeof(outinfo->vs_output_param_offset));
 
    outinfo->param_exports = 0;
    int pos_written = 0x1;
-   bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate ||
-                                        ctx->options->force_vrs_rates;
+   bool writes_primitive_shading_rate =
+      outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
    if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer ||
        writes_primitive_shading_rate)
       pos_written |= 1 << 1;
@@ -297,7 +294,8 @@ setup_vs_output_info(isel_context *ctx, nir_shader *nir,
       int idx = u_bit_scan64(&mask);
       if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER ||
           idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT ||
-          ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
+          ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) &&
+           export_clip_dists)) {
          if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
             outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
       }
@@ -333,15 +331,14 @@ setup_vs_output_info(isel_context *ctx, nir_shader *nir,
     * as soon as it encounters a DONE pos export. When this happens, PS waves can launch
     * before the NGG (or VS) waves finish.
     */
-   ctx->program->early_rast = ctx->program->chip_class >= GFX10 &&
-                              outinfo->param_exports == 0;
+   ctx->program->early_rast = ctx->program->chip_class >= GFX10 && outinfo->param_exports == 0;
 }
 
 void
-setup_vs_variables(isel_context *ctx, nir_shader *nir)
+setup_vs_variables(isel_context* ctx, nir_shader* nir)
 {
    if (ctx->stage == vertex_vs || ctx->stage == vertex_ngg) {
-      radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
+      radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo;
       setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
                            ctx->options->key.vs_common_out.export_clip_dists, outinfo);
 
@@ -351,21 +348,26 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir)
    }
 
    if (ctx->stage == vertex_ngg) {
-      ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
-      assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
+      ctx->program->config->lds_size =
+         DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
+      assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <
+             (32 * 1024));
    }
 }
 
-void setup_gs_variables(isel_context *ctx, nir_shader *nir)
+void
+setup_gs_variables(isel_context* ctx, nir_shader* nir)
 {
    if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
-      ctx->program->config->lds_size = ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */
+      ctx->program->config->lds_size =
+         ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */
    } else if (ctx->stage == vertex_geometry_ngg || ctx->stage == tess_eval_geometry_ngg) {
-      radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
-      setup_vs_output_info(ctx, nir, false,
-                           ctx->options->key.vs_common_out.export_clip_dists, outinfo);
+      radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo;
+      setup_vs_output_info(ctx, nir, false, ctx->options->key.vs_common_out.export_clip_dists,
+                           outinfo);
 
-      ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
+      ctx->program->config->lds_size =
+         DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
    }
 
    if (ctx->stage.has(SWStage::VS))
@@ -375,7 +377,7 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir)
 }
 
 void
-setup_tcs_info(isel_context *ctx, nir_shader *nir, nir_shader *vs)
+setup_tcs_info(isel_context* ctx, nir_shader* nir, nir_shader* vs)
 {
    ctx->tcs_in_out_eq = ctx->args->shader_info->vs.tcs_in_out_eq;
    ctx->tcs_temp_only_inputs = ctx->args->shader_info->vs.tcs_temp_only_input_mask;
@@ -384,12 +386,12 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir, nir_shader *vs)
 }
 
 void
-setup_tes_variables(isel_context *ctx, nir_shader *nir)
+setup_tes_variables(isel_context* ctx, nir_shader* nir)
 {
    ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;
 
    if (ctx->stage == tess_eval_vs || ctx->stage == tess_eval_ngg) {
-      radv_vs_output_info *outinfo = &ctx->program->info->tes.outinfo;
+      radv_vs_output_info* outinfo = &ctx->program->info->tes.outinfo;
       setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
                            ctx->options->key.vs_common_out.export_clip_dists, outinfo);
 
@@ -399,20 +401,23 @@ setup_tes_variables(isel_context *ctx, nir_shader *nir)
    }
 
    if (ctx->stage == tess_eval_ngg) {
-      ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
-      assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
+      ctx->program->config->lds_size =
+         DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
+      assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <
+             (32 * 1024));
    }
 }
 
 void
-setup_variables(isel_context *ctx, nir_shader *nir)
+setup_variables(isel_context* ctx, nir_shader* nir)
 {
    switch (nir->info.stage) {
    case MESA_SHADER_FRAGMENT: {
       break;
    }
    case MESA_SHADER_COMPUTE: {
-      ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
+      ctx->program->config->lds_size =
+         DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
       break;
    }
    case MESA_SHADER_VERTEX: {
@@ -430,16 +435,16 @@ setup_variables(isel_context *ctx, nir_shader *nir)
       setup_tes_variables(ctx, nir);
       break;
    }
-   default:
-      unreachable("Unhandled shader stage.");
+   default: unreachable("Unhandled shader stage.");
    }
 
    /* Make sure we fit the available LDS space. */
-   assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <= ctx->program->dev.lds_limit);
+   assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <=
+          ctx->program->dev.lds_limit);
 }
 
 void
-setup_nir(isel_context *ctx, nir_shader *nir)
+setup_nir(isel_context* ctx, nir_shader* nir)
 {
    /* the variable setup has to be done before lower_io / CSE */
    setup_variables(ctx, nir);
@@ -447,19 +452,20 @@ setup_nir(isel_context *ctx, nir_shader *nir)
    nir_convert_to_lcssa(nir, true, false);
    nir_lower_phis_to_scalar(nir, true);
 
-   nir_function_impl *func = nir_shader_get_entrypoint(nir);
+   nir_function_impl* func = nir_shader_get_entrypoint(nir);
    nir_index_ssa_defs(func);
 }
 
 } /* end namespace */
 
-void init_context(isel_context *ctx, nir_shader *shader)
+void
+init_context(isel_context* ctx, nir_shader* shader)
 {
-   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   nir_function_impl* impl = nir_shader_get_entrypoint(shader);
    ctx->shader = shader;
 
    /* Init NIR range analysis. */
-   ctx->range_ht =_mesa_pointer_hash_table_create(NULL);
+   ctx->range_ht = _mesa_pointer_hash_table_create(NULL);
    ctx->ub_config.min_subgroup_size = 64;
    ctx->ub_config.max_subgroup_size = 64;
    if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && ctx->options->key.cs.subgroup_size) {
@@ -481,34 +487,23 @@ void init_context(isel_context *ctx, nir_shader *shader)
       uint32_t max = UINT32_MAX;
       if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
          max = 0x3f800000u;
-      } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
-                 nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED) {
+      } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED) {
          bool uscaled = nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED;
          switch (dfmt) {
          case V_008F0C_BUF_DATA_FORMAT_8:
          case V_008F0C_BUF_DATA_FORMAT_8_8:
-         case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
-            max = uscaled ? 0x437f0000u : UINT8_MAX;
-            break;
+         case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: max = uscaled ? 0x437f0000u : UINT8_MAX; break;
          case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
-         case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
-            max = uscaled ? 0x447fc000u : 1023;
-            break;
+         case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: max = uscaled ? 0x447fc000u : 1023; break;
          case V_008F0C_BUF_DATA_FORMAT_10_11_11:
-         case V_008F0C_BUF_DATA_FORMAT_11_11_10:
-            max = uscaled ? 0x44ffe000u : 2047;
-            break;
+         case V_008F0C_BUF_DATA_FORMAT_11_11_10: max = uscaled ? 0x44ffe000u : 2047; break;
          case V_008F0C_BUF_DATA_FORMAT_16:
          case V_008F0C_BUF_DATA_FORMAT_16_16:
-         case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
-            max = uscaled ? 0x477fff00u : UINT16_MAX;
-            break;
+         case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: max = uscaled ? 0x477fff00u : UINT16_MAX; break;
          case V_008F0C_BUF_DATA_FORMAT_32:
          case V_008F0C_BUF_DATA_FORMAT_32_32:
          case V_008F0C_BUF_DATA_FORMAT_32_32_32:
-         case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
-            max = uscaled ? 0x4f800000u : UINT32_MAX;
-            break;
+         case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: max = uscaled ? 0x4f800000u : UINT32_MAX; break;
          }
       }
       ctx->ub_config.vertex_attrib_max[i] = max;
@@ -533,7 +528,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
 
    ctx->first_temp_id = ctx->program->peekAllocationId();
    ctx->program->allocateRange(impl->ssa_alloc);
-   RegClass *regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
+   RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
 
    unsigned spi_ps_inputs = 0;
 
@@ -543,100 +538,99 @@ void init_context(isel_context *ctx, nir_shader *shader)
    bool done = false;
    while (!done) {
       done = true;
-      nir_foreach_block(block, impl) {
-         nir_foreach_instr(instr, block) {
-            switch(instr->type) {
+      nir_foreach_block (block, impl) {
+         nir_foreach_instr (instr, block) {
+            switch (instr->type) {
             case nir_instr_type_alu: {
-               nir_alu_instr *alu_instr = nir_instr_as_alu(instr);
-               RegType type = nir_dest_is_divergent(alu_instr->dest.dest) ? RegType::vgpr : RegType::sgpr;
-               switch(alu_instr->op) {
-                  case nir_op_fmul:
-                  case nir_op_fadd:
-                  case nir_op_fsub:
-                  case nir_op_fmax:
-                  case nir_op_fmin:
-                  case nir_op_fneg:
-                  case nir_op_fabs:
-                  case nir_op_fsat:
-                  case nir_op_fsign:
-                  case nir_op_frcp:
-                  case nir_op_frsq:
-                  case nir_op_fsqrt:
-                  case nir_op_fexp2:
-                  case nir_op_flog2:
-                  case nir_op_ffract:
-                  case nir_op_ffloor:
-                  case nir_op_fceil:
-                  case nir_op_ftrunc:
-                  case nir_op_fround_even:
-                  case nir_op_fsin:
-                  case nir_op_fcos:
-                  case nir_op_f2f16:
-                  case nir_op_f2f16_rtz:
-                  case nir_op_f2f16_rtne:
-                  case nir_op_f2f32:
-                  case nir_op_f2f64:
-                  case nir_op_u2f16:
-                  case nir_op_u2f32:
-                  case nir_op_u2f64:
-                  case nir_op_i2f16:
-                  case nir_op_i2f32:
-                  case nir_op_i2f64:
-                  case nir_op_pack_half_2x16_split:
-                  case nir_op_unpack_half_2x16_split_x:
-                  case nir_op_unpack_half_2x16_split_y:
-                  case nir_op_fddx:
-                  case nir_op_fddy:
-                  case nir_op_fddx_fine:
-                  case nir_op_fddy_fine:
-                  case nir_op_fddx_coarse:
-                  case nir_op_fddy_coarse:
-                  case nir_op_fquantize2f16:
-                  case nir_op_ldexp:
-                  case nir_op_frexp_sig:
-                  case nir_op_frexp_exp:
-                  case nir_op_cube_face_index_amd:
-                  case nir_op_cube_face_coord_amd:
-                  case nir_op_sad_u8x4:
-                     type = RegType::vgpr;
-                     break;
-                  case nir_op_f2i16:
-                  case nir_op_f2u16:
-                  case nir_op_f2i32:
-                  case nir_op_f2u32:
-                  case nir_op_f2i64:
-                  case nir_op_f2u64:
-                  case nir_op_b2i8:
-                  case nir_op_b2i16:
-                  case nir_op_b2i32:
-                  case nir_op_b2i64:
-                  case nir_op_b2b32:
-                  case nir_op_b2f16:
-                  case nir_op_b2f32:
-                  case nir_op_mov:
-                     break;
-                  case nir_op_iadd:
-                  case nir_op_isub:
-                  case nir_op_imul:
-                  case nir_op_imin:
-                  case nir_op_imax:
-                  case nir_op_umin:
-                  case nir_op_umax:
-                  case nir_op_ishl:
-                  case nir_op_ishr:
-                  case nir_op_ushr:
-                     /* packed 16bit instructions have to be VGPR */
-                     type = alu_instr->dest.dest.ssa.num_components == 2 ? RegType::vgpr : type;
-                     FALLTHROUGH;
-                  default:
-                     for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
-                        if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
-                           type = RegType::vgpr;
-                     }
-                     break;
+               nir_alu_instr* alu_instr = nir_instr_as_alu(instr);
+               RegType type =
+                  nir_dest_is_divergent(alu_instr->dest.dest) ? RegType::vgpr : RegType::sgpr;
+               switch (alu_instr->op) {
+               case nir_op_fmul:
+               case nir_op_fadd:
+               case nir_op_fsub:
+               case nir_op_fmax:
+               case nir_op_fmin:
+               case nir_op_fneg:
+               case nir_op_fabs:
+               case nir_op_fsat:
+               case nir_op_fsign:
+               case nir_op_frcp:
+               case nir_op_frsq:
+               case nir_op_fsqrt:
+               case nir_op_fexp2:
+               case nir_op_flog2:
+               case nir_op_ffract:
+               case nir_op_ffloor:
+               case nir_op_fceil:
+               case nir_op_ftrunc:
+               case nir_op_fround_even:
+               case nir_op_fsin:
+               case nir_op_fcos:
+               case nir_op_f2f16:
+               case nir_op_f2f16_rtz:
+               case nir_op_f2f16_rtne:
+               case nir_op_f2f32:
+               case nir_op_f2f64:
+               case nir_op_u2f16:
+               case nir_op_u2f32:
+               case nir_op_u2f64:
+               case nir_op_i2f16:
+               case nir_op_i2f32:
+               case nir_op_i2f64:
+               case nir_op_pack_half_2x16_split:
+               case nir_op_unpack_half_2x16_split_x:
+               case nir_op_unpack_half_2x16_split_y:
+               case nir_op_fddx:
+               case nir_op_fddy:
+               case nir_op_fddx_fine:
+               case nir_op_fddy_fine:
+               case nir_op_fddx_coarse:
+               case nir_op_fddy_coarse:
+               case nir_op_fquantize2f16:
+               case nir_op_ldexp:
+               case nir_op_frexp_sig:
+               case nir_op_frexp_exp:
+               case nir_op_cube_face_index_amd:
+               case nir_op_cube_face_coord_amd:
+               case nir_op_sad_u8x4: type = RegType::vgpr; break;
+               case nir_op_f2i16:
+               case nir_op_f2u16:
+               case nir_op_f2i32:
+               case nir_op_f2u32:
+               case nir_op_f2i64:
+               case nir_op_f2u64:
+               case nir_op_b2i8:
+               case nir_op_b2i16:
+               case nir_op_b2i32:
+               case nir_op_b2i64:
+               case nir_op_b2b32:
+               case nir_op_b2f16:
+               case nir_op_b2f32:
+               case nir_op_mov: break;
+               case nir_op_iadd:
+               case nir_op_isub:
+               case nir_op_imul:
+               case nir_op_imin:
+               case nir_op_imax:
+               case nir_op_umin:
+               case nir_op_umax:
+               case nir_op_ishl:
+               case nir_op_ishr:
+               case nir_op_ushr:
+                  /* packed 16bit instructions have to be VGPR */
+                  type = alu_instr->dest.dest.ssa.num_components == 2 ? RegType::vgpr : type;
+                  FALLTHROUGH;
+               default:
+                  for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
+                     if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
+                        type = RegType::vgpr;
+                  }
+                  break;
                }
 
-               RegClass rc = get_reg_class(ctx, type, alu_instr->dest.dest.ssa.num_components, alu_instr->dest.dest.ssa.bit_size);
+               RegClass rc = get_reg_class(ctx, type, alu_instr->dest.dest.ssa.num_components,
+                                           alu_instr->dest.dest.ssa.bit_size);
                regclasses[alu_instr->dest.dest.ssa.index] = rc;
                break;
             }
@@ -648,207 +642,203 @@ void init_context(isel_context *ctx, nir_shader *shader)
                break;
             }
             case nir_instr_type_intrinsic: {
-               nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+               nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr);
                if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
                   break;
                RegType type = RegType::sgpr;
-               switch(intrinsic->intrinsic) {
-                  case nir_intrinsic_load_push_constant:
-                  case nir_intrinsic_load_workgroup_id:
-                  case nir_intrinsic_load_num_workgroups:
-                  case nir_intrinsic_load_subgroup_id:
-                  case nir_intrinsic_load_num_subgroups:
-                  case nir_intrinsic_load_first_vertex:
-                  case nir_intrinsic_load_base_instance:
-                  case nir_intrinsic_vote_all:
-                  case nir_intrinsic_vote_any:
-                  case nir_intrinsic_read_first_invocation:
-                  case nir_intrinsic_read_invocation:
-                  case nir_intrinsic_first_invocation:
-                  case nir_intrinsic_ballot:
-                  case nir_intrinsic_load_ring_tess_factors_amd:
-                  case nir_intrinsic_load_ring_tess_factors_offset_amd:
-                  case nir_intrinsic_load_ring_tess_offchip_amd:
-                  case nir_intrinsic_load_ring_tess_offchip_offset_amd:
-                  case nir_intrinsic_load_ring_esgs_amd:
-                  case nir_intrinsic_load_ring_es2gs_offset_amd:
-                  case nir_intrinsic_image_deref_samples:
-                  case nir_intrinsic_has_input_vertex_amd:
-                  case nir_intrinsic_has_input_primitive_amd:
-                  case nir_intrinsic_load_workgroup_num_input_vertices_amd:
-                  case nir_intrinsic_load_workgroup_num_input_primitives_amd:
-                  case nir_intrinsic_load_shader_query_enabled_amd:
-                     type = RegType::sgpr;
-                     break;
-                  case nir_intrinsic_load_sample_id:
-                  case nir_intrinsic_load_sample_mask_in:
-                  case nir_intrinsic_load_input:
-                  case nir_intrinsic_load_output:
-                  case nir_intrinsic_load_input_vertex:
-                  case nir_intrinsic_load_per_vertex_input:
-                  case nir_intrinsic_load_per_vertex_output:
-                  case nir_intrinsic_load_vertex_id:
-                  case nir_intrinsic_load_vertex_id_zero_base:
-                  case nir_intrinsic_load_barycentric_sample:
-                  case nir_intrinsic_load_barycentric_pixel:
-                  case nir_intrinsic_load_barycentric_model:
-                  case nir_intrinsic_load_barycentric_centroid:
-                  case nir_intrinsic_load_barycentric_at_sample:
-                  case nir_intrinsic_load_barycentric_at_offset:
-                  case nir_intrinsic_load_interpolated_input:
-                  case nir_intrinsic_load_frag_coord:
-                  case nir_intrinsic_load_frag_shading_rate:
-                  case nir_intrinsic_load_sample_pos:
-                  case nir_intrinsic_load_layer_id:
-                  case nir_intrinsic_load_local_invocation_id:
-                  case nir_intrinsic_load_local_invocation_index:
-                  case nir_intrinsic_load_subgroup_invocation:
-                  case nir_intrinsic_load_tess_coord:
-                  case nir_intrinsic_write_invocation_amd:
-                  case nir_intrinsic_mbcnt_amd:
-                  case nir_intrinsic_byte_permute_amd:
-                  case nir_intrinsic_lane_permute_16_amd:
-                  case nir_intrinsic_load_instance_id:
-                  case nir_intrinsic_ssbo_atomic_add:
-                  case nir_intrinsic_ssbo_atomic_imin:
-                  case nir_intrinsic_ssbo_atomic_umin:
-                  case nir_intrinsic_ssbo_atomic_imax:
-                  case nir_intrinsic_ssbo_atomic_umax:
-                  case nir_intrinsic_ssbo_atomic_and:
-                  case nir_intrinsic_ssbo_atomic_or:
-                  case nir_intrinsic_ssbo_atomic_xor:
-                  case nir_intrinsic_ssbo_atomic_exchange:
-                  case nir_intrinsic_ssbo_atomic_comp_swap:
-                  case nir_intrinsic_global_atomic_add:
-                  case nir_intrinsic_global_atomic_imin:
-                  case nir_intrinsic_global_atomic_umin:
-                  case nir_intrinsic_global_atomic_imax:
-                  case nir_intrinsic_global_atomic_umax:
-                  case nir_intrinsic_global_atomic_and:
-                  case nir_intrinsic_global_atomic_or:
-                  case nir_intrinsic_global_atomic_xor:
-                  case nir_intrinsic_global_atomic_exchange:
-                  case nir_intrinsic_global_atomic_comp_swap:
-                  case nir_intrinsic_image_deref_atomic_add:
-                  case nir_intrinsic_image_deref_atomic_umin:
-                  case nir_intrinsic_image_deref_atomic_imin:
-                  case nir_intrinsic_image_deref_atomic_umax:
-                  case nir_intrinsic_image_deref_atomic_imax:
-                  case nir_intrinsic_image_deref_atomic_and:
-                  case nir_intrinsic_image_deref_atomic_or:
-                  case nir_intrinsic_image_deref_atomic_xor:
-                  case nir_intrinsic_image_deref_atomic_exchange:
-                  case nir_intrinsic_image_deref_atomic_comp_swap:
-                  case nir_intrinsic_image_deref_size:
-                  case nir_intrinsic_shared_atomic_add:
-                  case nir_intrinsic_shared_atomic_imin:
-                  case nir_intrinsic_shared_atomic_umin:
-                  case nir_intrinsic_shared_atomic_imax:
-                  case nir_intrinsic_shared_atomic_umax:
-                  case nir_intrinsic_shared_atomic_and:
-                  case nir_intrinsic_shared_atomic_or:
-                  case nir_intrinsic_shared_atomic_xor:
-                  case nir_intrinsic_shared_atomic_exchange:
-                  case nir_intrinsic_shared_atomic_comp_swap:
-                  case nir_intrinsic_shared_atomic_fadd:
-                  case nir_intrinsic_load_scratch:
-                  case nir_intrinsic_load_invocation_id:
-                  case nir_intrinsic_load_primitive_id:
-                  case nir_intrinsic_load_buffer_amd:
-                  case nir_intrinsic_load_tess_rel_patch_id_amd:
-                  case nir_intrinsic_load_gs_vertex_offset_amd:
-                  case nir_intrinsic_load_initial_edgeflag_amd:
-                  case nir_intrinsic_load_packed_passthrough_primitive_amd:
-                  case nir_intrinsic_gds_atomic_add_amd:
-                  case nir_intrinsic_load_sbt_amd:
-                  case nir_intrinsic_bvh64_intersect_ray_amd:
+               switch (intrinsic->intrinsic) {
+               case nir_intrinsic_load_push_constant:
+               case nir_intrinsic_load_workgroup_id:
+               case nir_intrinsic_load_num_workgroups:
+               case nir_intrinsic_load_subgroup_id:
+               case nir_intrinsic_load_num_subgroups:
+               case nir_intrinsic_load_first_vertex:
+               case nir_intrinsic_load_base_instance:
+               case nir_intrinsic_vote_all:
+               case nir_intrinsic_vote_any:
+               case nir_intrinsic_read_first_invocation:
+               case nir_intrinsic_read_invocation:
+               case nir_intrinsic_first_invocation:
+               case nir_intrinsic_ballot:
+               case nir_intrinsic_load_ring_tess_factors_amd:
+               case nir_intrinsic_load_ring_tess_factors_offset_amd:
+               case nir_intrinsic_load_ring_tess_offchip_amd:
+               case nir_intrinsic_load_ring_tess_offchip_offset_amd:
+               case nir_intrinsic_load_ring_esgs_amd:
+               case nir_intrinsic_load_ring_es2gs_offset_amd:
+               case nir_intrinsic_image_deref_samples:
+               case nir_intrinsic_has_input_vertex_amd:
+               case nir_intrinsic_has_input_primitive_amd:
+               case nir_intrinsic_load_workgroup_num_input_vertices_amd:
+               case nir_intrinsic_load_workgroup_num_input_primitives_amd:
+               case nir_intrinsic_load_shader_query_enabled_amd: type = RegType::sgpr; break;
+               case nir_intrinsic_load_sample_id:
+               case nir_intrinsic_load_sample_mask_in:
+               case nir_intrinsic_load_input:
+               case nir_intrinsic_load_output:
+               case nir_intrinsic_load_input_vertex:
+               case nir_intrinsic_load_per_vertex_input:
+               case nir_intrinsic_load_per_vertex_output:
+               case nir_intrinsic_load_vertex_id:
+               case nir_intrinsic_load_vertex_id_zero_base:
+               case nir_intrinsic_load_barycentric_sample:
+               case nir_intrinsic_load_barycentric_pixel:
+               case nir_intrinsic_load_barycentric_model:
+               case nir_intrinsic_load_barycentric_centroid:
+               case nir_intrinsic_load_barycentric_at_sample:
+               case nir_intrinsic_load_barycentric_at_offset:
+               case nir_intrinsic_load_interpolated_input:
+               case nir_intrinsic_load_frag_coord:
+               case nir_intrinsic_load_frag_shading_rate:
+               case nir_intrinsic_load_sample_pos:
+               case nir_intrinsic_load_layer_id:
+               case nir_intrinsic_load_local_invocation_id:
+               case nir_intrinsic_load_local_invocation_index:
+               case nir_intrinsic_load_subgroup_invocation:
+               case nir_intrinsic_load_tess_coord:
+               case nir_intrinsic_write_invocation_amd:
+               case nir_intrinsic_mbcnt_amd:
+               case nir_intrinsic_byte_permute_amd:
+               case nir_intrinsic_lane_permute_16_amd:
+               case nir_intrinsic_load_instance_id:
+               case nir_intrinsic_ssbo_atomic_add:
+               case nir_intrinsic_ssbo_atomic_imin:
+               case nir_intrinsic_ssbo_atomic_umin:
+               case nir_intrinsic_ssbo_atomic_imax:
+               case nir_intrinsic_ssbo_atomic_umax:
+               case nir_intrinsic_ssbo_atomic_and:
+               case nir_intrinsic_ssbo_atomic_or:
+               case nir_intrinsic_ssbo_atomic_xor:
+               case nir_intrinsic_ssbo_atomic_exchange:
+               case nir_intrinsic_ssbo_atomic_comp_swap:
+               case nir_intrinsic_global_atomic_add:
+               case nir_intrinsic_global_atomic_imin:
+               case nir_intrinsic_global_atomic_umin:
+               case nir_intrinsic_global_atomic_imax:
+               case nir_intrinsic_global_atomic_umax:
+               case nir_intrinsic_global_atomic_and:
+               case nir_intrinsic_global_atomic_or:
+               case nir_intrinsic_global_atomic_xor:
+               case nir_intrinsic_global_atomic_exchange:
+               case nir_intrinsic_global_atomic_comp_swap:
+               case nir_intrinsic_image_deref_atomic_add:
+               case nir_intrinsic_image_deref_atomic_umin:
+               case nir_intrinsic_image_deref_atomic_imin:
+               case nir_intrinsic_image_deref_atomic_umax:
+               case nir_intrinsic_image_deref_atomic_imax:
+               case nir_intrinsic_image_deref_atomic_and:
+               case nir_intrinsic_image_deref_atomic_or:
+               case nir_intrinsic_image_deref_atomic_xor:
+               case nir_intrinsic_image_deref_atomic_exchange:
+               case nir_intrinsic_image_deref_atomic_comp_swap:
+               case nir_intrinsic_image_deref_size:
+               case nir_intrinsic_shared_atomic_add:
+               case nir_intrinsic_shared_atomic_imin:
+               case nir_intrinsic_shared_atomic_umin:
+               case nir_intrinsic_shared_atomic_imax:
+               case nir_intrinsic_shared_atomic_umax:
+               case nir_intrinsic_shared_atomic_and:
+               case nir_intrinsic_shared_atomic_or:
+               case nir_intrinsic_shared_atomic_xor:
+               case nir_intrinsic_shared_atomic_exchange:
+               case nir_intrinsic_shared_atomic_comp_swap:
+               case nir_intrinsic_shared_atomic_fadd:
+               case nir_intrinsic_load_scratch:
+               case nir_intrinsic_load_invocation_id:
+               case nir_intrinsic_load_primitive_id:
+               case nir_intrinsic_load_buffer_amd:
+               case nir_intrinsic_load_tess_rel_patch_id_amd:
+               case nir_intrinsic_load_gs_vertex_offset_amd:
+               case nir_intrinsic_load_initial_edgeflag_amd:
+               case nir_intrinsic_load_packed_passthrough_primitive_amd:
+               case nir_intrinsic_gds_atomic_add_amd:
+               case nir_intrinsic_load_sbt_amd:
+               case nir_intrinsic_bvh64_intersect_ray_amd: type = RegType::vgpr; break;
+               case nir_intrinsic_load_shared:
+                  /* When the result of these loads is only used by cross-lane instructions,
+                   * it is beneficial to use a VGPR destination. This is because this allows
+                   * to put the s_waitcnt further down, which decreases latency.
+                   */
+                  if (only_used_by_cross_lane_instrs(&intrinsic->dest.ssa)) {
                      type = RegType::vgpr;
                      break;
-                  case nir_intrinsic_load_shared:
-                     /* When the result of these loads is only used by cross-lane instructions,
-                     * it is beneficial to use a VGPR destination. This is because this allows
-                     * to put the s_waitcnt further down, which decreases latency.
-                     */
-                     if (only_used_by_cross_lane_instrs(&intrinsic->dest.ssa)) {
+                  }
+                  FALLTHROUGH;
+               case nir_intrinsic_shuffle:
+               case nir_intrinsic_quad_broadcast:
+               case nir_intrinsic_quad_swap_horizontal:
+               case nir_intrinsic_quad_swap_vertical:
+               case nir_intrinsic_quad_swap_diagonal:
+               case nir_intrinsic_quad_swizzle_amd:
+               case nir_intrinsic_masked_swizzle_amd:
+               case nir_intrinsic_inclusive_scan:
+               case nir_intrinsic_exclusive_scan:
+               case nir_intrinsic_reduce:
+               case nir_intrinsic_load_ubo:
+               case nir_intrinsic_load_ssbo:
+               case nir_intrinsic_load_global:
+               case nir_intrinsic_vulkan_resource_index:
+               case nir_intrinsic_get_ssbo_size:
+                  type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr;
+                  break;
+               case nir_intrinsic_load_view_index:
+                  type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
+                  break;
+               default:
+                  for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
+                       i++) {
+                     if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
                         type = RegType::vgpr;
-                        break;
-                     }
-                     FALLTHROUGH;
-                  case nir_intrinsic_shuffle:
-                  case nir_intrinsic_quad_broadcast:
-                  case nir_intrinsic_quad_swap_horizontal:
-                  case nir_intrinsic_quad_swap_vertical:
-                  case nir_intrinsic_quad_swap_diagonal:
-                  case nir_intrinsic_quad_swizzle_amd:
-                  case nir_intrinsic_masked_swizzle_amd:
-                  case nir_intrinsic_inclusive_scan:
-                  case nir_intrinsic_exclusive_scan:
-                  case nir_intrinsic_reduce:
-                  case nir_intrinsic_load_ubo:
-                  case nir_intrinsic_load_ssbo:
-                  case nir_intrinsic_load_global:
-                  case nir_intrinsic_vulkan_resource_index:
-                  case nir_intrinsic_get_ssbo_size:
-                     type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr;
-                     break;
-                  case nir_intrinsic_load_view_index:
-                     type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
-                     break;
-                  default:
-                     for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; i++) {
-                        if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
-                           type = RegType::vgpr;
-                     }
-                     break;
+                  }
+                  break;
                }
-               RegClass rc = get_reg_class(ctx, type, intrinsic->dest.ssa.num_components, intrinsic->dest.ssa.bit_size);
+               RegClass rc = get_reg_class(ctx, type, intrinsic->dest.ssa.num_components,
+                                           intrinsic->dest.ssa.bit_size);
                regclasses[intrinsic->dest.ssa.index] = rc;
 
-               switch(intrinsic->intrinsic) {
-                  case nir_intrinsic_load_barycentric_sample:
-                  case nir_intrinsic_load_barycentric_pixel:
-                  case nir_intrinsic_load_barycentric_centroid:
-                  case nir_intrinsic_load_barycentric_at_sample:
-                  case nir_intrinsic_load_barycentric_at_offset: {
-                     glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
-                     spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode);
-                     break;
+               switch (intrinsic->intrinsic) {
+               case nir_intrinsic_load_barycentric_sample:
+               case nir_intrinsic_load_barycentric_pixel:
+               case nir_intrinsic_load_barycentric_centroid:
+               case nir_intrinsic_load_barycentric_at_sample:
+               case nir_intrinsic_load_barycentric_at_offset: {
+                  glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
+                  spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode);
+                  break;
+               }
+               case nir_intrinsic_load_barycentric_model:
+                  spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1);
+                  break;
+               case nir_intrinsic_load_front_face:
+                  spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1);
+                  break;
+               case nir_intrinsic_load_frag_coord:
+               case nir_intrinsic_load_sample_pos: {
+                  uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
+                  for (unsigned i = 0; i < 4; i++) {
+                     if (mask & (1 << i))
+                        spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
                   }
-                  case nir_intrinsic_load_barycentric_model:
-                     spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1);
-                     break;
-                  case nir_intrinsic_load_front_face:
-                     spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1);
-                     break;
-                  case nir_intrinsic_load_frag_coord:
-                  case nir_intrinsic_load_sample_pos: {
-                     uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
-                     for (unsigned i = 0; i < 4; i++) {
-                        if (mask & (1 << i))
-                           spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
 
-                     }
-
-                     if (ctx->options->adjust_frag_coord_z &&
-                         intrinsic->intrinsic == nir_intrinsic_load_frag_coord &&
-                         G_0286CC_POS_Z_FLOAT_ENA(spi_ps_inputs)) {
-                        /* Enable ancillary for adjusting gl_FragCoord.z for
-                         * VRS due to a hw bug on some GFX10.3 chips.
-                         */
-                        spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
-                     }
-                     break;
+                  if (ctx->options->adjust_frag_coord_z &&
+                      intrinsic->intrinsic == nir_intrinsic_load_frag_coord &&
+                      G_0286CC_POS_Z_FLOAT_ENA(spi_ps_inputs)) {
+                     /* Enable ancillary for adjusting gl_FragCoord.z for
+                      * VRS due to a hw bug on some GFX10.3 chips.
+                      */
+                     spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
                   }
-                  case nir_intrinsic_load_sample_id:
-                  case nir_intrinsic_load_frag_shading_rate:
-                     spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
-                     break;
-                  case nir_intrinsic_load_sample_mask_in:
-                     spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
-                     spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
-                     break;
-                  default:
-                     break;
+                  break;
+               }
+               case nir_intrinsic_load_sample_id:
+               case nir_intrinsic_load_frag_shading_rate:
+                  spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
+                  break;
+               case nir_intrinsic_load_sample_mask_in:
+                  spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
+                  spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
+                  break;
+               default: break;
                }
                break;
             }
@@ -860,13 +850,13 @@ void init_context(isel_context *ctx, nir_shader *shader)
                   assert(!tex->dest.ssa.divergent);
                }
 
-               RegClass rc = get_reg_class(ctx, type, tex->dest.ssa.num_components,
-                                           tex->dest.ssa.bit_size);
+               RegClass rc =
+                  get_reg_class(ctx, type, tex->dest.ssa.num_components, tex->dest.ssa.bit_size);
                regclasses[tex->dest.ssa.index] = rc;
                break;
             }
             case nir_instr_type_parallel_copy: {
-               nir_foreach_parallel_copy_entry(entry, nir_instr_as_parallel_copy(instr)) {
+               nir_foreach_parallel_copy_entry (entry, nir_instr_as_parallel_copy(instr)) {
                   regclasses[entry->dest.ssa.index] = regclasses[entry->src.ssa->index];
                }
                break;
@@ -900,8 +890,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
                regclasses[phi->dest.ssa.index] = rc;
                break;
             }
-            default:
-               break;
+            default: break;
             }
          }
       }
@@ -931,47 +920,33 @@ void init_context(isel_context *ctx, nir_shader *shader)
                                       (uint8_t*)shader->constant_data + shader->constant_data_size);
 }
 
-void cleanup_context(isel_context *ctx)
+void
+cleanup_context(isel_context* ctx)
 {
    _mesa_hash_table_destroy(ctx->range_ht, NULL);
 }
 
 isel_context
-setup_isel_context(Program* program,
-                   unsigned shader_count,
-                   struct nir_shader *const *shaders,
-                   ac_shader_config* config,
-                   struct radv_shader_args *args,
-                   bool is_gs_copy_shader)
+setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
+                   ac_shader_config* config, struct radv_shader_args* args, bool is_gs_copy_shader)
 {
    SWStage sw_stage = SWStage::None;
    for (unsigned i = 0; i < shader_count; i++) {
       switch (shaders[i]->info.stage) {
-      case MESA_SHADER_VERTEX:
-         sw_stage = sw_stage | SWStage::VS;
-         break;
-      case MESA_SHADER_TESS_CTRL:
-         sw_stage = sw_stage | SWStage::TCS;
-         break;
-      case MESA_SHADER_TESS_EVAL:
-         sw_stage = sw_stage | SWStage::TES;
-         break;
+      case MESA_SHADER_VERTEX: sw_stage = sw_stage | SWStage::VS; break;
+      case MESA_SHADER_TESS_CTRL: sw_stage = sw_stage | SWStage::TCS; break;
+      case MESA_SHADER_TESS_EVAL: sw_stage = sw_stage | SWStage::TES; break;
       case MESA_SHADER_GEOMETRY:
          sw_stage = sw_stage | (is_gs_copy_shader ? SWStage::GSCopy : SWStage::GS);
          break;
-      case MESA_SHADER_FRAGMENT:
-         sw_stage = sw_stage | SWStage::FS;
-         break;
-      case MESA_SHADER_COMPUTE:
-         sw_stage = sw_stage | SWStage::CS;
-         break;
-      default:
-         unreachable("Shader stage not implemented");
+      case MESA_SHADER_FRAGMENT: sw_stage = sw_stage | SWStage::FS; break;
+      case MESA_SHADER_COMPUTE: sw_stage = sw_stage | SWStage::CS; break;
+      default: unreachable("Shader stage not implemented");
       }
    }
    bool gfx9_plus = args->options->chip_class >= GFX9;
    bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;
-   HWStage hw_stage { };
+   HWStage hw_stage{};
    if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg)
       hw_stage = HWStage::ES;
    else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg)
@@ -1009,8 +984,8 @@ setup_isel_context(Program* program,
    else
       unreachable("Shader stage not implemented");
 
-   init_program(program, Stage { hw_stage, sw_stage }, args->shader_info,
-                args->options->chip_class, args->options->family, args->options->wgp_mode, config);
+   init_program(program, Stage{hw_stage, sw_stage}, args->shader_info, args->options->chip_class,
+                args->options->family, args->options->wgp_mode, config);
 
    isel_context ctx = {};
    ctx.program = program;
@@ -1028,29 +1003,37 @@ setup_isel_context(Program* program,
                                 shaders[0]->info.workgroup_size[1] *
                                 shaders[0]->info.workgroup_size[2];
    } else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) {
-      /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa)  */
+      /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8
+       * (not implemented in Mesa)  */
       program->workgroup_size = program->wave_size;
    } else if (program->stage.hw == HWStage::GS) {
       /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */
       assert(program->chip_class >= GFX9);
-      uint32_t es_verts_per_subgrp = G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
-      uint32_t gs_instr_prims_in_subgrp = G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
+      uint32_t es_verts_per_subgrp =
+         G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
+      uint32_t gs_instr_prims_in_subgrp =
+         G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
       uint32_t workgroup_size = MAX2(es_verts_per_subgrp, gs_instr_prims_in_subgrp);
       program->workgroup_size = MAX2(MIN2(workgroup_size, 256), 1);
    } else if (program->stage == vertex_ls) {
       /* Unmerged LS operates in workgroups */
-      program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but those are not plumbed to ACO for LS */
+      program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but
+                                             those are not plumbed to ACO for LS */
    } else if (program->stage == tess_control_hs) {
       /* Unmerged HS operates in workgroups, size is determined by the output vertices */
       setup_tcs_info(&ctx, shaders[0], NULL);
       program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out;
    } else if (program->stage == vertex_tess_control_hs) {
-      /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */
+      /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS
+       * invocations */
       setup_tcs_info(&ctx, shaders[1], shaders[0]);
-      program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
+      program->workgroup_size =
+         ctx.tcs_num_patches *
+         MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
    } else if (program->stage.hw == HWStage::NGG) {
-      gfx10_ngg_info &ngg_info = args->shader_info->ngg_info;
-      unsigned num_gs_invocations = (program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1;
+      gfx10_ngg_info& ngg_info = args->shader_info->ngg_info;
+      unsigned num_gs_invocations =
+         (program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1;
 
       /* Max ES (SW VS/TES) threads */
       uint32_t max_esverts = ngg_info.hw_max_esverts;
@@ -1074,7 +1057,7 @@ setup_isel_context(Program* program,
       setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo);
    } else {
       for (unsigned i = 0; i < shader_count; i++) {
-         nir_shader *nir = shaders[i];
+         nir_shader* nir = shaders[i];
          setup_nir(&ctx, nir);
       }
 
@@ -1090,4 +1073,4 @@ setup_isel_context(Program* program,
    return ctx;
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index e7ea91e3c84..0c06f2a3ca8 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "aco_interface.h"
+
 #include "aco_ir.h"
 
 #include "vulkan/radv_shader.h"
@@ -37,23 +38,33 @@
 static const std::array<aco_compiler_statistic_info, aco::num_statistics> statistic_infos = []()
 {
    std::array<aco_compiler_statistic_info, aco::num_statistics> ret{};
-   ret[aco::statistic_hash] = aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
-   ret[aco::statistic_instructions] = aco_compiler_statistic_info{"Instructions", "Instruction count"};
-   ret[aco::statistic_copies] = aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
+   ret[aco::statistic_hash] =
+      aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
+   ret[aco::statistic_instructions] =
+      aco_compiler_statistic_info{"Instructions", "Instruction count"};
+   ret[aco::statistic_copies] =
+      aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
    ret[aco::statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"};
-   ret[aco::statistic_latency] = aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
-   ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{"Inverse Throughput", "Estimated busy cycles to execute one wave"};
-   ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
-   ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
-   ret[aco::statistic_sgpr_presched] = aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
-   ret[aco::statistic_vgpr_presched] = aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
+   ret[aco::statistic_latency] =
+      aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
+   ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{
+      "Inverse Throughput", "Estimated busy cycles to execute one wave"};
+   ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{
+      "VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
+   ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{
+      "SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
+   ret[aco::statistic_sgpr_presched] =
+      aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
+   ret[aco::statistic_vgpr_presched] =
+      aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
    return ret;
 }();
 
 const unsigned aco_num_statistics = aco::num_statistics;
-const aco_compiler_statistic_info *aco_statistic_infos = statistic_infos.data();
+const aco_compiler_statistic_info* aco_statistic_infos = statistic_infos.data();
 
-static void validate(aco::Program *program)
+static void
+validate(aco::Program* program)
 {
    if (!(aco::debug_flags & aco::DEBUG_VALIDATE_IR))
       return;
@@ -62,10 +73,9 @@ static void validate(aco::Program *program)
    assert(is_valid);
 }
 
-void aco_compile_shader(unsigned shader_count,
-                        struct nir_shader *const *shaders,
-                        struct radv_shader_binary **binary,
-                        struct radv_shader_args *args)
+void
+aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
+                   struct radv_shader_binary** binary, struct radv_shader_args* args)
 {
    aco::init();
 
@@ -116,11 +126,11 @@ void aco_compile_shader(unsigned shader_count,
 
    std::string llvm_ir;
    if (args->options->record_ir) {
-      char *data = NULL;
+      char* data = NULL;
       size_t size = 0;
       u_memstream mem;
       if (u_memstream_open(&mem, &data, &size)) {
-         FILE *const memf = u_memstream_get(&mem);
+         FILE* const memf = u_memstream_get(&mem);
          aco_print_program(program.get(), memf);
          fputc(0, memf);
          u_memstream_close(&mem);
@@ -137,8 +147,7 @@ void aco_compile_shader(unsigned shader_count,
       aco_print_program(program.get(), stderr, live_vars, aco::print_live_vars | aco::print_kill);
 
    if (!args->is_trap_handler_shader) {
-      if (!args->options->disable_optimizations &&
-          !(aco::debug_flags & aco::DEBUG_NO_SCHED))
+      if (!args->options->disable_optimizations && !(aco::debug_flags & aco::DEBUG_NO_SCHED))
          aco::schedule_program(program.get(), live_vars);
       validate(program.get());
 
@@ -189,11 +198,11 @@ void aco_compile_shader(unsigned shader_count,
 
    std::string disasm;
    if (get_disasm) {
-      char *data = NULL;
+      char* data = NULL;
       size_t disasm_size = 0;
       struct u_memstream mem;
       if (u_memstream_open(&mem, &data, &disasm_size)) {
-         FILE *const memf = u_memstream_get(&mem);
+         FILE* const memf = u_memstream_get(&mem);
          aco::print_asm(program.get(), code, exec_size / 4u, memf);
          fputc(0, memf);
          u_memstream_close(&mem);
@@ -214,10 +223,10 @@ void aco_compile_shader(unsigned shader_count,
     * directly for the disk cache. Uninitialized data can appear because of
     * padding in the struct or because legacy_binary->data can be at an offset
     * from the start less than sizeof(radv_shader_binary_legacy). */
-   radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*) calloc(size, 1);
+   radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*)calloc(size, 1);
 
    legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY;
-   legacy_binary->base.stage = shaders[shader_count-1]->info.stage;
+   legacy_binary->base.stage = shaders[shader_count - 1]->info.stage;
    legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader;
    legacy_binary->base.total_size = size;
 
@@ -225,7 +234,8 @@ void aco_compile_shader(unsigned shader_count,
       memcpy(legacy_binary->data, program->statistics, aco::num_statistics * sizeof(uint32_t));
    legacy_binary->stats_size = stats_size;
 
-   memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(), code.size() * sizeof(uint32_t));
+   memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(),
+          code.size() * sizeof(uint32_t));
    legacy_binary->exec_size = exec_size;
    legacy_binary->code_size = code.size() * sizeof(uint32_t);
 
@@ -233,12 +243,15 @@ void aco_compile_shader(unsigned shader_count,
    legacy_binary->disasm_size = 0;
    legacy_binary->ir_size = llvm_ir.size();
 
-   llvm_ir.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size, llvm_ir.size());
+   llvm_ir.copy((char*)legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size,
+                llvm_ir.size());
 
    if (get_disasm) {
-      disasm.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size + llvm_ir.size(), disasm.size());
+      disasm.copy((char*)legacy_binary->data + legacy_binary->stats_size +
+                     legacy_binary->code_size + llvm_ir.size(),
+                  disasm.size());
       legacy_binary->disasm_size = disasm.size();
    }
 
-   *binary = (radv_shader_binary*) legacy_binary;
+   *binary = (radv_shader_binary*)legacy_binary;
 }
diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h
index a4cace90912..a0df87827ef 100644
--- a/src/amd/compiler/aco_interface.h
+++ b/src/amd/compiler/aco_interface.h
@@ -39,12 +39,10 @@ struct aco_compiler_statistic_info {
 };
 
 extern const unsigned aco_num_statistics;
-extern const struct aco_compiler_statistic_info *aco_statistic_infos;
+extern const struct aco_compiler_statistic_info* aco_statistic_infos;
 
-void aco_compile_shader(unsigned shader_count,
-                        struct nir_shader *const *shaders,
-                        struct radv_shader_binary** binary,
-                        struct radv_shader_args *args);
+void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
+                        struct radv_shader_binary** binary, struct radv_shader_args* args);
 
 #ifdef __cplusplus
 }
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 4184aa1cd43..79f9d71a793 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -32,39 +32,40 @@ namespace aco {
 
 uint64_t debug_flags = 0;
 
-static const struct debug_control aco_debug_options[] = {
-   {"validateir", DEBUG_VALIDATE_IR},
-   {"validatera", DEBUG_VALIDATE_RA},
-   {"perfwarn", DEBUG_PERFWARN},
-   {"force-waitcnt", DEBUG_FORCE_WAITCNT},
-   {"novn", DEBUG_NO_VN},
-   {"noopt", DEBUG_NO_OPT},
-   {"nosched", DEBUG_NO_SCHED},
-   {"perfinfo", DEBUG_PERF_INFO},
-   {"liveinfo", DEBUG_LIVE_INFO},
-   {NULL, 0}
-};
+static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
+                                                         {"validatera", DEBUG_VALIDATE_RA},
+                                                         {"perfwarn", DEBUG_PERFWARN},
+                                                         {"force-waitcnt", DEBUG_FORCE_WAITCNT},
+                                                         {"novn", DEBUG_NO_VN},
+                                                         {"noopt", DEBUG_NO_OPT},
+                                                         {"nosched", DEBUG_NO_SCHED},
+                                                         {"perfinfo", DEBUG_PERF_INFO},
+                                                         {"liveinfo", DEBUG_LIVE_INFO},
+                                                         {NULL, 0}};
 
 static once_flag init_once_flag = ONCE_FLAG_INIT;
 
-static void init_once()
+static void
+init_once()
 {
    debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
 
-   #ifndef NDEBUG
+#ifndef NDEBUG
    /* enable some flags by default on debug builds */
    debug_flags |= aco::DEBUG_VALIDATE_IR;
-   #endif
+#endif
 }
 
-void init()
+void
+init()
 {
    call_once(&init_once_flag, init_once);
 }
 
-void init_program(Program *program, Stage stage, struct radv_shader_info *info,
-                  enum chip_class chip_class, enum radeon_family family,
-                  bool wgp_mode, ac_shader_config *config)
+void
+init_program(Program* program, Stage stage, struct radv_shader_info* info,
+             enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
+             ac_shader_config* config)
 {
    program->stage = stage;
    program->config = config;
@@ -72,24 +73,12 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
    program->chip_class = chip_class;
    if (family == CHIP_UNKNOWN) {
       switch (chip_class) {
-      case GFX6:
-         program->family = CHIP_TAHITI;
-         break;
-      case GFX7:
-         program->family = CHIP_BONAIRE;
-         break;
-      case GFX8:
-         program->family = CHIP_POLARIS10;
-         break;
-      case GFX9:
-         program->family = CHIP_VEGA10;
-         break;
-      case GFX10:
-         program->family = CHIP_NAVI10;
-         break;
-      default:
-         program->family = CHIP_UNKNOWN;
-         break;
+      case GFX6: program->family = CHIP_TAHITI; break;
+      case GFX7: program->family = CHIP_BONAIRE; break;
+      case GFX8: program->family = CHIP_POLARIS10; break;
+      case GFX9: program->family = CHIP_VEGA10; break;
+      case GFX10: program->family = CHIP_NAVI10; break;
+      default: program->family = CHIP_UNKNOWN; break;
       }
    } else {
       program->family = family;
@@ -98,7 +87,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
    program->lane_mask = program->wave_size == 32 ? s1 : s2;
 
    program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
-   program->dev.lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
+   program->dev.lds_alloc_granule =
+      chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
    program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
    /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
    program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
@@ -111,7 +101,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
       program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
       program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
       program->dev.sgpr_alloc_granule = 128;
-      program->dev.sgpr_limit = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
+      program->dev.sgpr_limit =
+         108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
       if (chip_class >= GFX10_3)
          program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
       else
@@ -145,18 +136,14 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
    /* GFX9 APUS */
    case CHIP_RAVEN:
    case CHIP_RAVEN2:
-   case CHIP_RENOIR:
-      program->dev.xnack_enabled = true;
-      break;
-   default:
-      break;
+   case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
+   default: break;
    }
 
    program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
    /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
    program->dev.has_fast_fma32 = program->chip_class >= GFX9;
-   if (program->family == CHIP_TAHITI ||
-       program->family == CHIP_CARRIZO ||
+   if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
        program->family == CHIP_HAWAII)
       program->dev.has_fast_fma32 = true;
 
@@ -176,29 +163,24 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
    program->next_fp_mode.round32 = fp_round_ne;
 }
 
-memory_sync_info get_sync_info(const Instruction* instr)
+memory_sync_info
+get_sync_info(const Instruction* instr)
 {
    switch (instr->format) {
-   case Format::SMEM:
-      return instr->smem().sync;
-   case Format::MUBUF:
-      return instr->mubuf().sync;
-   case Format::MIMG:
-      return instr->mimg().sync;
-   case Format::MTBUF:
-      return instr->mtbuf().sync;
+   case Format::SMEM: return instr->smem().sync;
+   case Format::MUBUF: return instr->mubuf().sync;
+   case Format::MIMG: return instr->mimg().sync;
+   case Format::MTBUF: return instr->mtbuf().sync;
    case Format::FLAT:
    case Format::GLOBAL:
-   case Format::SCRATCH:
-      return instr->flatlike().sync;
-   case Format::DS:
-      return instr->ds().sync;
-   default:
-      return memory_sync_info();
+   case Format::SCRATCH: return instr->flatlike().sync;
+   case Format::DS: return instr->ds().sync;
+   default: return memory_sync_info();
    }
 }
 
-bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
+bool
+can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
 {
    if (!instr->isVALU())
       return false;
@@ -218,7 +200,7 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_r
       if (vop3.omod && chip < GFX9)
          return false;
 
-      //TODO: return true if we know we will use vcc
+      // TODO: return true if we know we will use vcc
       if (!pre_ra && instr->definitions.size() >= 2)
          return false;
 
@@ -244,38 +226,36 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_r
          return false;
    }
 
-   bool is_mac = instr->opcode == aco_opcode::v_mac_f32 ||
-                 instr->opcode == aco_opcode::v_mac_f16 ||
-                 instr->opcode == aco_opcode::v_fmac_f32 ||
-                 instr->opcode == aco_opcode::v_fmac_f16;
+   bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
+                 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
 
    if (chip != GFX8 && is_mac)
       return false;
 
-   //TODO: return true if we know we will use vcc
+   // TODO: return true if we know we will use vcc
    if (!pre_ra && instr->isVOPC())
       return false;
    if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
       return false;
 
-   return instr->opcode != aco_opcode::v_madmk_f32 &&
-          instr->opcode != aco_opcode::v_madak_f32 &&
-          instr->opcode != aco_opcode::v_madmk_f16 &&
-          instr->opcode != aco_opcode::v_madak_f16 &&
+   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
+          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
-          instr->opcode != aco_opcode::v_clrexcp &&
-          instr->opcode != aco_opcode::v_swap_b32;
+          instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
 }
 
 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
-aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
+aco_ptr<Instruction>
+convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
 {
    if (instr->isSDWA())
       return NULL;
 
    aco_ptr<Instruction> tmp = std::move(instr);
-   Format format = (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
-   instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
+   Format format =
+      (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
+   instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
+                                                    tmp->definitions.size()));
    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
 
@@ -295,15 +275,9 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
          break;
 
       switch (instr->operands[i].bytes()) {
-      case 1:
-         sdwa.sel[i] = sdwa_ubyte;
-         break;
-      case 2:
-         sdwa.sel[i] = sdwa_uword;
-         break;
-      case 4:
-         sdwa.sel[i] = sdwa_udword;
-         break;
+      case 1: sdwa.sel[i] = sdwa_ubyte; break;
+      case 2: sdwa.sel[i] = sdwa_uword; break;
+      case 4: sdwa.sel[i] = sdwa_udword; break;
       }
    }
    switch (instr->definitions[0].bytes()) {
@@ -315,9 +289,7 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
       sdwa.dst_sel = sdwa_uword;
       sdwa.dst_preserve = true;
       break;
-   case 4:
-      sdwa.dst_sel = sdwa_udword;
-      break;
+   case 4: sdwa.dst_sel = sdwa_udword; break;
    }
 
    if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8)
@@ -330,7 +302,8 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
    return tmp;
 }
 
-bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
+bool
+can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
 {
    /* opsel is only GFX9+ */
    if ((high || idx == -1) && chip < GFX9)
@@ -362,21 +335,18 @@ bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
    case aco_opcode::v_lshlrev_b16_e64:
    case aco_opcode::v_lshrrev_b16_e64:
    case aco_opcode::v_ashrrev_i16_e64:
-   case aco_opcode::v_mul_lo_u16_e64:
-      return true;
+   case aco_opcode::v_mul_lo_u16_e64: return true;
    case aco_opcode::v_pack_b32_f16:
    case aco_opcode::v_cvt_pknorm_i16_f16:
-   case aco_opcode::v_cvt_pknorm_u16_f16:
-      return idx != -1;
+   case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
    case aco_opcode::v_mad_u32_u16:
-   case aco_opcode::v_mad_i32_i16:
-      return idx >= 0 && idx < 2;
-   default:
-      return false;
+   case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
+   default: return false;
    }
 }
 
-uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
+uint32_t
+get_reduction_identity(ReduceOp op, unsigned idx)
 {
    switch (op) {
    case iadd8:
@@ -397,65 +367,44 @@ uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
    case umax8:
    case umax16:
    case umax32:
-   case umax64:
-      return 0;
+   case umax64: return 0;
    case imul8:
    case imul16:
    case imul32:
-   case imul64:
-      return idx ? 0 : 1;
-   case fmul16:
-      return 0x3c00u; /* 1.0 */
-   case fmul32:
-      return 0x3f800000u; /* 1.0 */
-   case fmul64:
-      return idx ? 0x3ff00000u : 0u; /* 1.0 */
-   case imin8:
-      return INT8_MAX;
-   case imin16:
-      return INT16_MAX;
-   case imin32:
-      return INT32_MAX;
-   case imin64:
-      return idx ? 0x7fffffffu : 0xffffffffu;
-   case imax8:
-      return INT8_MIN;
-   case imax16:
-      return INT16_MIN;
-   case imax32:
-      return INT32_MIN;
-   case imax64:
-      return idx ? 0x80000000u : 0;
+   case imul64: return idx ? 0 : 1;
+   case fmul16: return 0x3c00u;                /* 1.0 */
+   case fmul32: return 0x3f800000u;            /* 1.0 */
+   case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
+   case imin8: return INT8_MAX;
+   case imin16: return INT16_MAX;
+   case imin32: return INT32_MAX;
+   case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
+   case imax8: return INT8_MIN;
+   case imax16: return INT16_MIN;
+   case imax32: return INT32_MIN;
+   case imax64: return idx ? 0x80000000u : 0;
    case umin8:
    case umin16:
    case iand8:
-   case iand16:
-      return 0xffffffffu;
+   case iand16: return 0xffffffffu;
    case umin32:
    case umin64:
    case iand32:
-   case iand64:
-      return 0xffffffffu;
-   case fmin16:
-      return 0x7c00u; /* infinity */
-   case fmin32:
-      return 0x7f800000u; /* infinity */
-   case fmin64:
-      return idx ? 0x7ff00000u : 0u; /* infinity */
-   case fmax16:
-      return 0xfc00u; /* negative infinity */
-   case fmax32:
-      return 0xff800000u; /* negative infinity */
-   case fmax64:
-      return idx ? 0xfff00000u : 0u; /* negative infinity */
-   default:
-      unreachable("Invalid reduction operation");
-      break;
+   case iand64: return 0xffffffffu;
+   case fmin16: return 0x7c00u;                /* infinity */
+   case fmin32: return 0x7f800000u;            /* infinity */
+   case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
+   case fmax16: return 0xfc00u;                /* negative infinity */
+   case fmax32: return 0xff800000u;            /* negative infinity */
+   case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
+   default: unreachable("Invalid reduction operation"); break;
    }
    return 0;
 }
 
-bool needs_exec_mask(const Instruction* instr) {
+bool
+needs_exec_mask(const Instruction* instr)
+{
    if (instr->isSALU() || instr->isBranch())
       return instr->reads_exec();
    if (instr->isSMEM())
@@ -479,10 +428,8 @@ bool needs_exec_mask(const Instruction* instr) {
       case aco_opcode::p_reload:
       case aco_opcode::p_logical_start:
       case aco_opcode::p_logical_end:
-      case aco_opcode::p_startpgm:
-         return false;
-      default:
-         break;
+      case aco_opcode::p_startpgm: return false;
+      default: break;
       }
    }
 
@@ -495,10 +442,11 @@ bool needs_exec_mask(const Instruction* instr) {
    return true;
 }
 
-wait_imm::wait_imm() :
-   vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {}
-wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
-      vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
+wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
+{}
+wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
+    : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
+{}
 
 wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
 {
@@ -513,7 +461,8 @@ wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
       lgkm |= (packed >> 8) & 0x30;
 }
 
-uint16_t wait_imm::pack(enum chip_class chip) const
+uint16_t
+wait_imm::pack(enum chip_class chip) const
 {
    uint16_t imm = 0;
    assert(exp == unset_counter || exp <= 0x7);
@@ -536,13 +485,16 @@ uint16_t wait_imm::pack(enum chip_class chip) const
       break;
    }
    if (chip < GFX9 && vm == wait_imm::unset_counter)
-      imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */
+      imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
+                        architecture when interpreting the immediate */
    if (chip < GFX10 && lgkm == wait_imm::unset_counter)
-      imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */
+      imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
+                        architecture when interpreting the immediate */
    return imm;
 }
 
-bool wait_imm::combine(const wait_imm& other)
+bool
+wait_imm::combine(const wait_imm& other)
 {
    bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
    vm = std::min(vm, other.vm);
@@ -552,17 +504,21 @@ bool wait_imm::combine(const wait_imm& other)
    return changed;
 }
 
-bool wait_imm::empty() const
+bool
+wait_imm::empty() const
 {
-   return vm == unset_counter && exp == unset_counter &&
-          lgkm == unset_counter && vs == unset_counter;
+   return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
+          vs == unset_counter;
 }
 
-bool should_form_clause(const Instruction *a, const Instruction *b)
+bool
+should_form_clause(const Instruction* a, const Instruction* b)
 {
    /* Vertex attribute loads from the same binding likely load from similar addresses */
-   unsigned a_vtx_binding = a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
-   unsigned b_vtx_binding = b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
+   unsigned a_vtx_binding =
+      a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
+   unsigned b_vtx_binding =
+      b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
    if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
       return true;
 
@@ -584,4 +540,4 @@ bool should_form_clause(const Instruction *a, const Instruction *b)
    return false;
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 9bbbbe2cadd..2675150d126 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -27,6 +27,7 @@
 
 #include "aco_opcodes.h"
 #include "aco_util.h"
+
 #include "vulkan/radv_shader.h"
 
 #include "nir.h"
@@ -129,11 +130,11 @@ enum class instr_class : uint8_t {
 };
 
 enum storage_class : uint8_t {
-   storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
+   storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
    storage_buffer = 0x1, /* SSBOs and global memory */
    storage_atomic_counter = 0x2, /* not used for Vulkan */
    storage_image = 0x4,
-   storage_shared = 0x8, /* or TCS output */
+   storage_shared = 0x8,       /* or TCS output */
    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
    storage_scratch = 0x20,
    storage_vgpr_spill = 0x40,
@@ -157,7 +158,8 @@ enum memory_semantics : uint8_t {
    /* does not interact with barriers and assumes this lane is the only lane
     * accessing this memory */
    semantic_private = 0x8,
-   /* this operation can be reordered around operations of the same storage. says nothing about barriers */
+   /* this operation can be reordered around operations of the same storage.
+    * says nothing about barriers */
    semantic_can_reorder = 0x10,
    /* this is a atomic instruction (may only read or write memory) */
    semantic_atomic = 0x20,
@@ -178,20 +180,21 @@ enum sync_scope : uint8_t {
 
 struct memory_sync_info {
    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
-   memory_sync_info(int storage_, int semantics_=0, sync_scope scope_=scope_invocation)
-      : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_) {}
+   memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
+       : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
+   {}
 
-   storage_class storage:8;
-   memory_semantics semantics:8;
-   sync_scope scope:8;
+   storage_class storage : 8;
+   memory_semantics semantics : 8;
+   sync_scope scope : 8;
 
-   bool operator == (const memory_sync_info& rhs) const {
-      return storage == rhs.storage &&
-             semantics == rhs.semantics &&
-             scope == rhs.scope;
+   bool operator==(const memory_sync_info& rhs) const
+   {
+      return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
    }
 
-   bool can_reorder() const {
+   bool can_reorder() const
+   {
       if (semantics & semantic_acqrel)
          return false;
       /* Also check storage so that zero-initialized memory_sync_info can be
@@ -221,33 +224,34 @@ struct float_mode {
    /* matches encoding of the MODE register */
    union {
       struct {
-          fp_round round32:2;
-          fp_round round16_64:2;
-          unsigned denorm32:2;
-          unsigned denorm16_64:2;
+         fp_round round32 : 2;
+         fp_round round16_64 : 2;
+         unsigned denorm32 : 2;
+         unsigned denorm16_64 : 2;
       };
       struct {
-         uint8_t round:4;
-         uint8_t denorm:4;
+         uint8_t round : 4;
+         uint8_t denorm : 4;
       };
       uint8_t val = 0;
    };
    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
-   bool preserve_signed_zero_inf_nan32:1;
-   bool preserve_signed_zero_inf_nan16_64:1;
+   bool preserve_signed_zero_inf_nan32 : 1;
+   bool preserve_signed_zero_inf_nan16_64 : 1;
    /* if false, optimizations which may remove denormal flushing can be done */
-   bool must_flush_denorms32:1;
-   bool must_flush_denorms16_64:1;
-   bool care_about_round32:1;
-   bool care_about_round16_64:1;
+   bool must_flush_denorms32 : 1;
+   bool must_flush_denorms16_64 : 1;
+   bool care_about_round32 : 1;
+   bool care_about_round16_64 : 1;
 
    /* Returns true if instructions using the mode "other" can safely use the
     * current one instead. */
-   bool canReplace(float_mode other) const noexcept {
+   bool canReplace(float_mode other) const noexcept
+   {
       return val == other.val &&
              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
-             (must_flush_denorms32  || !other.must_flush_denorms32) &&
+             (must_flush_denorms32 || !other.must_flush_denorms32) &&
              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
              (care_about_round32 || !other.care_about_round32) &&
              (care_about_round16_64 || !other.care_about_round16_64);
@@ -273,13 +277,17 @@ struct wait_imm {
    bool empty() const;
 };
 
-constexpr Format asVOP3(Format format) {
-   return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
+constexpr Format
+asVOP3(Format format)
+{
+   return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
 };
 
-constexpr Format asSDWA(Format format) {
+constexpr Format
+asSDWA(Format format)
+{
    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
-   return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
+   return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
 }
 
 enum class RegType {
@@ -303,10 +311,10 @@ struct RegClass {
       v2 = s2 | (1 << 5),
       v3 = s3 | (1 << 5),
       v4 = s4 | (1 << 5),
-      v5 = 5  | (1 << 5),
-      v6 = 6  | (1 << 5),
-      v7 = 7  | (1 << 5),
-      v8 = 8  | (1 << 5),
+      v5 = 5 | (1 << 5),
+      v6 = 6 | (1 << 5),
+      v7 = 7 | (1 << 5),
+      v8 = 8 | (1 << 5),
       /* byte-sized register class */
       v1b = v1 | (1 << 7),
       v2b = v2 | (1 << 7),
@@ -320,29 +328,29 @@ struct RegClass {
    };
 
    RegClass() = default;
-   constexpr RegClass(RC rc_)
-      : rc(rc_) {}
+   constexpr RegClass(RC rc_) : rc(rc_) {}
    constexpr RegClass(RegType type, unsigned size)
-      : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
+       : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
+   {}
 
    constexpr operator RC() const { return rc; }
    explicit operator bool() = delete;
 
    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
    constexpr bool is_subdword() const { return rc & (1 << 7); }
-   constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
-   //TODO: use size() less in favor of bytes()
+   constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
+   // TODO: use size() less in favor of bytes()
    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
-   constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
-   constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
+   constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
+   constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
 
-   static constexpr RegClass get(RegType type, unsigned bytes) {
+   static constexpr RegClass get(RegType type, unsigned bytes)
+   {
       if (type == RegType::sgpr) {
          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
       } else {
-         return bytes % 4u ? RegClass(type, bytes).as_subdword() :
-                             RegClass(type, bytes / 4u);
+         return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
       }
    }
 
@@ -380,8 +388,7 @@ static constexpr RegClass v8b{RegClass::v8b};
  */
 struct Temp {
    Temp() noexcept : id_(0), reg_class(0) {}
-   constexpr Temp(uint32_t id, RegClass cls) noexcept
-      : id_(id), reg_class(uint8_t(cls)) {}
+   constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
 
    constexpr uint32_t id() const noexcept { return id_; }
    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
@@ -391,12 +398,12 @@ struct Temp {
    constexpr RegType type() const noexcept { return regClass().type(); }
    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 
-   constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
+   constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 
 private:
-   uint32_t id_: 24;
+   uint32_t id_ : 24;
    uint32_t reg_class : 8;
 };
 
@@ -413,8 +420,13 @@ struct PhysReg {
    constexpr operator unsigned() const { return reg(); }
    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
-   constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
-   constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
+   constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
+   constexpr PhysReg advance(int bytes) const
+   {
+      PhysReg res = *this;
+      res.reg_b += bytes;
+      return res;
+   }
 
    uint16_t reg_b = 0;
 };
@@ -453,13 +465,13 @@ static constexpr PhysReg scc{253};
  * Temporary registers get mapped to physical register during RA
  * Constant values are inlined into the instruction sequence.
  */
-class Operand final
-{
+class Operand final {
 public:
    constexpr Operand()
-      : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
-        isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
-        isLateKill_(false), is16bit_(false), is24bit_(false), signext(false) {}
+       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
+         isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
+         is24bit_(false), signext(false)
+   {}
 
    explicit Operand(Temp r) noexcept
    {
@@ -553,11 +565,11 @@ public:
       isConstant_ = true;
       constSize = 3;
       if (v <= 64) {
-         data_.i = (uint32_t) v;
-         setFixed(PhysReg{128 + (uint32_t) v});
+         data_.i = (uint32_t)v;
+         setFixed(PhysReg{128 + (uint32_t)v});
       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
-         data_.i = (uint32_t) v;
-         setFixed(PhysReg{192 - (uint32_t) v});
+         data_.i = (uint32_t)v;
+         setFixed(PhysReg{192 - (uint32_t)v});
       } else if (v == 0x3FE0000000000000) { /* 0.5 */
          data_.i = 0x3f000000;
          setFixed(PhysReg{240});
@@ -586,7 +598,8 @@ public:
          signext = v >> 63;
          data_.i = v & 0xffffffffu;
          setFixed(PhysReg{255});
-         assert(constantValue64() == v && "attempt to create a unrepresentable 64-bit literal constant");
+         assert(constantValue64() == v &&
+                "attempt to create a unrepresentable 64-bit literal constant");
       }
    };
    explicit Operand(RegClass type) noexcept
@@ -623,7 +636,8 @@ public:
       return Operand((uint8_t)val);
    }
 
-   static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext=false, bool sext=false)
+   static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
+                                         bool sext = false)
    {
       if (bytes <= 4)
          return true;
@@ -634,48 +648,33 @@ public:
       if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
          return true;
 
-      return val <= 64 ||
-             val >= 0xFFFFFFFFFFFFFFF0 || /* [-16 .. -1] */
-             val == 0x3FE0000000000000 || /* 0.5 */
-             val == 0xBFE0000000000000 || /* -0.5 */
-             val == 0x3FF0000000000000 || /* 1.0 */
-             val == 0xBFF0000000000000 || /* -1.0 */
-             val == 0x4000000000000000 || /* 2.0 */
-             val == 0xC000000000000000 || /* -2.0 */
-             val == 0x4010000000000000 || /* 4.0 */
-             val == 0xC010000000000000; /* -4.0 */
+      return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
+             val == 0x3FE0000000000000 ||              /* 0.5 */
+             val == 0xBFE0000000000000 ||              /* -0.5 */
+             val == 0x3FF0000000000000 ||              /* 1.0 */
+             val == 0xBFF0000000000000 ||              /* -1.0 */
+             val == 0x4000000000000000 ||              /* 2.0 */
+             val == 0xC000000000000000 ||              /* -2.0 */
+             val == 0x4010000000000000 ||              /* 4.0 */
+             val == 0xC010000000000000;                /* -4.0 */
    }
 
-   constexpr bool isTemp() const noexcept
+   constexpr bool isTemp() const noexcept { return isTemp_; }
+
+   constexpr void setTemp(Temp t) noexcept
    {
-      return isTemp_;
-   }
-
-   constexpr void setTemp(Temp t) noexcept {
       assert(!isConstant_);
       isTemp_ = true;
       data_.temp = t;
    }
 
-   constexpr Temp getTemp() const noexcept
-   {
-      return data_.temp;
-   }
+   constexpr Temp getTemp() const noexcept { return data_.temp; }
 
-   constexpr uint32_t tempId() const noexcept
-   {
-      return data_.temp.id();
-   }
+   constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
 
-   constexpr bool hasRegClass() const noexcept
-   {
-      return isTemp() || isUndefined();
-   }
+   constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
 
-   constexpr RegClass regClass() const noexcept
-   {
-      return data_.temp.regClass();
-   }
+   constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
 
    constexpr unsigned bytes() const noexcept
    {
@@ -693,15 +692,9 @@ public:
          return data_.temp.size();
    }
 
-   constexpr bool isFixed() const noexcept
-   {
-      return isFixed_;
-   }
+   constexpr bool isFixed() const noexcept { return isFixed_; }
 
-   constexpr PhysReg physReg() const noexcept
-   {
-      return reg_;
-   }
+   constexpr PhysReg physReg() const noexcept { return reg_; }
 
    constexpr void setFixed(PhysReg reg) noexcept
    {
@@ -709,25 +702,13 @@ public:
       reg_ = reg;
    }
 
-   constexpr bool isConstant() const noexcept
-   {
-      return isConstant_;
-   }
+   constexpr bool isConstant() const noexcept { return isConstant_; }
 
-   constexpr bool isLiteral() const noexcept
-   {
-      return isConstant() && reg_ == 255;
-   }
+   constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
 
-   constexpr bool isUndefined() const noexcept
-   {
-      return isUndef_;
-   }
+   constexpr bool isUndefined() const noexcept { return isUndef_; }
 
-   constexpr uint32_t constantValue() const noexcept
-   {
-      return data_.i;
-   }
+   constexpr uint32_t constantValue() const noexcept { return data_.i; }
 
    constexpr bool constantEquals(uint32_t cmp) const noexcept
    {
@@ -743,22 +724,14 @@ public:
             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 
          switch (reg_) {
-         case 240:
-            return 0x3FE0000000000000;
-         case 241:
-            return 0xBFE0000000000000;
-         case 242:
-            return 0x3FF0000000000000;
-         case 243:
-            return 0xBFF0000000000000;
-         case 244:
-            return 0x4000000000000000;
-         case 245:
-            return 0xC000000000000000;
-         case 246:
-            return 0x4010000000000000;
-         case 247:
-            return 0xC010000000000000;
+         case 240: return 0x3FE0000000000000;
+         case 241: return 0xBFE0000000000000;
+         case 242: return 0x3FF0000000000000;
+         case 243: return 0xBFF0000000000000;
+         case 244: return 0x4000000000000000;
+         case 245: return 0xC000000000000000;
+         case 246: return 0x4010000000000000;
+         case 247: return 0xC010000000000000;
          case 255:
             return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
          }
@@ -776,15 +749,9 @@ public:
    /* Indicates that the killed operand's live range intersects with the
     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
     * not set by liveness analysis. */
-   constexpr void setLateKill(bool flag) noexcept
-   {
-      isLateKill_ = flag;
-   }
+   constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
 
-   constexpr bool isLateKill() const noexcept
-   {
-      return isLateKill_;
-   }
+   constexpr bool isLateKill() const noexcept { return isLateKill_; }
 
    constexpr void setKill(bool flag) noexcept
    {
@@ -793,10 +760,7 @@ public:
          setFirstKill(false);
    }
 
-   constexpr bool isKill() const noexcept
-   {
-      return isKill_ || isFirstKill();
-   }
+   constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
 
    constexpr void setFirstKill(bool flag) noexcept
    {
@@ -807,22 +771,13 @@ public:
 
    /* When there are multiple operands killing the same temporary,
     * isFirstKill() is only returns true for the first one. */
-   constexpr bool isFirstKill() const noexcept
-   {
-      return isFirstKill_;
-   }
+   constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
 
-   constexpr bool isKillBeforeDef() const noexcept
-   {
-      return isKill() && !isLateKill();
-   }
+   constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
 
-   constexpr bool isFirstKillBeforeDef() const noexcept
-   {
-      return isFirstKill() && !isLateKill();
-   }
+   constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
 
-   constexpr bool operator == (Operand other) const noexcept
+   constexpr bool operator==(Operand other) const noexcept
    {
       if (other.size() != size())
          return false;
@@ -840,51 +795,36 @@ public:
          return other.isTemp() && other.getTemp() == getTemp();
    }
 
-   constexpr bool operator != (Operand other) const noexcept
-   {
-      return !operator==(other);
-   }
+   constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
 
-   constexpr void set16bit(bool flag) noexcept
-   {
-      is16bit_ = flag;
-   }
+   constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
 
-   constexpr bool is16bit() const noexcept
-   {
-      return is16bit_;
-   }
+   constexpr bool is16bit() const noexcept { return is16bit_; }
 
-   constexpr void set24bit(bool flag) noexcept
-   {
-      is24bit_ = flag;
-   }
+   constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
 
-   constexpr bool is24bit() const noexcept
-   {
-      return is24bit_;
-   }
+   constexpr bool is24bit() const noexcept { return is24bit_; }
 
 private:
    union {
       Temp temp;
       uint32_t i;
       float f;
-   } data_ = { Temp(0, s1) };
+   } data_ = {Temp(0, s1)};
    PhysReg reg_;
    union {
       struct {
-         uint8_t isTemp_:1;
-         uint8_t isFixed_:1;
-         uint8_t isConstant_:1;
-         uint8_t isKill_:1;
-         uint8_t isUndef_:1;
-         uint8_t isFirstKill_:1;
-         uint8_t constSize:2;
-         uint8_t isLateKill_:1;
-         uint8_t is16bit_:1;
-         uint8_t is24bit_:1;
-         uint8_t signext:1;
+         uint8_t isTemp_ : 1;
+         uint8_t isFixed_ : 1;
+         uint8_t isConstant_ : 1;
+         uint8_t isKill_ : 1;
+         uint8_t isUndef_ : 1;
+         uint8_t isFirstKill_ : 1;
+         uint8_t constSize : 2;
+         uint8_t isLateKill_ : 1;
+         uint8_t is16bit_ : 1;
+         uint8_t is24bit_ : 1;
+         uint8_t signext : 1;
       };
       /* can't initialize bit-fields in c++11, so work around using a union */
       uint16_t control_ = 0;
@@ -897,73 +837,39 @@ private:
  * and refer to temporary virtual registers
  * which are later mapped to physical registers
  */
-class Definition final
-{
+class Definition final {
 public:
-   constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
-                            isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0) {}
-   Definition(uint32_t index, RegClass type) noexcept
-      : temp(index, type) {}
-   explicit Definition(Temp tmp) noexcept
-      : temp(tmp) {}
-   Definition(PhysReg reg, RegClass type) noexcept
-      : temp(Temp(0, type))
-   {
-      setFixed(reg);
-   }
-   Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
-      : temp(Temp(tmpId, type))
+   constexpr Definition()
+       : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0),
+         isNoCSE_(0)
+   {}
+   Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
+   explicit Definition(Temp tmp) noexcept : temp(tmp) {}
+   Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
+   Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
    {
       setFixed(reg);
    }
 
-   constexpr bool isTemp() const noexcept
-   {
-      return tempId() > 0;
-   }
+   constexpr bool isTemp() const noexcept { return tempId() > 0; }
 
-   constexpr Temp getTemp() const noexcept
-   {
-      return temp;
-   }
+   constexpr Temp getTemp() const noexcept { return temp; }
 
-   constexpr uint32_t tempId() const noexcept
-   {
-      return temp.id();
-   }
+   constexpr uint32_t tempId() const noexcept { return temp.id(); }
 
-   constexpr void setTemp(Temp t) noexcept {
-      temp = t;
-   }
+   constexpr void setTemp(Temp t) noexcept { temp = t; }
 
-   void swapTemp(Definition& other) noexcept {
-      std::swap(temp, other.temp);
-   }
+   void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
 
-   constexpr RegClass regClass() const noexcept
-   {
-      return temp.regClass();
-   }
+   constexpr RegClass regClass() const noexcept { return temp.regClass(); }
 
-   constexpr unsigned bytes() const noexcept
-   {
-      return temp.bytes();
-   }
+   constexpr unsigned bytes() const noexcept { return temp.bytes(); }
 
-   constexpr unsigned size() const noexcept
-   {
-      return temp.size();
-   }
+   constexpr unsigned size() const noexcept { return temp.size(); }
 
-   constexpr bool isFixed() const noexcept
-   {
-      return isFixed_;
-   }
+   constexpr bool isFixed() const noexcept { return isFixed_; }
 
-   constexpr PhysReg physReg() const noexcept
-   {
-      return reg_;
-   }
+   constexpr PhysReg physReg() const noexcept { return reg_; }
 
    constexpr void setFixed(PhysReg reg) noexcept
    {
@@ -977,63 +883,36 @@ public:
       reg_ = reg;
    }
 
-   constexpr bool hasHint() const noexcept
-   {
-      return hasHint_;
-   }
+   constexpr bool hasHint() const noexcept { return hasHint_; }
 
-   constexpr void setKill(bool flag) noexcept
-   {
-      isKill_ = flag;
-   }
+   constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
 
-   constexpr bool isKill() const noexcept
-   {
-      return isKill_;
-   }
+   constexpr bool isKill() const noexcept { return isKill_; }
 
-   constexpr void setPrecise(bool precise) noexcept
-   {
-      isPrecise_ = precise;
-   }
+   constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
 
-   constexpr bool isPrecise() const noexcept
-   {
-      return isPrecise_;
-   }
+   constexpr bool isPrecise() const noexcept { return isPrecise_; }
 
    /* No Unsigned Wrap */
-   constexpr void setNUW(bool nuw) noexcept
-   {
-      isNUW_ = nuw;
-   }
+   constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
 
-   constexpr bool isNUW() const noexcept
-   {
-      return isNUW_;
-   }
+   constexpr bool isNUW() const noexcept { return isNUW_; }
 
-   constexpr void setNoCSE(bool noCSE) noexcept
-   {
-      isNoCSE_ = noCSE;
-   }
+   constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
 
-   constexpr bool isNoCSE() const noexcept
-   {
-      return isNoCSE_;
-   }
+   constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
 
 private:
    Temp temp = Temp(0, s1);
    PhysReg reg_;
    union {
       struct {
-         uint8_t isFixed_:1;
-         uint8_t hasHint_:1;
-         uint8_t isKill_:1;
-         uint8_t isPrecise_:1;
-         uint8_t isNUW_:1;
-         uint8_t isNoCSE_:1;
+         uint8_t isFixed_ : 1;
+         uint8_t hasHint_ : 1;
+         uint8_t isKill_ : 1;
+         uint8_t isPrecise_ : 1;
+         uint8_t isNUW_ : 1;
+         uint8_t isNoCSE_ : 1;
       };
       /* can't initialize bit-fields in c++11, so work around using a union */
       uint8_t control_ = 0;
@@ -1086,99 +965,298 @@ struct Instruction {
       return false;
    }
 
-   Pseudo_instruction& pseudo() noexcept {assert(isPseudo()); return *(Pseudo_instruction *)this;}
-   const Pseudo_instruction& pseudo() const noexcept {assert(isPseudo()); return *(Pseudo_instruction *)this;}
-   constexpr bool isPseudo() const noexcept {return format == Format::PSEUDO;}
-   SOP1_instruction& sop1() noexcept {assert(isSOP1()); return *(SOP1_instruction *)this;}
-   const SOP1_instruction& sop1() const noexcept {assert(isSOP1()); return *(SOP1_instruction *)this;}
-   constexpr bool isSOP1() const noexcept {return format == Format::SOP1;}
-   SOP2_instruction& sop2() noexcept {assert(isSOP2()); return *(SOP2_instruction *)this;}
-   const SOP2_instruction& sop2() const noexcept {assert(isSOP2()); return *(SOP2_instruction *)this;}
-   constexpr bool isSOP2() const noexcept {return format == Format::SOP2;}
-   SOPK_instruction& sopk() noexcept {assert(isSOPK()); return *(SOPK_instruction *)this;}
-   const SOPK_instruction& sopk() const noexcept {assert(isSOPK()); return *(SOPK_instruction *)this;}
-   constexpr bool isSOPK() const noexcept {return format == Format::SOPK;}
-   SOPP_instruction& sopp() noexcept {assert(isSOPP()); return *(SOPP_instruction *)this;}
-   const SOPP_instruction& sopp() const noexcept {assert(isSOPP()); return *(SOPP_instruction *)this;}
-   constexpr bool isSOPP() const noexcept {return format == Format::SOPP;}
-   SOPC_instruction& sopc() noexcept {assert(isSOPC()); return *(SOPC_instruction *)this;}
-   const SOPC_instruction& sopc() const noexcept {assert(isSOPC()); return *(SOPC_instruction *)this;}
-   constexpr bool isSOPC() const noexcept {return format == Format::SOPC;}
-   SMEM_instruction& smem() noexcept {assert(isSMEM()); return *(SMEM_instruction *)this;}
-   const SMEM_instruction& smem() const noexcept {assert(isSMEM()); return *(SMEM_instruction *)this;}
-   constexpr bool isSMEM() const noexcept {return format == Format::SMEM;}
-   DS_instruction& ds() noexcept {assert(isDS()); return *(DS_instruction *)this;}
-   const DS_instruction& ds() const noexcept {assert(isDS()); return *(DS_instruction *)this;}
-   constexpr bool isDS() const noexcept {return format == Format::DS;}
-   MTBUF_instruction& mtbuf() noexcept {assert(isMTBUF()); return *(MTBUF_instruction *)this;}
-   const MTBUF_instruction& mtbuf() const noexcept {assert(isMTBUF()); return *(MTBUF_instruction *)this;}
-   constexpr bool isMTBUF() const noexcept {return format == Format::MTBUF;}
-   MUBUF_instruction& mubuf() noexcept {assert(isMUBUF()); return *(MUBUF_instruction *)this;}
-   const MUBUF_instruction& mubuf() const noexcept {assert(isMUBUF()); return *(MUBUF_instruction *)this;}
-   constexpr bool isMUBUF() const noexcept {return format == Format::MUBUF;}
-   MIMG_instruction& mimg() noexcept {assert(isMIMG()); return *(MIMG_instruction *)this;}
-   const MIMG_instruction& mimg() const noexcept {assert(isMIMG()); return *(MIMG_instruction *)this;}
-   constexpr bool isMIMG() const noexcept {return format == Format::MIMG;}
-   Export_instruction& exp() noexcept {assert(isEXP()); return *(Export_instruction *)this;}
-   const Export_instruction& exp() const noexcept {assert(isEXP()); return *(Export_instruction *)this;}
-   constexpr bool isEXP() const noexcept {return format == Format::EXP;}
-   FLAT_instruction& flat() noexcept {assert(isFlat()); return *(FLAT_instruction *)this;}
-   const FLAT_instruction& flat() const noexcept {assert(isFlat()); return *(FLAT_instruction *)this;}
-   constexpr bool isFlat() const noexcept {return format == Format::FLAT;}
-   FLAT_instruction& global() noexcept {assert(isGlobal()); return *(FLAT_instruction *)this;}
-   const FLAT_instruction& global() const noexcept {assert(isGlobal()); return *(FLAT_instruction *)this;}
-   constexpr bool isGlobal() const noexcept {return format == Format::GLOBAL;}
-   FLAT_instruction& scratch() noexcept {assert(isScratch()); return *(FLAT_instruction *)this;}
-   const FLAT_instruction& scratch() const noexcept {assert(isScratch()); return *(FLAT_instruction *)this;}
-   constexpr bool isScratch() const noexcept {return format == Format::SCRATCH;}
-   Pseudo_branch_instruction& branch() noexcept {assert(isBranch()); return *(Pseudo_branch_instruction *)this;}
-   const Pseudo_branch_instruction& branch() const noexcept {assert(isBranch()); return *(Pseudo_branch_instruction *)this;}
-   constexpr bool isBranch() const noexcept {return format == Format::PSEUDO_BRANCH;}
-   Pseudo_barrier_instruction& barrier() noexcept {assert(isBarrier()); return *(Pseudo_barrier_instruction *)this;}
-   const Pseudo_barrier_instruction& barrier() const noexcept {assert(isBarrier()); return *(Pseudo_barrier_instruction *)this;}
-   constexpr bool isBarrier() const noexcept {return format == Format::PSEUDO_BARRIER;}
-   Pseudo_reduction_instruction& reduction() noexcept {assert(isReduction()); return *(Pseudo_reduction_instruction *)this;}
-   const Pseudo_reduction_instruction& reduction() const noexcept {assert(isReduction()); return *(Pseudo_reduction_instruction *)this;}
-   constexpr bool isReduction() const noexcept {return format == Format::PSEUDO_REDUCTION;}
-   VOP3P_instruction& vop3p() noexcept {assert(isVOP3P()); return *(VOP3P_instruction *)this;}
-   const VOP3P_instruction& vop3p() const noexcept {assert(isVOP3P()); return *(VOP3P_instruction *)this;}
-   constexpr bool isVOP3P() const noexcept {return format == Format::VOP3P;}
-   VOP1_instruction& vop1() noexcept {assert(isVOP1()); return *(VOP1_instruction *)this;}
-   const VOP1_instruction& vop1() const noexcept {assert(isVOP1()); return *(VOP1_instruction *)this;}
-   constexpr bool isVOP1() const noexcept {return (uint16_t)format & (uint16_t)Format::VOP1;}
-   VOP2_instruction& vop2() noexcept {assert(isVOP2()); return *(VOP2_instruction *)this;}
-   const VOP2_instruction& vop2() const noexcept {assert(isVOP2()); return *(VOP2_instruction *)this;}
-   constexpr bool isVOP2() const noexcept {return (uint16_t)format & (uint16_t)Format::VOP2;}
-   VOPC_instruction& vopc() noexcept {assert(isVOPC()); return *(VOPC_instruction *)this;}
-   const VOPC_instruction& vopc() const noexcept {assert(isVOPC()); return *(VOPC_instruction *)this;}
-   constexpr bool isVOPC() const noexcept {return (uint16_t)format & (uint16_t)Format::VOPC;}
-   VOP3_instruction& vop3() noexcept {assert(isVOP3()); return *(VOP3_instruction *)this;}
-   const VOP3_instruction& vop3() const noexcept {assert(isVOP3()); return *(VOP3_instruction *)this;}
-   constexpr bool isVOP3() const noexcept {return (uint16_t)format & (uint16_t)Format::VOP3;}
-   Interp_instruction& vintrp() noexcept {assert(isVINTRP()); return *(Interp_instruction *)this;}
-   const Interp_instruction& vintrp() const noexcept {assert(isVINTRP()); return *(Interp_instruction *)this;}
-   constexpr bool isVINTRP() const noexcept {return (uint16_t)format & (uint16_t)Format::VINTRP;}
-   DPP_instruction& dpp() noexcept {assert(isDPP()); return *(DPP_instruction *)this;}
-   const DPP_instruction& dpp() const noexcept {assert(isDPP()); return *(DPP_instruction *)this;}
-   constexpr bool isDPP() const noexcept {return (uint16_t)format & (uint16_t)Format::DPP;}
-   SDWA_instruction& sdwa() noexcept {assert(isSDWA()); return *(SDWA_instruction *)this;}
-   const SDWA_instruction& sdwa() const noexcept {assert(isSDWA()); return *(SDWA_instruction *)this;}
-   constexpr bool isSDWA() const noexcept {return (uint16_t)format & (uint16_t)Format::SDWA;}
-
-   FLAT_instruction& flatlike()
+   Pseudo_instruction& pseudo() noexcept
    {
-      return *(FLAT_instruction *)this;
+      assert(isPseudo());
+      return *(Pseudo_instruction*)this;
    }
-
-   const FLAT_instruction& flatlike() const
+   const Pseudo_instruction& pseudo() const noexcept
    {
-      return *(FLAT_instruction *)this;
+      assert(isPseudo());
+      return *(Pseudo_instruction*)this;
    }
-
-   constexpr bool isFlatLike() const noexcept
+   constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
+   SOP1_instruction& sop1() noexcept
    {
-      return isFlat() || isGlobal() || isScratch();
+      assert(isSOP1());
+      return *(SOP1_instruction*)this;
    }
+   const SOP1_instruction& sop1() const noexcept
+   {
+      assert(isSOP1());
+      return *(SOP1_instruction*)this;
+   }
+   constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
+   SOP2_instruction& sop2() noexcept
+   {
+      assert(isSOP2());
+      return *(SOP2_instruction*)this;
+   }
+   const SOP2_instruction& sop2() const noexcept
+   {
+      assert(isSOP2());
+      return *(SOP2_instruction*)this;
+   }
+   constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
+   SOPK_instruction& sopk() noexcept
+   {
+      assert(isSOPK());
+      return *(SOPK_instruction*)this;
+   }
+   const SOPK_instruction& sopk() const noexcept
+   {
+      assert(isSOPK());
+      return *(SOPK_instruction*)this;
+   }
+   constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
+   SOPP_instruction& sopp() noexcept
+   {
+      assert(isSOPP());
+      return *(SOPP_instruction*)this;
+   }
+   const SOPP_instruction& sopp() const noexcept
+   {
+      assert(isSOPP());
+      return *(SOPP_instruction*)this;
+   }
+   constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
+   SOPC_instruction& sopc() noexcept
+   {
+      assert(isSOPC());
+      return *(SOPC_instruction*)this;
+   }
+   const SOPC_instruction& sopc() const noexcept
+   {
+      assert(isSOPC());
+      return *(SOPC_instruction*)this;
+   }
+   constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
+   SMEM_instruction& smem() noexcept
+   {
+      assert(isSMEM());
+      return *(SMEM_instruction*)this;
+   }
+   const SMEM_instruction& smem() const noexcept
+   {
+      assert(isSMEM());
+      return *(SMEM_instruction*)this;
+   }
+   constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
+   DS_instruction& ds() noexcept
+   {
+      assert(isDS());
+      return *(DS_instruction*)this;
+   }
+   const DS_instruction& ds() const noexcept
+   {
+      assert(isDS());
+      return *(DS_instruction*)this;
+   }
+   constexpr bool isDS() const noexcept { return format == Format::DS; }
+   MTBUF_instruction& mtbuf() noexcept
+   {
+      assert(isMTBUF());
+      return *(MTBUF_instruction*)this;
+   }
+   const MTBUF_instruction& mtbuf() const noexcept
+   {
+      assert(isMTBUF());
+      return *(MTBUF_instruction*)this;
+   }
+   constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
+   MUBUF_instruction& mubuf() noexcept
+   {
+      assert(isMUBUF());
+      return *(MUBUF_instruction*)this;
+   }
+   const MUBUF_instruction& mubuf() const noexcept
+   {
+      assert(isMUBUF());
+      return *(MUBUF_instruction*)this;
+   }
+   constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
+   MIMG_instruction& mimg() noexcept
+   {
+      assert(isMIMG());
+      return *(MIMG_instruction*)this;
+   }
+   const MIMG_instruction& mimg() const noexcept
+   {
+      assert(isMIMG());
+      return *(MIMG_instruction*)this;
+   }
+   constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
+   Export_instruction& exp() noexcept
+   {
+      assert(isEXP());
+      return *(Export_instruction*)this;
+   }
+   const Export_instruction& exp() const noexcept
+   {
+      assert(isEXP());
+      return *(Export_instruction*)this;
+   }
+   constexpr bool isEXP() const noexcept { return format == Format::EXP; }
+   FLAT_instruction& flat() noexcept
+   {
+      assert(isFlat());
+      return *(FLAT_instruction*)this;
+   }
+   const FLAT_instruction& flat() const noexcept
+   {
+      assert(isFlat());
+      return *(FLAT_instruction*)this;
+   }
+   constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
+   FLAT_instruction& global() noexcept
+   {
+      assert(isGlobal());
+      return *(FLAT_instruction*)this;
+   }
+   const FLAT_instruction& global() const noexcept
+   {
+      assert(isGlobal());
+      return *(FLAT_instruction*)this;
+   }
+   constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
+   FLAT_instruction& scratch() noexcept
+   {
+      assert(isScratch());
+      return *(FLAT_instruction*)this;
+   }
+   const FLAT_instruction& scratch() const noexcept
+   {
+      assert(isScratch());
+      return *(FLAT_instruction*)this;
+   }
+   constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
+   Pseudo_branch_instruction& branch() noexcept
+   {
+      assert(isBranch());
+      return *(Pseudo_branch_instruction*)this;
+   }
+   const Pseudo_branch_instruction& branch() const noexcept
+   {
+      assert(isBranch());
+      return *(Pseudo_branch_instruction*)this;
+   }
+   constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
+   Pseudo_barrier_instruction& barrier() noexcept
+   {
+      assert(isBarrier());
+      return *(Pseudo_barrier_instruction*)this;
+   }
+   const Pseudo_barrier_instruction& barrier() const noexcept
+   {
+      assert(isBarrier());
+      return *(Pseudo_barrier_instruction*)this;
+   }
+   constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
+   Pseudo_reduction_instruction& reduction() noexcept
+   {
+      assert(isReduction());
+      return *(Pseudo_reduction_instruction*)this;
+   }
+   const Pseudo_reduction_instruction& reduction() const noexcept
+   {
+      assert(isReduction());
+      return *(Pseudo_reduction_instruction*)this;
+   }
+   constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
+   VOP3P_instruction& vop3p() noexcept
+   {
+      assert(isVOP3P());
+      return *(VOP3P_instruction*)this;
+   }
+   const VOP3P_instruction& vop3p() const noexcept
+   {
+      assert(isVOP3P());
+      return *(VOP3P_instruction*)this;
+   }
+   constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
+   VOP1_instruction& vop1() noexcept
+   {
+      assert(isVOP1());
+      return *(VOP1_instruction*)this;
+   }
+   const VOP1_instruction& vop1() const noexcept
+   {
+      assert(isVOP1());
+      return *(VOP1_instruction*)this;
+   }
+   constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
+   VOP2_instruction& vop2() noexcept
+   {
+      assert(isVOP2());
+      return *(VOP2_instruction*)this;
+   }
+   const VOP2_instruction& vop2() const noexcept
+   {
+      assert(isVOP2());
+      return *(VOP2_instruction*)this;
+   }
+   constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
+   VOPC_instruction& vopc() noexcept
+   {
+      assert(isVOPC());
+      return *(VOPC_instruction*)this;
+   }
+   const VOPC_instruction& vopc() const noexcept
+   {
+      assert(isVOPC());
+      return *(VOPC_instruction*)this;
+   }
+   constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
+   VOP3_instruction& vop3() noexcept
+   {
+      assert(isVOP3());
+      return *(VOP3_instruction*)this;
+   }
+   const VOP3_instruction& vop3() const noexcept
+   {
+      assert(isVOP3());
+      return *(VOP3_instruction*)this;
+   }
+   constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
+   Interp_instruction& vintrp() noexcept
+   {
+      assert(isVINTRP());
+      return *(Interp_instruction*)this;
+   }
+   const Interp_instruction& vintrp() const noexcept
+   {
+      assert(isVINTRP());
+      return *(Interp_instruction*)this;
+   }
+   constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
+   DPP_instruction& dpp() noexcept
+   {
+      assert(isDPP());
+      return *(DPP_instruction*)this;
+   }
+   const DPP_instruction& dpp() const noexcept
+   {
+      assert(isDPP());
+      return *(DPP_instruction*)this;
+   }
+   constexpr bool isDPP() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP; }
+   SDWA_instruction& sdwa() noexcept
+   {
+      assert(isSDWA());
+      return *(SDWA_instruction*)this;
+   }
+   const SDWA_instruction& sdwa() const noexcept
+   {
+      assert(isSDWA());
+      return *(SDWA_instruction*)this;
+   }
+   constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
+
+   FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
+
+   const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
+
+   constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
 
    constexpr bool isVALU() const noexcept
    {
@@ -1190,10 +1268,7 @@ struct Instruction {
       return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
    }
 
-   constexpr bool isVMEM() const noexcept
-   {
-      return isMTBUF() || isMUBUF() || isMIMG();
-   }
+   constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
 };
 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 
@@ -1209,16 +1284,13 @@ struct SOPP_instruction : public Instruction {
 };
 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
-struct SOPC_instruction : public Instruction {
-};
+struct SOPC_instruction : public Instruction {};
 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 
-struct SOP1_instruction : public Instruction {
-};
+struct SOP1_instruction : public Instruction {};
 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 
-struct SOP2_instruction : public Instruction {
-};
+struct SOP2_instruction : public Instruction {};
 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 
 /**
@@ -1236,23 +1308,20 @@ struct SMEM_instruction : public Instruction {
    memory_sync_info sync;
    bool glc : 1; /* VI+: globally coherent */
    bool dlc : 1; /* NAVI: device level coherent */
-   bool nv : 1; /* VEGA only: Non-volatile */
+   bool nv : 1;  /* VEGA only: Non-volatile */
    bool disable_wqm : 1;
    bool prevent_overflow : 1; /* avoid overflow when combining additions */
-   uint8_t padding: 3;
+   uint8_t padding : 3;
 };
 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 
-struct VOP1_instruction : public Instruction {
-};
+struct VOP1_instruction : public Instruction {};
 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 
-struct VOP2_instruction : public Instruction {
-};
+struct VOP2_instruction : public Instruction {};
 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 
-struct VOPC_instruction : public Instruction {
-};
+struct VOPC_instruction : public Instruction {};
 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 
 struct VOP3_instruction : public Instruction {
@@ -1295,39 +1364,39 @@ struct DPP_instruction : public Instruction {
 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
 enum sdwa_sel : uint8_t {
-    /* masks */
-    sdwa_wordnum = 0x1,
-    sdwa_bytenum = 0x3,
-    sdwa_asuint = 0x7 | 0x10,
-    sdwa_rasize = 0x3,
+   /* masks */
+   sdwa_wordnum = 0x1,
+   sdwa_bytenum = 0x3,
+   sdwa_asuint = 0x7 | 0x10,
+   sdwa_rasize = 0x3,
 
-    /* flags */
-    sdwa_isword = 0x4,
-    sdwa_sext = 0x8,
-    sdwa_isra = 0x10,
+   /* flags */
+   sdwa_isword = 0x4,
+   sdwa_sext = 0x8,
+   sdwa_isra = 0x10,
 
-    /* specific values */
-    sdwa_ubyte0 = 0,
-    sdwa_ubyte1 = 1,
-    sdwa_ubyte2 = 2,
-    sdwa_ubyte3 = 3,
-    sdwa_uword0 = sdwa_isword | 0,
-    sdwa_uword1 = sdwa_isword | 1,
-    sdwa_udword = 6,
+   /* specific values */
+   sdwa_ubyte0 = 0,
+   sdwa_ubyte1 = 1,
+   sdwa_ubyte2 = 2,
+   sdwa_ubyte3 = 3,
+   sdwa_uword0 = sdwa_isword | 0,
+   sdwa_uword1 = sdwa_isword | 1,
+   sdwa_udword = 6,
 
-    sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
-    sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
-    sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
-    sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
-    sdwa_sword0 = sdwa_uword0 | sdwa_sext,
-    sdwa_sword1 = sdwa_uword1 | sdwa_sext,
-    sdwa_sdword = sdwa_udword | sdwa_sext,
+   sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
+   sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
+   sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
+   sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
+   sdwa_sword0 = sdwa_uword0 | sdwa_sext,
+   sdwa_sword1 = sdwa_uword1 | sdwa_sext,
+   sdwa_sdword = sdwa_udword | sdwa_sext,
 
-    /* register-allocated */
-    sdwa_ubyte = 1 | sdwa_isra,
-    sdwa_uword = 2 | sdwa_isra,
-    sdwa_sbyte = sdwa_ubyte | sdwa_sext,
-    sdwa_sword = sdwa_uword | sdwa_sext,
+   /* register-allocated */
+   sdwa_ubyte = 1 | sdwa_isra,
+   sdwa_uword = 2 | sdwa_isra,
+   sdwa_sbyte = sdwa_ubyte | sdwa_sext,
+   sdwa_sword = sdwa_uword | sdwa_sext,
 };
 
 /**
@@ -1387,16 +1456,16 @@ static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected pad
  */
 struct MUBUF_instruction : public Instruction {
    memory_sync_info sync;
-   bool offen : 1; /* Supply an offset from VGPR (VADDR) */
-   bool idxen : 1; /* Supply an index from VGPR (VADDR) */
-   bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
-   bool glc : 1; /* globally coherent */
-   bool dlc : 1; /* NAVI: device level coherent */
-   bool slc : 1; /* system level coherent */
-   bool tfe : 1; /* texture fail enable */
-   bool lds : 1; /* Return read-data to LDS instead of VGPRs */
+   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
+   bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
+   bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
+   bool glc : 1;             /* globally coherent */
+   bool dlc : 1;             /* NAVI: device level coherent */
+   bool slc : 1;             /* system level coherent */
+   bool tfe : 1;             /* texture fail enable */
+   bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
-   uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+   uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
    uint16_t swizzled : 1;
    uint16_t padding0 : 2;
    uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
@@ -1414,14 +1483,14 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
  */
 struct MTBUF_instruction : public Instruction {
    memory_sync_info sync;
-   uint8_t dfmt : 4; /* Data Format of data in memory buffer */
-   uint8_t nfmt : 3; /* Numeric format of data in memory */
-   bool offen : 1; /* Supply an offset from VGPR (VADDR) */
-   uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */
-   uint16_t glc : 1; /* globally coherent */
-   uint16_t dlc : 1; /* NAVI: device level coherent */
-   uint16_t slc : 1; /* system level coherent */
-   uint16_t tfe : 1; /* texture fail enable */
+   uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
+   uint8_t nfmt : 3;         /* Numeric format of data in memory */
+   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
+   uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
+   uint16_t glc : 1;         /* globally coherent */
+   uint16_t dlc : 1;         /* NAVI: device level coherent */
+   uint16_t slc : 1;         /* system level coherent */
+   uint16_t tfe : 1;         /* texture fail enable */
    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
    uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
    uint16_t padding : 4;
@@ -1440,18 +1509,18 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
  */
 struct MIMG_instruction : public Instruction {
    memory_sync_info sync;
-   uint8_t dmask; /* Data VGPR enable mask */
-   uint8_t dim : 3; /* NAVI: dimensionality */
-   bool unrm : 1; /* Force address to be un-normalized */
-   bool dlc : 1; /* NAVI: device level coherent */
-   bool glc : 1; /* globally coherent */
-   bool slc : 1; /* system level coherent */
-   bool tfe : 1; /* texture fail enable */
-   bool da : 1; /* declare an array */
-   bool lwe : 1; /* LOD warning enable */
-   bool r128 : 1; /* NAVI: Texture resource size */
-   bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
-   bool d16 : 1; /* Convert 32-bit data to 16-bit data */
+   uint8_t dmask;        /* Data VGPR enable mask */
+   uint8_t dim : 3;      /* NAVI: dimensionality */
+   bool unrm : 1;        /* Force address to be un-normalized */
+   bool dlc : 1;         /* NAVI: device level coherent */
+   bool glc : 1;         /* globally coherent */
+   bool slc : 1;         /* system level coherent */
+   bool tfe : 1;         /* texture fail enable */
+   bool da : 1;          /* declare an array */
+   bool lwe : 1;         /* LOD warning enable */
+   bool r128 : 1;        /* NAVI: Texture resource size */
+   bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
+   bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
    uint8_t padding0 : 2;
    uint8_t padding1;
@@ -1514,6 +1583,7 @@ struct Pseudo_barrier_instruction : public Instruction {
 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 
 enum ReduceOp : uint16_t {
+   // clang-format off
    iadd8, iadd16, iadd32, iadd64,
    imul8, imul16, imul32, imul64,
           fadd16, fadd32, fadd64,
@@ -1528,6 +1598,7 @@ enum ReduceOp : uint16_t {
    ior8, ior16, ior32, ior64,
    ixor8, ixor16, ixor32, ixor64,
    num_reduce_ops,
+   // clang-format on
 };
 
 /**
@@ -1547,23 +1618,24 @@ struct Pseudo_reduction_instruction : public Instruction {
    ReduceOp reduce_op;
    uint16_t cluster_size; // must be 0 for scans
 };
-static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
+              "Unexpected padding");
 
 struct instr_deleter_functor {
-   void operator()(void* p) {
-      free(p);
-   }
+   void operator()(void* p) { free(p); }
 };
 
-template<typename T>
-using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
+template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 
-template<typename T>
-T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
+template <typename T>
+T*
+create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
+                   uint32_t num_definitions)
 {
-   std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
-   char *data = (char*) calloc(1, size);
-   T* inst = (T*) data;
+   std::size_t size =
+      sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
+   char* data = (char*)calloc(1, size);
+   T* inst = (T*)data;
 
    inst->opcode = opcode;
    inst->format = format;
@@ -1576,7 +1648,8 @@ T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, u
    return inst;
 }
 
-constexpr bool Instruction::usesModifiers() const noexcept
+constexpr bool
+Instruction::usesModifiers() const noexcept
 {
    if (isDPP() || isSDWA())
       return true;
@@ -1603,19 +1676,21 @@ constexpr bool Instruction::usesModifiers() const noexcept
    return false;
 }
 
-constexpr bool is_phi(Instruction* instr)
+constexpr bool
+is_phi(Instruction* instr)
 {
    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 }
 
-static inline bool is_phi(aco_ptr<Instruction>& instr)
+static inline bool
+is_phi(aco_ptr<Instruction>& instr)
 {
    return is_phi(instr.get());
 }
 
 memory_sync_info get_sync_info(const Instruction* instr);
 
-bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
+bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
 
 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
@@ -1625,9 +1700,9 @@ bool needs_exec_mask(const Instruction* instr);
 
 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
 
-unsigned get_mimg_nsa_dwords(const Instruction *instr);
+unsigned get_mimg_nsa_dwords(const Instruction* instr);
 
-bool should_form_clause(const Instruction *a, const Instruction *b);
+bool should_form_clause(const Instruction* a, const Instruction* b);
 
 enum block_kind {
    /* uniform indicates that leaving this block,
@@ -1650,50 +1725,56 @@ enum block_kind {
    block_kind_export_end = 1 << 15,
 };
 
-
 struct RegisterDemand {
    constexpr RegisterDemand() = default;
-   constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
-      : vgpr{v}, sgpr{s} {}
+   constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
    int16_t vgpr = 0;
    int16_t sgpr = 0;
 
-   constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
+   constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
+   {
       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
    }
 
-   constexpr bool exceeds(const RegisterDemand other) const noexcept {
+   constexpr bool exceeds(const RegisterDemand other) const noexcept
+   {
       return vgpr > other.vgpr || sgpr > other.sgpr;
    }
 
-   constexpr RegisterDemand operator+(const Temp t) const noexcept {
+   constexpr RegisterDemand operator+(const Temp t) const noexcept
+   {
       if (t.type() == RegType::sgpr)
-         return RegisterDemand( vgpr, sgpr + t.size() );
+         return RegisterDemand(vgpr, sgpr + t.size());
       else
-         return RegisterDemand( vgpr + t.size(), sgpr );
+         return RegisterDemand(vgpr + t.size(), sgpr);
    }
 
-   constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
+   constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
+   {
       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
    }
 
-   constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
+   constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
+   {
       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
    }
 
-   constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
+   constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
+   {
       vgpr += other.vgpr;
       sgpr += other.sgpr;
       return *this;
    }
 
-   constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
+   constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
+   {
       vgpr -= other.vgpr;
       sgpr -= other.sgpr;
       return *this;
    }
 
-   constexpr RegisterDemand& operator+=(const Temp t) noexcept {
+   constexpr RegisterDemand& operator+=(const Temp t) noexcept
+   {
       if (t.type() == RegType::sgpr)
          sgpr += t.size();
       else
@@ -1701,7 +1782,8 @@ struct RegisterDemand {
       return *this;
    }
 
-   constexpr RegisterDemand& operator-=(const Temp t) noexcept {
+   constexpr RegisterDemand& operator-=(const Temp t) noexcept
+   {
       if (t.type() == RegType::sgpr)
          sgpr -= t.size();
       else
@@ -1709,11 +1791,11 @@ struct RegisterDemand {
       return *this;
    }
 
-   constexpr void update(const RegisterDemand other) noexcept {
+   constexpr void update(const RegisterDemand other) noexcept
+   {
       vgpr = std::max(vgpr, other.vgpr);
       sgpr = std::max(sgpr, other.sgpr);
    }
-
 };
 
 /* CFG */
@@ -1746,23 +1828,25 @@ struct Block {
  * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
  */
 enum class SWStage : uint8_t {
-    None = 0,
-    VS = 1 << 0,     /* Vertex Shader */
-    GS = 1 << 1,     /* Geometry Shader */
-    TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
-    TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
-    FS = 1 << 4,     /* Fragment aka Pixel Shader */
-    CS = 1 << 5,     /* Compute Shader */
-    GSCopy = 1 << 6, /* GS Copy Shader (internal) */
+   None = 0,
+   VS = 1 << 0,     /* Vertex Shader */
+   GS = 1 << 1,     /* Geometry Shader */
+   TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
+   TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
+   FS = 1 << 4,     /* Fragment aka Pixel Shader */
+   CS = 1 << 5,     /* Compute Shader */
+   GSCopy = 1 << 6, /* GS Copy Shader (internal) */
 
-    /* Stage combinations merged to run on a single HWStage */
-    VS_GS = VS | GS,
-    VS_TCS = VS | TCS,
-    TES_GS = TES | GS,
+   /* Stage combinations merged to run on a single HWStage */
+   VS_GS = VS | GS,
+   VS_TCS = VS | TCS,
+   TES_GS = TES | GS,
 };
 
-constexpr SWStage operator|(SWStage a, SWStage b) {
-    return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
+constexpr SWStage
+operator|(SWStage a, SWStage b)
+{
+   return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
 }
 
 /*
@@ -1773,14 +1857,14 @@ constexpr SWStage operator|(SWStage a, SWStage b) {
  * See README.md for details.
  */
 enum class HWStage : uint8_t {
-    VS,
-    ES,  /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
-    GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
-    NGG, /* Primitive shader, used to implement VS, TES, GS. */
-    LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
-    HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
-    FS,
-    CS,
+   VS,
+   ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
+   GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
+   NGG, /* Primitive shader, used to implement VS, TES, GS. */
+   LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
+   HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
+   FS,
+   CS,
 };
 
 /*
@@ -1788,32 +1872,27 @@ enum class HWStage : uint8_t {
  * HWStage it will run on.
  */
 struct Stage {
-    constexpr Stage() = default;
+   constexpr Stage() = default;
 
-    explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) { }
+   explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
 
-    /* Check if the given SWStage is included */
-    constexpr bool has(SWStage stage) const {
-        return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
-    }
+   /* Check if the given SWStage is included */
+   constexpr bool has(SWStage stage) const
+   {
+      return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
+   }
 
-    unsigned num_sw_stages() const {
-        return util_bitcount(static_cast<uint8_t>(sw));
-    }
+   unsigned num_sw_stages() const { return util_bitcount(static_cast<uint8_t>(sw)); }
 
-    constexpr bool operator==(const Stage& other) const {
-        return sw == other.sw && hw == other.hw;
-    }
+   constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
 
-    constexpr bool operator!=(const Stage& other) const {
-        return sw != other.sw || hw != other.hw;
-    }
+   constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
 
-    /* Mask of merged software stages */
-    SWStage sw = SWStage::None;
+   /* Mask of merged software stages */
+   SWStage sw = SWStage::None;
 
-    /* Active hardware stage */
-    HWStage hw {};
+   /* Active hardware stage */
+   HWStage hw{};
 };
 
 /* possible settings of Program::stage */
@@ -1835,7 +1914,8 @@ static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
-static constexpr Stage tess_eval_es(HWStage::ES, SWStage::TES); /* tesselation evaluation before geometry */
+static constexpr Stage tess_eval_es(HWStage::ES,
+                                    SWStage::TES); /* tesselation evaluation before geometry */
 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
 
 enum statistic {
@@ -1884,7 +1964,7 @@ public:
    uint16_t num_waves = 0;
    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
    ac_shader_config* config;
-   struct radv_shader_info *info;
+   struct radv_shader_info* info;
    enum chip_class chip_class;
    enum radeon_family family;
    DeviceInfo dev;
@@ -1892,7 +1972,7 @@ public:
    RegClass lane_mask;
    Stage stage;
    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
-   bool needs_wqm = false; /* there exists a p_wqm instruction */
+   bool needs_wqm = false;   /* there exists a p_wqm instruction */
 
    std::vector<uint8_t> constant_data;
    Temp private_segment_buffer;
@@ -1917,12 +1997,10 @@ public:
    unsigned next_uniform_if_depth = 0;
 
    struct {
-      FILE *output = stderr;
+      FILE* output = stderr;
       bool shorten_messages = false;
-      void (*func)(void *private_data,
-                   enum radv_compiler_debug_level level,
-                   const char *message);
-      void *private_data;
+      void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message);
+      void* private_data;
    } debug;
 
    uint32_t allocateId(RegClass rc)
@@ -1939,25 +2017,21 @@ public:
       allocationID += amount;
    }
 
-   Temp allocateTmp(RegClass rc)
-   {
-      return Temp(allocateId(rc), rc);
-   }
+   Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
 
-   uint32_t peekAllocationId()
-   {
-      return allocationID;
-   }
+   uint32_t peekAllocationId() { return allocationID; }
 
    friend void reindex_ssa(Program* program);
    friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
 
-   Block* create_and_insert_block() {
+   Block* create_and_insert_block()
+   {
       Block block;
       return insert_block(std::move(block));
    }
 
-   Block* insert_block(Block&& block) {
+   Block* insert_block(Block&& block)
+   {
       block.index = blocks.size();
       block.fp_mode = next_fp_mode;
       block.loop_nest_depth = next_loop_depth;
@@ -1985,35 +2059,30 @@ struct ra_test_policy {
 
 void init();
 
-void init_program(Program *program, Stage stage, struct radv_shader_info *info,
-                  enum chip_class chip_class, enum radeon_family family,
-                  bool wgp_mode, ac_shader_config *config);
+void init_program(Program* program, Stage stage, struct radv_shader_info* info,
+                  enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
+                  ac_shader_config* config);
 
-void select_program(Program *program,
-                    unsigned shader_count,
-                    struct nir_shader *const *shaders,
-                    ac_shader_config* config,
-                    struct radv_shader_args *args);
-void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
-                           ac_shader_config* config,
-                           struct radv_shader_args *args);
-void select_trap_handler_shader(Program *program, struct nir_shader *shader,
-                                ac_shader_config* config,
-                                struct radv_shader_args *args);
+void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
+                    ac_shader_config* config, struct radv_shader_args* args);
+void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
+                           struct radv_shader_args* args);
+void select_trap_handler_shader(Program* program, struct nir_shader* shader,
+                                ac_shader_config* config, struct radv_shader_args* args);
 
 void lower_phis(Program* program);
 void calc_min_waves(Program* program);
 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
 live live_var_analysis(Program* program);
-std::vector<uint16_t> dead_code_analysis(Program *program);
+std::vector<uint16_t> dead_code_analysis(Program* program);
 void dominator_tree(Program* program);
-void insert_exec_mask(Program *program);
+void insert_exec_mask(Program* program);
 void value_numbering(Program* program);
 void optimize(Program* program);
 void optimize_postRA(Program* program);
 void setup_reduce_temp(Program* program);
 void lower_to_cssa(Program* program, live& live_vars);
-void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block,
+void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
                          ra_test_policy = {});
 void ssa_elimination(Program* program);
 void lower_to_hw_instr(Program* program);
@@ -2021,21 +2090,22 @@ void schedule_program(Program* program, live& live_vars);
 void spill(Program* program, live& live_vars);
 void insert_wait_states(Program* program);
 void insert_NOPs(Program* program);
-void form_hard_clauses(Program *program);
+void form_hard_clauses(Program* program);
 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
-bool print_asm(Program *program, std::vector<uint32_t>& binary,
-               unsigned exec_size, FILE *output);
+bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
 bool validate_ir(Program* program);
 bool validate_ra(Program* program);
 #ifndef NDEBUG
-void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
+void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
 #else
-#define perfwarn(program, cond, msg, ...) do {} while(0)
+#define perfwarn(program, cond, msg, ...)                                                          \
+   do {                                                                                            \
+   } while (0)
 #endif
 
-void collect_presched_stats(Program *program);
-void collect_preasm_stats(Program *program);
-void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
+void collect_presched_stats(Program* program);
+void collect_preasm_stats(Program* program);
+void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
 
 enum print_flags {
    print_no_ssa = 0x1,
@@ -2044,34 +2114,34 @@ enum print_flags {
    print_live_vars = 0x8,
 };
 
-void aco_print_operand(const Operand *operand, FILE *output, unsigned flags=0);
-void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags=0);
-void aco_print_program(const Program *program, FILE *output, unsigned flags=0);
-void aco_print_program(const Program *program, FILE *output, const live& live_vars, unsigned flags=0);
+void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
+void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
+void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
+void aco_print_program(const Program* program, FILE* output, const live& live_vars,
+                       unsigned flags = 0);
 
-void _aco_perfwarn(Program *program, const char *file, unsigned line,
-                   const char *fmt, ...);
-void _aco_err(Program *program, const char *file, unsigned line,
-              const char *fmt, ...);
+void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
+void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
 
 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
-#define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
+#define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
 
 /* utilities for dealing with register demand */
 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
-RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
+RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
+                                 aco_ptr<Instruction>& instr_before);
 
 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
-uint16_t get_extra_sgprs(Program *program);
+uint16_t get_extra_sgprs(Program* program);
 
 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
-uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
-uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
+uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
+uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
 
 /* return number of addressable sgprs/vgprs for max_waves */
-uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
-uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
+uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
+uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
 
 typedef struct {
    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
@@ -2080,7 +2150,7 @@ typedef struct {
    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
-   const char *name[static_cast<int>(aco_opcode::num_opcodes)];
+   const char* name[static_cast<int>(aco_opcode::num_opcodes)];
    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
    /* sizes used for input/output modifiers and constants */
    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
@@ -2090,7 +2160,6 @@ typedef struct {
 
 extern const Info instr_info;
 
-}
+} // namespace aco
 
 #endif /* ACO_IR_H */
-
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index bcba44acd19..60876267d04 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -24,13 +24,15 @@
  */
 
 #include "aco_ir.h"
+
 #include "util/u_math.h"
 
 #include <set>
 #include <vector>
 
 namespace aco {
-RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
+RegisterDemand
+get_live_changes(aco_ptr<Instruction>& instr)
 {
    RegisterDemand changes;
    for (const Definition& def : instr->definitions) {
@@ -48,7 +50,8 @@ RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
    return changes;
 }
 
-RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
+RegisterDemand
+get_temp_registers(aco_ptr<Instruction>& instr)
 {
    RegisterDemand temp_registers;
 
@@ -67,7 +70,9 @@ RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
    return temp_registers;
 }
 
-RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before)
+RegisterDemand
+get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
+                  aco_ptr<Instruction>& instr_before)
 {
    demand -= get_live_changes(instr);
    demand -= get_temp_registers(instr);
@@ -77,8 +82,9 @@ RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& in
 }
 
 namespace {
-void process_live_temps_per_block(Program *program, live& lives, Block* block,
-                                  std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
+void
+process_live_temps_per_block(Program* program, live& lives, Block* block,
+                             std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
 {
    std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
    RegisterDemand new_demand;
@@ -94,8 +100,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
 
    /* traverse the instructions backwards */
    int idx;
-   for (idx = block->instructions.size() -1; idx >= 0; idx--) {
-      Instruction *insn = block->instructions[idx].get();
+   for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
+      Instruction* insn = block->instructions[idx].get();
       if (is_phi(insn))
          break;
 
@@ -131,8 +137,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
          for (Operand& op : insn->operands)
             op.setKill(false);
 
-         for (unsigned i = 0; i < insn->operands.size(); ++i)
-         {
+         for (unsigned i = 0; i < insn->operands.size(); ++i) {
             Operand& operand = insn->operands[i];
             if (!operand.isTemp())
                continue;
@@ -143,7 +148,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
             if (inserted) {
                operand.setFirstKill(true);
                for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
-                  if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
+                  if (insn->operands[j].isTemp() &&
+                      insn->operands[j].tempId() == operand.tempId()) {
                      insn->operands[j].setFirstKill(false);
                      insn->operands[j].setKill(true);
                   }
@@ -167,7 +173,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
    int phi_idx = idx;
    while (phi_idx >= 0) {
       register_demand[phi_idx] = new_demand;
-      Instruction *insn = block->instructions[phi_idx].get();
+      Instruction* insn = block->instructions[phi_idx].get();
 
       assert(is_phi(insn) && insn->definitions.size() == 1);
       if (!insn->definitions[0].isTemp()) {
@@ -196,7 +202,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
 
 #ifndef NDEBUG
       if (preds.empty())
-         aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
+         aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t,
+                 block->index);
 #endif
 
       for (unsigned pred_idx : preds) {
@@ -209,14 +216,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
    /* handle phi operands */
    phi_idx = idx;
    while (phi_idx >= 0) {
-      Instruction *insn = block->instructions[phi_idx].get();
+      Instruction* insn = block->instructions[phi_idx].get();
       assert(is_phi(insn));
       /* directly insert into the predecessors live-out set */
-      std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
-                                   ? block->logical_preds
-                                   : block->linear_preds;
+      std::vector<unsigned>& preds =
+         insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
       for (unsigned i = 0; i < preds.size(); ++i) {
-         Operand &operand = insn->operands[i];
+         Operand& operand = insn->operands[i];
          if (!operand.isTemp())
             continue;
          if (operand.isFixed() && operand.physReg() == vcc)
@@ -238,18 +244,19 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
    assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
 }
 
-unsigned calc_waves_per_workgroup(Program *program)
+unsigned
+calc_waves_per_workgroup(Program* program)
 {
    /* When workgroup size is not known, just go with wave_size */
-   unsigned workgroup_size = program->workgroup_size == UINT_MAX
-                             ? program->wave_size
-                             : program->workgroup_size;
+   unsigned workgroup_size =
+      program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;
 
    return align(workgroup_size, program->wave_size) / program->wave_size;
 }
 } /* end namespace */
 
-uint16_t get_extra_sgprs(Program *program)
+uint16_t
+get_extra_sgprs(Program* program)
 {
    if (program->chip_class >= GFX10) {
       assert(!program->needs_flat_scr);
@@ -275,26 +282,30 @@ uint16_t get_extra_sgprs(Program *program)
    }
 }
 
-uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
+uint16_t
+get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
 {
    uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
    uint16_t granule = program->dev.sgpr_alloc_granule;
    return ALIGN_NPOT(std::max(sgprs, granule), granule);
 }
 
-uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
+uint16_t
+get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
 {
    assert(addressable_vgprs <= program->dev.vgpr_limit);
    uint16_t granule = program->dev.vgpr_alloc_granule;
    return align(std::max(addressable_vgprs, granule), granule);
 }
 
-unsigned round_down(unsigned a, unsigned b)
+unsigned
+round_down(unsigned a, unsigned b)
 {
    return a - (a % b);
 }
 
-uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
+uint16_t
+get_addr_sgpr_from_waves(Program* program, uint16_t waves)
 {
    /* it's not possible to allocate more than 128 SGPRs */
    uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
@@ -303,21 +314,24 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
    return std::min(sgprs, program->dev.sgpr_limit);
 }
 
-uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
+uint16_t
+get_addr_vgpr_from_waves(Program* program, uint16_t waves)
 {
    uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
    vgprs -= program->config->num_shared_vgprs / 2;
    return std::min(vgprs, program->dev.vgpr_limit);
 }
 
-void calc_min_waves(Program* program)
+void
+calc_min_waves(Program* program)
 {
    unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
    unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
    program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
 }
 
-void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
+void
+update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
 {
    unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
    unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
@@ -333,8 +347,10 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
       program->max_reg_demand = new_demand;
    } else {
       program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
-      uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
-      program->num_waves = std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
+      uint16_t vgpr_demand =
+         get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
+      program->num_waves =
+         std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
       program->max_waves = max_waves_per_simd;
 
       /* adjust max_waves for workgroup and LDS limits */
@@ -346,12 +362,15 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
          workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
       }
       if (waves_per_workgroup > 1 && program->chip_class < GFX10)
-         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
+         workgroups_per_cu_wgp = std::min(
+            workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
 
       /* in cases like waves_per_workgroup=3 or lds=65536 and
        * waves_per_workgroup=1, we want the maximum possible number of waves per
        * SIMD and not the minimum. so DIV_ROUND_UP is used */
-      program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
+      program->max_waves = std::min<uint16_t>(
+         program->max_waves,
+         DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
 
       /* incorporate max_waves and calculate max_reg_demand */
       program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
@@ -360,7 +379,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
    }
 }
 
-live live_var_analysis(Program* program)
+live
+live_var_analysis(Program* program)
 {
    live result;
    result.live_out.resize(program->blocks.size());
@@ -371,14 +391,16 @@ live live_var_analysis(Program* program)
 
    program->needs_vcc = false;
 
-   /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
+   /* this implementation assumes that the block idx corresponds to the block's position in
+    * program->blocks vector */
    for (Block& block : program->blocks)
       worklist.insert(block.index);
    while (!worklist.empty()) {
       std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
       unsigned block_idx = *b_it;
       worklist.erase(block_idx);
-      process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
+      process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist,
+                                   phi_sgpr_ops);
       new_demand.update(program->blocks[block_idx].register_demand);
    }
 
@@ -389,5 +411,4 @@ live live_var_analysis(Program* program)
    return result;
 }
 
-}
-
+} // namespace aco
diff --git a/src/amd/compiler/aco_lower_phis.cpp b/src/amd/compiler/aco_lower_phis.cpp
index 41d0c202eae..2b10318c9bd 100644
--- a/src/amd/compiler/aco_lower_phis.cpp
+++ b/src/amd/compiler/aco_lower_phis.cpp
@@ -47,7 +47,8 @@ struct ssa_state {
    std::vector<bool> visited;
 };
 
-Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool before_write)
+Operand
+get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool before_write)
 {
    if (!before_write) {
       auto it = state->writes.find(block_idx);
@@ -79,7 +80,8 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef
       Temp res = Temp(program->allocateTmp(program->lane_mask));
       state->latest[block_idx] = Operand(res);
 
-      aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
+      aco_ptr<Pseudo_instruction> phi{
+         create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
       for (unsigned i = 0; i < pred; i++)
          phi->operands[i] = get_ssa(program, block.linear_preds[i], state, false);
       phi->definitions[0] = Definition(res);
@@ -89,11 +91,11 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef
    }
 }
 
-void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
+void
+insert_before_logical_end(Block* block, aco_ptr<Instruction> instr)
 {
-   auto IsLogicalEnd = [] (const aco_ptr<Instruction>& inst) -> bool {
-      return inst->opcode == aco_opcode::p_logical_end;
-   };
+   auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
+   { return inst->opcode == aco_opcode::p_logical_end; };
    auto it = std::find_if(block->instructions.crbegin(), block->instructions.crend(), IsLogicalEnd);
 
    if (it == block->instructions.crend()) {
@@ -104,13 +106,13 @@ void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
    }
 }
 
-void build_merge_code(Program *program, Block *block, Definition dst, Operand prev, Operand cur)
+void
+build_merge_code(Program* program, Block* block, Definition dst, Operand prev, Operand cur)
 {
    Builder bld(program);
 
-   auto IsLogicalEnd = [] (const aco_ptr<Instruction>& instr) -> bool {
-      return instr->opcode == aco_opcode::p_logical_end;
-   };
+   auto IsLogicalEnd = [](const aco_ptr<Instruction>& instr) -> bool
+   { return instr->opcode == aco_opcode::p_logical_end; };
    auto it = std::find_if(block->instructions.rbegin(), block->instructions.rend(), IsLogicalEnd);
    assert(it != block->instructions.rend());
    bld.reset(&block->instructions, std::prev(it.base()));
@@ -126,7 +128,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr
    if (!prev_is_constant) {
       if (!cur_is_constant) {
          Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm);
-         bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev, Operand(exec, bld.lm));
+         bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev,
+                  Operand(exec, bld.lm));
          bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), cur, Operand(exec, bld.lm));
          bld.sop2(Builder::s_or, dst, bld.def(s1, scc), tmp1, tmp2);
       } else if (cur.constantValue()) {
@@ -151,7 +154,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr
    }
 }
 
-void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco_ptr<Instruction>& phi)
+void
+init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<Instruction>& phi)
 {
    std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), false);
    for (unsigned i = 0; i < block->logical_preds.size(); i++) {
@@ -178,7 +182,9 @@ void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco
    }
 }
 
-void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, aco_ptr<Instruction>& phi)
+void
+lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,
+                         aco_ptr<Instruction>& phi)
 {
    Builder bld(program);
 
@@ -186,7 +192,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
       state->all_preds_uniform = !(block->kind & block_kind_merge) &&
                                  block->linear_preds.size() == block->logical_preds.size();
       for (unsigned pred : block->logical_preds)
-         state->all_preds_uniform = state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
+         state->all_preds_uniform =
+            state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
       state->checked_preds_for_uniform = true;
    }
 
@@ -230,7 +237,7 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
    bool uniform_merge = block->kind & block_kind_loop_header;
 
    for (unsigned i = 0; i < phi->operands.size(); i++) {
-      Block *pred = &program->blocks[block->logical_preds[i]];
+      Block* pred = &program->blocks[block->logical_preds[i]];
 
       bool need_get_ssa = !uniform_merge;
       if (block->kind & block_kind_loop_header && !(pred->kind & block_kind_uniform))
@@ -254,7 +261,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
 
    unsigned num_preds = block->linear_preds.size();
    if (phi->operands.size() != num_preds) {
-      Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
+      Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
       new_phi->definitions[0] = phi->definitions[0];
       phi.reset(new_phi);
    } else {
@@ -268,7 +276,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
    return;
 }
 
-void lower_subdword_phis(Program *program, Block *block, aco_ptr<Instruction>& phi)
+void
+lower_subdword_phis(Program* program, Block* block, aco_ptr<Instruction>& phi)
 {
    Builder bld(program);
    for (unsigned i = 0; i < phi->operands.size(); i++) {
@@ -278,21 +287,24 @@ void lower_subdword_phis(Program *program, Block *block, aco_ptr<Instruction>& p
          continue;
 
       assert(phi->operands[i].isTemp());
-      Block *pred = &program->blocks[block->logical_preds[i]];
+      Block* pred = &program->blocks[block->logical_preds[i]];
       Temp phi_src = phi->operands[i].getTemp();
 
       assert(phi_src.regClass().type() == RegType::sgpr);
       Temp tmp = bld.tmp(RegClass(RegType::vgpr, phi_src.size()));
       insert_before_logical_end(pred, bld.copy(Definition(tmp), phi_src).get_ptr());
       Temp new_phi_src = bld.tmp(phi->definitions[0].regClass());
-      insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u)).get_ptr());
+      insert_before_logical_end(
+         pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u))
+                  .get_ptr());
 
       phi->operands[i].setTemp(new_phi_src);
    }
    return;
 }
 
-void lower_phis(Program* program)
+void
+lower_phis(Program* program)
 {
    ssa_state state;
 
@@ -301,7 +313,8 @@ void lower_phis(Program* program)
       state.needs_init = true;
       for (aco_ptr<Instruction>& phi : block.instructions) {
          if (phi->opcode == aco_opcode::p_phi) {
-            assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2);
+            assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1
+                                            : phi->definitions[0].regClass() != s2);
             if (phi->definitions[0].regClass() == program->lane_mask)
                lower_divergent_bool_phi(program, &state, &block, phi);
             else if (phi->definitions[0].regClass().is_subdword())
@@ -313,4 +326,4 @@ void lower_phis(Program* program)
    }
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp
index 15fd0f6ae62..db809867a70 100644
--- a/src/amd/compiler/aco_lower_to_cssa.cpp
+++ b/src/amd/compiler/aco_lower_to_cssa.cpp
@@ -53,32 +53,32 @@ struct copy {
 
 struct merge_node {
    Operand value = Operand(); /* original value: can be an SSA-def or constant value */
-   uint32_t index = -1u; /* index into the vector of merge sets */
+   uint32_t index = -1u;      /* index into the vector of merge sets */
    uint32_t defined_at = -1u; /* defining block */
 
    /* we also remember two dominating defs with the same value: */
-   Temp equal_anc_in = Temp(); /* within the same merge set */
+   Temp equal_anc_in = Temp();  /* within the same merge set */
    Temp equal_anc_out = Temp(); /* from a different set */
 };
 
 struct cssa_ctx {
    Program* program;
-   std::vector<IDSet>& live_out; /* live-out sets per block */
+   std::vector<IDSet>& live_out;                  /* live-out sets per block */
    std::vector<std::vector<copy>> parallelcopies; /* copies per block */
-   std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
+   std::vector<merge_set> merge_sets;             /* each vector is one (ordered) merge set */
    std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
 };
 
 /* create (virtual) parallelcopies for each phi instruction and
  * already merge copy-definitions with phi-defs into merge sets */
-void collect_parallelcopies(cssa_ctx& ctx)
+void
+collect_parallelcopies(cssa_ctx& ctx)
 {
    ctx.parallelcopies.resize(ctx.program->blocks.size());
    Builder bld(ctx.program);
    for (Block& block : ctx.program->blocks) {
       for (aco_ptr<Instruction>& phi : block.instructions) {
-         if (phi->opcode != aco_opcode::p_phi &&
-             phi->opcode != aco_opcode::p_linear_phi)
+         if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
             break;
 
          const Definition& def = phi->definitions[0];
@@ -89,9 +89,8 @@ void collect_parallelcopies(cssa_ctx& ctx)
          if (!def.isTemp())
             continue;
 
-         std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ?
-                                        block.logical_preds :
-                                        block.linear_preds;
+         std::vector<unsigned>& preds =
+            phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
          uint32_t index = ctx.merge_sets.size();
          merge_set set;
 
@@ -151,8 +150,8 @@ void collect_parallelcopies(cssa_ctx& ctx)
 }
 
 /* check whether the definition of a comes after b. */
-inline
-bool defined_after(cssa_ctx& ctx, Temp a, Temp b)
+inline bool
+defined_after(cssa_ctx& ctx, Temp a, Temp b)
 {
    merge_node& node_a = ctx.merge_node_table[a.id()];
    merge_node& node_b = ctx.merge_node_table[b.id()];
@@ -163,25 +162,24 @@ bool defined_after(cssa_ctx& ctx, Temp a, Temp b)
 }
 
 /* check whether a dominates b where b is defined after a */
-inline
-bool dominates(cssa_ctx& ctx, Temp a, Temp b)
+inline bool
+dominates(cssa_ctx& ctx, Temp a, Temp b)
 {
    assert(defined_after(ctx, b, a));
    merge_node& node_a = ctx.merge_node_table[a.id()];
    merge_node& node_b = ctx.merge_node_table[b.id()];
    unsigned idom = node_b.defined_at;
    while (idom > node_a.defined_at)
-      idom = b.regClass().type() == RegType::vgpr ?
-             ctx.program->blocks[idom].logical_idom :
-             ctx.program->blocks[idom].linear_idom;
+      idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
+                                                  : ctx.program->blocks[idom].linear_idom;
 
    return idom == node_a.defined_at;
 }
 
 /* check intersection between var and parent:
  * We already know that parent dominates var. */
-inline
-bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
+inline bool
+intersects(cssa_ctx& ctx, Temp var, Temp parent)
 {
    merge_node& node_var = ctx.merge_node_table[var.id()];
    merge_node& node_parent = ctx.merge_node_table[parent.id()];
@@ -196,9 +194,9 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
    /* parent is defined in a different block than var */
    if (node_parent.defined_at < node_var.defined_at) {
       /* if the parent is not live-in, they don't interfere */
-      std::vector<uint32_t>& preds = var.type() == RegType::vgpr ?
-                                     ctx.program->blocks[block_idx].logical_preds :
-                                     ctx.program->blocks[block_idx].linear_preds;
+      std::vector<uint32_t>& preds = var.type() == RegType::vgpr
+                                        ? ctx.program->blocks[block_idx].logical_preds
+                                        : ctx.program->blocks[block_idx].linear_preds;
       for (uint32_t pred : preds) {
          if (!ctx.live_out[pred].count(parent.id()))
             return false;
@@ -246,8 +244,8 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
 /* check interference between var and parent:
  * i.e. they have different values and intersect.
  * If parent and var share the same value, also updates the equal ancestor. */
-inline
-bool interference(cssa_ctx& ctx, Temp var, Temp parent)
+inline bool
+interference(cssa_ctx& ctx, Temp var, Temp parent)
 {
    assert(var != parent);
    merge_node& node_var = ctx.merge_node_table[var.id()];
@@ -281,13 +279,14 @@ bool interference(cssa_ctx& ctx, Temp var, Temp parent)
 
 /* tries to merge set_b into set_a of given temporary and
  * drops that temporary as it is being coalesced */
-bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
+bool
+try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
 {
    auto def_node_it = ctx.merge_node_table.find(dst.id());
    uint32_t index = def_node_it->second.index;
    merge_set& set_a = ctx.merge_sets[index];
    std::vector<Temp> dom; /* stack of the traversal */
-   merge_set union_set; /* the new merged merge-set */
+   merge_set union_set;   /* the new merged merge-set */
    uint32_t i_a = 0;
    uint32_t i_b = 0;
 
@@ -335,7 +334,8 @@ bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
 }
 
 /* returns true if the copy can safely be omitted */
-bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
+bool
+try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
 {
    /* we can only coalesce temporaries */
    if (!copy.op.isTemp())
@@ -348,11 +348,9 @@ bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
       uint32_t pred = block_idx;
       do {
          block_idx = pred;
-         pred = copy.op.regClass().type() == RegType::vgpr ?
-                ctx.program->blocks[pred].logical_idom :
-                ctx.program->blocks[pred].linear_idom;
-      } while (block_idx != pred &&
-               ctx.live_out[pred].count(copy.op.tempId()));
+         pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
+                                                           : ctx.program->blocks[pred].linear_idom;
+      } while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
       op_node.defined_at = block_idx;
       op_node.value = copy.op;
    }
@@ -385,7 +383,8 @@ struct ltg_node {
 
 /* emit the copies in an order that does not
  * create interferences within a merge-set */
-void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
+void
+emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
 {
    auto&& it = ltg.begin();
    while (it != ltg.end()) {
@@ -410,16 +409,16 @@ void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType t
    }
 
    /* count the number of remaining circular dependencies */
-   unsigned num = std::count_if(ltg.begin(), ltg.end(), [&] (auto& n){
-      return n.second.cp.def.regClass().type() == type;
-   });
+   unsigned num = std::count_if(ltg.begin(), ltg.end(),
+                                [&](auto& n) { return n.second.cp.def.regClass().type() == type; });
 
    /* if there are circular dependencies, we just emit them as single parallelcopy */
    if (num) {
       // TODO: this should be restricted to a feasible number of registers
       // and otherwise use a temporary to avoid having to reload more (spilled)
       // variables than we have registers.
-      aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
+      aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
       it = ltg.begin();
       for (unsigned i = 0; i < num; i++) {
          while (it->second.cp.def.regClass().type() != type)
@@ -435,7 +434,8 @@ void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType t
 
 /* either emits or coalesces all parallelcopies and
  * renames the phi-operands accordingly. */
-void emit_parallelcopies(cssa_ctx& ctx)
+void
+emit_parallelcopies(cssa_ctx& ctx)
 {
    std::unordered_map<uint32_t, Operand> renames;
 
@@ -476,9 +476,8 @@ void emit_parallelcopies(cssa_ctx& ctx)
       Block& block = ctx.program->blocks[i];
 
       /* emit VGPR copies */
-      auto IsLogicalEnd = [] (const aco_ptr<Instruction>& inst) -> bool {
-         return inst->opcode == aco_opcode::p_logical_end;
-      };
+      auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
+      { return inst->opcode == aco_opcode::p_logical_end; };
       auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
       bld.reset(&block.instructions, std::prev(it.base()));
       emit_copies_block(bld, ltg, RegType::vgpr);
@@ -494,8 +493,7 @@ void emit_parallelcopies(cssa_ctx& ctx)
    /* finally, rename coalesced phi operands */
    for (Block& block : ctx.program->blocks) {
       for (aco_ptr<Instruction>& phi : block.instructions) {
-         if (phi->opcode != aco_opcode::p_phi &&
-             phi->opcode != aco_opcode::p_linear_phi)
+         if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
             break;
 
          for (Operand& op : phi->operands) {
@@ -514,8 +512,8 @@ void emit_parallelcopies(cssa_ctx& ctx)
 
 } /* end namespace */
 
-
-void lower_to_cssa(Program* program, live& live_vars)
+void
+lower_to_cssa(Program* program, live& live_vars)
 {
    reindex_ssa(program, live_vars.live_out);
    cssa_ctx ctx = {program, live_vars.live_out};
@@ -525,5 +523,4 @@ void lower_to_cssa(Program* program, live& live_vars)
    /* update live variable information */
    live_vars = live_var_analysis(program);
 }
-}
-
+} // namespace aco
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 59cae568d68..4a1a2caf82d 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -24,6 +24,7 @@
 
 #include "aco_builder.h"
 #include "aco_ir.h"
+
 #include "common/sid.h"
 
 #include <map>
@@ -32,43 +33,43 @@
 namespace aco {
 
 struct lower_context {
-   Program *program;
-   Block *block;
+   Program* program;
+   Block* block;
    std::vector<aco_ptr<Instruction>> instructions;
 };
 
 /* used by handle_operands() indirectly through Builder::copy */
 uint8_t int8_mul_table[512] = {
-    0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 10, 1, 11,
-    1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19, 1, 20, 1, 21,
-    1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29, 1, 30, 1, 31,
-    1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39, 1, 40, 1, 41,
-    1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49, 1, 50, 1, 51,
-    1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59, 1, 60, 1, 61,
-    1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23, 2, 35, 11, 53,
-    2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243, 2, 40, 3, 27,
-    2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23, 2, 45, 7, 13,
-    2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33, 2, 50, 7, 51,
-    2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27, 2, 55, 3, 37,
-    2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17, 2, 60, 9, 241,
-    2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43, 5, 26, 9, 43,
-    3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243, 4, 35, 3, 47,
-    22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45, 3, 50, 7, 241,
-    4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53, 4, 40, 7, 23,
-    3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13, 5, 34, 3, 57,
-    4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245, 3, 60, 5, 241,
-    7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63, 5, 38, 5, 243,
-    3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35, 4, 242, 5, 245,
-    6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19, 5, 42, 3, 241,
-    4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43, 3, 244,
-    5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,
-    2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247,
-    9, 55, 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247,
-    1, 248, 1, 249, 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255
-};
+   0, 20,  1,  1,   1,  2,   1,  3,   1,  4,   1, 5,   1,  6,   1,  7,   1,  8,   1,  9,
+   1, 10,  1,  11,  1,  12,  1,  13,  1,  14,  1, 15,  1,  16,  1,  17,  1,  18,  1,  19,
+   1, 20,  1,  21,  1,  22,  1,  23,  1,  24,  1, 25,  1,  26,  1,  27,  1,  28,  1,  29,
+   1, 30,  1,  31,  1,  32,  1,  33,  1,  34,  1, 35,  1,  36,  1,  37,  1,  38,  1,  39,
+   1, 40,  1,  41,  1,  42,  1,  43,  1,  44,  1, 45,  1,  46,  1,  47,  1,  48,  1,  49,
+   1, 50,  1,  51,  1,  52,  1,  53,  1,  54,  1, 55,  1,  56,  1,  57,  1,  58,  1,  59,
+   1, 60,  1,  61,  1,  62,  1,  63,  1,  64,  5, 13,  2,  33,  17, 19,  2,  34,  3,  23,
+   2, 35,  11, 53,  2,  36,  7,  47,  2,  37,  3, 25,  2,  38,  7,  11,  2,  39,  53, 243,
+   2, 40,  3,  27,  2,  41,  17, 35,  2,  42,  5, 17,  2,  43,  3,  29,  2,  44,  15, 23,
+   2, 45,  7,  13,  2,  46,  3,  31,  2,  47,  5, 19,  2,  48,  19, 59,  2,  49,  3,  33,
+   2, 50,  7,  51,  2,  51,  15, 41,  2,  52,  3, 35,  2,  53,  11, 33,  2,  54,  23, 27,
+   2, 55,  3,  37,  2,  56,  9,  41,  2,  57,  5, 23,  2,  58,  3,  39,  2,  59,  7,  17,
+   2, 60,  9,  241, 2,  61,  3,  41,  2,  62,  5, 25,  2,  63,  35, 245, 2,  64,  3,  43,
+   5, 26,  9,  43,  3,  44,  7,  19,  10, 39,  3, 45,  4,  34,  11, 59,  3,  46,  9,  243,
+   4, 35,  3,  47,  22, 53,  7,  57,  3,  48,  5, 29,  10, 245, 3,  49,  4,  37,  9,  45,
+   3, 50,  7,  241, 4,  38,  3,  51,  7,  22,  5, 31,  3,  52,  7,  59,  7,  242, 3,  53,
+   4, 40,  7,  23,  3,  54,  15, 45,  4,  41,  3, 55,  6,  241, 9,  47,  3,  56,  13, 13,
+   5, 34,  3,  57,  4,  43,  11, 39,  3,  58,  5, 35,  4,  44,  3,  59,  6,  243, 7,  245,
+   3, 60,  5,  241, 7,  26,  3,  61,  4,  46,  5, 37,  3,  62,  11, 17,  4,  47,  3,  63,
+   5, 38,  5,  243, 3,  64,  7,  247, 9,  50,  5, 39,  4,  241, 33, 37,  6,  33,  13, 35,
+   4, 242, 5,  245, 6,  247, 7,  29,  4,  51,  5, 41,  5,  246, 7,  249, 3,  240, 11, 19,
+   5, 42,  3,  241, 4,  245, 25, 29,  3,  242, 5, 43,  4,  246, 3,  243, 17, 58,  17, 43,
+   3, 244, 5,  249, 6,  37,  3,  245, 2,  240, 5, 45,  2,  241, 21, 23,  2,  242, 3,  247,
+   2, 243, 5,  251, 2,  244, 29, 61,  2,  245, 3, 249, 2,  246, 17, 29,  2,  247, 9,  55,
+   1, 240, 1,  241, 1,  242, 1,  243, 1,  244, 1, 245, 1,  246, 1,  247, 1,  248, 1,  249,
+   1, 250, 1,  251, 1,  252, 1,  253, 1,  254, 1, 255};
 
-
-aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
+aco_opcode
+get_reduce_opcode(chip_class chip, ReduceOp op)
+{
    /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
     * 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use
     * DPP with the arithmetic instructions. This requires to sign-extend.
@@ -174,7 +175,8 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
    }
 }
 
-bool is_vop3_reduce_opcode(aco_opcode opcode)
+bool
+is_vop3_reduce_opcode(aco_opcode opcode)
 {
    /* 64-bit reductions are VOP3. */
    if (opcode == aco_opcode::num_opcodes)
@@ -183,83 +185,75 @@ bool is_vop3_reduce_opcode(aco_opcode opcode)
    return instr_info.format[(int)opcode] == Format::VOP3;
 }
 
-void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
+void
+emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
 {
-   Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
+   Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
    if (instr->definitions.size() >= 2) {
       assert(instr->definitions[1].regClass() == bld.lm);
       instr->definitions[1].setFixed(vcc);
    }
 }
 
-void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
-                       PhysReg vtmp_reg, ReduceOp op,
-                       unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl,
-                       Operand *identity=NULL)
+void
+emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
+                  PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
+                  unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
 {
    Builder bld(ctx->program, &ctx->instructions);
-   Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
-   Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg+1}, v1)};
-   Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg+1}, v1)};
-   Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
+   Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
+   Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
+   Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
+   Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
    Operand src1_64 = Operand(src1_reg, v2);
-   Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg+1}, v1)};
+   Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
    Operand vtmp_op64 = Operand(vtmp_reg, v2);
    if (op == iadd64) {
       if (ctx->program->chip_class >= GFX10) {
          if (identity)
             bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
-         bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
-                      dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+         bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
+                      bound_ctrl);
          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
       } else {
          bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
                       dpp_ctrl, row_mask, bank_mask, bound_ctrl);
       }
-      bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm),
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
+                   Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
    } else if (op == iand64) {
-      bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
-      bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
+      bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
    } else if (op == ior64) {
-      bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
-      bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
+      bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
    } else if (op == ixor64) {
-      bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
-      bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
+      bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
       aco_opcode cmp = aco_opcode::num_opcodes;
       switch (op) {
-      case umin64:
-         cmp = aco_opcode::v_cmp_gt_u64;
-         break;
-      case umax64:
-         cmp = aco_opcode::v_cmp_lt_u64;
-         break;
-      case imin64:
-         cmp = aco_opcode::v_cmp_gt_i64;
-         break;
-      case imax64:
-         cmp = aco_opcode::v_cmp_lt_i64;
-         break;
-      default:
-         break;
+      case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
+      case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
+      case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
+      case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
+      default: break;
       }
 
       if (identity) {
          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
       }
-      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
-      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
+      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
 
       bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
@@ -278,36 +272,38 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph
        */
       if (identity)
          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
-      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
       if (identity)
          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
-      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
       emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
       if (identity)
          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
-      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
       bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
       emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
       if (identity)
          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
-      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
+                   bound_ctrl);
       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
    }
 }
 
-void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, ReduceOp op)
+void
+emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
+              ReduceOp op)
 {
    Builder bld(ctx->program, &ctx->instructions);
-   Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
+   Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
    RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
-   Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg+1}, src0_rc)};
-   Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
+   Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
+   Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
    Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
    Operand src1_64 = Operand(src1_reg, v2);
 
@@ -315,15 +311,15 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
        (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
       assert(vtmp.reg() != 0);
       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
-      bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
+      bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
       src0_reg = vtmp;
       src0[0] = Operand(vtmp, v1);
-      src0[1] = Operand(PhysReg{vtmp+1}, v1);
+      src0[1] = Operand(PhysReg{vtmp + 1}, v1);
       src0_64 = Operand(vtmp, v2);
    } else if (src0_rc == s1 && op == iadd64) {
       assert(vtmp.reg() != 0);
-      bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
-      src0[1] = Operand(PhysReg{vtmp+1}, v1);
+      bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
+      src0[1] = Operand(PhysReg{vtmp + 1}, v1);
    }
 
    if (op == iadd64) {
@@ -332,7 +328,8 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
       } else {
          bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
       }
-      bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm));
+      bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
+               Operand(vcc, bld.lm));
    } else if (op == iand64) {
       bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
       bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
@@ -345,20 +342,11 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
       aco_opcode cmp = aco_opcode::num_opcodes;
       switch (op) {
-      case umin64:
-         cmp = aco_opcode::v_cmp_gt_u64;
-         break;
-      case umax64:
-         cmp = aco_opcode::v_cmp_lt_u64;
-         break;
-      case imin64:
-         cmp = aco_opcode::v_cmp_gt_i64;
-         break;
-      case imax64:
-         cmp = aco_opcode::v_cmp_lt_i64;
-         break;
-      default:
-         break;
+      case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
+      case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
+      case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
+      case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
+      default: break;
       }
 
       bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
@@ -381,8 +369,8 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
        * res_lo = umul_lo(x_lo, y_lo)
        * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
        */
-      Definition tmp0_def(PhysReg{src0_reg+1}, v1);
-      Definition tmp1_def(PhysReg{src1_reg+1}, v1);
+      Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
+      Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
       Operand tmp0_op = src0[1];
       Operand tmp1_op = src1[1];
       bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
@@ -394,10 +382,10 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
    }
 }
 
-void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
-                 PhysReg vtmp, ReduceOp op, unsigned size,
-                 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl,
-                 Operand *identity=NULL) /* for VOP3 with sparse writes */
+void
+emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
+            ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+            bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
 {
    Builder bld(ctx->program, &ctx->instructions);
    RegClass rc = RegClass(RegType::vgpr, size);
@@ -410,32 +398,34 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg
 
    if (!vop3) {
       if (opcode == aco_opcode::v_add_co_u32)
-         bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+         bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,
+                      bound_ctrl);
       else
          bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
       return;
    }
 
    if (opcode == aco_opcode::num_opcodes) {
-      emit_int64_dpp_op(ctx, dst_reg ,src0_reg, src1_reg, vtmp, op,
-                        dpp_ctrl, row_mask, bank_mask, bound_ctrl, identity);
+      emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
+                        bound_ctrl, identity);
       return;
    }
 
    if (identity)
       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
    if (identity && size >= 2)
-      bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]);
+      bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
 
    for (unsigned i = 0; i < size; i++)
-      bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0_reg+i}, v1),
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
+                   Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
 
    bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
 }
 
-void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
-             PhysReg vtmp, ReduceOp op, unsigned size)
+void
+emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
+        ReduceOp op, unsigned size)
 {
    Builder bld(ctx->program, &ctx->instructions);
    RegClass rc = RegClass(RegType::vgpr, size);
@@ -460,26 +450,29 @@ void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1
    }
 }
 
-void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size,
-                  unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
+void
+emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
+             unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
 {
    Builder bld(ctx->program, &ctx->instructions);
    for (unsigned i = 0; i < size; i++) {
-      bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst+i}, v1), Operand(PhysReg{src0+i}, v1),
-                   dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+      bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
+                   Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
    }
 }
 
-void emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
+void
+emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
 {
    for (unsigned i = 0; i < size; i++) {
-      bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst+i}, v1),
-             Operand(PhysReg{src+i}, v1), ds_pattern);
+      bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
+             Operand(PhysReg{src + i}, v1), ds_pattern);
    }
 }
 
-void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp,
-                    PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
+void
+emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
+               PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
 {
    assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
    assert(cluster_size <= ctx->program->wave_size);
@@ -492,20 +485,22 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
    Operand vcndmask_identity[2] = {identity[0], identity[1]};
 
    /* First, copy the source to tmp and set inactive lanes to the identity */
-   bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm));
+   bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
+            Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm));
 
    for (unsigned i = 0; i < src.size(); i++) {
       /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
        * except on GFX10, where v_writelane_b32 can take a literal. */
-      if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
-         bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
-         identity[i] = Operand(PhysReg{sitmp+i}, s1);
+      if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan &&
+          ctx->program->chip_class < GFX10) {
+         bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
+         identity[i] = Operand(PhysReg{sitmp + i}, s1);
 
-         bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
-         vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
+         bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
+         vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
       } else if (identity[i].isLiteral()) {
-         bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
-         vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
+         bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
+         vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
       }
    }
 
@@ -517,7 +512,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
 
    if (src.regClass() == v1b) {
       if (ctx->program->chip_class >= GFX8) {
-         aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+         aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
+            aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
          if (reduce_op == imin8 || reduce_op == imax8)
@@ -534,14 +530,15 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          else
             opcode = aco_opcode::v_bfe_u32;
 
-         bld.vop3(opcode, Definition(PhysReg{tmp}, v1),
-                  Operand(PhysReg{tmp}, v1), Operand(0u), Operand(8u));
+         bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand(0u),
+                  Operand(8u));
       }
    } else if (src.regClass() == v2b) {
       if (ctx->program->chip_class >= GFX10 &&
-          (reduce_op == iadd16 || reduce_op == imax16 ||
-           reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16)) {
-         aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+          (reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
+           reduce_op == umin16 || reduce_op == umax16)) {
+         aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
+            aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
          if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
@@ -558,54 +555,69 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          else
             opcode = aco_opcode::v_bfe_u32;
 
-         bld.vop3(opcode, Definition(PhysReg{tmp}, v1),
-                  Operand(PhysReg{tmp}, v1), Operand(0u), Operand(16u));
+         bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand(0u),
+                  Operand(16u));
       }
    }
 
    bool reduction_needs_last_op = false;
    switch (op) {
    case aco_opcode::p_reduce:
-      if (cluster_size == 1) break;
+      if (cluster_size == 1)
+         break;
 
       if (ctx->program->chip_class <= GFX7) {
          reduction_needs_last_op = true;
          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
-         if (cluster_size == 2) break;
+         if (cluster_size == 2)
+            break;
          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
-         if (cluster_size == 4) break;
+         if (cluster_size == 4)
+            break;
          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
-         if (cluster_size == 8) break;
+         if (cluster_size == 8)
+            break;
          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
-         if (cluster_size == 16) break;
+         if (cluster_size == 16)
+            break;
          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
-         if (cluster_size == 32) break;
+         if (cluster_size == 32)
+            break;
          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
          for (unsigned i = 0; i < src.size(); i++)
-            bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), Operand(0u));
+            bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
+                         Operand(0u));
          // TODO: it would be more effective to do the last reduction step on SALU
          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
          reduction_needs_last_op = false;
          break;
       }
 
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false);
-      if (cluster_size == 2) break;
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false);
-      if (cluster_size == 4) break;
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf, false);
-      if (cluster_size == 8) break;
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
+                  0xf, false);
+      if (cluster_size == 2)
+         break;
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
+                  0xf, false);
+      if (cluster_size == 4)
+         break;
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
+                  false);
+      if (cluster_size == 8)
+         break;
       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
-      if (cluster_size == 16) break;
+      if (cluster_size == 16)
+         break;
 
       if (ctx->program->chip_class >= GFX10) {
          /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
          for (unsigned i = 0; i < src.size(); i++)
-            bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
+            bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
+                     Operand(PhysReg{tmp + i}, v1), Operand(0u), Operand(0u));
 
          if (cluster_size == 32) {
             reduction_needs_last_op = true;
@@ -614,7 +626,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
 
          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
          for (unsigned i = 0; i < src.size(); i++)
-            bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
+            bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
+                         Operand(0u));
          // TODO: it would be more effective to do the last reduction step on SALU
          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
          break;
@@ -626,8 +639,10 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          break;
       }
       assert(cluster_size == 64);
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, false);
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, false);
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
+                  false);
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
+                  false);
       break;
    case aco_opcode::p_exclusive_scan:
       if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
@@ -638,10 +653,10 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
          for (unsigned i = 0; i < src.size(); i++) {
-            Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
-                                         Definition(PhysReg{vtmp+i}, v1),
-                                         Operand(PhysReg{tmp+i}, v1),
-                                         Operand(0xffffffffu), Operand(0xffffffffu)).instr;
+            Instruction* perm =
+               bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
+                        Operand(PhysReg{tmp + i}, v1), Operand(0xffffffffu), Operand(0xffffffffu))
+                  .instr;
             perm->vop3().opsel = 1; /* FI (Fetch Inactive) */
          }
          bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX));
@@ -649,8 +664,10 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          if (ctx->program->wave_size == 64) {
             /* fill in the gap in row 2 */
             for (unsigned i = 0; i < src.size(); i++) {
-               bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
-               bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
+               bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
+                            Operand(31u));
+               bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
+                             Operand(32u), Operand(PhysReg{vtmp + i}, v1));
             }
          }
          std::swap(tmp, vtmp);
@@ -660,41 +677,53 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          // TODO: use LDS on CS with a single write and shifted read
          /* wavefront shift_right by 1 on SI/CI */
          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
-         emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
+         emit_ds_swizzle(bld, tmp, tmp, src.size(),
+                         ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10101010u));
          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
          for (unsigned i = 0; i < src.size(); i++)
-            bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
+            bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
+                     Operand(PhysReg{tmp + i}, v1));
 
          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
-         emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
+         emit_ds_swizzle(bld, tmp, tmp, src.size(),
+                         ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x01000100u));
          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
          for (unsigned i = 0; i < src.size(); i++)
-            bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
+            bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
+                     Operand(PhysReg{tmp + i}, v1));
 
          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
-         emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
+         emit_ds_swizzle(bld, tmp, tmp, src.size(),
+                         ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(1u), Operand(16u));
          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(1u), Operand(16u));
          for (unsigned i = 0; i < src.size(); i++)
-            bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
+            bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
+                     Operand(PhysReg{tmp + i}, v1));
 
          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
          for (unsigned i = 0; i < src.size(); i++) {
-            bld.writelane(Definition(PhysReg{vtmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{vtmp+i}, v1));
-            bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
-            bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
+            bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand(0u),
+                          Operand(PhysReg{vtmp + i}, v1));
+            bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
+                         Operand(0u));
+            bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
+                          Operand(32u), Operand(PhysReg{vtmp + i}, v1));
             identity[i] = Operand(0u); /* prevent further uses of identity */
          }
          std::swap(tmp, vtmp);
       }
 
       for (unsigned i = 0; i < src.size(); i++) {
-         if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
+         if (!identity[i].isConstant() ||
+             identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
             if (ctx->program->chip_class < GFX10)
-               assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
-            bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));
+               assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
+                      identity[i].physReg() == PhysReg{sitmp + i});
+            bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand(0u),
+                          Operand(PhysReg{tmp + i}, v1));
          }
       }
       FALLTHROUGH;
@@ -731,28 +760,29 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 
          for (unsigned i = 0; i < src.size(); i++)
-            bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+            bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
+                         Operand(31u));
          bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
          emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
          break;
       }
 
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
-                  dpp_row_sr(1), 0xf, 0xf, false, identity);
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
-                  dpp_row_sr(2), 0xf, 0xf, false, identity);
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
-                  dpp_row_sr(4), 0xf, 0xf, false, identity);
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
-                  dpp_row_sr(8), 0xf, 0xf, false, identity);
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
+                  identity);
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
+                  identity);
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
+                  identity);
+      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
+                  identity);
       if (ctx->program->chip_class >= GFX10) {
          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u));
          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u));
          for (unsigned i = 0; i < src.size(); i++) {
-            Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
-                                         Definition(PhysReg{vtmp+i}, v1),
-                                         Operand(PhysReg{tmp+i}, v1),
-                                         Operand(0xffffffffu), Operand(0xffffffffu)).instr;
+            Instruction* perm =
+               bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
+                        Operand(PhysReg{tmp + i}, v1), Operand(0xffffffffu), Operand(0xffffffffu))
+                  .instr;
             perm->vop3().opsel = 1; /* FI (Fetch Inactive) */
          }
          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
@@ -760,21 +790,20 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          if (ctx->program->wave_size == 64) {
             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
             for (unsigned i = 0; i < src.size(); i++)
-               bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+               bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
+                            Operand(31u));
             emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
          }
       } else {
-         emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
-                     dpp_row_bcast15, 0xa, 0xf, false, identity);
-         emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
-                     dpp_row_bcast31, 0xc, 0xf, false, identity);
+         emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
+                     false, identity);
+         emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
+                     false, identity);
       }
       break;
-   default:
-      unreachable("Invalid reduction mode");
+   default: unreachable("Invalid reduction mode");
    }
 
-
    if (op == aco_opcode::p_reduce) {
       if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
          bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
@@ -791,8 +820,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
 
    if (dst.regClass().type() == RegType::sgpr) {
       for (unsigned k = 0; k < src.size(); k++) {
-         bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1),
-                      Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
+         bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
+                      Operand(ctx->program->wave_size - 1));
       }
    } else if (dst.physReg() != tmp) {
       for (unsigned k = 0; k < src.size(); k++) {
@@ -802,7 +831,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
    }
 }
 
-void emit_gfx10_wave64_bpermute(Program *program, aco_ptr<Instruction> &instr, Builder &bld)
+void
+emit_gfx10_wave64_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
 {
    /* Emulates proper bpermute on GFX10 in wave64 mode.
     *
@@ -840,7 +870,8 @@ void emit_gfx10_wave64_bpermute(Program *program, aco_ptr<Instruction> &instr, B
    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
 
    /* HI: Copy data from high lanes 32-63 to shared vgpr */
-   bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
+   bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
+                dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
    /* Save EXEC */
    bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
    /* Set EXEC to enable LO lanes only */
@@ -848,30 +879,37 @@ void emit_gfx10_wave64_bpermute(Program *program, aco_ptr<Instruction> &instr, B
    /* LO: Copy data from low lanes 0-31 to shared vgpr */
    bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
    /* LO: bpermute shared vgpr (high lanes' data) */
-   bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4, Operand(shared_vgpr_hi, v1));
+   bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
+          Operand(shared_vgpr_hi, v1));
    /* Set EXEC to enable HI lanes only */
    bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
    /* HI: bpermute shared vgpr (low lanes' data) */
-   bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4, Operand(shared_vgpr_lo, v1));
+   bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
+          Operand(shared_vgpr_lo, v1));
 
    /* Only enable lanes which use the other half's data */
-   bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc, Operand(tmp_exec.physReg(), s2), same_half);
+   bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
+            Operand(tmp_exec.physReg(), s2), same_half);
    /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
-   bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false);
+   bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
+                0x3, 0xf, false);
    /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
-   bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
+   bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
+                0xc, 0xf, false);
 
    /* Restore saved EXEC */
    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
 
-   /* RA assumes that the result is always in the low part of the register, so we have to shift, if it's not there already */
+   /* RA assumes that the result is always in the low part of the register, so we have to shift, if
+    * it's not there already */
    if (input_data.physReg().byte()) {
       unsigned right_shift = input_data.physReg().byte() * 8;
       bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand(right_shift), Operand(dst.physReg(), v1));
    }
 }
 
-void emit_gfx6_bpermute(Program *program, aco_ptr<Instruction> &instr, Builder &bld)
+void
+emit_gfx6_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
 {
    /* Emulates bpermute using readlane instructions */
 
@@ -920,8 +958,9 @@ struct copy_operation {
    };
 };
 
-void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *op,
-                const copy_operation& src, bool ignore_uses, unsigned max_size)
+void
+split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
+           const copy_operation& src, bool ignore_uses, unsigned max_size)
 {
    PhysReg def_reg = src.def.physReg();
    PhysReg op_reg = src.op.physReg();
@@ -929,8 +968,7 @@ void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *o
    op_reg.reg_b += offset;
 
    /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */
-   if (ctx->program->chip_class < GFX10 &&
-       src.def.regClass().type() == RegType::vgpr)
+   if (ctx->program->chip_class < GFX10 && src.def.regClass().type() == RegType::vgpr)
       max_size = MIN2(max_size, 4);
    unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
 
@@ -948,23 +986,23 @@ void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *o
          break;
    }
 
-   RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u) :
-                      RegClass(src.def.regClass().type(), bytes).as_subdword();
+   RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u)
+                                     : RegClass(src.def.regClass().type(), bytes).as_subdword();
    *def = Definition(src.def.tempId(), def_reg, def_cls);
    if (src.op.isConstant()) {
       assert(bytes >= 1 && bytes <= 8);
       uint64_t val = src.op.constantValue64() >> (offset * 8u);
       *op = Operand::get_const(ctx->program->chip_class, val, bytes);
    } else {
-      RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) :
-                        RegClass(src.op.regClass().type(), bytes).as_subdword();
+      RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u)
+                                       : RegClass(src.op.regClass().type(), bytes).as_subdword();
       *op = Operand(op_reg, op_cls);
       op->setTemp(Temp(src.op.tempId(), op_cls));
    }
 }
 
-uint32_t get_intersection_mask(int a_start, int a_size,
-                               int b_start, int b_size)
+uint32_t
+get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
 {
    int intersection_start = MAX2(b_start - a_start, 0);
    int intersection_end = MAX2(b_start + b_size - a_start, 0);
@@ -975,7 +1013,8 @@ uint32_t get_intersection_mask(int a_start, int a_size,
    return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
 }
 
-void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op)
+void
+copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
 {
    assert(op.bytes() == dst.bytes());
 
@@ -1069,7 +1108,9 @@ void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op)
    }
 }
 
-bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr)
+bool
+do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
+        PhysReg scratch_sgpr)
 {
    bool did_copy = false;
    for (unsigned offset = 0; offset < copy.bytes;) {
@@ -1104,23 +1145,30 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
             /* preserve the target's lower half */
             uint32_t bits = def.physReg().byte() * 8;
             PhysReg lo_reg = PhysReg(def.physReg().reg());
-            Definition lo_half = Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
-            Definition dst = Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
+            Definition lo_half =
+               Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
+            Definition dst =
+               Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
 
             if (def.physReg().reg() == op.physReg().reg()) {
-               bld.vop2(aco_opcode::v_and_b32, lo_half, Operand((1 << bits) - 1u), Operand(lo_reg, lo_half.regClass()));
+               bld.vop2(aco_opcode::v_and_b32, lo_half, Operand((1 << bits) - 1u),
+                        Operand(lo_reg, lo_half.regClass()));
                if (def.physReg().byte() == 1) {
                   bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand((1 << bits) + 1u), op);
                } else if (def.physReg().byte() == 2) {
                   bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
                } else if (def.physReg().byte() == 3) {
-                  bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
+                  bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),
+                           Operand((1 << bits) + 1u));
                   bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
                }
             } else {
                lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
-               bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand(32 - bits), Operand(lo_reg, lo_half.regClass()));
-               bld.vop3(aco_opcode::v_alignbyte_b32, dst, op, Operand(lo_half.physReg(), lo_half.regClass()), Operand(4 - def.physReg().byte()));
+               bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand(32 - bits),
+                        Operand(lo_reg, lo_half.regClass()));
+               bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,
+                        Operand(lo_half.physReg(), lo_half.regClass()),
+                        Operand(4 - def.physReg().byte()));
             }
          } else {
             bld.vop1(aco_opcode::v_mov_b32, def, op);
@@ -1137,13 +1185,16 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
    return did_copy;
 }
 
-void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, Pseudo_instruction *pi)
+void
+do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
+        Pseudo_instruction* pi)
 {
    unsigned offset = 0;
 
    if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
        (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
-      /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte swap */
+      /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
+       * swap */
       PhysReg op = copy.op.physReg();
       PhysReg def = copy.def.physReg();
       op.reg_b &= ~0x3;
@@ -1209,9 +1260,11 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool
          bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
          if (preserve_scc)
-            bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), Operand(0u));
+            bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
+                     Operand(0u));
       } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
-         bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op, Operand(2u));
+         bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
+                  Operand(2u));
       } else {
          assert(def.regClass().is_subdword());
          bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
@@ -1232,7 +1285,8 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool
    do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
 }
 
-void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, Operand hi)
+void
+do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
 {
    if (lo.isConstant() && hi.isConstant()) {
       copy_constant(ctx, bld, def, Operand(lo.constantValue() | (hi.constantValue() << 16)));
@@ -1241,8 +1295,7 @@ void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo,
 
    bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
                        (ctx->program->chip_class >= GFX10 ||
-                        (ctx->program->chip_class >= GFX9 &&
-                         !lo.isLiteral() && !hi.isLiteral()));
+                        (ctx->program->chip_class >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
 
    if (can_use_pack) {
       Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
@@ -1277,7 +1330,8 @@ void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo,
          bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand(16u), lo);
       else
          bld.vop2(aco_opcode::v_and_b32, def_lo, Operand(0xFFFFu), lo);
-      bld.vop2(aco_opcode::v_or_b32, def, Operand(hi.constantValue() << 16u), Operand(def.physReg(), v1));
+      bld.vop2(aco_opcode::v_or_b32, def, Operand(hi.constantValue() << 16u),
+               Operand(def.physReg(), v1));
       return;
    }
 
@@ -1331,9 +1385,9 @@ void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo,
    bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand(2u));
 }
 
-void try_coalesce_copies(lower_context *ctx,
-                         std::map<PhysReg, copy_operation>& copy_map,
-                         copy_operation& copy)
+void
+try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
+                    copy_operation& copy)
 {
    // TODO try more relaxed alignment for subdword copies
    unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
@@ -1359,8 +1413,8 @@ void try_coalesce_copies(lower_context *ctx,
 
    unsigned new_size = copy.bytes + other->second.bytes;
    if (copy.op.isConstant()) {
-      uint64_t val = copy.op.constantValue64() |
-                     (other->second.op.constantValue64() << (copy.bytes * 8u));
+      uint64_t val =
+         copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
       if (!Operand::is_constant_representable(val, copy.bytes + other->second.bytes, true,
                                               copy.def.regClass().type() == RegType::vgpr))
          return;
@@ -1376,7 +1430,9 @@ void try_coalesce_copies(lower_context *ctx,
    copy_map.erase(other);
 }
 
-void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi)
+void
+handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
+                chip_class chip_class, Pseudo_instruction* pi)
 {
    Builder bld(ctx->program, &ctx->instructions);
    unsigned num_instructions_before = ctx->instructions.size();
@@ -1408,8 +1464,10 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
          copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
          copy_map[hi_def.physReg()] = copy;
          assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
-         it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);
-         it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);
+         it->second.op = Operand(it->second.op.physReg(),
+                                 it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);
+         it->second.def = Definition(it->second.def.physReg(),
+                                     it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);
          it->second.bytes = 8;
       }
 
@@ -1435,7 +1493,8 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
    bool skip_partial_copies = true;
    for (auto it = copy_map.begin();;) {
       if (copy_map.empty()) {
-         ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before;
+         ctx->program->statistics[statistic_copies] +=
+            ctx->instructions.size() - num_instructions_before;
          return;
       }
       if (it == copy_map.end()) {
@@ -1451,12 +1510,10 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
          std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
          if (other != copy_map.end() && other->second.bytes == 2) {
             /* check if the target register is otherwise unused */
-            bool unused_lo = !it->second.is_used ||
-                             (it->second.is_used == 0x0101 &&
-                              other->second.op.physReg() == it->first);
+            bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
+                                                     other->second.op.physReg() == it->first);
             bool unused_hi = !other->second.is_used ||
-                             (other->second.is_used == 0x0101 &&
-                              it->second.op.physReg() == reg_hi);
+                             (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
             if (unused_lo && unused_hi) {
                Operand lo = it->second.op;
                Operand hi = other->second.op;
@@ -1482,8 +1539,8 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
       /* on GFX6/7, we need some small workarounds as there is no
        * SDWA instruction to do partial register writes */
       if (ctx->program->chip_class < GFX8 && it->second.bytes < 4) {
-         if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 &&
-             !it->second.is_used && pi->opcode == aco_opcode::p_split_vector) {
+         if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&
+             pi->opcode == aco_opcode::p_split_vector) {
             /* Other operations might overwrite the high bits, so change all users
              * of the high bits to the new target where they are still available.
              * This mechanism depends on also emitting dead definitions. */
@@ -1502,7 +1559,8 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
          } else if (it->first.byte()) {
             assert(pi->opcode == aco_opcode::p_create_vector);
             /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
-             * move to the target operand's high bits. This is save to do as it cannot be an operand */
+             * move to the target operand's high bits. This is save to do as it cannot be an operand
+             */
             PhysReg lo = PhysReg(it->first.reg());
             std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
             if (other != copy_map.end()) {
@@ -1511,8 +1569,10 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
                it->second.def = Definition(new_reg_hi, it->second.def.regClass());
                it->second.is_used = 0;
                other->second.bytes += it->second.bytes;
-               other->second.def.setTemp(Temp(other->second.def.tempId(), RegClass::get(RegType::vgpr, other->second.bytes)));
-               other->second.op.setTemp(Temp(other->second.op.tempId(), RegClass::get(RegType::vgpr, other->second.bytes)));
+               other->second.def.setTemp(Temp(other->second.def.tempId(),
+                                              RegClass::get(RegType::vgpr, other->second.bytes)));
+               other->second.op.setTemp(Temp(other->second.op.tempId(),
+                                             RegClass::get(RegType::vgpr, other->second.bytes)));
                /* if the new target's high bits are also a target, change uses */
                std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
                if (target != copy_map.end()) {
@@ -1604,7 +1664,7 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
        * operand (for example, v[7:8] = v[8:9]) */
       if (did_copy && !copy.second.op.isConstant()) {
          for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
-             for (uint16_t i = 0; i < other.second.bytes; i++) {
+            for (uint16_t i = 0; i < other.second.bytes; i++) {
                /* distance might underflow */
                unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
                if (distance < copy.second.bytes && !copy.second.uses[distance])
@@ -1690,13 +1750,15 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
       /* change the operand reg of the target's uses and split uses if needed */
       uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
       for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
-         if (target->second.op.physReg() == swap.def.physReg() && swap.bytes == target->second.bytes) {
+         if (target->second.op.physReg() == swap.def.physReg() &&
+             swap.bytes == target->second.bytes) {
             target->second.op.setFixed(swap.op.physReg());
             break;
          }
 
-         uint32_t imask = get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
-                                                target->second.op.physReg().reg_b, target->second.bytes);
+         uint32_t imask =
+            get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
+                                  target->second.op.physReg().reg_b, target->second.bytes);
 
          if (!imask)
             continue;
@@ -1752,7 +1814,8 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
    ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before;
 }
 
-void emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
+void
+emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
 {
    if (bld.program->chip_class >= GFX10) {
       if (set_round)
@@ -1761,13 +1824,15 @@ void emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_d
          bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm);
    } else if (set_round || set_denorm) {
       /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
-      Instruction *instr = bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(new_mode.val), (7 << 11) | 1).instr;
+      Instruction* instr =
+         bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(new_mode.val), (7 << 11) | 1).instr;
       /* has to be a literal */
       instr->operands[0].setFixed(PhysReg{255});
    }
 }
 
-void emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set)
+void
+emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set)
 {
    float_mode config_mode;
    config_mode.val = program.config->float_mode;
@@ -1788,13 +1853,13 @@ void emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool
    emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
 }
 
-void lower_to_hw_instr(Program* program)
+void
+lower_to_hw_instr(Program* program)
 {
-   Block *discard_block = NULL;
+   Block* discard_block = NULL;
 
-   for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--)
-   {
-      Block *block = &program->blocks[block_idx];
+   for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
+      Block* block = &program->blocks[block_idx];
       lower_context ctx;
       ctx.program = program;
       ctx.block = block;
@@ -1806,12 +1871,10 @@ void lower_to_hw_instr(Program* program)
          aco_ptr<Instruction>& instr = block->instructions[instr_idx];
          aco_ptr<Instruction> mov;
          if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {
-            Pseudo_instruction *pi = &instr->pseudo();
+            Pseudo_instruction* pi = &instr->pseudo();
 
-            switch (instr->opcode)
-            {
-            case aco_opcode::p_extract_vector:
-            {
+            switch (instr->opcode) {
+            case aco_opcode::p_extract_vector: {
                PhysReg reg = instr->operands[0].physReg();
                Definition& def = instr->definitions[0];
                reg.reg_b += instr->operands[1].constantValue() * def.bytes();
@@ -1819,21 +1882,22 @@ void lower_to_hw_instr(Program* program)
                if (reg == def.physReg())
                   break;
 
-               RegClass op_rc = def.regClass().is_subdword() ? def.regClass() :
-                                RegClass(instr->operands[0].getTemp().type(), def.size());
+               RegClass op_rc = def.regClass().is_subdword()
+                                   ? def.regClass()
+                                   : RegClass(instr->operands[0].getTemp().type(), def.size());
                std::map<PhysReg, copy_operation> copy_operations;
                copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
                handle_operands(copy_operations, &ctx, program->chip_class, pi);
                break;
             }
-            case aco_opcode::p_create_vector:
-            {
+            case aco_opcode::p_create_vector: {
                std::map<PhysReg, copy_operation> copy_operations;
                PhysReg reg = instr->definitions[0].physReg();
 
                for (const Operand& op : instr->operands) {
                   if (op.isConstant()) {
-                     const Definition def = Definition(reg, RegClass(instr->definitions[0].getTemp().type(), op.size()));
+                     const Definition def = Definition(
+                        reg, RegClass(instr->definitions[0].getTemp().type(), op.size()));
                      copy_operations[reg] = {op, def, op.bytes()};
                      reg.reg_b += op.bytes();
                      continue;
@@ -1844,8 +1908,10 @@ void lower_to_hw_instr(Program* program)
                      continue;
                   }
 
-                  RegClass rc_def = op.regClass().is_subdword() ? op.regClass() :
-                                    RegClass(instr->definitions[0].getTemp().type(), op.size());
+                  RegClass rc_def =
+                     op.regClass().is_subdword()
+                        ? op.regClass()
+                        : RegClass(instr->definitions[0].getTemp().type(), op.size());
                   const Definition def = Definition(reg, rc_def);
                   copy_operations[def.physReg()] = {op, def, op.bytes()};
                   reg.reg_b += op.bytes();
@@ -1853,14 +1919,14 @@ void lower_to_hw_instr(Program* program)
                handle_operands(copy_operations, &ctx, program->chip_class, pi);
                break;
             }
-            case aco_opcode::p_split_vector:
-            {
+            case aco_opcode::p_split_vector: {
                std::map<PhysReg, copy_operation> copy_operations;
                PhysReg reg = instr->operands[0].physReg();
 
                for (const Definition& def : instr->definitions) {
-                  RegClass rc_op = def.regClass().is_subdword() ? def.regClass() :
-                                   RegClass(instr->operands[0].getTemp().type(), def.size());
+                  RegClass rc_op = def.regClass().is_subdword()
+                                      ? def.regClass()
+                                      : RegClass(instr->operands[0].getTemp().type(), def.size());
                   const Operand op = Operand(reg, rc_op);
                   copy_operations[def.physReg()] = {op, def, def.bytes()};
                   reg.reg_b += def.bytes();
@@ -1869,26 +1935,26 @@ void lower_to_hw_instr(Program* program)
                break;
             }
             case aco_opcode::p_parallelcopy:
-            case aco_opcode::p_wqm:
-            {
+            case aco_opcode::p_wqm: {
                std::map<PhysReg, copy_operation> copy_operations;
                for (unsigned j = 0; j < instr->operands.size(); j++) {
                   assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
-                  copy_operations[instr->definitions[j].physReg()] = {instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
+                  copy_operations[instr->definitions[j].physReg()] = {
+                     instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
                }
                handle_operands(copy_operations, &ctx, program->chip_class, pi);
                break;
             }
-            case aco_opcode::p_exit_early_if:
-            {
+            case aco_opcode::p_exit_early_if: {
                /* don't bother with an early exit near the end of the program */
                if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
-                    block->instructions.back()->opcode == aco_opcode::s_endpgm) {
-                  unsigned null_exp_dest = (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
+                   block->instructions.back()->opcode == aco_opcode::s_endpgm) {
+                  unsigned null_exp_dest =
+                     (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
                   bool ignore_early_exit = true;
 
                   for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
-                     const aco_ptr<Instruction> &instr2 = block->instructions[k];
+                     const aco_ptr<Instruction>& instr2 = block->instructions[k];
                      if (instr2->opcode == aco_opcode::s_endpgm ||
                          instr2->opcode == aco_opcode::p_logical_end)
                         continue;
@@ -1896,8 +1962,8 @@ void lower_to_hw_instr(Program* program)
                               instr2->exp().dest == null_exp_dest)
                         continue;
                      else if (instr2->opcode == aco_opcode::p_parallelcopy &&
-                         instr2->definitions[0].isFixed() &&
-                         instr2->definitions[0].physReg() == exec)
+                              instr2->definitions[0].isFixed() &&
+                              instr2->definitions[0].physReg() == exec)
                         continue;
 
                      ignore_early_exit = false;
@@ -1912,50 +1978,49 @@ void lower_to_hw_instr(Program* program)
                   block = &program->blocks[block_idx];
 
                   bld.reset(discard_block);
-                  bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
-                          0, V_008DFC_SQ_EXP_NULL, false, true, true);
+                  bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
+                          V_008DFC_SQ_EXP_NULL, false, true, true);
                   bld.sopp(aco_opcode::s_endpgm);
 
                   bld.reset(&ctx.instructions);
                }
 
-               //TODO: exec can be zero here with block_kind_discard
+               // TODO: exec can be zero here with block_kind_discard
 
                assert(instr->operands[0].physReg() == scc);
-               bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0], discard_block->index);
+               bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0],
+                        discard_block->index);
 
                discard_block->linear_preds.push_back(block->index);
                block->linear_succs.push_back(discard_block->index);
                break;
             }
-            case aco_opcode::p_spill:
-            {
+            case aco_opcode::p_spill: {
                assert(instr->operands[0].regClass() == v1.as_linear());
                for (unsigned i = 0; i < instr->operands[2].size(); i++) {
-                  Operand src = instr->operands[2].isConstant() ?
-                                Operand(uint32_t(instr->operands[2].constantValue64() >> (32 * i))) :
-                                Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
-                  bld.writelane(bld.def(v1, instr->operands[0].physReg()),
-                                src,
+                  Operand src =
+                     instr->operands[2].isConstant()
+                        ? Operand(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
+                        : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
+                  bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,
                                 Operand(instr->operands[1].constantValue() + i),
                                 instr->operands[0]);
                }
                break;
             }
-            case aco_opcode::p_reload:
-            {
+            case aco_opcode::p_reload: {
                assert(instr->operands[0].regClass() == v1.as_linear());
                for (unsigned i = 0; i < instr->definitions[0].size(); i++)
                   bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
-                               instr->operands[0],
-                               Operand(instr->operands[1].constantValue() + i));
+                               instr->operands[0], Operand(instr->operands[1].constantValue() + i));
                break;
             }
-            case aco_opcode::p_as_uniform:
-            {
-               if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) {
+            case aco_opcode::p_as_uniform: {
+               if (instr->operands[0].isConstant() ||
+                   instr->operands[0].regClass().type() == RegType::sgpr) {
                   std::map<PhysReg, copy_operation> copy_operations;
-                  copy_operations[instr->definitions[0].physReg()] = {instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
+                  copy_operations[instr->definitions[0].physReg()] = {
+                     instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
                   handle_operands(copy_operations, &ctx, program->chip_class, pi);
                } else {
                   assert(instr->operands[0].regClass().type() == RegType::vgpr);
@@ -1969,8 +2034,7 @@ void lower_to_hw_instr(Program* program)
                }
                break;
             }
-            case aco_opcode::p_bpermute:
-            {
+            case aco_opcode::p_bpermute: {
                if (ctx.program->chip_class <= GFX7)
                   emit_gfx6_bpermute(program, instr, bld);
                else if (ctx.program->chip_class >= GFX10 && ctx.program->wave_size == 64)
@@ -1979,8 +2043,7 @@ void lower_to_hw_instr(Program* program)
                   unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
                break;
             }
-            case aco_opcode::p_constaddr:
-            {
+            case aco_opcode::p_constaddr: {
                unsigned id = instr->definitions[0].tempId();
                PhysReg reg = instr->definitions[0].physReg();
                bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand(id));
@@ -1990,8 +2053,7 @@ void lower_to_hw_instr(Program* program)
                         Operand(reg.advance(4), s1), Operand(0u), Operand(scc, s1));
                break;
             }
-            case aco_opcode::p_extract:
-            {
+            case aco_opcode::p_extract: {
                assert(instr->operands[1].isConstant());
                assert(instr->operands[2].isConstant());
                assert(instr->operands[3].isConstant());
@@ -2006,26 +2068,28 @@ void lower_to_hw_instr(Program* program)
 
                if (dst.regClass() == s1) {
                   if (offset == (32 - bits)) {
-                     bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32,
-                              dst, bld.def(s1, scc), op, Operand(offset));
+                     bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
+                              bld.def(s1, scc), op, Operand(offset));
                   } else if (offset == 0 && signext && (bits == 8 || bits == 16)) {
-                     bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, dst, op);
+                     bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
+                              dst, op);
                   } else {
-                     bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32,
-                              dst, bld.def(s1, scc), op, Operand((bits << 16) | offset));
+                     bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
+                              bld.def(s1, scc), op, Operand((bits << 16) | offset));
                   }
                } else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) {
                   assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
                   if (offset == (32 - bits) && op.regClass() != s1) {
-                     bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32,
-                              dst, Operand(offset), op);
+                     bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
+                              Operand(offset), op);
                   } else {
-                     bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32,
-                              dst, op, Operand(offset), Operand(bits));
+                     bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
+                              Operand(offset), Operand(bits));
                   }
                } else if (dst.regClass() == v2b) {
                   aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
-                     aco_opcode::v_mov_b32, (Format)((uint16_t)Format::VOP1|(uint16_t)Format::SDWA), 1, 1)};
+                     aco_opcode::v_mov_b32,
+                     (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};
                   sdwa->operands[0] = Operand(op.physReg().advance(-op.physReg().byte()),
                                               RegClass::get(op.regClass().type(), 4));
                   sdwa->definitions[0] = dst;
@@ -2037,8 +2101,7 @@ void lower_to_hw_instr(Program* program)
                }
                break;
             }
-            case aco_opcode::p_insert:
-            {
+            case aco_opcode::p_insert: {
                assert(instr->operands[1].isConstant());
                assert(instr->operands[2].isConstant());
                if (instr->definitions[0].regClass() == s1)
@@ -2053,18 +2116,24 @@ void lower_to_hw_instr(Program* program)
                   if (offset == (32 - bits)) {
                      bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op, Operand(offset));
                   } else if (offset == 0) {
-                     bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, Operand(bits << 16));
+                     bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
+                              Operand(bits << 16));
                   } else {
-                     bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, Operand(bits << 16));
-                     bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), Operand(dst.physReg(), s1), Operand(offset));
+                     bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
+                              Operand(bits << 16));
+                     bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),
+                              Operand(dst.physReg(), s1), Operand(offset));
                   }
                } else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) {
                   if (offset == (dst.bytes() * 8u - bits)) {
                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand(offset), op);
                   } else if (offset == 0) {
                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand(0u), Operand(bits));
-                  } else if (program->chip_class >= GFX9 || (op.regClass() != s1 && program->chip_class >= GFX8)) {
-                     aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, (Format)((uint16_t)Format::VOP1|(uint16_t)Format::SDWA), 1, 1)};
+                  } else if (program->chip_class >= GFX9 ||
+                             (op.regClass() != s1 && program->chip_class >= GFX8)) {
+                     aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
+                        aco_opcode::v_mov_b32,
+                        (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};
                      sdwa->operands[0] = op;
                      sdwa->definitions[0] = dst;
                      sdwa->sel[0] = sdwa_udword;
@@ -2072,14 +2141,17 @@ void lower_to_hw_instr(Program* program)
                      bld.insert(std::move(sdwa));
                   } else {
                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand(0u), Operand(bits));
-                     bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand(offset), Operand(dst.physReg(), v1));
+                     bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand(offset),
+                              Operand(dst.physReg(), v1));
                   }
                } else {
                   assert(dst.regClass() == v2b);
                   aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
-                     aco_opcode::v_mov_b32, (Format)((uint16_t)Format::VOP1|(uint16_t)Format::SDWA), 1, 1)};
+                     aco_opcode::v_mov_b32,
+                     (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};
                   sdwa->operands[0] = op;
-                  sdwa->definitions[0] = Definition(dst.physReg().advance(-dst.physReg().byte()), v1);
+                  sdwa->definitions[0] =
+                     Definition(dst.physReg().advance(-dst.physReg().byte()), v1);
                   sdwa->sel[0] = sdwa_uword;
                   sdwa->dst_sel = sdwa_ubyte0 + dst.physReg().byte() + index;
                   sdwa->dst_preserve = 1;
@@ -2087,8 +2159,7 @@ void lower_to_hw_instr(Program* program)
                }
                break;
             }
-            default:
-               break;
+            default: break;
             }
          } else if (instr->isBranch()) {
             Pseudo_branch_instruction* branch = &instr->branch();
@@ -2132,42 +2203,41 @@ void lower_to_hw_instr(Program* program)
                continue;
 
             switch (instr->opcode) {
-               case aco_opcode::p_branch:
-                  assert(block->linear_succs[0] == target);
-                  bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
-                  break;
-               case aco_opcode::p_cbranch_nz:
-                  assert(block->linear_succs[1] == target);
-                  if (branch->operands[0].physReg() == exec)
-                     bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
-                  else if (branch->operands[0].physReg() == vcc)
-                     bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
-                  else {
-                     assert(branch->operands[0].physReg() == scc);
-                     bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
-                  }
-                  break;
-               case aco_opcode::p_cbranch_z:
-                  assert(block->linear_succs[1] == target);
-                  if (branch->operands[0].physReg() == exec)
-                     bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
-                  else if (branch->operands[0].physReg() == vcc)
-                     bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
-                  else {
-                     assert(branch->operands[0].physReg() == scc);
-                     bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
-                  }
-                  break;
-               default:
-                  unreachable("Unknown Pseudo branch instruction!");
+            case aco_opcode::p_branch:
+               assert(block->linear_succs[0] == target);
+               bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
+               break;
+            case aco_opcode::p_cbranch_nz:
+               assert(block->linear_succs[1] == target);
+               if (branch->operands[0].physReg() == exec)
+                  bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
+               else if (branch->operands[0].physReg() == vcc)
+                  bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
+               else {
+                  assert(branch->operands[0].physReg() == scc);
+                  bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
+               }
+               break;
+            case aco_opcode::p_cbranch_z:
+               assert(block->linear_succs[1] == target);
+               if (branch->operands[0].physReg() == exec)
+                  bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
+               else if (branch->operands[0].physReg() == vcc)
+                  bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
+               else {
+                  assert(branch->operands[0].physReg() == scc);
+                  bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
+               }
+               break;
+            default: unreachable("Unknown Pseudo branch instruction!");
             }
 
          } else if (instr->isReduction()) {
             Pseudo_reduction_instruction& reduce = instr->reduction();
             emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
-                           reduce.operands[1].physReg(), // tmp
+                           reduce.operands[1].physReg(),    // tmp
                            reduce.definitions[1].physReg(), // stmp
-                           reduce.operands[2].physReg(), // vtmp
+                           reduce.operands[2].physReg(),    // vtmp
                            reduce.definitions[2].physReg(), // sitmp
                            reduce.operands[0], reduce.definitions[0]);
          } else if (instr->isBarrier()) {
@@ -2196,10 +2266,9 @@ void lower_to_hw_instr(Program* program)
          } else {
             ctx.instructions.emplace_back(std::move(instr));
          }
-
       }
       block->instructions.swap(ctx.instructions);
    }
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp
index 4faa9bad687..32c0bd8a120 100644
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -36,8 +36,9 @@
 namespace aco {
 namespace {
 
-inline
-uint32_t murmur_32_scramble(uint32_t h, uint32_t k) {
+inline uint32_t
+murmur_32_scramble(uint32_t h, uint32_t k)
+{
    k *= 0xcc9e2d51;
    k = (k << 15) | (k >> 17);
    h ^= k * 0x1b873593;
@@ -46,8 +47,9 @@ uint32_t murmur_32_scramble(uint32_t h, uint32_t k) {
    return h;
 }
 
-template<typename T>
-uint32_t hash_murmur_32(Instruction* instr)
+template <typename T>
+uint32_t
+hash_murmur_32(Instruction* instr)
 {
    uint32_t hash = uint32_t(instr->format) << 16 | uint32_t(instr->opcode);
 
@@ -58,7 +60,7 @@ uint32_t hash_murmur_32(Instruction* instr)
    for (unsigned i = 2; i < (sizeof(T) >> 2); i++) {
       uint32_t u;
       /* Accesses it though a byte array, so doesn't violate the strict aliasing rule */
-      memcpy(&u, reinterpret_cast<uint8_t *>(instr) + i * 4, 4);
+      memcpy(&u, reinterpret_cast<uint8_t*>(instr) + i * 4, 4);
       hash = murmur_32_scramble(hash, u);
    }
 
@@ -92,32 +94,19 @@ struct InstrHash {
          return hash_murmur_32<SDWA_instruction>(instr);
 
       switch (instr->format) {
-      case Format::SMEM:
-         return hash_murmur_32<SMEM_instruction>(instr);
-      case Format::VINTRP:
-         return hash_murmur_32<Interp_instruction>(instr);
-      case Format::DS:
-         return hash_murmur_32<DS_instruction>(instr);
-      case Format::SOPP:
-         return hash_murmur_32<SOPP_instruction>(instr);
-      case Format::SOPK:
-         return hash_murmur_32<SOPK_instruction>(instr);
-      case Format::EXP:
-         return hash_murmur_32<Export_instruction>(instr);
-      case Format::MUBUF:
-         return hash_murmur_32<MUBUF_instruction>(instr);
-      case Format::MIMG:
-         return hash_murmur_32<MIMG_instruction>(instr);
-      case Format::MTBUF:
-         return hash_murmur_32<MTBUF_instruction>(instr);
-      case Format::FLAT:
-         return hash_murmur_32<FLAT_instruction>(instr);
-      case Format::PSEUDO_BRANCH:
-         return hash_murmur_32<Pseudo_branch_instruction>(instr);
-      case Format::PSEUDO_REDUCTION:
-         return hash_murmur_32<Pseudo_reduction_instruction>(instr);
-      default:
-         return hash_murmur_32<Instruction>(instr);
+      case Format::SMEM: return hash_murmur_32<SMEM_instruction>(instr);
+      case Format::VINTRP: return hash_murmur_32<Interp_instruction>(instr);
+      case Format::DS: return hash_murmur_32<DS_instruction>(instr);
+      case Format::SOPP: return hash_murmur_32<SOPP_instruction>(instr);
+      case Format::SOPK: return hash_murmur_32<SOPK_instruction>(instr);
+      case Format::EXP: return hash_murmur_32<Export_instruction>(instr);
+      case Format::MUBUF: return hash_murmur_32<MUBUF_instruction>(instr);
+      case Format::MIMG: return hash_murmur_32<MIMG_instruction>(instr);
+      case Format::MTBUF: return hash_murmur_32<MTBUF_instruction>(instr);
+      case Format::FLAT: return hash_murmur_32<FLAT_instruction>(instr);
+      case Format::PSEUDO_BRANCH: return hash_murmur_32<Pseudo_branch_instruction>(instr);
+      case Format::PSEUDO_REDUCTION: return hash_murmur_32<Pseudo_reduction_instruction>(instr);
+      default: return hash_murmur_32<Instruction>(instr);
       }
    }
 };
@@ -129,7 +118,8 @@ struct InstrPred {
          return false;
       if (a->opcode != b->opcode)
          return false;
-      if (a->operands.size() != b->operands.size() || a->definitions.size() != b->definitions.size())
+      if (a->operands.size() != b->operands.size() ||
+          a->definitions.size() != b->definitions.size())
          return false; /* possible with pseudo-instructions */
       for (unsigned i = 0; i < a->operands.size(); i++) {
          if (a->operands[i].isConstant()) {
@@ -137,14 +127,12 @@ struct InstrPred {
                return false;
             if (a->operands[i].constantValue() != b->operands[i].constantValue())
                return false;
-         }
-         else if (a->operands[i].isTemp()) {
+         } else if (a->operands[i].isTemp()) {
             if (!b->operands[i].isTemp())
                return false;
             if (a->operands[i].tempId() != b->operands[i].tempId())
                return false;
-         }
-         else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined())
+         } else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined())
             return false;
          if (a->operands[i].isFixed()) {
             if (!b->operands[i].isFixed())
@@ -179,154 +167,110 @@ struct InstrPred {
          VOP3_instruction& a3 = a->vop3();
          VOP3_instruction& b3 = b->vop3();
          for (unsigned i = 0; i < 3; i++) {
-            if (a3.abs[i] != b3.abs[i] ||
-                a3.neg[i] != b3.neg[i])
+            if (a3.abs[i] != b3.abs[i] || a3.neg[i] != b3.neg[i])
                return false;
          }
-         return a3.clamp == b3.clamp &&
-                a3.omod == b3.omod &&
-                a3.opsel == b3.opsel;
+         return a3.clamp == b3.clamp && a3.omod == b3.omod && a3.opsel == b3.opsel;
       }
       if (a->isDPP()) {
          DPP_instruction& aDPP = a->dpp();
          DPP_instruction& bDPP = b->dpp();
-         return aDPP.pass_flags == bDPP.pass_flags &&
-                aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
-                aDPP.bank_mask == bDPP.bank_mask &&
-                aDPP.row_mask == bDPP.row_mask &&
-                aDPP.bound_ctrl == bDPP.bound_ctrl &&
-                aDPP.abs[0] == bDPP.abs[0] &&
-                aDPP.abs[1] == bDPP.abs[1] &&
-                aDPP.neg[0] == bDPP.neg[0] &&
+         return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
+                aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask &&
+                aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.abs[0] == bDPP.abs[0] &&
+                aDPP.abs[1] == bDPP.abs[1] && aDPP.neg[0] == bDPP.neg[0] &&
                 aDPP.neg[1] == bDPP.neg[1];
       }
       if (a->isSDWA()) {
          SDWA_instruction& aSDWA = a->sdwa();
          SDWA_instruction& bSDWA = b->sdwa();
-         return aSDWA.sel[0] == bSDWA.sel[0] &&
-                aSDWA.sel[1] == bSDWA.sel[1] &&
-                aSDWA.dst_sel == bSDWA.dst_sel &&
-                aSDWA.abs[0] == bSDWA.abs[0] &&
-                aSDWA.abs[1] == bSDWA.abs[1] &&
-                aSDWA.neg[0] == bSDWA.neg[0] &&
-                aSDWA.neg[1] == bSDWA.neg[1] &&
-                aSDWA.dst_preserve == bSDWA.dst_preserve &&
-                aSDWA.clamp == bSDWA.clamp &&
-                aSDWA.omod == bSDWA.omod;
+         return aSDWA.sel[0] == bSDWA.sel[0] && aSDWA.sel[1] == bSDWA.sel[1] &&
+                aSDWA.dst_sel == bSDWA.dst_sel && aSDWA.abs[0] == bSDWA.abs[0] &&
+                aSDWA.abs[1] == bSDWA.abs[1] && aSDWA.neg[0] == bSDWA.neg[0] &&
+                aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.dst_preserve == bSDWA.dst_preserve &&
+                aSDWA.clamp == bSDWA.clamp && aSDWA.omod == bSDWA.omod;
       }
 
       switch (a->format) {
-         case Format::SOPK: {
-            if (a->opcode == aco_opcode::s_getreg_b32)
+      case Format::SOPK: {
+         if (a->opcode == aco_opcode::s_getreg_b32)
+            return false;
+         SOPK_instruction& aK = a->sopk();
+         SOPK_instruction& bK = b->sopk();
+         return aK.imm == bK.imm;
+      }
+      case Format::SMEM: {
+         SMEM_instruction& aS = a->smem();
+         SMEM_instruction& bS = b->smem();
+         /* isel shouldn't be creating situations where this assertion fails */
+         assert(aS.prevent_overflow == bS.prevent_overflow);
+         return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv &&
+                aS.disable_wqm == bS.disable_wqm && aS.prevent_overflow == bS.prevent_overflow;
+      }
+      case Format::VINTRP: {
+         Interp_instruction& aI = a->vintrp();
+         Interp_instruction& bI = b->vintrp();
+         if (aI.attribute != bI.attribute)
+            return false;
+         if (aI.component != bI.component)
+            return false;
+         return true;
+      }
+      case Format::VOP3P: {
+         VOP3P_instruction& a3P = a->vop3p();
+         VOP3P_instruction& b3P = b->vop3p();
+         for (unsigned i = 0; i < 3; i++) {
+            if (a3P.neg_lo[i] != b3P.neg_lo[i] || a3P.neg_hi[i] != b3P.neg_hi[i])
                return false;
-            SOPK_instruction& aK = a->sopk();
-            SOPK_instruction& bK = b->sopk();
-            return aK.imm == bK.imm;
          }
-         case Format::SMEM: {
-            SMEM_instruction& aS = a->smem();
-            SMEM_instruction& bS = b->smem();
-            /* isel shouldn't be creating situations where this assertion fails */
-            assert(aS.prevent_overflow == bS.prevent_overflow);
-            return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc &&
-                   aS.nv == bS.nv && aS.disable_wqm == bS.disable_wqm &&
-                   aS.prevent_overflow == bS.prevent_overflow;
-         }
-         case Format::VINTRP: {
-            Interp_instruction& aI = a->vintrp();
-            Interp_instruction& bI = b->vintrp();
-            if (aI.attribute != bI.attribute)
-               return false;
-            if (aI.component != bI.component)
-               return false;
-            return true;
-         }
-         case Format::VOP3P: {
-            VOP3P_instruction& a3P = a->vop3p();
-            VOP3P_instruction& b3P = b->vop3p();
-            for (unsigned i = 0; i < 3; i++) {
-               if (a3P.neg_lo[i] != b3P.neg_lo[i] ||
-                   a3P.neg_hi[i] != b3P.neg_hi[i])
-                  return false;
-            }
-            return a3P.opsel_lo == b3P.opsel_lo &&
-                   a3P.opsel_hi == b3P.opsel_hi &&
-                   a3P.clamp == b3P.clamp;
-         }
-         case Format::PSEUDO_REDUCTION: {
-            Pseudo_reduction_instruction& aR = a->reduction();
-            Pseudo_reduction_instruction& bR = b->reduction();
-            return aR.pass_flags == bR.pass_flags &&
-                   aR.reduce_op == bR.reduce_op &&
-                   aR.cluster_size == bR.cluster_size;
-         }
-         case Format::DS: {
-            assert(a->opcode == aco_opcode::ds_bpermute_b32 ||
-                   a->opcode == aco_opcode::ds_permute_b32 ||
-                   a->opcode == aco_opcode::ds_swizzle_b32);
-            DS_instruction& aD = a->ds();
-            DS_instruction& bD = b->ds();
-            return aD.sync == bD.sync &&
-                   aD.pass_flags == bD.pass_flags &&
-                   aD.gds == bD.gds &&
-                   aD.offset0 == bD.offset0 &&
-                   aD.offset1 == bD.offset1;
-         }
-         case Format::MTBUF: {
-            MTBUF_instruction& aM = a->mtbuf();
-            MTBUF_instruction& bM = b->mtbuf();
-            return aM.sync == bM.sync &&
-                   aM.dfmt == bM.dfmt &&
-                   aM.nfmt == bM.nfmt &&
-                   aM.offset == bM.offset &&
-                   aM.offen == bM.offen &&
-                   aM.idxen == bM.idxen &&
-                   aM.glc == bM.glc &&
-                   aM.dlc == bM.dlc &&
-                   aM.slc == bM.slc &&
-                   aM.tfe == bM.tfe &&
-                   aM.disable_wqm == bM.disable_wqm;
-         }
-         case Format::MUBUF: {
-            MUBUF_instruction& aM = a->mubuf();
-            MUBUF_instruction& bM = b->mubuf();
-            return aM.sync == bM.sync &&
-                   aM.offset == bM.offset &&
-                   aM.offen == bM.offen &&
-                   aM.idxen == bM.idxen &&
-                   aM.glc == bM.glc &&
-                   aM.dlc == bM.dlc &&
-                   aM.slc == bM.slc &&
-                   aM.tfe == bM.tfe &&
-                   aM.lds == bM.lds &&
-                   aM.disable_wqm == bM.disable_wqm;
-         }
-         case Format::MIMG: {
-            MIMG_instruction& aM = a->mimg();
-            MIMG_instruction& bM = b->mimg();
-            return aM.sync == bM.sync &&
-                   aM.dmask == bM.dmask &&
-                   aM.unrm == bM.unrm &&
-                   aM.glc == bM.glc &&
-                   aM.slc == bM.slc &&
-                   aM.tfe == bM.tfe &&
-                   aM.da == bM.da &&
-                   aM.lwe == bM.lwe &&
-                   aM.r128 == bM.r128 &&
-                   aM.a16 == bM.a16 &&
-                   aM.d16 == bM.d16 &&
-                   aM.disable_wqm == bM.disable_wqm;
-         }
-         case Format::FLAT:
-         case Format::GLOBAL:
-         case Format::SCRATCH:
-         case Format::EXP:
-         case Format::SOPP:
-         case Format::PSEUDO_BRANCH:
-         case Format::PSEUDO_BARRIER:
-            assert(false);
-         default:
-            return true;
+         return a3P.opsel_lo == b3P.opsel_lo && a3P.opsel_hi == b3P.opsel_hi &&
+                a3P.clamp == b3P.clamp;
+      }
+      case Format::PSEUDO_REDUCTION: {
+         Pseudo_reduction_instruction& aR = a->reduction();
+         Pseudo_reduction_instruction& bR = b->reduction();
+         return aR.pass_flags == bR.pass_flags && aR.reduce_op == bR.reduce_op &&
+                aR.cluster_size == bR.cluster_size;
+      }
+      case Format::DS: {
+         assert(a->opcode == aco_opcode::ds_bpermute_b32 ||
+                a->opcode == aco_opcode::ds_permute_b32 || a->opcode == aco_opcode::ds_swizzle_b32);
+         DS_instruction& aD = a->ds();
+         DS_instruction& bD = b->ds();
+         return aD.sync == bD.sync && aD.pass_flags == bD.pass_flags && aD.gds == bD.gds &&
+                aD.offset0 == bD.offset0 && aD.offset1 == bD.offset1;
+      }
+      case Format::MTBUF: {
+         MTBUF_instruction& aM = a->mtbuf();
+         MTBUF_instruction& bM = b->mtbuf();
+         return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt &&
+                aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen &&
+                aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe &&
+                aM.disable_wqm == bM.disable_wqm;
+      }
+      case Format::MUBUF: {
+         MUBUF_instruction& aM = a->mubuf();
+         MUBUF_instruction& bM = b->mubuf();
+         return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen &&
+                aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc &&
+                aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm;
+      }
+      case Format::MIMG: {
+         MIMG_instruction& aM = a->mimg();
+         MIMG_instruction& bM = b->mimg();
+         return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm &&
+                aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da &&
+                aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 &&
+                aM.disable_wqm == bM.disable_wqm;
+      }
+      case Format::FLAT:
+      case Format::GLOBAL:
+      case Format::SCRATCH:
+      case Format::EXP:
+      case Format::SOPP:
+      case Format::PSEUDO_BRANCH:
+      case Format::PSEUDO_BARRIER: assert(false);
+      default: return true;
       }
    }
 };
@@ -345,7 +289,8 @@ struct vn_ctx {
     */
    uint32_t exec_id = 1;
 
-   vn_ctx(Program* program_) : program(program_) {
+   vn_ctx(Program* program_) : program(program_)
+   {
       static_assert(sizeof(Temp) == 4, "Temp must fit in 32bits");
       unsigned size = 0;
       for (Block& block : program->blocks)
@@ -354,11 +299,11 @@ struct vn_ctx {
    }
 };
 
-
 /* dominates() returns true if the parent block dominates the child block and
  * if the parent block is part of the same loop or has a smaller loop nest depth.
  */
-bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
+bool
+dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
 {
    unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth;
    while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth)
@@ -375,42 +320,40 @@ bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
  *  Note that expr_set must not be used with instructions
  *  which cannot be eliminated.
  */
-bool can_eliminate(aco_ptr<Instruction>& instr)
+bool
+can_eliminate(aco_ptr<Instruction>& instr)
 {
    switch (instr->format) {
-      case Format::FLAT:
-      case Format::GLOBAL:
-      case Format::SCRATCH:
-      case Format::EXP:
-      case Format::SOPP:
-      case Format::PSEUDO_BRANCH:
-      case Format::PSEUDO_BARRIER:
+   case Format::FLAT:
+   case Format::GLOBAL:
+   case Format::SCRATCH:
+   case Format::EXP:
+   case Format::SOPP:
+   case Format::PSEUDO_BRANCH:
+   case Format::PSEUDO_BARRIER: return false;
+   case Format::DS:
+      return instr->opcode == aco_opcode::ds_bpermute_b32 ||
+             instr->opcode == aco_opcode::ds_permute_b32 ||
+             instr->opcode == aco_opcode::ds_swizzle_b32;
+   case Format::SMEM:
+   case Format::MUBUF:
+   case Format::MIMG:
+   case Format::MTBUF:
+      if (!get_sync_info(instr.get()).can_reorder())
          return false;
-      case Format::DS:
-         return instr->opcode == aco_opcode::ds_bpermute_b32 ||
-                instr->opcode == aco_opcode::ds_permute_b32 ||
-                instr->opcode == aco_opcode::ds_swizzle_b32;
-      case Format::SMEM:
-      case Format::MUBUF:
-      case Format::MIMG:
-      case Format::MTBUF:
-         if (!get_sync_info(instr.get()).can_reorder())
-            return false;
-         break;
-      default:
-         break;
+      break;
+   default: break;
    }
 
-   if (instr->definitions.empty() ||
-       instr->opcode == aco_opcode::p_phi ||
-       instr->opcode == aco_opcode::p_linear_phi ||
-       instr->definitions[0].isNoCSE())
+   if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi ||
+       instr->opcode == aco_opcode::p_linear_phi || instr->definitions[0].isNoCSE())
       return false;
 
    return true;
 }
 
-void process_block(vn_ctx& ctx, Block& block)
+void
+process_block(vn_ctx& ctx, Block& block)
 {
    std::vector<aco_ptr<Instruction>> new_instructions;
    new_instructions.reserve(block.instructions.size());
@@ -435,8 +378,9 @@ void process_block(vn_ctx& ctx, Block& block)
       }
 
       /* simple copy-propagation through renaming */
-      bool copy_instr = instr->opcode == aco_opcode::p_parallelcopy ||
-                        (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
+      bool copy_instr =
+         instr->opcode == aco_opcode::p_parallelcopy ||
+         (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
       if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() &&
           instr->operands[0].regClass() == instr->definitions[0].regClass()) {
          ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp();
@@ -479,7 +423,8 @@ void process_block(vn_ctx& ctx, Block& block)
    block.instructions = std::move(new_instructions);
 }
 
-void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
+void
+rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
 {
    for (aco_ptr<Instruction>& phi : block.instructions) {
       if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
@@ -496,8 +441,8 @@ void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
 }
 } /* end namespace */
 
-
-void value_numbering(Program* program)
+void
+value_numbering(Program* program)
 {
    vn_ctx ctx(program);
    std::vector<unsigned> loop_headers;
@@ -521,10 +466,8 @@ void value_numbering(Program* program)
          rename_phi_operands(block, ctx.renames);
 
       /* increment exec_id when entering nested control flow */
-      if (block.kind & block_kind_branch ||
-          block.kind & block_kind_loop_preheader ||
-          block.kind & block_kind_break ||
-          block.kind & block_kind_continue ||
+      if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader ||
+          block.kind & block_kind_break || block.kind & block_kind_continue ||
           block.kind & block_kind_discard)
          ctx.exec_id++;
       else if (block.kind & block_kind_continue_or_break)
@@ -538,4 +481,4 @@ void value_numbering(Program* program)
    }
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 9ec30d6a0c1..da0769ec301 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "aco_ir.h"
+
 #include "util/half_float.h"
 #include "util/memstream.h"
 
@@ -33,14 +34,15 @@
 namespace aco {
 
 #ifndef NDEBUG
-void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr)
+void
+perfwarn(Program* program, bool cond, const char* msg, Instruction* instr)
 {
    if (cond) {
-      char *out;
+      char* out;
       size_t outsize;
       struct u_memstream mem;
       u_memstream_open(&mem, &out, &outsize);
-      FILE *const memf = u_memstream_get(&mem);
+      FILE* const memf = u_memstream_get(&mem);
 
       fprintf(memf, "%s: ", msg);
       aco_print_instr(instr, memf);
@@ -69,7 +71,6 @@ void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr)
  *     instructions are removed from the sequence.
  */
 
-
 struct mad_info {
    aco_ptr<Instruction> add_instr;
    uint32_t mul_temp_id;
@@ -77,7 +78,8 @@ struct mad_info {
    bool check_literal;
 
    mad_info(aco_ptr<Instruction> instr, uint32_t id)
-   : add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false) {}
+       : add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false)
+   {}
 };
 
 enum Label {
@@ -112,22 +114,25 @@ enum Label {
    label_b2i = 1 << 27,
    label_fcanonicalize = 1 << 28,
    label_constant_16bit = 1 << 29,
-   label_usedef = 1 << 30, /* generic label */
+   label_usedef = 1 << 30,   /* generic label */
    label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */
    label_canonicalized = 1ull << 32,
    label_extract = 1ull << 33,
    label_insert = 1ull << 34,
 };
 
-static constexpr uint64_t instr_usedef_labels = label_vec | label_mul | label_mad | label_add_sub | label_vop3p |
-                                                label_bitwise | label_uniform_bitwise | label_minmax | label_vopc |
-                                                label_usedef | label_extract;
-static constexpr uint64_t instr_mod_labels = label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert;
+static constexpr uint64_t instr_usedef_labels =
+   label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise |
+   label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract;
+static constexpr uint64_t instr_mod_labels =
+   label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert;
 
 static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels;
-static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool |
-                                        label_scc_invert | label_b2i | label_fcanonicalize;
-static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal;
+static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |
+                                        label_uniform_bool | label_scc_invert | label_b2i |
+                                        label_fcanonicalize;
+static constexpr uint32_t val_labels =
+   label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal;
 
 static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");
 static_assert((instr_labels & val_labels) == 0, "labels cannot intersect");
@@ -161,7 +166,8 @@ struct ssa_info {
          label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
       }
 
-      uint32_t const_labels = label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
+      uint32_t const_labels =
+         label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
       if (new_label & const_labels) {
          label &= ~val_labels | const_labels;
          label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
@@ -179,10 +185,7 @@ struct ssa_info {
       instr = vec;
    }
 
-   bool is_vec()
-   {
-      return label & label_vec;
-   }
+   bool is_vec() { return label & label_vec; }
 
    void set_constant(chip_class chip, uint64_t constant)
    {
@@ -210,14 +213,10 @@ struct ssa_info {
    bool is_constant(unsigned bits)
    {
       switch (bits) {
-      case 8:
-         return label & label_literal;
-      case 16:
-         return label & label_constant_16bit;
-      case 32:
-         return label & label_constant_32bit;
-      case 64:
-         return label & label_constant_64bit;
+      case 8: return label & label_literal;
+      case 16: return label & label_constant_16bit;
+      case 32: return label & label_constant_32bit;
+      case 64: return label & label_constant_64bit;
       }
       return false;
    }
@@ -226,14 +225,10 @@ struct ssa_info {
    {
       bool is_lit = label & label_literal;
       switch (bits) {
-      case 8:
-         return false;
-      case 16:
-         return is_lit && ~(label & label_constant_16bit);
-      case 32:
-         return is_lit && ~(label & label_constant_32bit);
-      case 64:
-         return false;
+      case 8: return false;
+      case 16: return is_lit && ~(label & label_constant_16bit);
+      case 32: return is_lit && ~(label & label_constant_32bit);
+      case 64: return false;
       }
       return false;
    }
@@ -252,10 +247,7 @@ struct ssa_info {
       temp = abs_temp;
    }
 
-   bool is_abs()
-   {
-      return label & label_abs;
-   }
+   bool is_abs() { return label & label_abs; }
 
    void set_neg(Temp neg_temp)
    {
@@ -263,10 +255,7 @@ struct ssa_info {
       temp = neg_temp;
    }
 
-   bool is_neg()
-   {
-      return label & label_neg;
-   }
+   bool is_neg() { return label & label_neg; }
 
    void set_neg_abs(Temp neg_abs_temp)
    {
@@ -280,10 +269,7 @@ struct ssa_info {
       instr = mul;
    }
 
-   bool is_mul()
-   {
-      return label & label_mul;
-   }
+   bool is_mul() { return label & label_mul; }
 
    void set_temp(Temp tmp)
    {
@@ -291,10 +277,7 @@ struct ssa_info {
       temp = tmp;
    }
 
-   bool is_temp()
-   {
-      return label & label_temp;
-   }
+   bool is_temp() { return label & label_temp; }
 
    void set_mad(Instruction* mad, uint32_t mad_info_idx)
    {
@@ -303,10 +286,7 @@ struct ssa_info {
       instr = mad;
    }
 
-   bool is_mad()
-   {
-      return label & label_mad;
-   }
+   bool is_mad() { return label & label_mad; }
 
    void set_omod2(Instruction* mul)
    {
@@ -314,10 +294,7 @@ struct ssa_info {
       instr = mul;
    }
 
-   bool is_omod2()
-   {
-      return label & label_omod2;
-   }
+   bool is_omod2() { return label & label_omod2; }
 
    void set_omod4(Instruction* mul)
    {
@@ -325,10 +302,7 @@ struct ssa_info {
       instr = mul;
    }
 
-   bool is_omod4()
-   {
-      return label & label_omod4;
-   }
+   bool is_omod4() { return label & label_omod4; }
 
    void set_omod5(Instruction* mul)
    {
@@ -336,31 +310,19 @@ struct ssa_info {
       instr = mul;
    }
 
-   bool is_omod5()
-   {
-      return label & label_omod5;
-   }
+   bool is_omod5() { return label & label_omod5; }
 
-   void set_clamp(Instruction *med3)
+   void set_clamp(Instruction* med3)
    {
       add_label(label_clamp);
       instr = med3;
    }
 
-   bool is_clamp()
-   {
-      return label & label_clamp;
-   }
+   bool is_clamp() { return label & label_clamp; }
 
-   void set_undefined()
-   {
-      add_label(label_undefined);
-   }
+   void set_undefined() { add_label(label_undefined); }
 
-   bool is_undefined()
-   {
-      return label & label_undefined;
-   }
+   bool is_undefined() { return label & label_undefined; }
 
    void set_vcc(Temp vcc_val)
    {
@@ -368,10 +330,7 @@ struct ssa_info {
       temp = vcc_val;
    }
 
-   bool is_vcc()
-   {
-      return label & label_vcc;
-   }
+   bool is_vcc() { return label & label_vcc; }
 
    void set_b2f(Temp b2f_val)
    {
@@ -379,74 +338,47 @@ struct ssa_info {
       temp = b2f_val;
    }
 
-   bool is_b2f()
-   {
-      return label & label_b2f;
-   }
+   bool is_b2f() { return label & label_b2f; }
 
-   void set_add_sub(Instruction *add_sub_instr)
+   void set_add_sub(Instruction* add_sub_instr)
    {
       add_label(label_add_sub);
       instr = add_sub_instr;
    }
 
-   bool is_add_sub()
-   {
-      return label & label_add_sub;
-   }
+   bool is_add_sub() { return label & label_add_sub; }
 
-   void set_bitwise(Instruction *bitwise_instr)
+   void set_bitwise(Instruction* bitwise_instr)
    {
       add_label(label_bitwise);
       instr = bitwise_instr;
    }
 
-   bool is_bitwise()
-   {
-      return label & label_bitwise;
-   }
+   bool is_bitwise() { return label & label_bitwise; }
 
-   void set_uniform_bitwise()
-   {
-      add_label(label_uniform_bitwise);
-   }
+   void set_uniform_bitwise() { add_label(label_uniform_bitwise); }
 
-   bool is_uniform_bitwise()
-   {
-      return label & label_uniform_bitwise;
-   }
+   bool is_uniform_bitwise() { return label & label_uniform_bitwise; }
 
-   void set_minmax(Instruction *minmax_instr)
+   void set_minmax(Instruction* minmax_instr)
    {
       add_label(label_minmax);
       instr = minmax_instr;
    }
 
-   bool is_minmax()
-   {
-      return label & label_minmax;
-   }
+   bool is_minmax() { return label & label_minmax; }
 
-   void set_vopc(Instruction *vopc_instr)
+   void set_vopc(Instruction* vopc_instr)
    {
       add_label(label_vopc);
       instr = vopc_instr;
    }
 
-   bool is_vopc()
-   {
-      return label & label_vopc;
-   }
+   bool is_vopc() { return label & label_vopc; }
 
-   void set_scc_needed()
-   {
-      add_label(label_scc_needed);
-   }
+   void set_scc_needed() { add_label(label_scc_needed); }
 
-   bool is_scc_needed()
-   {
-      return label & label_scc_needed;
-   }
+   bool is_scc_needed() { return label & label_scc_needed; }
 
    void set_scc_invert(Temp scc_inv)
    {
@@ -454,10 +386,7 @@ struct ssa_info {
       temp = scc_inv;
    }
 
-   bool is_scc_invert()
-   {
-      return label & label_scc_invert;
-   }
+   bool is_scc_invert() { return label & label_scc_invert; }
 
    void set_uniform_bool(Temp uniform_bool)
    {
@@ -465,20 +394,11 @@ struct ssa_info {
       temp = uniform_bool;
    }
 
-   bool is_uniform_bool()
-   {
-      return label & label_uniform_bool;
-   }
+   bool is_uniform_bool() { return label & label_uniform_bool; }
 
-   void set_vcc_hint()
-   {
-      add_label(label_vcc_hint);
-   }
+   void set_vcc_hint() { add_label(label_vcc_hint); }
 
-   bool is_vcc_hint()
-   {
-      return label & label_vcc_hint;
-   }
+   bool is_vcc_hint() { return label & label_vcc_hint; }
 
    void set_b2i(Temp b2i_val)
    {
@@ -486,21 +406,15 @@ struct ssa_info {
       temp = b2i_val;
    }
 
-   bool is_b2i()
-   {
-      return label & label_b2i;
-   }
+   bool is_b2i() { return label & label_b2i; }
 
-   void set_usedef(Instruction *label_instr)
+   void set_usedef(Instruction* label_instr)
    {
       add_label(label_usedef);
       instr = label_instr;
    }
 
-   bool is_usedef()
-   {
-      return label & label_usedef;
-   }
+   bool is_usedef() { return label & label_usedef; }
 
    void set_vop3p(Instruction* vop3p_instr)
    {
@@ -508,10 +422,7 @@ struct ssa_info {
       instr = vop3p_instr;
    }
 
-   bool is_vop3p()
-   {
-      return label & label_vop3p;
-   }
+   bool is_vop3p() { return label & label_vop3p; }
 
    void set_fcanonicalize(Temp tmp)
    {
@@ -519,42 +430,27 @@ struct ssa_info {
       temp = tmp;
    }
 
-   bool is_fcanonicalize()
-   {
-      return label & label_fcanonicalize;
-   }
+   bool is_fcanonicalize() { return label & label_fcanonicalize; }
 
-   void set_canonicalized()
-   {
-      add_label(label_canonicalized);
-   }
+   void set_canonicalized() { add_label(label_canonicalized); }
 
-   bool is_canonicalized()
-   {
-      return label & label_canonicalized;
-   }
+   bool is_canonicalized() { return label & label_canonicalized; }
 
-   void set_extract(Instruction *extract)
+   void set_extract(Instruction* extract)
    {
       add_label(label_extract);
       instr = extract;
    }
 
-   bool is_extract()
-   {
-      return label & label_extract;
-   }
+   bool is_extract() { return label & label_extract; }
 
-   void set_insert(Instruction *insert)
+   void set_insert(Instruction* insert)
    {
       add_label(label_insert);
       instr = insert;
    }
 
-   bool is_insert()
-   {
-      return label & label_insert;
-   }
+   bool is_insert() { return label & label_insert; }
 };
 
 struct opt_ctx {
@@ -562,7 +458,7 @@ struct opt_ctx {
    float_mode fp_mode;
    std::vector<aco_ptr<Instruction>> instructions;
    ssa_info* info;
-   std::pair<uint32_t,Temp> last_literal;
+   std::pair<uint32_t, Temp> last_literal;
    std::vector<mad_info> mad_infos;
    std::vector<uint16_t> uses;
 };
@@ -577,9 +473,10 @@ struct CmpInfo {
    unsigned size;
 };
 
-ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info);
+ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo* info);
 
-bool can_swap_operands(aco_ptr<Instruction>& instr)
+bool
+can_swap_operands(aco_ptr<Instruction>& instr)
 {
    if (instr->operands[0].isConstant() ||
        (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
@@ -612,23 +509,12 @@ bool can_swap_operands(aco_ptr<Instruction>& instr)
    case aco_opcode::v_max_i16_e64:
    case aco_opcode::v_min_i16_e64:
    case aco_opcode::v_max_u16_e64:
-   case aco_opcode::v_min_u16_e64:
-      return true;
-   case aco_opcode::v_sub_f16:
-      instr->opcode = aco_opcode::v_subrev_f16;
-      return true;
-   case aco_opcode::v_sub_f32:
-      instr->opcode = aco_opcode::v_subrev_f32;
-      return true;
-   case aco_opcode::v_sub_co_u32:
-      instr->opcode = aco_opcode::v_subrev_co_u32;
-      return true;
-   case aco_opcode::v_sub_u16:
-      instr->opcode = aco_opcode::v_subrev_u16;
-      return true;
-   case aco_opcode::v_sub_u32:
-      instr->opcode = aco_opcode::v_subrev_u32;
-      return true;
+   case aco_opcode::v_min_u16_e64: return true;
+   case aco_opcode::v_sub_f16: instr->opcode = aco_opcode::v_subrev_f16; return true;
+   case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true;
+   case aco_opcode::v_sub_co_u32: instr->opcode = aco_opcode::v_subrev_co_u32; return true;
+   case aco_opcode::v_sub_u16: instr->opcode = aco_opcode::v_subrev_u16; return true;
+   case aco_opcode::v_sub_u32: instr->opcode = aco_opcode::v_subrev_u32; return true;
    default: {
       CmpInfo info;
       get_cmp_info(instr->opcode, &info);
@@ -645,7 +531,8 @@ bool can_swap_operands(aco_ptr<Instruction>& instr)
    }
 }
 
-bool can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
+bool
+can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
 {
    if (instr->isVOP3())
       return true;
@@ -659,36 +546,34 @@ bool can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
    if (instr->isDPP() || instr->isSDWA())
       return false;
 
-   return instr->opcode != aco_opcode::v_madmk_f32 &&
-          instr->opcode != aco_opcode::v_madak_f32 &&
-          instr->opcode != aco_opcode::v_madmk_f16 &&
-          instr->opcode != aco_opcode::v_madak_f16 &&
-          instr->opcode != aco_opcode::v_fmamk_f32 &&
-          instr->opcode != aco_opcode::v_fmaak_f32 &&
-          instr->opcode != aco_opcode::v_fmamk_f16 &&
-          instr->opcode != aco_opcode::v_fmaak_f16 &&
+   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
+          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
+          instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
+          instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
           instr->opcode != aco_opcode::v_readlane_b32 &&
           instr->opcode != aco_opcode::v_writelane_b32 &&
           instr->opcode != aco_opcode::v_readfirstlane_b32;
 }
 
-bool pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
-                           Temp temp, unsigned index)
+bool
+pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index)
 {
    if (instr->definitions.empty())
       return false;
 
-   const bool vgpr = instr->opcode == aco_opcode::p_as_uniform ||
-                     std::all_of(instr->definitions.begin(), instr->definitions.end(),
-                                 [] (const Definition& def) { return def.regClass().type() == RegType::vgpr;});
+   const bool vgpr =
+      instr->opcode == aco_opcode::p_as_uniform ||
+      std::all_of(instr->definitions.begin(), instr->definitions.end(),
+                  [](const Definition& def) { return def.regClass().type() == RegType::vgpr; });
 
    /* don't propagate VGPRs into SGPR instructions */
    if (temp.type() == RegType::vgpr && !vgpr)
       return false;
 
-   bool can_accept_sgpr = ctx.program->chip_class >= GFX9 ||
-                          std::none_of(instr->definitions.begin(), instr->definitions.end(),
-                                       [] (const Definition& def) { return def.regClass().is_subdword();});
+   bool can_accept_sgpr =
+      ctx.program->chip_class >= GFX9 ||
+      std::none_of(instr->definitions.begin(), instr->definitions.end(),
+                   [](const Definition& def) { return def.regClass().is_subdword(); });
 
    switch (instr->opcode) {
    case aco_opcode::p_phi:
@@ -725,15 +610,15 @@ bool pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
       if (temp.regClass() == instr->definitions[0].regClass())
          instr->opcode = aco_opcode::p_parallelcopy;
       break;
-   default:
-      return false;
+   default: return false;
    }
 
    instr->operands[index].setTemp(temp);
    return true;
 }
 
-bool can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+bool
+can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP())
       return false;
@@ -746,14 +631,16 @@ bool can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
           instr->opcode != aco_opcode::v_permlanex16_b32;
 }
 
-void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+void
+to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->isVOP3())
       return;
 
    aco_ptr<Instruction> tmp = std::move(instr);
    Format format = asVOP3(tmp->format);
-   instr.reset(create_instruction<VOP3_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
+   instr.reset(create_instruction<VOP3_instruction>(tmp->opcode, format, tmp->operands.size(),
+                                                    tmp->definitions.size()));
    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
    for (unsigned i = 0; i < instr->definitions.size(); i++) {
       instr->definitions[i] = tmp->definitions[i];
@@ -767,12 +654,14 @@ void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
     * been applied yet or this instruction isn't dead and so they've been ignored */
 }
 
-bool is_operand_vgpr(Operand op)
+bool
+is_operand_vgpr(Operand op)
 {
    return op.isTemp() && op.getTemp().type() == RegType::vgpr;
 }
 
-void to_SDWA(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+void
+to_SDWA(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    aco_ptr<Instruction> tmp = convert_to_SDWA(ctx.program->chip_class, instr);
    if (!tmp)
@@ -786,15 +675,15 @@ void to_SDWA(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 }
 
 /* only covers special cases */
-bool alu_can_accept_constant(aco_opcode opcode, unsigned operand)
+bool
+alu_can_accept_constant(aco_opcode opcode, unsigned operand)
 {
    switch (opcode) {
    case aco_opcode::v_interp_p2_f32:
    case aco_opcode::v_mac_f32:
    case aco_opcode::v_writelane_b32:
    case aco_opcode::v_writelane_b32_e64:
-   case aco_opcode::v_cndmask_b32:
-      return operand != 2;
+   case aco_opcode::v_cndmask_b32: return operand != 2;
    case aco_opcode::s_addk_i32:
    case aco_opcode::s_mulk_i32:
    case aco_opcode::p_wqm:
@@ -804,25 +693,28 @@ bool alu_can_accept_constant(aco_opcode opcode, unsigned operand)
    case aco_opcode::v_readlane_b32_e64:
    case aco_opcode::v_readfirstlane_b32:
    case aco_opcode::p_extract:
-   case aco_opcode::p_insert:
-      return operand != 0;
-   default:
-      return true;
+   case aco_opcode::p_insert: return operand != 0;
+   default: return true;
    }
 }
 
-bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
+bool
+valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
 {
-   if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
-       instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
+   if (instr->opcode == aco_opcode::v_readlane_b32 ||
+       instr->opcode == aco_opcode::v_readlane_b32_e64 ||
+       instr->opcode == aco_opcode::v_writelane_b32 ||
+       instr->opcode == aco_opcode::v_writelane_b32_e64)
       return operand != 1;
-   if (instr->opcode == aco_opcode::v_permlane16_b32 || instr->opcode == aco_opcode::v_permlanex16_b32)
+   if (instr->opcode == aco_opcode::v_permlane16_b32 ||
+       instr->opcode == aco_opcode::v_permlanex16_b32)
       return operand == 0;
    return true;
 }
 
 /* check constant bus and literal limitations */
-bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
+bool
+check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands)
 {
    int limit = ctx.program->chip_class >= GFX10 ? 2 : 1;
    Operand literal32(s1);
@@ -869,7 +761,9 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
    return true;
 }
 
-bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow)
+bool
+parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset,
+                  bool prevent_overflow)
 {
    Operand op = instr->operands[op_index];
 
@@ -879,17 +773,15 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp
    if (!ctx.info[tmp.id()].is_add_sub())
       return false;
 
-   Instruction *add_instr = ctx.info[tmp.id()].instr;
+   Instruction* add_instr = ctx.info[tmp.id()].instr;
 
    switch (add_instr->opcode) {
    case aco_opcode::v_add_u32:
    case aco_opcode::v_add_co_u32:
    case aco_opcode::v_add_co_u32_e64:
    case aco_opcode::s_add_i32:
-   case aco_opcode::s_add_u32:
-      break;
-   default:
-      return false;
+   case aco_opcode::s_add_u32: break;
+   default: return false;
    }
    if (prevent_overflow && !add_instr->definitions[0].isNUW())
       return false;
@@ -921,11 +813,13 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp
    return false;
 }
 
-unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
+unsigned
+get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
 {
    if (instr->isPseudo())
       return instr->operands[index].bytes() * 8u;
-   else if (instr->opcode == aco_opcode::v_mad_u64_u32 || instr->opcode == aco_opcode::v_mad_i64_i32)
+   else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
+            instr->opcode == aco_opcode::v_mad_i64_i32)
       return index == 2 ? 64 : 32;
    else if (instr->isVALU() || instr->isSALU())
       return instr_info.operand_size[(int)instr->opcode];
@@ -933,19 +827,22 @@ unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
       return 0;
 }
 
-Operand get_constant_op(opt_ctx &ctx, ssa_info info, uint32_t bits)
+Operand
+get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits)
 {
    if (bits == 64)
       return Operand(info.val, true);
    return Operand::get_const(ctx.program->chip_class, info.val, bits / 8u);
 }
 
-bool fixed_to_exec(Operand op)
+bool
+fixed_to_exec(Operand op)
 {
    return op.isFixed() && op.physReg() == exec;
 }
 
-int parse_extract(Instruction *instr)
+int
+parse_extract(Instruction* instr)
 {
    if (instr->opcode == aco_opcode::p_extract) {
       bool is_byte = instr->operands[2].constantEquals(8);
@@ -961,7 +858,8 @@ int parse_extract(Instruction *instr)
    }
 }
 
-int parse_insert(Instruction *instr)
+int
+parse_insert(Instruction* instr)
 {
    if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&
        instr->operands[1].constantEquals(0)) {
@@ -976,7 +874,8 @@ int parse_insert(Instruction *instr)
    }
 }
 
-bool can_apply_extract(opt_ctx &ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
+bool
+can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
 {
    if (idx >= 2)
       return false;
@@ -990,7 +889,8 @@ bool can_apply_extract(opt_ctx &ctx, aco_ptr<Instruction>& instr, unsigned idx,
       return true;
    } else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&
               (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {
-      if (instr->isSDWA() && (static_cast<SDWA_instruction*>(instr.get())->sel[idx] & sdwa_asuint) != sdwa_udword)
+      if (instr->isSDWA() &&
+          (static_cast<SDWA_instruction*>(instr.get())->sel[idx] & sdwa_asuint) != sdwa_udword)
          return false;
       return true;
    } else if (instr->isVOP3() && (sel & sdwa_isword) &&
@@ -1005,7 +905,8 @@ bool can_apply_extract(opt_ctx &ctx, aco_ptr<Instruction>& instr, unsigned idx,
 /* Combine an p_extract (or p_insert, in some cases) instruction with instr.
  * instr(p_extract(...)) -> instr()
  */
-void apply_extract(opt_ctx &ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
+void
+apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
 {
    Temp tmp = info.instr->operands[0].getTemp();
    unsigned sel = parse_extract(info.instr);
@@ -1013,18 +914,10 @@ void apply_extract(opt_ctx &ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_
    if (sel == sdwa_udword || sel == sdwa_sdword) {
    } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel <= sdwa_ubyte3) {
       switch (sel) {
-      case sdwa_ubyte0:
-         instr->opcode = aco_opcode::v_cvt_f32_ubyte0;
-         break;
-      case sdwa_ubyte1:
-         instr->opcode = aco_opcode::v_cvt_f32_ubyte1;
-         break;
-      case sdwa_ubyte2:
-         instr->opcode = aco_opcode::v_cvt_f32_ubyte2;
-         break;
-      case sdwa_ubyte3:
-         instr->opcode = aco_opcode::v_cvt_f32_ubyte3;
-         break;
+      case sdwa_ubyte0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
+      case sdwa_ubyte1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
+      case sdwa_ubyte2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
+      case sdwa_ubyte3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
       }
    } else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&
               (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {
@@ -1041,7 +934,8 @@ void apply_extract(opt_ctx &ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_
       ctx.info[def.tempId()].label &= label_vopc;
 }
 
-void check_sdwa_extract(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void
+check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    /* only VALU can use SDWA */
    if (!instr->isVALU())
@@ -1060,7 +954,8 @@ void check_sdwa_extract(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    }
 }
 
-bool does_fp_op_flush_denorms(opt_ctx &ctx, aco_opcode op)
+bool
+does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op)
 {
    if (ctx.program->chip_class <= GFX8) {
       switch (op) {
@@ -1070,18 +965,17 @@ bool does_fp_op_flush_denorms(opt_ctx &ctx, aco_opcode op)
       case aco_opcode::v_min3_f32:
       case aco_opcode::v_max3_f32:
       case aco_opcode::v_min_f16:
-      case aco_opcode::v_max_f16:
-         return false;
-      default:
-         break;
+      case aco_opcode::v_max_f16: return false;
+      default: break;
       }
    }
    return op != aco_opcode::v_cndmask_b32;
 }
 
-bool can_eliminate_fcanonicalize(opt_ctx &ctx, aco_ptr<Instruction>& instr, Temp tmp)
+bool
+can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp)
 {
-   float_mode *fp = &ctx.fp_mode;
+   float_mode* fp = &ctx.fp_mode;
    if (ctx.info[tmp.id()].is_canonicalized() ||
        (tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
       return true;
@@ -1090,14 +984,17 @@ bool can_eliminate_fcanonicalize(opt_ctx &ctx, aco_ptr<Instruction>& instr, Temp
    return instr_info.can_use_input_modifiers[(int)op] && does_fp_op_flush_denorms(ctx, op);
 }
 
-bool is_copy_label(opt_ctx &ctx, aco_ptr<Instruction>& instr, ssa_info& info)
+bool
+is_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info)
 {
-   return info.is_temp() || (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp));
+   return info.is_temp() ||
+          (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp));
 }
 
-bool is_op_canonicalized(opt_ctx &ctx, Operand op)
+bool
+is_op_canonicalized(opt_ctx& ctx, Operand op)
 {
-   float_mode *fp = &ctx.fp_mode;
+   float_mode* fp = &ctx.fp_mode;
    if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||
        (op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
       return true;
@@ -1112,22 +1009,24 @@ bool is_op_canonicalized(opt_ctx &ctx, Operand op)
    return false;
 }
 
-void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void
+label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {
       ASSERTED bool all_const = false;
       for (Operand& op : instr->operands)
-         all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
+         all_const =
+            all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
       perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
 
       ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||
                               instr->opcode == aco_opcode::s_mov_b64 ||
                               instr->opcode == aco_opcode::v_mov_b32;
-      perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead", instr.get());
+      perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead",
+               instr.get());
    }
 
-   for (unsigned i = 0; i < instr->operands.size(); i++)
-   {
+   for (unsigned i = 0; i < instr->operands.size(); i++) {
       if (!instr->operands[i].isTemp())
          continue;
 
@@ -1161,18 +1060,22 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
       /* VALU: propagate neg, abs & inline constants */
       else if (instr->isVALU()) {
-         if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) {
+         if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr &&
+             valu_can_accept_vgpr(instr, i)) {
             instr->operands[i].setTemp(info.temp);
             info = ctx.info[info.temp.id()];
          }
          /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
-         if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) && instr->operands.size() == 1) {
+         if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
+             instr->operands.size() == 1) {
             instr->operands[i].setTemp(info.temp);
             info = ctx.info[info.temp.id()];
          }
 
-         /* for instructions other than v_cndmask_b32, the size of the instruction should match the operand size */
-         unsigned can_use_mod = instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
+         /* for instructions other than v_cndmask_b32, the size of the instruction should match the
+          * operand size */
+         unsigned can_use_mod =
+            instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
          can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
 
          if (instr->isSDWA())
@@ -1186,7 +1089,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
             instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
             instr->operands[i].setTemp(info.temp);
-         } else if (info.is_neg() && can_use_mod && can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
+         } else if (info.is_neg() && can_use_mod &&
+                    can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
             if (!instr->isDPP() && !instr->isSDWA())
                to_VOP3(ctx, instr);
             instr->operands[i].setTemp(info.temp);
@@ -1213,7 +1117,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
              (!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {
             Operand op = get_constant_op(ctx, info, bits);
-            perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
+            perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
+                     "v_cndmask_b32 with a constant selector", instr.get());
             if (i == 0 || instr->isSDWA() || instr->isVOP3P() ||
                 instr->opcode == aco_opcode::v_readlane_b32 ||
                 instr->opcode == aco_opcode::v_writelane_b32) {
@@ -1248,23 +1153,28 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->chip_class < GFX9;
          bool saddr_prevent_overflow = mubuf.swizzled;
 
-         if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
+         if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&
+             mubuf.offset + info.val < 4096) {
             assert(!mubuf.idxen);
             instr->operands[1] = Operand(v1);
             mubuf.offset += info.val;
             mubuf.offen = false;
             continue;
          } else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
-            instr->operands[2] = Operand((uint32_t) 0);
+            instr->operands[2] = Operand((uint32_t)0);
             mubuf.offset += info.val;
             continue;
-         } else if (mubuf.offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) &&
+         } else if (mubuf.offen && i == 1 &&
+                    parse_base_offset(ctx, instr.get(), i, &base, &offset,
+                                      vaddr_prevent_overflow) &&
                     base.regClass() == v1 && mubuf.offset + offset < 4096) {
             assert(!mubuf.idxen);
             instr->operands[1].setTemp(base);
             mubuf.offset += offset;
             continue;
-         } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) &&
+         } else if (i == 2 &&
+                    parse_base_offset(ctx, instr.get(), i, &base, &offset,
+                                      saddr_prevent_overflow) &&
                     base.regClass() == s1 && mubuf.offset + offset < 4096) {
             instr->operands[i].setTemp(base);
             mubuf.offset += offset;
@@ -1279,17 +1189,24 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          Temp base;
          uint32_t offset;
          bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
-         if (has_usable_ds_offset &&
-             i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
+         if (has_usable_ds_offset && i == 0 &&
+             parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
              base.regClass() == instr->operands[i].regClass() &&
              instr->opcode != aco_opcode::ds_swizzle_b32) {
-            if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
-                instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) {
-               unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 0x7 : 0x3;
-               unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 3 : 2;
+            if (instr->opcode == aco_opcode::ds_write2_b32 ||
+                instr->opcode == aco_opcode::ds_read2_b32 ||
+                instr->opcode == aco_opcode::ds_write2_b64 ||
+                instr->opcode == aco_opcode::ds_read2_b64) {
+               unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 ||
+                                instr->opcode == aco_opcode::ds_read2_b64)
+                                  ? 0x7
+                                  : 0x3;
+               unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 ||
+                                  instr->opcode == aco_opcode::ds_read2_b64)
+                                    ? 3
+                                    : 2;
 
-               if ((offset & mask) == 0 &&
-                   ds.offset0 + (offset >> shifts) <= 255 &&
+               if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 &&
                    ds.offset1 + (offset >> shifts) <= 255) {
                   instr->operands[i].setTemp(base);
                   ds.offset0 += offset >> shifts;
@@ -1317,18 +1234,20 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
               (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
             instr->operands[i] = Operand(info.val);
             continue;
-         } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
+         } else if (i == 1 &&
+                    parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) &&
+                    base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
             bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);
-            if (soe &&
-                (!ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) ||
-                 ctx.info[smem.operands.back().tempId()].val != 0)) {
+            if (soe && (!ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) ||
+                        ctx.info[smem.operands.back().tempId()].val != 0)) {
                continue;
             }
             if (soe) {
                smem.operands[1] = Operand(offset);
                smem.operands.back() = Operand(base);
             } else {
-               SMEM_instruction *new_instr = create_instruction<SMEM_instruction>(smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
+               SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(
+                  smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
                new_instr->operands[0] = smem.operands[0];
                new_instr->operands[1] = Operand(offset);
                if (smem.definitions.empty())
@@ -1350,7 +1269,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       else if (instr->isBranch()) {
          if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
             /* Flip the branch instruction to get rid of the scc_invert instruction */
-            instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
+            instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
+                                                                     : aco_opcode::p_cbranch_z;
             instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
          }
       }
@@ -1415,7 +1335,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       if (ops.size() != instr->operands.size()) {
          assert(ops.size() > instr->operands.size());
          Definition def = instr->definitions[0];
-         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, ops.size(), 1));
+         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
+                                                            Format::PSEUDO, ops.size(), 1));
          for (unsigned i = 0; i < ops.size(); i++) {
             if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&
                 ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass())
@@ -1450,16 +1371,19 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       unsigned split_offset = 0;
       unsigned vec_offset = 0;
       unsigned vec_index = 0;
-      for (unsigned i = 0; i < instr->definitions.size(); split_offset += instr->definitions[i++].bytes()) {
+      for (unsigned i = 0; i < instr->definitions.size();
+           split_offset += instr->definitions[i++].bytes()) {
          while (vec_offset < split_offset && vec_index < vec->operands.size())
             vec_offset += vec->operands[vec_index++].bytes();
 
-         if (vec_offset != split_offset || vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
+         if (vec_offset != split_offset ||
+             vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
             continue;
 
          Operand vec_op = vec->operands[vec_index];
          if (vec_op.isConstant()) {
-            ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class, vec_op.constantValue64());
+            ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class,
+                                                                  vec_op.constantValue64());
          } else if (vec_op.isUndefined()) {
             ctx.info[instr->definitions[i].tempId()].set_undefined();
          } else {
@@ -1493,7 +1417,9 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          /* propagate constants */
          uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
          uint32_t val = (info.val >> (dst_offset * 8u)) & mask;
-         instr->operands[0] = Operand::get_const(ctx.program->chip_class, val, instr->definitions[0].bytes());;
+         instr->operands[0] =
+            Operand::get_const(ctx.program->chip_class, val, instr->definitions[0].bytes());
+         ;
       } else if (index == 0 && instr->operands[0].size() == instr->definitions[0].size()) {
          ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
       }
@@ -1512,10 +1438,11 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so
           * duplicate the vector instead.
           */
-         Instruction *vec = ctx.info[instr->operands[0].tempId()].instr;
+         Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
          aco_ptr<Instruction> old_copy = std::move(instr);
 
-         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
+         instr.reset(create_instruction<Pseudo_instruction>(
+            aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
          instr->definitions[0] = old_copy->definitions[0];
          std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());
          for (unsigned i = 0; i < vec->operands.size(); i++) {
@@ -1534,7 +1461,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       } else if (instr->usesModifiers()) {
          // TODO
       } else if (instr->operands[0].isConstant()) {
-         ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, instr->operands[0].constantValue64());
+         ctx.info[instr->definitions[0].tempId()].set_constant(
+            ctx.program->chip_class, instr->operands[0].constantValue64());
       } else if (instr->operands[0].isTemp()) {
          ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
          if (ctx.info[instr->operands[0].tempId()].is_canonicalized())
@@ -1558,11 +1486,11 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       for (unsigned i = 0; i < 2; i++) {
          if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
             if (!instr->isDPP() && !instr->isSDWA() &&
-                (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */
+                (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) ||   /* 1.0 */
                  instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */
                bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);
 
-               VOP3_instruction *vop3 = instr->isVOP3() ? &instr->vop3() : NULL;
+               VOP3_instruction* vop3 = instr->isVOP3() ? &instr->vop3() : NULL;
                if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))
                   continue;
 
@@ -1580,14 +1508,18 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
                   ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);
             } else if (uses_mods) {
                continue;
-            } else if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
+            } else if (instr->operands[!i].constantValue() ==
+                       (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
                ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
-            } else if (instr->operands[!i].constantValue() == (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
+            } else if (instr->operands[!i].constantValue() ==
+                       (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
                ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
-            } else if (instr->operands[!i].constantValue() == (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
+            } else if (instr->operands[!i].constantValue() ==
+                       (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
                ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
             } else if (instr->operands[!i].constantValue() == 0u &&
-                       !(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64 : ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */
+                       !(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64
+                              : ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */
                ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
             } else {
                continue;
@@ -1609,16 +1541,14 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    case aco_opcode::v_med3_f16:
    case aco_opcode::v_med3_f32: { /* clamp */
       VOP3_instruction& vop3 = instr->vop3();
-      if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] ||
-          vop3.neg[0] || vop3.neg[1] || vop3.neg[2] ||
+      if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] || vop3.neg[0] || vop3.neg[1] || vop3.neg[2] ||
           vop3.omod != 0 || vop3.opsel != 0)
          break;
 
       unsigned idx = 0;
       bool found_zero = false, found_one = false;
       bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
-      for (unsigned i = 0; i < 3; i++)
-      {
+      for (unsigned i = 0; i < 3; i++) {
          if (instr->operands[i].constantEquals(0))
             found_zero = true;
          else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
@@ -1631,23 +1561,22 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       break;
    }
    case aco_opcode::v_cndmask_b32:
-      if (instr->operands[0].constantEquals(0) &&
-          instr->operands[1].constantEquals(0xFFFFFFFF))
+      if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF))
          ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
       else if (instr->operands[0].constantEquals(0) &&
                instr->operands[1].constantEquals(0x3f800000u))
          ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
-      else if (instr->operands[0].constantEquals(0) &&
-               instr->operands[1].constantEquals(1))
+      else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1))
          ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
 
       ctx.info[instr->operands[2].tempId()].set_vcc_hint();
       break;
    case aco_opcode::v_cmp_lg_u32:
       if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
-          instr->operands[0].constantEquals(0) &&
-          instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_vcc())
-         ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
+          instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() &&
+          ctx.info[instr->operands[1].tempId()].is_vcc())
+         ctx.info[instr->definitions[0].tempId()].set_temp(
+            ctx.info[instr->operands[1].tempId()].temp);
       break;
    case aco_opcode::p_linear_phi: {
       /* lower_bool_phis() can create phis like this */
@@ -1656,7 +1585,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       if (all_same_temp)
          all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
       for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
-         if (!instr->operands[i].isTemp() || instr->operands[i].tempId() != instr->operands[0].tempId())
+         if (!instr->operands[i].isTemp() ||
+             instr->operands[i].tempId() != instr->operands[0].tempId())
             all_same_temp = false;
       }
       if (all_same_temp) {
@@ -1684,10 +1614,12 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    case aco_opcode::s_not_b64:
       if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
          ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
-         ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].temp);
+         ctx.info[instr->definitions[1].tempId()].set_scc_invert(
+            ctx.info[instr->operands[0].tempId()].temp);
       } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
          ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
-         ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
+         ctx.info[instr->definitions[1].tempId()].set_scc_invert(
+            ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
       }
       ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
       break;
@@ -1695,21 +1627,29 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    case aco_opcode::s_and_b64:
       if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
          if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
-            /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a uniform bool into divergent */
-            ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp);
-            ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].temp);
+            /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a
+             * uniform bool into divergent */
+            ctx.info[instr->definitions[1].tempId()].set_temp(
+               ctx.info[instr->operands[0].tempId()].temp);
+            ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
+               ctx.info[instr->operands[0].tempId()].temp);
             break;
          } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
-            /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction already produces the same SCC */
-            ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
-            ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
+            /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction
+             * already produces the same SCC */
+            ctx.info[instr->definitions[1].tempId()].set_temp(
+               ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
+            ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
+               ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
             break;
          } else if (ctx.info[instr->operands[0].tempId()].is_vopc()) {
             Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr;
-            /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus already produces the same result */
+            /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus
+             * already produces the same result */
             if (vopc_instr->pass_flags == instr->pass_flags) {
                assert(instr->pass_flags > 0);
-               ctx.info[instr->definitions[0].tempId()].set_temp(vopc_instr->definitions[0].getTemp());
+               ctx.info[instr->definitions[0].tempId()].set_temp(
+                  vopc_instr->definitions[0].getTemp());
                break;
             }
          }
@@ -1719,8 +1659,11 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    case aco_opcode::s_or_b64:
    case aco_opcode::s_xor_b32:
    case aco_opcode::s_xor_b64:
-      if (std::all_of(instr->operands.begin(), instr->operands.end(), [&ctx](const Operand& op) {
-                         return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || ctx.info[op.tempId()].is_uniform_bitwise());
+      if (std::all_of(instr->operands.begin(), instr->operands.end(),
+                      [&ctx](const Operand& op)
+                      {
+                         return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() ||
+                                                ctx.info[op.tempId()].is_uniform_bitwise());
                       })) {
          ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
       }
@@ -1749,8 +1692,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       break;
    case aco_opcode::s_cselect_b64:
    case aco_opcode::s_cselect_b32:
-      if (instr->operands[0].constantEquals((unsigned) -1) &&
-          instr->operands[1].constantEquals(0)) {
+      if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) {
          /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
          ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
       }
@@ -1761,8 +1703,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       }
       break;
    case aco_opcode::p_wqm:
-      if (instr->operands[0].isTemp() &&
-          ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
+      if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
          ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
       }
       break;
@@ -1790,8 +1731,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       }
       break;
    }
-   default:
-      break;
+   default: break;
    }
 
    /* Don't remove label_extract if we can't apply the extract to
@@ -1800,93 +1740,104 @@ void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       check_sdwa_extract(ctx, instr);
 }
 
-ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info)
+ALWAYS_INLINE bool
+get_cmp_info(aco_opcode op, CmpInfo* info)
 {
    info->ordered = aco_opcode::num_opcodes;
    info->unordered = aco_opcode::num_opcodes;
    info->ordered_swapped = aco_opcode::num_opcodes;
    info->unordered_swapped = aco_opcode::num_opcodes;
    switch (op) {
-   #define CMP2(ord, unord, ord_swap, unord_swap, sz) \
-   case aco_opcode::v_cmp_##ord##_f##sz:\
-   case aco_opcode::v_cmp_n##unord##_f##sz:\
-      info->ordered = aco_opcode::v_cmp_##ord##_f##sz;\
-      info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;\
-      info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;\
-      info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;\
-      info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz : aco_opcode::v_cmp_n##ord##_f##sz;\
-      info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 : aco_opcode::v_cmp_n##unord##_f32;\
-      info->size = sz;\
+      // clang-format off
+#define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
+   case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
+   case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
+      info->ordered = aco_opcode::v_cmp_##ord##_f##sz;                                             \
+      info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;                                        \
+      info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;                                \
+      info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;                           \
+      info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
+                                                               : aco_opcode::v_cmp_n##ord##_f##sz; \
+      info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32            \
+                                                        : aco_opcode::v_cmp_n##unord##_f32;        \
+      info->size = sz;                                                                             \
       return true;
-   #define CMP(ord, unord, ord_swap, unord_swap) \
-   CMP2(ord, unord, ord_swap, unord_swap, 16)\
-   CMP2(ord, unord, ord_swap, unord_swap, 32)\
+#define CMP(ord, unord, ord_swap, unord_swap)                                                      \
+   CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
+   CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
    CMP2(ord, unord, ord_swap, unord_swap, 64)
-   CMP(lt, /*n*/ge, gt, /*n*/le)
-   CMP(eq, /*n*/lg, eq, /*n*/lg)
-   CMP(le, /*n*/gt, ge, /*n*/lt)
-   CMP(gt, /*n*/le, lt, /*n*/le)
-   CMP(lg, /*n*/eq, lg, /*n*/eq)
-   CMP(ge, /*n*/lt, le, /*n*/gt)
-   #undef CMP
-   #undef CMP2
-   #define ORD_TEST(sz) \
-   case aco_opcode::v_cmp_u_f##sz:\
-      info->f32 = aco_opcode::v_cmp_u_f32;\
-      info->inverse = aco_opcode::v_cmp_o_f##sz;\
-      info->size = sz;\
-      return true;\
-   case aco_opcode::v_cmp_o_f##sz:\
-      info->f32 = aco_opcode::v_cmp_o_f32;\
-      info->inverse = aco_opcode::v_cmp_u_f##sz;\
-      info->size = sz;\
+      CMP(lt, /*n*/ge, gt, /*n*/le)
+      CMP(eq, /*n*/lg, eq, /*n*/lg)
+      CMP(le, /*n*/gt, ge, /*n*/lt)
+      CMP(gt, /*n*/le, lt, /*n*/le)
+      CMP(lg, /*n*/eq, lg, /*n*/eq)
+      CMP(ge, /*n*/lt, le, /*n*/gt)
+#undef CMP
+#undef CMP2
+#define ORD_TEST(sz)                                                                               \
+   case aco_opcode::v_cmp_u_f##sz:                                                                 \
+      info->f32 = aco_opcode::v_cmp_u_f32;                                                         \
+      info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
+      info->size = sz;                                                                             \
+      return true;                                                                                 \
+   case aco_opcode::v_cmp_o_f##sz:                                                                 \
+      info->f32 = aco_opcode::v_cmp_o_f32;                                                         \
+      info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
+      info->size = sz;                                                                             \
       return true;
-   ORD_TEST(16)
-   ORD_TEST(32)
-   ORD_TEST(64)
-   #undef ORD_TEST
-   default:
-      return false;
+      ORD_TEST(16)
+      ORD_TEST(32)
+      ORD_TEST(64)
+#undef ORD_TEST
+      // clang-format on
+   default: return false;
    }
 }
 
-aco_opcode get_ordered(aco_opcode op)
+aco_opcode
+get_ordered(aco_opcode op)
 {
    CmpInfo info;
    return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
 }
 
-aco_opcode get_unordered(aco_opcode op)
+aco_opcode
+get_unordered(aco_opcode op)
 {
    CmpInfo info;
    return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
 }
 
-aco_opcode get_inverse(aco_opcode op)
+aco_opcode
+get_inverse(aco_opcode op)
 {
    CmpInfo info;
    return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
 }
 
-aco_opcode get_f32_cmp(aco_opcode op)
+aco_opcode
+get_f32_cmp(aco_opcode op)
 {
    CmpInfo info;
    return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
 }
 
-unsigned get_cmp_bitsize(aco_opcode op)
+unsigned
+get_cmp_bitsize(aco_opcode op)
 {
    CmpInfo info;
    return get_cmp_info(op, &info) ? info.size : 0;
 }
 
-bool is_cmp(aco_opcode op)
+bool
+is_cmp(aco_opcode op)
 {
    CmpInfo info;
    return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
 }
 
-unsigned original_temp_id(opt_ctx &ctx, Temp tmp)
+unsigned
+original_temp_id(opt_ctx& ctx, Temp tmp)
 {
    if (ctx.info[tmp.id()].is_temp())
       return ctx.info[tmp.id()].temp.id();
@@ -1894,7 +1845,8 @@ unsigned original_temp_id(opt_ctx &ctx, Temp tmp)
       return tmp.id();
 }
 
-void decrease_uses(opt_ctx &ctx, Instruction* instr)
+void
+decrease_uses(opt_ctx& ctx, Instruction* instr)
 {
    if (!--ctx.uses[instr->definitions[0].tempId()]) {
       for (const Operand& op : instr->operands) {
@@ -1904,14 +1856,15 @@ void decrease_uses(opt_ctx &ctx, Instruction* instr)
    }
 }
 
-Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false)
+Instruction*
+follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)
 {
    if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
       return nullptr;
    if (!ignore_uses && ctx.uses[op.tempId()] > 1)
       return nullptr;
 
-   Instruction *instr = ctx.info[op.tempId()].instr;
+   Instruction* instr = ctx.info[op.tempId()].instr;
 
    if (instr->definitions.size() == 2) {
       assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
@@ -1924,7 +1877,8 @@ Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false)
 
 /* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
  * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
-bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+bool
+combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->definitions[0].regClass() != ctx.program->lane_mask)
       return false;
@@ -1936,7 +1890,7 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    bool neg[2] = {false, false};
    bool abs[2] = {false, false};
    uint8_t opsel = 0;
-   Instruction *op_instr[2];
+   Instruction* op_instr[2];
    Temp op[2];
 
    unsigned bitsize = 0;
@@ -1957,7 +1911,8 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
       if (op_instr[i]->isVOP3()) {
          VOP3_instruction& vop3 = op_instr[i]->vop3();
-         if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || vop3.opsel == 2)
+         if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||
+             vop3.opsel == 2)
             return false;
          neg[i] = vop3.neg[0];
          abs[i] = vop3.abs[0];
@@ -1988,25 +1943,20 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
    aco_opcode new_op = aco_opcode::num_opcodes;
    switch (bitsize) {
-   case 16:
-      new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16;
-      break;
-   case 32:
-      new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
-      break;
-   case 64:
-      new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64;
-      break;
+   case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break;
+   case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break;
+   case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;
    }
-   Instruction *new_instr;
+   Instruction* new_instr;
    if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {
-      VOP3_instruction *vop3 = create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
+      VOP3_instruction* vop3 =
+         create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
       for (unsigned i = 0; i < 2; i++) {
          vop3->neg[i] = neg[i];
          vop3->abs[i] = abs[i];
       }
       vop3->opsel = opsel;
-      new_instr = static_cast<Instruction *>(vop3);
+      new_instr = static_cast<Instruction*>(vop3);
    } else {
       new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
       instr->definitions[0].setHint(vcc);
@@ -2025,7 +1975,8 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
 /* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
  * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
-bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+bool
+combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->definitions[0].regClass() != ctx.program->lane_mask)
       return false;
@@ -2035,8 +1986,8 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
    aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
 
-   Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
-   Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
+   Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
+   Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
    if (!nan_test || !cmp)
       return false;
    if (nan_test->isSDWA() || cmp->isSDWA())
@@ -2070,9 +2021,10 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    decrease_uses(ctx, cmp);
 
    aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
-   Instruction *new_instr;
+   Instruction* new_instr;
    if (cmp->isVOP3()) {
-      VOP3_instruction *new_vop3 = create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
+      VOP3_instruction* new_vop3 =
+         create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
       VOP3_instruction& cmp_vop3 = cmp->vop3();
       memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
       memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
@@ -2096,7 +2048,8 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    return true;
 }
 
-bool is_operand_constant(opt_ctx &ctx, Operand op, unsigned bit_size, uint64_t *value)
+bool
+is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)
 {
    if (op.isConstant()) {
       *value = op.constantValue64();
@@ -2111,7 +2064,8 @@ bool is_operand_constant(opt_ctx &ctx, Operand op, unsigned bit_size, uint64_t *
    return false;
 }
 
-bool is_constant_nan(uint64_t value, unsigned bit_size)
+bool
+is_constant_nan(uint64_t value, unsigned bit_size)
 {
    if (bit_size == 16)
       return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff);
@@ -2123,7 +2077,8 @@ bool is_constant_nan(uint64_t value, unsigned bit_size)
 
 /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
  * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
-bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+bool
+combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->definitions[0].regClass() != ctx.program->lane_mask)
       return false;
@@ -2132,8 +2087,8 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& in
 
    bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
 
-   Instruction *nan_test = follow_operand(ctx, instr->operands[0], true);
-   Instruction *cmp = follow_operand(ctx, instr->operands[1], true);
+   Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
+   Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
 
    if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA())
       return false;
@@ -2162,13 +2117,15 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& in
 
    if (nan_test->isVOP3()) {
       VOP3_instruction& vop3 = nan_test->vop3();
-      if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || vop3.opsel == 2)
+      if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||
+          vop3.opsel == 2)
          return false;
    }
 
    int constant_operand = -1;
    for (unsigned i = 0; i < 2; i++) {
-      if (cmp->operands[i].isTemp() && original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
+      if (cmp->operands[i].isTemp() &&
+          original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
          constant_operand = !i;
          break;
       }
@@ -2190,9 +2147,10 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& in
    decrease_uses(ctx, cmp);
 
    aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
-   Instruction *new_instr;
+   Instruction* new_instr;
    if (cmp->isVOP3()) {
-      VOP3_instruction *new_vop3 = create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
+      VOP3_instruction* new_vop3 =
+         create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
       VOP3_instruction& cmp_vop3 = cmp->vop3();
       memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
       memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
@@ -2217,14 +2175,15 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& in
 }
 
 /* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */
-bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+bool
+combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec)
       return false;
    if (ctx.uses[instr->definitions[1].tempId()])
       return false;
 
-   Instruction *cmp = follow_operand(ctx, instr->operands[1]);
+   Instruction* cmp = follow_operand(ctx, instr->operands[1]);
    if (!cmp)
       return false;
 
@@ -2240,9 +2199,10 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
    /* This creates a new instruction instead of modifying the existing
     * comparison so that the comparison is done with the correct exec mask. */
-   Instruction *new_instr;
+   Instruction* new_instr;
    if (cmp->isVOP3()) {
-      VOP3_instruction *new_vop3 = create_instruction<VOP3_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
+      VOP3_instruction* new_vop3 =
+         create_instruction<VOP3_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
       VOP3_instruction& cmp_vop3 = cmp->vop3();
       memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
       memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
@@ -2251,7 +2211,7 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       new_vop3->opsel = cmp_vop3.opsel;
       new_instr = new_vop3;
    } else if (cmp->isSDWA()) {
-      SDWA_instruction *new_sdwa = create_instruction<SDWA_instruction>(
+      SDWA_instruction* new_sdwa = create_instruction<SDWA_instruction>(
          new_opcode, (Format)((uint16_t)Format::SDWA | (uint16_t)Format::VOPC), 2, 1);
       SDWA_instruction& cmp_sdwa = cmp->sdwa();
       memcpy(new_sdwa->abs, cmp_sdwa.abs, sizeof(new_sdwa->abs));
@@ -2280,25 +2240,24 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
 /* op1(op2(1, 2), 0) if swap = false
  * op1(0, op2(1, 2)) if swap = true */
-bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2,
-                        Instruction* op1_instr, bool swap, const char *shuffle_str,
-                        Operand operands[3], bool neg[3], bool abs[3], uint8_t *opsel,
-                        bool *op1_clamp, uint8_t *op1_omod,
-                        bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel,
-                        bool *precise)
+bool
+match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap,
+                   const char* shuffle_str, Operand operands[3], bool neg[3], bool abs[3],
+                   uint8_t* opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg,
+                   bool* inbetween_abs, bool* inbetween_opsel, bool* precise)
 {
    /* checks */
    if (op1_instr->opcode != op1)
       return false;
 
-   Instruction *op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
+   Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
    if (!op2_instr || op2_instr->opcode != op2)
       return false;
    if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1]))
       return false;
 
-   VOP3_instruction *op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL;
-   VOP3_instruction *op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL;
+   VOP3_instruction* op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL;
+   VOP3_instruction* op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL;
 
    if (op1_instr->isSDWA() || op2_instr->isSDWA())
       return false;
@@ -2326,8 +2285,7 @@ bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2,
    else if (op1_vop3 && op1_vop3->opsel & (1 << (unsigned)swap))
       return false;
 
-   *precise = op1_instr->definitions[0].isPrecise() ||
-              op2_instr->definitions[0].isPrecise();
+   *precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise();
 
    int shuffle[3];
    shuffle[shuffle_str[0] - '0'] = 0;
@@ -2355,11 +2313,12 @@ bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2,
    return true;
 }
 
-void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
-                         Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel,
-                         bool clamp, unsigned omod)
+void
+create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
+                    Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, bool clamp,
+                    unsigned omod)
 {
-   VOP3_instruction *new_instr = create_instruction<VOP3_instruction>(opcode, Format::VOP3, 3, 1);
+   VOP3_instruction* new_instr = create_instruction<VOP3_instruction>(opcode, Format::VOP3, 3, 1);
    memcpy(new_instr->abs, abs, sizeof(bool[3]));
    memcpy(new_instr->neg, neg, sizeof(bool[3]));
    new_instr->clamp = clamp;
@@ -2374,7 +2333,9 @@ void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>&
    instr.reset(new_instr);
 }
 
-bool combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops)
+bool
+combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op,
+                      const char* shuffle, uint8_t ops)
 {
    for (unsigned swap = 0; swap < 2; swap++) {
       if (!((1 << swap) & ops))
@@ -2383,10 +2344,8 @@ bool combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode
       Operand operands[3];
       bool neg[3], abs[3], clamp, precise;
       uint8_t opsel = 0, omod = 0;
-      if (match_op3_for_vop3(ctx, instr->opcode, op2,
-                             instr.get(), swap, shuffle,
-                             operands, neg, abs, &opsel,
-                             &clamp, &omod, NULL, NULL, NULL, &precise)) {
+      if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg,
+                             abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
          ctx.uses[instr->operands[swap].tempId()]--;
          create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
          return true;
@@ -2396,14 +2355,17 @@ bool combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode
 }
 
 /* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
-bool combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+bool
+combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    bool is_or = instr->opcode == aco_opcode::v_or_b32;
    aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
 
-   if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2))
+   if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
+                                      "120", 1 | 2))
       return true;
-   if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2))
+   if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
+                                      "120", 1 | 2))
       return true;
    if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
       return true;
@@ -2419,7 +2381,7 @@ bool combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
     * v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
     */
    for (unsigned i = 0; i < 2; i++) {
-      Instruction *extins = follow_operand(ctx, instr->operands[i]);
+      Instruction* extins = follow_operand(ctx, instr->operands[i]);
       if (!extins)
          continue;
 
@@ -2429,14 +2391,17 @@ bool combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       if (extins->opcode == aco_opcode::p_insert &&
           (extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
          op = new_op_lshl;
-         operands[1] = Operand(extins->operands[1].constantValue() * extins->operands[2].constantValue());
-      } else if (is_or && (extins->opcode == aco_opcode::p_insert ||
-                           (extins->opcode == aco_opcode::p_extract && extins->operands[3].constantEquals(0))) &&
+         operands[1] =
+            Operand(extins->operands[1].constantValue() * extins->operands[2].constantValue());
+      } else if (is_or &&
+                 (extins->opcode == aco_opcode::p_insert ||
+                  (extins->opcode == aco_opcode::p_extract &&
+                   extins->operands[3].constantEquals(0))) &&
                  extins->operands[1].constantEquals(0)) {
          op = aco_opcode::v_and_or_b32;
          operands[1] = Operand(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
       } else {
-        continue;
+         continue;
       }
 
       operands[0] = extins->operands[0];
@@ -2459,7 +2424,8 @@ bool combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    return false;
 }
 
-bool combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)
+bool
+combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)
 {
    /* TODO: this can handle SDWA min/max instructions by using opsel */
    if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2))
@@ -2472,10 +2438,8 @@ bool combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposi
       bool neg[3], abs[3], clamp, precise;
       uint8_t opsel = 0, omod = 0;
       bool inbetween_neg;
-      if (match_op3_for_vop3(ctx, instr->opcode, opposite,
-                             instr.get(), swap, "012",
-                             operands, neg, abs, &opsel,
-                             &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
+      if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "012", operands, neg,
+                             abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
           inbetween_neg) {
          ctx.uses[instr->operands[swap].tempId()]--;
          neg[1] = !neg[1];
@@ -2493,7 +2457,8 @@ bool combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposi
  * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
  * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
  * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
-bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+bool
+combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    /* checks */
    if (!instr->operands[0].isTemp())
@@ -2501,7 +2466,7 @@ bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
       return false;
 
-   Instruction *op2_instr = follow_operand(ctx, instr->operands[0]);
+   Instruction* op2_instr = follow_operand(ctx, instr->operands[0]);
    if (!op2_instr)
       return false;
    switch (op2_instr->opcode) {
@@ -2510,10 +2475,8 @@ bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    case aco_opcode::s_xor_b32:
    case aco_opcode::s_and_b64:
    case aco_opcode::s_or_b64:
-   case aco_opcode::s_xor_b64:
-      break;
-   default:
-      return false;
+   case aco_opcode::s_xor_b64: break;
+   default: return false;
    }
 
    /* create instruction */
@@ -2523,26 +2486,13 @@ bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    ctx.info[op2_instr->definitions[0].tempId()].label = 0;
 
    switch (op2_instr->opcode) {
-   case aco_opcode::s_and_b32:
-      op2_instr->opcode = aco_opcode::s_nand_b32;
-      break;
-   case aco_opcode::s_or_b32:
-      op2_instr->opcode = aco_opcode::s_nor_b32;
-      break;
-   case aco_opcode::s_xor_b32:
-      op2_instr->opcode = aco_opcode::s_xnor_b32;
-      break;
-   case aco_opcode::s_and_b64:
-      op2_instr->opcode = aco_opcode::s_nand_b64;
-      break;
-   case aco_opcode::s_or_b64:
-      op2_instr->opcode = aco_opcode::s_nor_b64;
-      break;
-   case aco_opcode::s_xor_b64:
-      op2_instr->opcode = aco_opcode::s_xnor_b64;
-      break;
-   default:
-      break;
+   case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;
+   case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;
+   case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;
+   case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;
+   case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;
+   case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;
+   default: break;
    }
 
    return true;
@@ -2552,14 +2502,16 @@ bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
  * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
  * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
  * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
-bool combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+bool
+combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
       return false;
 
    for (unsigned i = 0; i < 2; i++) {
-      Instruction *op2_instr = follow_operand(ctx, instr->operands[i]);
-      if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && op2_instr->opcode != aco_opcode::s_not_b64))
+      Instruction* op2_instr = follow_operand(ctx, instr->operands[i]);
+      if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 &&
+                         op2_instr->opcode != aco_opcode::s_not_b64))
          continue;
       if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0]))
          continue;
@@ -2574,20 +2526,11 @@ bool combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       ctx.info[instr->definitions[0].tempId()].label = 0;
 
       switch (instr->opcode) {
-      case aco_opcode::s_and_b32:
-         instr->opcode = aco_opcode::s_andn2_b32;
-         break;
-      case aco_opcode::s_or_b32:
-         instr->opcode = aco_opcode::s_orn2_b32;
-         break;
-      case aco_opcode::s_and_b64:
-         instr->opcode = aco_opcode::s_andn2_b64;
-         break;
-      case aco_opcode::s_or_b64:
-         instr->opcode = aco_opcode::s_orn2_b64;
-         break;
-      default:
-         break;
+      case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break;
+      case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break;
+      case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break;
+      case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break;
+      default: break;
       }
 
       return true;
@@ -2596,13 +2539,14 @@ bool combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 }
 
 /* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
-bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+bool
+combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
       return false;
 
    for (unsigned i = 0; i < 2; i++) {
-      Instruction *op2_instr = follow_operand(ctx, instr->operands[i], true);
+      Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true);
       if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
           ctx.uses[op2_instr->definitions[1].tempId()])
          continue;
@@ -2622,17 +2566,17 @@ bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       instr->operands[0] = op2_instr->operands[0];
       ctx.info[instr->definitions[0].tempId()].label = 0;
 
-      instr->opcode = std::array<aco_opcode, 4>{aco_opcode::s_lshl1_add_u32,
-                                                aco_opcode::s_lshl2_add_u32,
-                                                aco_opcode::s_lshl3_add_u32,
-                                                aco_opcode::s_lshl4_add_u32}[shift - 1];
+      instr->opcode = std::array<aco_opcode, 4>{
+         aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32,
+         aco_opcode::s_lshl4_add_u32}[shift - 1];
 
       return true;
    }
    return false;
 }
 
-bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
+bool
+combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
 {
    if (instr->usesModifiers())
       return false;
@@ -2640,16 +2584,17 @@ bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode n
    for (unsigned i = 0; i < 2; i++) {
       if (!((1 << i) & ops))
          continue;
-      if (instr->operands[i].isTemp() &&
-          ctx.info[instr->operands[i].tempId()].is_b2i() &&
+      if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() &&
           ctx.uses[instr->operands[i].tempId()] == 1) {
 
          aco_ptr<Instruction> new_instr;
-         if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
+         if (instr->operands[!i].isTemp() &&
+             instr->operands[!i].getTemp().type() == RegType::vgpr) {
             new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));
          } else if (ctx.program->chip_class >= GFX10 ||
                     (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
-            new_instr.reset(create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
+            new_instr.reset(
+               create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
          } else {
             return false;
          }
@@ -2678,19 +2623,20 @@ bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode n
    return false;
 }
 
-bool combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+bool
+combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->usesModifiers())
       return false;
 
    for (unsigned i = 0; i < 2; i++) {
-      Instruction *op_instr = follow_operand(ctx, instr->operands[i]);
-      if (op_instr &&
-          op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
+      Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
+      if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
           op_instr->operands[0].isTemp() &&
           op_instr->operands[0].getTemp().type() == RegType::vgpr &&
           op_instr->operands[1].constantEquals(0)) {
-         aco_ptr<Instruction> new_instr{create_instruction<VOP3_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
+         aco_ptr<Instruction> new_instr{
+            create_instruction<VOP3_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
          ctx.uses[instr->operands[i].tempId()]--;
          new_instr->operands[0] = op_instr->operands[0];
          new_instr->operands[1] = instr->operands[!i];
@@ -2705,36 +2651,40 @@ bool combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    return false;
 }
 
-bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only)
+bool
+get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3,
+                aco_opcode* med3, bool* some_gfx9_only)
 {
    switch (op) {
-   #define MINMAX(type, gfx9) \
-   case aco_opcode::v_min_##type:\
-   case aco_opcode::v_max_##type:\
-   case aco_opcode::v_med3_##type:\
-      *min = aco_opcode::v_min_##type;\
-      *max = aco_opcode::v_max_##type;\
-      *med3 = aco_opcode::v_med3_##type;\
-      *min3 = aco_opcode::v_min3_##type;\
-      *max3 = aco_opcode::v_max3_##type;\
-      *some_gfx9_only = gfx9;\
+#define MINMAX(type, gfx9)                                                                         \
+   case aco_opcode::v_min_##type:                                                                  \
+   case aco_opcode::v_max_##type:                                                                  \
+   case aco_opcode::v_med3_##type:                                                                 \
+      *min = aco_opcode::v_min_##type;                                                             \
+      *max = aco_opcode::v_max_##type;                                                             \
+      *med3 = aco_opcode::v_med3_##type;                                                           \
+      *min3 = aco_opcode::v_min3_##type;                                                           \
+      *max3 = aco_opcode::v_max3_##type;                                                           \
+      *some_gfx9_only = gfx9;                                                                      \
       return true;
-   MINMAX(f32, false)
-   MINMAX(u32, false)
-   MINMAX(i32, false)
-   MINMAX(f16, true)
-   MINMAX(u16, true)
-   MINMAX(i16, true)
-   #undef MINMAX
-   default:
-      return false;
+      MINMAX(f32, false)
+      MINMAX(u32, false)
+      MINMAX(i32, false)
+      MINMAX(f16, true)
+      MINMAX(u16, true)
+      MINMAX(i16, true)
+#undef MINMAX
+   default: return false;
    }
 }
 
-/* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb
- * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb */
-bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
-                   aco_opcode min, aco_opcode max, aco_opcode med)
+/* when ub > lb:
+ * v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
+ * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
+ */
+bool
+combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max,
+              aco_opcode med)
 {
    /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
     * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
@@ -2751,9 +2701,8 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
       Operand operands[3];
       bool neg[3], abs[3], clamp, precise;
       uint8_t opsel = 0, omod = 0;
-      if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap,
-                             "012", operands, neg, abs, &opsel,
-                             &clamp, &omod, NULL, NULL, NULL, &precise)) {
+      if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg,
+                             abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
          /* max(min(src, upper), lower) returns upper if src is NaN, but
           * med3(src, lower, upper) returns lower.
           */
@@ -2766,7 +2715,8 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
             uint32_t val;
             if (operands[i].isConstant()) {
                val = operands[i].constantValue();
-            } else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
+            } else if (operands[i].isTemp() &&
+                       ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
                val = ctx.info[operands[i].tempId()].val;
             } else {
                continue;
@@ -2799,10 +2749,14 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
                const0_f = _mesa_half_to_float(const0);
                const1_f = _mesa_half_to_float(const1);
             }
-            if (abs[const0_idx]) const0_f = fabsf(const0_f);
-            if (abs[const1_idx]) const1_f = fabsf(const1_f);
-            if (neg[const0_idx]) const0_f = -const0_f;
-            if (neg[const1_idx]) const1_f = -const1_f;
+            if (abs[const0_idx])
+               const0_f = fabsf(const0_f);
+            if (abs[const1_idx])
+               const1_f = fabsf(const1_f);
+            if (neg[const0_idx])
+               const0_f = -const0_f;
+            if (neg[const1_idx])
+               const1_f = -const1_f;
             lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
             break;
          }
@@ -2815,8 +2769,10 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
             break;
          }
          case aco_opcode::v_min_i32: {
-            int32_t const0_i = const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
-            int32_t const1_i = const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
+            int32_t const0_i =
+               const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
+            int32_t const1_i =
+               const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
             lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
             break;
          }
@@ -2826,8 +2782,7 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
             lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
             break;
          }
-         default:
-            break;
+         default: break;
          }
          int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
 
@@ -2849,8 +2804,8 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr,
    return false;
 }
 
-
-void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void
+apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
                      instr->opcode == aco_opcode::v_lshrrev_b64 ||
@@ -2904,8 +2859,8 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       /* Applying two sgprs require making it VOP3, so don't do it unless it's
        * definitively beneficial.
        * TODO: this is too conservative because later the use count could be reduced to 1 */
-      if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 &&
-          !instr->isVOP3() && !instr->isSDWA() && instr->format != Format::VOP3P)
+      if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() &&
+          !instr->isSDWA() && instr->format != Format::VOP3P)
          break;
 
       Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp;
@@ -2913,7 +2868,8 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       if (new_sgpr && num_sgprs >= max_sgprs)
          continue;
 
-      if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() || info.is_extract()) {
+      if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
+          info.is_extract()) {
          /* can_apply_extract() checks SGPR encoding restrictions */
          if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info))
             apply_extract(ctx, instr, sgpr_idx, info);
@@ -2946,7 +2902,8 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 }
 
 template <typename T>
-bool apply_omod_clamp_helper(opt_ctx &ctx, T *instr, ssa_info& def_info)
+bool
+apply_omod_clamp_helper(opt_ctx& ctx, T* instr, ssa_info& def_info)
 {
    if (!def_info.is_clamp() && (instr->clamp || instr->omod))
       return false;
@@ -2964,7 +2921,8 @@ bool apply_omod_clamp_helper(opt_ctx &ctx, T *instr, ssa_info& def_info)
 }
 
 /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
-bool apply_omod_clamp(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+bool
+apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
        !instr_info.can_use_output_modifiers[(int)instr->opcode])
@@ -2977,8 +2935,8 @@ bool apply_omod_clamp(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    /* omod flushes -0 to +0 and has no effect if denormals are enabled */
    bool can_use_omod = (can_vop3 || ctx.program->chip_class >= GFX9); /* SDWA omod is GFX9+ */
    if (instr->definitions[0].bytes() == 4)
-      can_use_omod = can_use_omod && ctx.fp_mode.denorm32 == 0 &&
-                     !ctx.fp_mode.preserve_signed_zero_inf_nan32;
+      can_use_omod =
+         can_use_omod && ctx.fp_mode.denorm32 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan32;
    else
       can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 &&
                      !ctx.fp_mode.preserve_signed_zero_inf_nan16_64;
@@ -3015,7 +2973,8 @@ bool apply_omod_clamp(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 /* Combine an p_insert (or p_extract, in some cases) instruction with instr.
  * p_insert(instr(...)) -> instr_insert().
  */
-bool apply_insert(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+bool
+apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)
       return false;
@@ -3057,25 +3016,27 @@ bool apply_insert(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 }
 
 /* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
-bool combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+bool
+combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->usesModifiers())
       return false;
 
    for (unsigned i = 0; i < 2; i++) {
-      Instruction *op_instr = follow_operand(ctx, instr->operands[i], true);
-      if (op_instr &&
-          op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
-          op_instr->operands[0].constantEquals(0) &&
-          op_instr->operands[1].constantEquals(0) &&
+      Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
+      if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
+          op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) &&
           !op_instr->usesModifiers()) {
 
          aco_ptr<Instruction> new_instr;
-         if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
-            new_instr.reset(create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
+         if (instr->operands[!i].isTemp() &&
+             instr->operands[!i].getTemp().type() == RegType::vgpr) {
+            new_instr.reset(
+               create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
          } else if (ctx.program->chip_class >= GFX10 ||
                     (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
-            new_instr.reset(create_instruction<VOP3_instruction>(aco_opcode::v_cndmask_b32, asVOP3(Format::VOP2), 3, 1));
+            new_instr.reset(create_instruction<VOP3_instruction>(aco_opcode::v_cndmask_b32,
+                                                                 asVOP3(Format::VOP2), 3, 1));
          } else {
             return false;
          }
@@ -3099,13 +3060,14 @@ bool combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 
 /* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)
  * v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c) */
-bool combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+bool
+combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->usesModifiers())
       return false;
 
    for (unsigned i = 0; i < 2; i++) {
-      Instruction *op_instr = follow_operand(ctx, instr->operands[i]);
+      Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
       if (!op_instr)
          continue;
 
@@ -3113,10 +3075,8 @@ bool combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
           op_instr->opcode != aco_opcode::v_lshlrev_b32)
          continue;
 
-      if (op_instr->opcode == aco_opcode::v_lshlrev_b32 &&
-          op_instr->operands[1].isTemp() &&
-          op_instr->operands[1].getTemp().type() == RegType::sgpr &&
-          instr->operands[!i].isTemp() &&
+      if (op_instr->opcode == aco_opcode::v_lshlrev_b32 && op_instr->operands[1].isTemp() &&
+          op_instr->operands[1].getTemp().type() == RegType::sgpr && instr->operands[!i].isTemp() &&
           instr->operands[!i].getTemp().type() == RegType::sgpr)
          return false;
 
@@ -3129,7 +3089,8 @@ bool combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 
          ctx.uses[instr->operands[i].tempId()]--;
 
-         aco_ptr<VOP3_instruction> new_instr{create_instruction<VOP3_instruction>(aco_opcode::v_mad_u32_u24, Format::VOP3, 3, 1)};
+         aco_ptr<VOP3_instruction> new_instr{
+            create_instruction<VOP3_instruction>(aco_opcode::v_mad_u32_u24, Format::VOP3, 3, 1)};
          new_instr->operands[0] = op_instr->operands[!shift_op_idx];
          new_instr->operands[1] = Operand(multiplier);
          new_instr->operands[2] = instr->operands[!i];
@@ -3143,7 +3104,8 @@ bool combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    return false;
 }
 
-void propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi)
+void
+propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi)
 {
    /* propagate swizzles which apply to a result down to the instruction's operands:
     * result = a.xy + b.xx -> result.yx = a.yx + b.xx */
@@ -3151,8 +3113,8 @@ void propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opse
    assert((opsel_hi & 1) == opsel_hi);
    uint8_t tmp_lo = instr->opsel_lo;
    uint8_t tmp_hi = instr->opsel_hi;
-   bool neg_lo[3] = { instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2] };
-   bool neg_hi[3] = { instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2] };
+   bool neg_lo[3] = {instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2]};
+   bool neg_hi[3] = {instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2]};
    if (opsel_lo == 1) {
       instr->opsel_lo = tmp_hi;
       for (unsigned i = 0; i < 3; i++)
@@ -3165,16 +3127,14 @@ void propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opse
    }
 }
 
-void combine_vop3p(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void
+combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    VOP3P_instruction* vop3p = &instr->vop3p();
 
    /* apply clamp */
-   if (instr->opcode == aco_opcode::v_pk_mul_f16 &&
-       instr->operands[1].constantEquals(0x3C00) &&
-       vop3p->clamp &&
-       instr->operands[0].isTemp() &&
-       ctx.uses[instr->operands[0].tempId()] == 1) {
+   if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) &&
+       vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1) {
 
       ssa_info& info = ctx.info[instr->operands[0].tempId()];
       if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) {
@@ -3240,12 +3200,12 @@ void combine_vop3p(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_vop3p())
             continue;
          ssa_info& info = ctx.info[instr->operands[i].tempId()];
-         if (info.instr->opcode != aco_opcode::v_pk_mul_f16 || info.instr->definitions[0].isPrecise())
+         if (info.instr->opcode != aco_opcode::v_pk_mul_f16 ||
+             info.instr->definitions[0].isPrecise())
             continue;
 
          Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
-         if (ctx.uses[instr->operands[i].tempId()] >= uses ||
-             !check_vop3_operands(ctx, 3, op))
+         if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
             continue;
 
          /* no clamp allowed between mul and add */
@@ -3274,7 +3234,8 @@ void combine_vop3p(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
       /* turn packed mul+add into v_pk_fma_f16 */
       assert(mul_instr->isVOP3P());
-      aco_ptr<VOP3P_instruction> fma{create_instruction<VOP3P_instruction>(aco_opcode::v_pk_fma_f16, Format::VOP3P, 3, 1)};
+      aco_ptr<VOP3P_instruction> fma{
+         create_instruction<VOP3P_instruction>(aco_opcode::v_pk_fma_f16, Format::VOP3P, 3, 1)};
       VOP3P_instruction* mul = &mul_instr->vop3p();
       for (unsigned i = 0; i < 2; i++) {
          fma->operands[i] = op[i];
@@ -3302,7 +3263,8 @@ void combine_vop3p(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 // TODO: we could possibly move the whole label_instruction pass to combine_instruction:
 // this would mean that we'd have to fix the instruction uses while value propagation
 
-void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void
+combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
       return;
@@ -3315,8 +3277,9 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          if (!op.isTemp())
             continue;
          ssa_info& info = ctx.info[op.tempId()];
-         if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
-                                   instr->operands[i].getTemp().type() == RegType::sgpr) &&
+         if (info.is_extract() &&
+             (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
+              instr->operands[i].getTemp().type() == RegType::sgpr) &&
              can_apply_extract(ctx, instr, i, info)) {
             apply_extract(ctx, instr, i, info);
             ctx.uses[instr->operands[i].tempId()]--;
@@ -3326,7 +3289,8 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
       if (can_apply_sgprs(ctx, instr))
          apply_sgprs(ctx, instr);
-      while (apply_omod_clamp(ctx, instr)) ;
+      while (apply_omod_clamp(ctx, instr))
+         ;
       apply_insert(ctx, instr);
    }
 
@@ -3351,7 +3315,8 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
     * floats. */
 
    /* neg(mul(a, b)) -> mul(neg(a), b) */
-   if (ctx.info[instr->definitions[0].tempId()].is_neg() && ctx.uses[instr->operands[1].tempId()] == 1) {
+   if (ctx.info[instr->definitions[0].tempId()].is_neg() &&
+       ctx.uses[instr->operands[1].tempId()] == 1) {
       Temp val = ctx.info[instr->definitions[0].tempId()].temp;
 
       if (!ctx.info[val.id()].is_mul())
@@ -3371,7 +3336,8 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       Definition def = instr->definitions[0];
       /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */
       bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
-      instr.reset(create_instruction<VOP3_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
+      instr.reset(
+         create_instruction<VOP3_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
       instr->operands[0] = mul_instr->operands[0];
       instr->operands[1] = mul_instr->operands[1];
       instr->definitions[0] = def;
@@ -3392,15 +3358,13 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    }
 
    /* combine mul+add -> mad */
-   bool mad32 = instr->opcode == aco_opcode::v_add_f32 ||
-                instr->opcode == aco_opcode::v_sub_f32 ||
+   bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 ||
                 instr->opcode == aco_opcode::v_subrev_f32;
-   bool mad16 = instr->opcode == aco_opcode::v_add_f16 ||
-                instr->opcode == aco_opcode::v_sub_f16 ||
+   bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||
                 instr->opcode == aco_opcode::v_subrev_f16;
    if (mad16 || mad32) {
-      bool need_fma = mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3) :
-                              (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
+      bool need_fma = mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3)
+                            : (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
       if (need_fma && instr->definitions[0].isPrecise())
          return;
       if (need_fma && mad32 && !ctx.program->dev.has_fast_fma32)
@@ -3423,8 +3387,7 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
             continue;
 
          Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
-         if (info.instr->isSDWA() ||
-             !check_vop3_operands(ctx, 3, op) ||
+         if (info.instr->isSDWA() || !check_vop3_operands(ctx, 3, op) ||
              ctx.uses[instr->operands[i].tempId()] >= uses)
             continue;
 
@@ -3435,7 +3398,8 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
       if (mul_instr) {
          /* turn mul+add into v_mad/v_fma */
-         Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], instr->operands[add_op_idx]};
+         Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1],
+                          instr->operands[add_op_idx]};
          ctx.uses[mul_instr->definitions[0].tempId()]--;
          if (ctx.uses[mul_instr->definitions[0].tempId()]) {
             if (op[0].isTemp())
@@ -3475,15 +3439,19 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          }
          if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
             neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
-         else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16)
+         else if (instr->opcode == aco_opcode::v_subrev_f32 ||
+                  instr->opcode == aco_opcode::v_subrev_f16)
             neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
 
          aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
          if (mad16)
-            mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) :
-                                (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16);
+            mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16
+                                                                 : aco_opcode::v_fma_f16)
+                              : (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16
+                                                                 : aco_opcode::v_mad_f16);
 
-         aco_ptr<VOP3_instruction> mad{create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
+         aco_ptr<VOP3_instruction> mad{
+            create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
          for (unsigned i = 0; i < 3; i++) {
             mad->operands[i] = op[i];
             mad->neg[i] = neg[i];
@@ -3504,12 +3472,13 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {
       for (unsigned i = 0; i < 2; i++) {
          if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
-             ctx.uses[instr->operands[i].tempId()] == 1 &&
-             instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) {
+             ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&
+             instr->operands[!i].getTemp().type() == RegType::vgpr) {
             ctx.uses[instr->operands[i].tempId()]--;
             ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
 
-            aco_ptr<VOP2_instruction> new_instr{create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
+            aco_ptr<VOP2_instruction> new_instr{
+               create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
             new_instr->operands[0] = Operand(0u);
             new_instr->operands[1] = instr->operands[!i];
             new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
@@ -3520,34 +3489,49 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          }
       }
    } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) {
-      if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
-      else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ;
-      else combine_add_or_then_and_lshl(ctx, instr) ;
+      if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012",
+                                1 | 2)) {
+      } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32,
+                                       "012", 1 | 2)) {
+      } else if (combine_add_or_then_and_lshl(ctx, instr)) {
+      }
    } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) {
-      if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2)) ;
-      else combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2);
+      if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012",
+                                1 | 2)) {
+      } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,
+                                       "012", 1 | 2)) {
+      }
    } else if (instr->opcode == aco_opcode::v_add_u32) {
-      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
-      else if (combine_add_bcnt(ctx, instr)) ;
-      else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2)) ;
-      else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) {
-         if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
-         else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
-         else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
-         else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
-         else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ;
-         else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16, aco_opcode::v_mad_u32_u16, "120", 1 | 2)) ;
-         else combine_add_or_then_and_lshl(ctx, instr) ;
+      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
+      } else if (combine_add_bcnt(ctx, instr)) {
+      } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
+                                       aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
+      } else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) {
+         if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
+                                   1 | 2)) {
+         } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
+                                          "120", 1 | 2)) {
+         } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
+                                          "012", 1 | 2)) {
+         } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
+                                          "012", 1 | 2)) {
+         } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
+                                          "012", 1 | 2)) {
+         } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16,
+                                          aco_opcode::v_mad_u32_u16, "120", 1 | 2)) {
+         } else if (combine_add_or_then_and_lshl(ctx, instr)) {
+         }
       }
    } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
               instr->opcode == aco_opcode::v_add_co_u32_e64) {
       bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
-      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
-      else if (!carry_out && combine_add_bcnt(ctx, instr)) ;
-      else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2)) ;
-      else if (!carry_out) combine_add_lshl(ctx, instr);
-   } else if (instr->opcode == aco_opcode::v_sub_u32 ||
-              instr->opcode == aco_opcode::v_sub_co_u32 ||
+      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
+      } else if (!carry_out && combine_add_bcnt(ctx, instr)) {
+      } else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
+                                                     aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
+      } else if (!carry_out && combine_add_lshl(ctx, instr)) {
+      }
+   } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||
               instr->opcode == aco_opcode::v_sub_co_u32_e64) {
       combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2);
    } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
@@ -3555,17 +3539,20 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
               instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
       combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
    } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) {
-      combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2);
-   } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) {
+      combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120",
+                            2);
+   } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) &&
+              ctx.program->chip_class >= GFX9) {
       combine_salu_lshl_add(ctx, instr);
    } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
       combine_salu_not_bitwise(ctx, instr);
    } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
               instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
-      if (combine_ordering_test(ctx, instr)) ;
-      else if (combine_comparison_ordering(ctx, instr)) ;
-      else if (combine_constant_comparison_ordering(ctx, instr)) ;
-      else combine_salu_n2(ctx, instr);
+      if (combine_ordering_test(ctx, instr)) {
+      } else if (combine_comparison_ordering(ctx, instr)) {
+      } else if (combine_constant_comparison_ordering(ctx, instr)) {
+      } else if (combine_salu_n2(ctx, instr)) {
+      }
    } else if (instr->opcode == aco_opcode::v_and_b32) {
       combine_and_subbrev(ctx, instr);
    } else {
@@ -3573,8 +3560,11 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       bool some_gfx9_only;
       if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) &&
           (!some_gfx9_only || ctx.program->chip_class >= GFX9)) {
-         if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, instr->opcode == min ? min3 : max3)) ;
-         else combine_clamp(ctx, instr, min, max, med3);
+         if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
+                            instr->opcode == min ? min3 : max3)) {
+         } else {
+            combine_clamp(ctx, instr, min, max, med3);
+         }
       }
    }
 
@@ -3583,27 +3573,22 @@ void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       combine_inverse_comparison(ctx, instr);
 }
 
-bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
+bool
+to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    switch (instr->opcode) {
-      case aco_opcode::s_and_b32:
-      case aco_opcode::s_and_b64:
-         instr->opcode = aco_opcode::s_and_b32;
-         break;
-      case aco_opcode::s_or_b32:
-      case aco_opcode::s_or_b64:
-         instr->opcode = aco_opcode::s_or_b32;
-         break;
-      case aco_opcode::s_xor_b32:
-      case aco_opcode::s_xor_b64:
-         instr->opcode = aco_opcode::s_absdiff_i32;
-         break;
-      default:
-         /* Don't transform other instructions. They are very unlikely to appear here. */
-         return false;
+   case aco_opcode::s_and_b32:
+   case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
+   case aco_opcode::s_or_b32:
+   case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break;
+   case aco_opcode::s_xor_b32:
+   case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break;
+   default:
+      /* Don't transform other instructions. They are very unlikely to appear here. */
+      return false;
    }
 
-   for (Operand &op : instr->operands) {
+   for (Operand& op : instr->operands) {
       ctx.uses[op.tempId()]--;
 
       if (ctx.info[op.tempId()].is_uniform_bool()) {
@@ -3611,12 +3596,14 @@ bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
          op.setTemp(ctx.info[op.tempId()].temp);
       } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
          /* Use the SCC definition of the predecessor instruction.
-          * This allows the predecessor to get picked up by the same optimization (if it has no divergent users),
-          * and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed.
+          * This allows the predecessor to get picked up by the same optimization (if it has no
+          * divergent users), and it also makes sure that the current instruction will keep working
+          * even if the predecessor won't be transformed.
           */
-         Instruction *pred_instr = ctx.info[op.tempId()].instr;
+         Instruction* pred_instr = ctx.info[op.tempId()].instr;
          assert(pred_instr->definitions.size() >= 2);
-         assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc);
+         assert(pred_instr->definitions[1].isFixed() &&
+                pred_instr->definitions[1].physReg() == scc);
          op.setTemp(pred_instr->definitions[1].getTemp());
       } else {
          unreachable("Invalid operand on uniform bitwise instruction.");
@@ -3631,7 +3618,8 @@ bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
    return true;
 }
 
-void select_mul_u32_u24(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void
+select_mul_u32_u24(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    if (instr->usesModifiers())
       return;
@@ -3655,12 +3643,12 @@ void select_mul_u32_u24(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 
    /* VOP2 instructions can only take constants/sgprs in operand 0. */
    if ((instr->operands[1].isConstant() ||
-       (instr->operands[1].hasRegClass() &&
-      instr->operands[1].regClass().type() == RegType::sgpr))) {
+        (instr->operands[1].hasRegClass() &&
+         instr->operands[1].regClass().type() == RegType::sgpr))) {
       swap = true;
       if ((instr->operands[0].isConstant() ||
-          (instr->operands[0].hasRegClass() &&
-           instr->operands[0].regClass().type() == RegType::sgpr))) {
+           (instr->operands[0].hasRegClass() &&
+            instr->operands[0].regClass().type() == RegType::sgpr))) {
          /* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because
           * v_mul_u32_u24 has no advantages.
           */
@@ -3668,14 +3656,16 @@ void select_mul_u32_u24(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       }
    }
 
-   VOP2_instruction *new_instr = create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);
+   VOP2_instruction* new_instr =
+      create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);
    new_instr->operands[0] = instr->operands[swap];
    new_instr->operands[1] = instr->operands[!swap];
    new_instr->definitions[0] = instr->definitions[0];
    instr.reset(new_instr);
 }
 
-void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void
+select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    const uint32_t threshold = 4;
 
@@ -3689,7 +3679,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       unsigned num_used = 0;
       unsigned idx = 0;
       unsigned split_offset = 0;
-      for (unsigned i = 0, offset = 0; i < instr->definitions.size(); offset += instr->definitions[i++].bytes()) {
+      for (unsigned i = 0, offset = 0; i < instr->definitions.size();
+           offset += instr->definitions[i++].bytes()) {
          if (ctx.uses[instr->definitions[i].tempId()]) {
             num_used++;
             idx = i;
@@ -3699,7 +3690,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       bool done = false;
       if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
           ctx.uses[instr->operands[0].tempId()] == 1) {
-         Instruction *vec = ctx.info[instr->operands[0].tempId()].instr;
+         Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
 
          unsigned off = 0;
          Operand op;
@@ -3719,7 +3710,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
             if (op.isTemp())
                ctx.uses[op.tempId()]++;
 
-            aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
+            aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
+               aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
             extract->operands[0] = op;
             extract->definitions[0] = instr->definitions[idx];
             instr.reset(extract.release());
@@ -3731,9 +3723,10 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       if (!done && num_used == 1 &&
           instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
           split_offset % instr->definitions[idx].bytes() == 0) {
-         aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
+         aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
          extract->operands[0] = instr->operands[0];
-         extract->operands[1] = Operand((uint32_t) split_offset / instr->definitions[idx].bytes());
+         extract->operands[1] = Operand((uint32_t)split_offset / instr->definitions[idx].bytes());
          extract->definitions[0] = instr->definitions[idx];
          instr.reset(extract.release());
       }
@@ -3762,8 +3755,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
          bool sgpr_used = false;
          uint32_t literal_idx = 0;
          uint32_t literal_uses = UINT32_MAX;
-         for (unsigned i = 0; i < instr->operands.size(); i++)
-         {
+         for (unsigned i = 0; i < instr->operands.size(); i++) {
             if (instr->operands[i].isConstant() && i > 0) {
                literal_uses = UINT32_MAX;
                break;
@@ -3771,8 +3763,10 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
             if (!instr->operands[i].isTemp())
                continue;
             unsigned bits = get_operand_size(instr, i);
-            /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 or operands other than the 1st */
-            if (instr->operands[i].getTemp().type() == RegType::sgpr && (i > 0 || ctx.program->chip_class < GFX10)) {
+            /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10
+             * or operands other than the 1st */
+            if (instr->operands[i].getTemp().type() == RegType::sgpr &&
+                (i > 0 || ctx.program->chip_class < GFX10)) {
                if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits)) {
                   literal_uses = ctx.uses[instr->operands[i].tempId()];
                   literal_idx = i;
@@ -3781,8 +3775,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
                }
                sgpr_used = true;
                /* don't break because we still need to check constants */
-            } else if (!sgpr_used &&
-                       ctx.info[instr->operands[i].tempId()].is_literal(bits) &&
+            } else if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits) &&
                        ctx.uses[instr->operands[i].tempId()] < literal_uses) {
                literal_uses = ctx.uses[instr->operands[i].tempId()];
                literal_idx = i;
@@ -3805,20 +3798,17 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       }
    }
 
-   /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */
-   if (instr->isBranch() &&
-       instr->operands.size() &&
-       instr->operands[0].isTemp() &&
-       instr->operands[0].isFixed() &&
-       instr->operands[0].physReg() == scc) {
+   /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions
+    * when it isn't beneficial */
+   if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() &&
+       instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) {
       ctx.info[instr->operands[0].tempId()].set_scc_needed();
       return;
    } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
                instr->opcode == aco_opcode::s_cselect_b32) &&
               instr->operands[2].isTemp()) {
       ctx.info[instr->operands[2].tempId()].set_scc_needed();
-   } else if (instr->opcode == aco_opcode::p_wqm &&
-              instr->operands[0].isTemp() &&
+   } else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() &&
               ctx.info[instr->definitions[0].tempId()].is_scc_needed()) {
       /* Propagate label so it is correctly detected by the uniform bool transform */
       ctx.info[instr->operands[0].tempId()].set_scc_needed();
@@ -3832,13 +3822,13 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       return;
 
    /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
-   if (instr->definitions.size() &&
-       ctx.uses[instr->definitions[0].tempId()] == 0 &&
+   if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 &&
        ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
       bool transform_done = to_uniform_bool_instr(ctx, instr);
 
       if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
-         /* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */
+         /* Swap the two definition IDs in order to avoid overusing the SCC.
+          * This reduces extra moves generated by RA. */
          uint32_t def0_id = instr->definitions[0].getTemp().id();
          uint32_t def1_id = instr->definitions[1].getTemp().id();
          instr->definitions[0].setTemp(Temp(def1_id, s1));
@@ -3851,8 +3841,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    if (instr->opcode == aco_opcode::v_mad_u32_u16)
       select_mul_u32_u24(ctx, instr);
 
-   if (instr->isSDWA() || instr->isDPP() ||
-       (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
+   if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
        (instr->isVOP3P() && ctx.program->chip_class < GFX10))
       return; /* some encodings can't ever take literals */
 
@@ -3864,8 +3853,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    Operand literal(s1);
    unsigned num_operands = 1;
    if (instr->isSALU() ||
-       (ctx.program->chip_class >= GFX10 &&
-        (can_use_VOP3(ctx, instr) || instr->isVOP3P())))
+       (ctx.program->chip_class >= GFX10 && (can_use_VOP3(ctx, instr) || instr->isVOP3P())))
       num_operands = instr->operands.size();
    /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
    else if (instr->isVALU() && instr->operands.size() >= 3)
@@ -3905,7 +3893,6 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       mask |= (op.tempId() == literal_id) << i;
    }
 
-
    /* don't go over the constant bus limit */
    bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
                      instr->opcode == aco_opcode::v_lshrrev_b64 ||
@@ -3931,8 +3918,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    }
 }
 
-
-void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+void
+apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    /* Cleanup Dead Instructions */
    if (!instr)
@@ -3945,10 +3932,12 @@ void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr)
           (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) {
          aco_ptr<Instruction> new_mad;
 
-         aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
+         aco_opcode new_op =
+            info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
          if (instr->opcode == aco_opcode::v_fma_f32)
             new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
-         else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16)
+         else if (instr->opcode == aco_opcode::v_mad_f16 ||
+                  instr->opcode == aco_opcode::v_mad_legacy_f16)
             new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
          else if (instr->opcode == aco_opcode::v_fma_f16)
             new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
@@ -3985,8 +3974,8 @@ void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr)
    ctx.instructions.emplace_back(std::move(instr));
 }
 
-
-void optimize(Program* program)
+void
+optimize(Program* program)
 {
    opt_ctx ctx;
    ctx.program = program;
@@ -4010,10 +3999,12 @@ void optimize(Program* program)
    }
 
    /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
-   for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend(); ++block_rit) {
+   for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();
+        ++block_rit) {
       Block* block = &(*block_rit);
       ctx.fp_mode = block->fp_mode;
-      for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend(); ++instr_rit)
+      for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend();
+           ++instr_rit)
          select_instruction(ctx, *instr_rit);
    }
 
@@ -4025,7 +4016,6 @@ void optimize(Program* program)
          apply_literals(ctx, instr);
       block.instructions.swap(ctx.instructions);
    }
-
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp
index 590b9e3f1e9..2e426cf81a3 100644
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@@ -24,9 +24,9 @@
 
 #include "aco_ir.h"
 
-#include <bitset>
 #include <algorithm>
 #include <array>
+#include <bitset>
 #include <vector>
 
 namespace aco {
@@ -41,15 +41,14 @@ enum {
    written_by_multiple_instrs = -4,
 };
 
-struct pr_opt_ctx
-{
-   Program *program;
-   Block *current_block;
+struct pr_opt_ctx {
+   Program* program;
+   Block* current_block;
    int current_instr_idx;
    std::vector<uint16_t> uses;
    std::array<int, max_reg_cnt * 4u> instr_idx_by_regs;
 
-   void reset_block(Block *block)
+   void reset_block(Block* block)
    {
       current_block = block;
       current_instr_idx = -1;
@@ -57,9 +56,10 @@ struct pr_opt_ctx
    }
 };
 
-void save_reg_writes(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+void
+save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
-   for (const Definition &def : instr->definitions) {
+   for (const Definition& def : instr->definitions) {
       assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255);
       assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256);
 
@@ -75,20 +75,21 @@ void save_reg_writes(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
    }
 }
 
-int last_writer_idx(pr_opt_ctx &ctx, PhysReg physReg, RegClass rc)
+int
+last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
 {
    /* Verify that all of the operand's registers are written by the same instruction. */
    int instr_idx = ctx.instr_idx_by_regs[physReg.reg()];
    unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
    unsigned r = physReg.reg();
-   bool all_same = std::all_of(
-      &ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
-      [instr_idx](int i) { return i == instr_idx; });
+   bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
+                               [instr_idx](int i) { return i == instr_idx; });
 
    return all_same ? instr_idx : written_by_multiple_instrs;
 }
 
-int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
+int
+last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
 {
    if (op.isConstant() || op.isUndefined())
       return const_or_undef;
@@ -104,7 +105,8 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
    return instr_idx;
 }
 
-void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+void
+try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    /* We are looking for the following pattern:
     *
@@ -123,8 +125,7 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
    if (ctx.program->chip_class < GFX8)
       return;
 
-   if (instr->format != Format::PSEUDO_BRANCH ||
-       instr->operands.size() == 0 ||
+   if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 ||
        instr->operands[0].physReg() != scc)
       return;
 
@@ -141,13 +142,12 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
        last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
       return;
 
-   aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx];
-   aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
+   aco_ptr<Instruction>& op0_instr = ctx.current_block->instructions[op0_instr_idx];
+   aco_ptr<Instruction>& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
 
    if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
         op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
-       op0_instr->operands[0].physReg() != vcc ||
-       op0_instr->operands[1].physReg() != exec ||
+       op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec ||
        !last_vcc_wr->isVOPC())
       return;
 
@@ -159,7 +159,8 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
    instr->operands[0] = op0_instr->operands[0];
 }
 
-void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+void
+try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    /* We are looking for the following pattern:
     *
@@ -180,8 +181,7 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
    if (instr->isSOPC() &&
        (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
         instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 ||
-        instr->opcode == aco_opcode::s_cmp_eq_u64 ||
-        instr->opcode == aco_opcode::s_cmp_lg_u64) &&
+        instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) &&
        (instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) &&
        (instr->operands[0].isTemp() || instr->operands[1].isTemp())) {
       /* Make sure the constant is always in operand 1 */
@@ -197,8 +197,9 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
       if (wr_idx < 0 || wr_idx != sccwr_idx)
          return;
 
-      aco_ptr<Instruction> &wr_instr = ctx.current_block->instructions[wr_idx];
-      if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 || wr_instr->definitions[1].physReg() != scc)
+      aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
+      if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
+          wr_instr->definitions[1].physReg() != scc)
          return;
 
       /* Look for instructions which set SCC := (D != 0) */
@@ -232,10 +233,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
       case aco_opcode::s_ashr_i32:
       case aco_opcode::s_ashr_i64:
       case aco_opcode::s_abs_i32:
-      case aco_opcode::s_absdiff_i32:
-         break;
-      default:
-         return;
+      case aco_opcode::s_absdiff_i32: break;
+      default: return;
       }
 
       /* Use the SCC def from wr_instr */
@@ -245,13 +244,12 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
 
       /* Set the opcode and operand to 32-bit */
       instr->operands[1] = Operand(0u);
-      instr->opcode = (instr->opcode == aco_opcode::s_cmp_eq_u32 ||
-                       instr->opcode == aco_opcode::s_cmp_eq_i32 ||
-                       instr->opcode == aco_opcode::s_cmp_eq_u64)
-                      ? aco_opcode::s_cmp_eq_u32
-                      : aco_opcode::s_cmp_lg_u32;
-   } else if ((instr->format == Format::PSEUDO_BRANCH &&
-               instr->operands.size() == 1 &&
+      instr->opcode =
+         (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
+          instr->opcode == aco_opcode::s_cmp_eq_u64)
+            ? aco_opcode::s_cmp_eq_u32
+            : aco_opcode::s_cmp_lg_u32;
+   } else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 &&
                instr->operands[0].physReg() == scc) ||
               instr->opcode == aco_opcode::s_cselect_b32) {
 
@@ -265,10 +263,11 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
       if (wr_idx < 0)
          return;
 
-      aco_ptr<Instruction> &wr_instr = ctx.current_block->instructions[wr_idx];
+      aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
 
       /* Check if we found the pattern above. */
-      if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 && wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
+      if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
+          wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
          return;
       if (wr_instr->operands[0].physReg() != scc)
          return;
@@ -282,11 +281,13 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
       if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) {
          /* Flip the meaning of the instruction to correctly use the SCC. */
          if (instr->format == Format::PSEUDO_BRANCH)
-            instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
+            instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
+                                                                     : aco_opcode::p_cbranch_z;
          else if (instr->opcode == aco_opcode::s_cselect_b32)
             std::swap(instr->operands[0], instr->operands[1]);
          else
-            unreachable("scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
+            unreachable(
+               "scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
       }
 
       /* Use the SCC def from the original instruction, not the comparison */
@@ -295,7 +296,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
    }
 }
 
-void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+void
+process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    ctx.current_instr_idx++;
 
@@ -307,9 +309,10 @@ void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
       save_reg_writes(ctx, instr);
 }
 
-} /* End of empty namespace */
+} // namespace
 
-void optimize_postRA(Program* program)
+void
+optimize_postRA(Program* program)
 {
    pr_opt_ctx ctx;
    ctx.program = program;
@@ -319,10 +322,10 @@ void optimize_postRA(Program* program)
     * Goes through each instruction exactly once, and can transform
     * instructions or adjust the use counts of temps.
     */
-   for (auto &block : program->blocks) {
+   for (auto& block : program->blocks) {
       ctx.reset_block(&block);
 
-      for (aco_ptr<Instruction> &instr : block.instructions)
+      for (aco_ptr<Instruction>& instr : block.instructions)
          process_instruction(ctx, instr);
    }
 
@@ -330,13 +333,12 @@ void optimize_postRA(Program* program)
     * Gets rid of instructions which are manually deleted or
     * no longer have any uses.
     */
-   for (auto &block : program->blocks) {
-      auto new_end = std::remove_if(
-         block.instructions.begin(), block.instructions.end(),
-         [&ctx](const aco_ptr<Instruction> &instr) { return !instr || is_dead(ctx.uses, instr.get()); });
+   for (auto& block : program->blocks) {
+      auto new_end = std::remove_if(block.instructions.begin(), block.instructions.end(),
+                                    [&ctx](const aco_ptr<Instruction>& instr)
+                                    { return !instr || is_dead(ctx.uses, instr.get()); });
       block.instructions.resize(new_end - block.instructions.begin());
    }
 }
 
-} /* End of aco namespace */
-
+} // namespace aco
diff --git a/src/amd/compiler/aco_print_asm.cpp b/src/amd/compiler/aco_print_asm.cpp
index ec86327e212..dcc7c4bc747 100644
--- a/src/amd/compiler/aco_print_asm.cpp
+++ b/src/amd/compiler/aco_print_asm.cpp
@@ -39,17 +39,17 @@ namespace {
 
 /* LLVM disassembler only supports GFX8+, try to disassemble with CLRXdisasm
  * for GFX6-GFX7 if found on the system, this is better than nothing.
-*/
-bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
-                         FILE *output)
+ */
+bool
+print_asm_gfx6_gfx7(Program* program, std::vector<uint32_t>& binary, FILE* output)
 {
 #ifdef _WIN32
    return true;
 #else
    char path[] = "/tmp/fileXXXXXX";
    char line[2048], command[128];
-   const char *gpu_type;
-   FILE *p;
+   const char* gpu_type;
+   FILE* p;
    int fd;
 
    /* Dump the binary into a temporary file. */
@@ -57,8 +57,7 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
    if (fd < 0)
       return true;
 
-   for (uint32_t w : binary)
-   {
+   for (uint32_t w : binary) {
       if (write(fd, &w, sizeof(w)) == -1)
          goto fail;
    }
@@ -69,30 +68,16 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
    switch (program->chip_class) {
    case GFX6:
       switch (program->family) {
-      case CHIP_TAHITI:
-         gpu_type = "tahiti";
-         break;
-      case CHIP_PITCAIRN:
-         gpu_type = "pitcairn";
-         break;
-      case CHIP_VERDE:
-         gpu_type = "capeverde";
-         break;
-      case CHIP_OLAND:
-         gpu_type = "oland";
-         break;
-      case CHIP_HAINAN:
-         gpu_type = "hainan";
-         break;
-      default:
-         unreachable("Invalid GFX6 family!");
+      case CHIP_TAHITI: gpu_type = "tahiti"; break;
+      case CHIP_PITCAIRN: gpu_type = "pitcairn"; break;
+      case CHIP_VERDE: gpu_type = "capeverde"; break;
+      case CHIP_OLAND: gpu_type = "oland"; break;
+      case CHIP_HAINAN: gpu_type = "hainan"; break;
+      default: unreachable("Invalid GFX6 family!");
       }
       break;
-   case GFX7:
-      gpu_type = "gfx700";
-      break;
-   default:
-      unreachable("Invalid chip class!");
+   case GFX7: gpu_type = "gfx700"; break;
+   default: unreachable("Invalid chip class!");
    }
 
    sprintf(command, "clrxdisasm --gpuType=%s -r %s", gpu_type, path);
@@ -121,22 +106,21 @@ fail:
 #endif
 }
 
-std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disasm,
-                                     uint32_t *binary, unsigned exec_size, size_t pos,
-                                     char *outline, unsigned outline_size)
+std::pair<bool, size_t>
+disasm_instr(chip_class chip, LLVMDisasmContextRef disasm, uint32_t* binary, unsigned exec_size,
+             size_t pos, char* outline, unsigned outline_size)
 {
    /* mask out src2 on v_writelane_b32 */
    if (((chip == GFX8 || chip == GFX9) && (binary[pos] & 0xffff8000) == 0xd28a0000) ||
        (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7610000)) {
-      binary[pos+1] = binary[pos+1] & 0xF803FFFF;
+      binary[pos + 1] = binary[pos + 1] & 0xF803FFFF;
    }
 
-   size_t l = LLVMDisasmInstruction(disasm, (uint8_t *) &binary[pos],
-                                    (exec_size - pos) * sizeof(uint32_t), pos * 4,
-                                    outline, outline_size);
+   size_t l =
+      LLVMDisasmInstruction(disasm, (uint8_t*)&binary[pos], (exec_size - pos) * sizeof(uint32_t),
+                            pos * 4, outline, outline_size);
 
-   if (chip >= GFX10 && l == 8 &&
-       ((binary[pos] & 0xffff0000) == 0xd7610000) &&
+   if (chip >= GFX10 && l == 8 && ((binary[pos] & 0xffff0000) == 0xd7610000) &&
        ((binary[pos + 1] & 0x1ff) == 0xff)) {
       /* v_writelane with literal uses 3 dwords but llvm consumes only 2 */
       l += 4;
@@ -145,14 +129,14 @@ std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disas
    bool invalid = false;
    size_t size;
    if (!l &&
-       ((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) || /* v_add_u32_e64 + clamp */
+       ((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) ||  /* v_add_u32_e64 + clamp */
         (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7038000) || /* v_add_u16_e64 + clamp */
-        (chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) || /* v_add_u16_e64 + clamp */
+        (chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) ||  /* v_add_u16_e64 + clamp */
         (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd76d8000) || /* v_add3_u32 + clamp */
         (chip == GFX9 && (binary[pos] & 0xffff8000) == 0xd1ff8000)) /* v_add3_u32 + clamp */) {
       strcpy(outline, "\tinteger addition + clamp");
-      bool has_literal = chip >= GFX10 &&
-                         (((binary[pos+1] & 0x1ff) == 0xff) || (((binary[pos+1] >> 9) & 0x1ff) == 0xff));
+      bool has_literal = chip >= GFX10 && (((binary[pos + 1] & 0x1ff) == 0xff) ||
+                                           (((binary[pos + 1] >> 9) & 0x1ff) == 0xff));
       size = 2 + has_literal;
    } else if (chip >= GFX10 && l == 4 && ((binary[pos] & 0xfe0001ff) == 0x020000f9)) {
       strcpy(outline, "\tv_cndmask_b32 + sdwa");
@@ -170,8 +154,8 @@ std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disas
 }
 } /* end namespace */
 
-bool print_asm(Program *program, std::vector<uint32_t>& binary,
-               unsigned exec_size, FILE *output)
+bool
+print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output)
 {
    if (program->chip_class <= GFX7) {
       /* Do not abort if clrxdisasm isn't found. */
@@ -187,7 +171,7 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
    }
 
    std::vector<llvm::SymbolInfoTy> symbols;
-   std::vector<std::array<char,16>> block_names;
+   std::vector<std::array<char, 16>> block_names;
    block_names.reserve(program->blocks.size());
    for (Block& block : program->blocks) {
       if (!referenced_blocks[block.index])
@@ -195,18 +179,18 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
       std::array<char, 16> name;
       sprintf(name.data(), "BB%u", block.index);
       block_names.push_back(name);
-      symbols.emplace_back(block.offset * 4, llvm::StringRef(block_names[block_names.size() - 1].data()), 0);
+      symbols.emplace_back(block.offset * 4,
+                           llvm::StringRef(block_names[block_names.size() - 1].data()), 0);
    }
 
-   const char *features = "";
+   const char* features = "";
    if (program->chip_class >= GFX10 && program->wave_size == 64) {
       features = "+wavefrontsize64";
    }
 
-   LLVMDisasmContextRef disasm = LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d",
-                                                             ac_get_llvm_processor_name(program->family),
-                                                             features,
-                                                             &symbols, 0, NULL, NULL);
+   LLVMDisasmContextRef disasm =
+      LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d", ac_get_llvm_processor_name(program->family),
+                                  features, &symbols, 0, NULL, NULL);
 
    size_t pos = 0;
    bool invalid = false;
@@ -216,7 +200,8 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
    unsigned prev_pos = 0;
    unsigned repeat_count = 0;
    while (pos < exec_size) {
-      bool new_block = next_block < program->blocks.size() && pos == program->blocks[next_block].offset;
+      bool new_block =
+         next_block < program->blocks.size() && pos == program->blocks[next_block].offset;
       if (pos + prev_size <= exec_size && prev_pos != pos && !new_block &&
           memcmp(&binary[prev_pos], &binary[pos], prev_size * 4) == 0) {
          repeat_count++;
@@ -235,8 +220,8 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
       }
 
       char outline[1024];
-      std::pair<bool, size_t> res = disasm_instr(
-         program->chip_class, disasm, binary.data(), exec_size, pos, outline, sizeof(outline));
+      std::pair<bool, size_t> res = disasm_instr(program->chip_class, disasm, binary.data(),
+                                                 exec_size, pos, outline, sizeof(outline));
       invalid |= res.first;
 
       fprintf(output, "%-60s ;", outline);
@@ -271,4 +256,4 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
    return invalid;
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp
index c45e823ca65..339b938c3eb 100644
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -86,36 +86,38 @@ const std::array<const char*, num_reduce_ops> reduce_ops = []()
    return ret;
 }();
 
-static void print_reg_class(const RegClass rc, FILE *output)
+static void
+print_reg_class(const RegClass rc, FILE* output)
 {
    switch (rc) {
-      case RegClass::s1: fprintf(output, " s1: "); return;
-      case RegClass::s2: fprintf(output, " s2: "); return;
-      case RegClass::s3: fprintf(output, " s3: "); return;
-      case RegClass::s4: fprintf(output, " s4: "); return;
-      case RegClass::s6: fprintf(output, " s6: "); return;
-      case RegClass::s8: fprintf(output, " s8: "); return;
-      case RegClass::s16: fprintf(output, "s16: "); return;
-      case RegClass::v1: fprintf(output, " v1: "); return;
-      case RegClass::v2: fprintf(output, " v2: "); return;
-      case RegClass::v3: fprintf(output, " v3: "); return;
-      case RegClass::v4: fprintf(output, " v4: "); return;
-      case RegClass::v5: fprintf(output, " v5: "); return;
-      case RegClass::v6: fprintf(output, " v6: "); return;
-      case RegClass::v7: fprintf(output, " v7: "); return;
-      case RegClass::v8: fprintf(output, " v8: "); return;
-      case RegClass::v1b: fprintf(output, " v1b: "); return;
-      case RegClass::v2b: fprintf(output, " v2b: "); return;
-      case RegClass::v3b: fprintf(output, " v3b: "); return;
-      case RegClass::v4b: fprintf(output, " v4b: "); return;
-      case RegClass::v6b: fprintf(output, " v6b: "); return;
-      case RegClass::v8b: fprintf(output, " v8b: "); return;
-      case RegClass::v1_linear: fprintf(output, " v1: "); return;
-      case RegClass::v2_linear: fprintf(output, " v2: "); return;
+   case RegClass::s1: fprintf(output, " s1: "); return;
+   case RegClass::s2: fprintf(output, " s2: "); return;
+   case RegClass::s3: fprintf(output, " s3: "); return;
+   case RegClass::s4: fprintf(output, " s4: "); return;
+   case RegClass::s6: fprintf(output, " s6: "); return;
+   case RegClass::s8: fprintf(output, " s8: "); return;
+   case RegClass::s16: fprintf(output, "s16: "); return;
+   case RegClass::v1: fprintf(output, " v1: "); return;
+   case RegClass::v2: fprintf(output, " v2: "); return;
+   case RegClass::v3: fprintf(output, " v3: "); return;
+   case RegClass::v4: fprintf(output, " v4: "); return;
+   case RegClass::v5: fprintf(output, " v5: "); return;
+   case RegClass::v6: fprintf(output, " v6: "); return;
+   case RegClass::v7: fprintf(output, " v7: "); return;
+   case RegClass::v8: fprintf(output, " v8: "); return;
+   case RegClass::v1b: fprintf(output, " v1b: "); return;
+   case RegClass::v2b: fprintf(output, " v2b: "); return;
+   case RegClass::v3b: fprintf(output, " v3b: "); return;
+   case RegClass::v4b: fprintf(output, " v4b: "); return;
+   case RegClass::v6b: fprintf(output, " v6b: "); return;
+   case RegClass::v8b: fprintf(output, " v8b: "); return;
+   case RegClass::v1_linear: fprintf(output, " v1: "); return;
+   case RegClass::v2_linear: fprintf(output, " v2: "); return;
    }
 }
 
-void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags)
+void
+print_physReg(PhysReg reg, unsigned bytes, FILE* output, unsigned flags)
 {
    if (reg == 124) {
       fprintf(output, "m0");
@@ -134,16 +136,17 @@ void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags)
       } else {
          fprintf(output, "%c[%d", is_vgpr ? 'v' : 's', r);
          if (size > 1)
-            fprintf(output, "-%d]", r + size -1);
+            fprintf(output, "-%d]", r + size - 1);
          else
             fprintf(output, "]");
       }
       if (reg.byte() || bytes % 4)
-         fprintf(output, "[%d:%d]", reg.byte()*8, (reg.byte()+bytes) * 8);
+         fprintf(output, "[%d:%d]", reg.byte() * 8, (reg.byte() + bytes) * 8);
    }
 }
 
-static void print_constant(uint8_t reg, FILE *output)
+static void
+print_constant(uint8_t reg, FILE* output)
 {
    if (reg >= 128 && reg <= 192) {
       fprintf(output, "%d", reg - 128);
@@ -154,37 +157,20 @@ static void print_constant(uint8_t reg, FILE *output)
    }
 
    switch (reg) {
-   case 240:
-      fprintf(output, "0.5");
-      break;
-   case 241:
-      fprintf(output, "-0.5");
-      break;
-   case 242:
-      fprintf(output, "1.0");
-      break;
-   case 243:
-      fprintf(output, "-1.0");
-      break;
-   case 244:
-      fprintf(output, "2.0");
-      break;
-   case 245:
-      fprintf(output, "-2.0");
-      break;
-   case 246:
-      fprintf(output, "4.0");
-      break;
-   case 247:
-      fprintf(output, "-4.0");
-      break;
-   case 248:
-      fprintf(output, "1/(2*PI)");
-      break;
+   case 240: fprintf(output, "0.5"); break;
+   case 241: fprintf(output, "-0.5"); break;
+   case 242: fprintf(output, "1.0"); break;
+   case 243: fprintf(output, "-1.0"); break;
+   case 244: fprintf(output, "2.0"); break;
+   case 245: fprintf(output, "-2.0"); break;
+   case 246: fprintf(output, "4.0"); break;
+   case 247: fprintf(output, "-4.0"); break;
+   case 248: fprintf(output, "1/(2*PI)"); break;
    }
 }
 
-void aco_print_operand(const Operand *operand, FILE *output, unsigned flags)
+void
+aco_print_operand(const Operand* operand, FILE* output, unsigned flags)
 {
    if (operand->isLiteral() || (operand->isConstant() && operand->bytes() == 1)) {
       if (operand->bytes() == 1)
@@ -216,7 +202,8 @@ void aco_print_operand(const Operand *operand, FILE *output, unsigned flags)
    }
 }
 
-static void print_definition(const Definition *definition, FILE *output, unsigned flags)
+static void
+print_definition(const Definition* definition, FILE* output, unsigned flags)
 {
    if (!(flags & print_no_ssa))
       print_reg_class(definition->regClass(), output);
@@ -235,7 +222,8 @@ static void print_definition(const Definition *definition, FILE *output, unsigne
       print_physReg(definition->physReg(), definition->bytes(), output, flags);
 }
 
-static void print_storage(storage_class storage, FILE *output)
+static void
+print_storage(storage_class storage, FILE* output)
 {
    fprintf(output, " storage:");
    int printed = 0;
@@ -255,7 +243,8 @@ static void print_storage(storage_class storage, FILE *output)
       printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
 }
 
-static void print_semantics(memory_semantics sem, FILE *output)
+static void
+print_semantics(memory_semantics sem, FILE* output)
 {
    fprintf(output, " semantics:");
    int printed = 0;
@@ -275,36 +264,29 @@ static void print_semantics(memory_semantics sem, FILE *output)
       printed += fprintf(output, "%srmw", printed ? "," : "");
 }
 
-static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
+static void
+print_scope(sync_scope scope, FILE* output, const char* prefix = "scope")
 {
    fprintf(output, " %s:", prefix);
    switch (scope) {
-   case scope_invocation:
-      fprintf(output, "invocation");
-      break;
-   case scope_subgroup:
-      fprintf(output, "subgroup");
-      break;
-   case scope_workgroup:
-      fprintf(output, "workgroup");
-      break;
-   case scope_queuefamily:
-      fprintf(output, "queuefamily");
-      break;
-   case scope_device:
-      fprintf(output, "device");
-      break;
+   case scope_invocation: fprintf(output, "invocation"); break;
+   case scope_subgroup: fprintf(output, "subgroup"); break;
+   case scope_workgroup: fprintf(output, "workgroup"); break;
+   case scope_queuefamily: fprintf(output, "queuefamily"); break;
+   case scope_device: fprintf(output, "device"); break;
    }
 }
 
-static void print_sync(memory_sync_info sync, FILE *output)
+static void
+print_sync(memory_sync_info sync, FILE* output)
 {
    print_storage(sync.storage, output);
    print_semantics(sync.semantics, output);
    print_scope(sync.scope, output);
 }
 
-static void print_instr_format_specific(const Instruction *instr, FILE *output)
+static void
+print_instr_format_specific(const Instruction* instr, FILE* output)
 {
    switch (instr->format) {
    case Format::SOPK: {
@@ -319,9 +301,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
          /* we usually should check the chip class for vmcnt/lgkm, but
           * insert_waitcnt() should fill it in regardless. */
          unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
-         if (vmcnt != 63) fprintf(output, " vmcnt(%d)", vmcnt);
-         if (((imm >> 4) & 0x7) < 0x7) fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7);
-         if (((imm >> 8) & 0x3F) < 0x3F) fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F);
+         if (vmcnt != 63)
+            fprintf(output, " vmcnt(%d)", vmcnt);
+         if (((imm >> 4) & 0x7) < 0x7)
+            fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7);
+         if (((imm >> 8) & 0x3F) < 0x3F)
+            fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F);
          break;
       }
       case aco_opcode::s_endpgm:
@@ -337,35 +322,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
       case aco_opcode::s_sendmsg: {
          unsigned id = imm & sendmsg_id_mask;
          switch (id) {
-         case sendmsg_none:
-            fprintf(output, " sendmsg(MSG_NONE)");
-            break;
+         case sendmsg_none: fprintf(output, " sendmsg(MSG_NONE)"); break;
          case _sendmsg_gs:
-            fprintf(output, " sendmsg(gs%s%s, %u)",
-                    imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8);
+            fprintf(output, " sendmsg(gs%s%s, %u)", imm & 0x10 ? ", cut" : "",
+                    imm & 0x20 ? ", emit" : "", imm >> 8);
             break;
          case _sendmsg_gs_done:
-            fprintf(output, " sendmsg(gs_done%s%s, %u)",
-                    imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8);
-            break;
-         case sendmsg_save_wave:
-            fprintf(output, " sendmsg(save_wave)");
-            break;
-         case sendmsg_stall_wave_gen:
-            fprintf(output, " sendmsg(stall_wave_gen)");
-            break;
-         case sendmsg_halt_waves:
-            fprintf(output, " sendmsg(halt_waves)");
-            break;
-         case sendmsg_ordered_ps_done:
-            fprintf(output, " sendmsg(ordered_ps_done)");
-            break;
-         case sendmsg_early_prim_dealloc:
-            fprintf(output, " sendmsg(early_prim_dealloc)");
-            break;
-         case sendmsg_gs_alloc_req:
-            fprintf(output, " sendmsg(gs_alloc_req)");
+            fprintf(output, " sendmsg(gs_done%s%s, %u)", imm & 0x10 ? ", cut" : "",
+                    imm & 0x20 ? ", emit" : "", imm >> 8);
             break;
+         case sendmsg_save_wave: fprintf(output, " sendmsg(save_wave)"); break;
+         case sendmsg_stall_wave_gen: fprintf(output, " sendmsg(stall_wave_gen)"); break;
+         case sendmsg_halt_waves: fprintf(output, " sendmsg(halt_waves)"); break;
+         case sendmsg_ordered_ps_done: fprintf(output, " sendmsg(ordered_ps_done)"); break;
+         case sendmsg_early_prim_dealloc: fprintf(output, " sendmsg(early_prim_dealloc)"); break;
+         case sendmsg_gs_alloc_req: fprintf(output, " sendmsg(gs_alloc_req)"); break;
          }
          break;
       }
@@ -433,40 +404,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
    }
    case Format::MIMG: {
       const MIMG_instruction& mimg = instr->mimg();
-      unsigned identity_dmask = !instr->definitions.empty() ?
-                                (1 << instr->definitions[0].size()) - 1 :
-                                0xf;
+      unsigned identity_dmask =
+         !instr->definitions.empty() ? (1 << instr->definitions[0].size()) - 1 : 0xf;
       if ((mimg.dmask & identity_dmask) != identity_dmask)
-         fprintf(output, " dmask:%s%s%s%s",
-                 mimg.dmask & 0x1 ? "x" : "",
-                 mimg.dmask & 0x2 ? "y" : "",
-                 mimg.dmask & 0x4 ? "z" : "",
+         fprintf(output, " dmask:%s%s%s%s", mimg.dmask & 0x1 ? "x" : "",
+                 mimg.dmask & 0x2 ? "y" : "", mimg.dmask & 0x4 ? "z" : "",
                  mimg.dmask & 0x8 ? "w" : "");
       switch (mimg.dim) {
-      case ac_image_1d:
-         fprintf(output, " 1d");
-         break;
-      case ac_image_2d:
-         fprintf(output, " 2d");
-         break;
-      case ac_image_3d:
-         fprintf(output, " 3d");
-         break;
-      case ac_image_cube:
-         fprintf(output, " cube");
-         break;
-      case ac_image_1darray:
-         fprintf(output, " 1darray");
-         break;
-      case ac_image_2darray:
-         fprintf(output, " 2darray");
-         break;
-      case ac_image_2dmsaa:
-         fprintf(output, " 2dmsaa");
-         break;
-      case ac_image_2darraymsaa:
-         fprintf(output, " 2darraymsaa");
-         break;
+      case ac_image_1d: fprintf(output, " 1d"); break;
+      case ac_image_2d: fprintf(output, " 2d"); break;
+      case ac_image_3d: fprintf(output, " 3d"); break;
+      case ac_image_cube: fprintf(output, " cube"); break;
+      case ac_image_1darray: fprintf(output, " 1darray"); break;
+      case ac_image_2darray: fprintf(output, " 2darray"); break;
+      case ac_image_2dmsaa: fprintf(output, " 2dmsaa"); break;
+      case ac_image_2darraymsaa: fprintf(output, " 2darraymsaa"); break;
       }
       if (mimg.unrm)
          fprintf(output, " unrm");
@@ -495,10 +447,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
       const Export_instruction& exp = instr->exp();
       unsigned identity_mask = exp.compressed ? 0x5 : 0xf;
       if ((exp.enabled_mask & identity_mask) != identity_mask)
-         fprintf(output, " en:%c%c%c%c",
-                 exp.enabled_mask & 0x1 ? 'r' : '*',
-                 exp.enabled_mask & 0x2 ? 'g' : '*',
-                 exp.enabled_mask & 0x4 ? 'b' : '*',
+         fprintf(output, " en:%c%c%c%c", exp.enabled_mask & 0x1 ? 'r' : '*',
+                 exp.enabled_mask & 0x2 ? 'g' : '*', exp.enabled_mask & 0x4 ? 'b' : '*',
                  exp.enabled_mask & 0x8 ? 'a' : '*');
       if (exp.compressed)
          fprintf(output, " compr");
@@ -624,15 +574,9 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
    if (instr->isVOP3()) {
       const VOP3_instruction& vop3 = instr->vop3();
       switch (vop3.omod) {
-      case 1:
-         fprintf(output, " *2");
-         break;
-      case 2:
-         fprintf(output, " *4");
-         break;
-      case 3:
-         fprintf(output, " *0.5");
-         break;
+      case 1: fprintf(output, " *2"); break;
+      case 2: fprintf(output, " *4"); break;
+      case 3: fprintf(output, " *0.5"); break;
       }
       if (vop3.clamp)
          fprintf(output, " clamp");
@@ -641,8 +585,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
    } else if (instr->isDPP()) {
       const DPP_instruction& dpp = instr->dpp();
       if (dpp.dpp_ctrl <= 0xff) {
-         fprintf(output, " quad_perm:[%d,%d,%d,%d]",
-                 dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3,
+         fprintf(output, " quad_perm:[%d,%d,%d,%d]", dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3,
                  (dpp.dpp_ctrl >> 4) & 0x3, (dpp.dpp_ctrl >> 6) & 0x3);
       } else if (dpp.dpp_ctrl >= 0x101 && dpp.dpp_ctrl <= 0x10f) {
          fprintf(output, " row_shl:%d", dpp.dpp_ctrl & 0xf);
@@ -678,21 +621,14 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
    } else if (instr->isSDWA()) {
       const SDWA_instruction& sdwa = instr->sdwa();
       switch (sdwa.omod) {
-      case 1:
-         fprintf(output, " *2");
-         break;
-      case 2:
-         fprintf(output, " *4");
-         break;
-      case 3:
-         fprintf(output, " *0.5");
-         break;
+      case 1: fprintf(output, " *2"); break;
+      case 2: fprintf(output, " *4"); break;
+      case 3: fprintf(output, " *0.5"); break;
       }
       if (sdwa.clamp)
          fprintf(output, " clamp");
       switch (sdwa.dst_sel & sdwa_asuint) {
-      case sdwa_udword:
-         break;
+      case sdwa_udword: break;
       case sdwa_ubyte0:
       case sdwa_ubyte1:
       case sdwa_ubyte2:
@@ -711,7 +647,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
    }
 }
 
-void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
+void
+aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
 {
    if (!instr->definitions.empty()) {
       for (unsigned i = 0; i < instr->definitions.size(); ++i) {
@@ -723,10 +660,10 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
    }
    fprintf(output, "%s", instr_info.name[(int)instr->opcode]);
    if (instr->operands.size()) {
-      bool *const abs = (bool *)alloca(instr->operands.size() * sizeof(bool));
-      bool *const neg = (bool *)alloca(instr->operands.size() * sizeof(bool));
-      bool *const opsel = (bool *)alloca(instr->operands.size() * sizeof(bool));
-      uint8_t *const sel = (uint8_t *)alloca(instr->operands.size() * sizeof(uint8_t));
+      bool* const abs = (bool*)alloca(instr->operands.size() * sizeof(bool));
+      bool* const neg = (bool*)alloca(instr->operands.size() * sizeof(bool));
+      bool* const opsel = (bool*)alloca(instr->operands.size() * sizeof(bool));
+      uint8_t* const sel = (uint8_t*)alloca(instr->operands.size() * sizeof(uint8_t));
       for (unsigned i = 0; i < instr->operands.size(); ++i) {
          abs[i] = false;
          neg[i] = false;
@@ -792,8 +729,7 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
          if (instr->isVOP3P()) {
             const VOP3P_instruction& vop3 = instr->vop3p();
             if ((vop3.opsel_lo & (1 << i)) || !(vop3.opsel_hi & (1 << i))) {
-               fprintf(output, ".%c%c",
-                       vop3.opsel_lo & (1 << i) ? 'y' : 'x',
+               fprintf(output, ".%c%c", vop3.opsel_lo & (1 << i) ? 'y' : 'x',
                        vop3.opsel_hi & (1 << i) ? 'y' : 'x');
             }
             if (vop3.neg_lo[i] && vop3.neg_hi[i])
@@ -808,7 +744,8 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
    print_instr_format_specific(instr, output);
 }
 
-static void print_block_kind(uint16_t kind, FILE *output)
+static void
+print_block_kind(uint16_t kind, FILE* output)
 {
    if (kind & block_kind_uniform)
       fprintf(output, "uniform, ");
@@ -844,7 +781,8 @@ static void print_block_kind(uint16_t kind, FILE *output)
       fprintf(output, "export_end, ");
 }
 
-static void print_stage(Stage stage, FILE *output)
+static void
+print_stage(Stage stage, FILE* output)
 {
    fprintf(output, "ACO shader stage: ");
 
@@ -888,7 +826,8 @@ static void print_stage(Stage stage, FILE *output)
    fprintf(output, "\n");
 }
 
-void aco_print_block(const Block* block, FILE *output, unsigned flags, const live& live_vars)
+void
+aco_print_block(const Block* block, FILE* output, unsigned flags, const live& live_vars)
 {
    fprintf(output, "BB%d\n", block->index);
    fprintf(output, "/* logical preds: ");
@@ -927,19 +866,16 @@ void aco_print_block(const Block* block, FILE *output, unsigned flags, const liv
    }
 }
 
-void aco_print_program(const Program *program, FILE *output, const live& live_vars, unsigned flags)
+void
+aco_print_program(const Program* program, FILE* output, const live& live_vars, unsigned flags)
 {
    switch (program->progress) {
-   case CompilationProgress::after_isel:
-      fprintf(output, "After Instruction Selection:\n");
-      break;
+   case CompilationProgress::after_isel: fprintf(output, "After Instruction Selection:\n"); break;
    case CompilationProgress::after_spilling:
       fprintf(output, "After Spilling:\n");
       flags |= print_kill;
       break;
-   case CompilationProgress::after_ra:
-      fprintf(output, "After RA:\n");
-      break;
+   case CompilationProgress::after_ra: fprintf(output, "After RA:\n"); break;
    }
 
    print_stage(program->stage, output);
@@ -965,9 +901,10 @@ void aco_print_program(const Program *program, FILE *output, const live& live_va
    fprintf(output, "\n");
 }
 
-void aco_print_program(const Program *program, FILE *output, unsigned flags)
+void
+aco_print_program(const Program* program, FILE* output, unsigned flags)
 {
    aco_print_program(program, output, live(), flags);
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp
index c7ba4ff16a2..ce99779327b 100644
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -36,7 +36,8 @@
 
 namespace aco {
 
-void setup_reduce_temp(Program* program)
+void
+setup_reduce_temp(Program* program)
 {
    unsigned last_top_level_block_idx = 0;
    unsigned maxSize = 0;
@@ -69,7 +70,8 @@ void setup_reduce_temp(Program* program)
       if (reduceTmp_in_loop && block.loop_nest_depth == 0) {
          assert(inserted_at == (int)last_top_level_block_idx);
 
-         aco_ptr<Instruction> end{create_instruction<Instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
+         aco_ptr<Instruction> end{create_instruction<Instruction>(
+            aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
          end->operands[0] = Operand(reduceTmp);
          if (vtmp_in_loop)
             end->operands[1] = Operand(vtmp);
@@ -89,7 +91,7 @@ void setup_reduce_temp(Program* program)
 
       std::vector<aco_ptr<Instruction>>::iterator it;
       for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
-         Instruction *instr = (*it).get();
+         Instruction* instr = (*it).get();
          if (instr->format != Format::PSEUDO_REDUCTION)
             continue;
 
@@ -98,7 +100,8 @@ void setup_reduce_temp(Program* program)
 
          if ((int)last_top_level_block_idx != inserted_at) {
             reduceTmp = program->allocateTmp(reduceTmp.regClass());
-            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
+            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+               aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
             create->definitions[0] = Definition(reduceTmp);
             /* find the right place to insert this definition */
             if (last_top_level_block_idx == block.index) {
@@ -110,18 +113,19 @@ void setup_reduce_temp(Program* program)
             } else {
                assert(last_top_level_block_idx < block.index);
                /* insert before the branch at last top level block */
-               std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions;
-               instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
+               std::vector<aco_ptr<Instruction>>& instructions =
+                  program->blocks[last_top_level_block_idx].instructions;
+               instructions.insert(std::next(instructions.begin(), instructions.size() - 1),
+                                   std::move(create));
                inserted_at = last_top_level_block_idx;
             }
          }
 
          /* same as before, except for the vector temporary instead of the reduce temporary */
          unsigned cluster_size = instr->reduction().cluster_size;
-         bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
-                          op == fmin64 || op == fmax64 || op == umin64 ||
-                          op == umax64 || op == imin64 || op == imax64 ||
-                          op == imul64;
+         bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
+                          op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
+                          op == imax64 || op == imul64;
          bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
                                 op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
                                 op == iadd64;
@@ -138,15 +142,18 @@ void setup_reduce_temp(Program* program)
          vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
          if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
             vtmp = program->allocateTmp(vtmp.regClass());
-            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
+            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+               aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
             create->definitions[0] = Definition(vtmp);
             if (last_top_level_block_idx == block.index) {
                it = block.instructions.insert(it, std::move(create));
                it++;
             } else {
                assert(last_top_level_block_idx < block.index);
-               std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions;
-               instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
+               std::vector<aco_ptr<Instruction>>& instructions =
+                  program->blocks[last_top_level_block_idx].instructions;
+               instructions.insert(std::next(instructions.begin(), instructions.size() - 1),
+                                   std::move(create));
                vtmp_inserted_at = last_top_level_block_idx;
             }
          }
@@ -158,5 +165,4 @@ void setup_reduce_temp(Program* program)
    }
 }
 
-};
-
+}; // namespace aco
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index 9723caddc47..3ec0b21db48 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -37,10 +37,14 @@ namespace {
 
 struct ra_ctx;
 
-unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc);
-void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc);
-std::pair<unsigned, unsigned> get_subdword_definition_info(Program *program, const aco_ptr<Instruction>& instr, RegClass rc);
-void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg);
+unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,
+                                     unsigned idx, RegClass rc);
+void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
+                          RegClass rc);
+std::pair<unsigned, unsigned>
+get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
+void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, unsigned idx,
+                             PhysReg reg);
 
 struct assignment {
    PhysReg reg;
@@ -71,12 +75,11 @@ struct ra_ctx {
    ra_test_policy policy;
 
    ra_ctx(Program* program_, ra_test_policy policy_)
-      : program(program_),
-        assignments(program->peekAllocationId()),
-        renames(program->blocks.size()),
-        policy(policy_)
+       : program(program_), assignments(program->peekAllocationId()),
+         renames(program->blocks.size()), policy(policy_)
    {
-      pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
+      pseudo_dummy.reset(
+         create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
       sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
       vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
    }
@@ -92,31 +95,25 @@ struct PhysRegIterator {
 
    PhysReg reg;
 
-   PhysReg operator*() const {
-      return reg;
-   }
+   PhysReg operator*() const { return reg; }
 
-   PhysRegIterator& operator++() {
+   PhysRegIterator& operator++()
+   {
       reg.reg_b += 4;
       return *this;
    }
 
-   PhysRegIterator& operator--() {
+   PhysRegIterator& operator--()
+   {
       reg.reg_b -= 4;
       return *this;
    }
 
-   bool operator==(PhysRegIterator oth) const {
-      return reg == oth.reg;
-   }
+   bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
 
-   bool operator!=(PhysRegIterator oth) const {
-      return reg != oth.reg;
-   }
+   bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
 
-   bool operator<(PhysRegIterator oth) const {
-      return reg < oth.reg;
-   }
+   bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
 };
 
 /* Half-open register interval used in "sliding window"-style for-loops */
@@ -125,72 +122,65 @@ struct PhysRegInterval {
    unsigned size;
 
    /* Inclusive lower bound */
-   PhysReg lo() const {
-      return lo_;
-   }
+   PhysReg lo() const { return lo_; }
 
    /* Exclusive upper bound */
-   PhysReg hi() const {
-      return PhysReg { lo() + size };
-   }
+   PhysReg hi() const { return PhysReg{lo() + size}; }
 
-   PhysRegInterval& operator+=(uint32_t stride) {
-      lo_ = PhysReg { lo_.reg() + stride };
+   PhysRegInterval& operator+=(uint32_t stride)
+   {
+      lo_ = PhysReg{lo_.reg() + stride};
       return *this;
    }
 
-   bool operator!=(const PhysRegInterval& oth) const {
-      return lo_ != oth.lo_ || size != oth.size;
-   }
+   bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
 
    /* Construct a half-open interval, excluding the end register */
-   static PhysRegInterval from_until(PhysReg first, PhysReg end) {
-      return { first, end - first };
-   }
+   static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
 
-   bool contains(PhysReg reg) const {
-      return lo() <= reg && reg < hi();
-   }
+   bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
 
-   bool contains(const PhysRegInterval& needle) const {
+   bool contains(const PhysRegInterval& needle) const
+   {
       return needle.lo() >= lo() && needle.hi() <= hi();
    }
 
-   PhysRegIterator begin() const {
-      return { lo_ };
-   }
+   PhysRegIterator begin() const { return {lo_}; }
 
-   PhysRegIterator end() const {
-      return { PhysReg { lo_ + size } };
-   }
+   PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
 };
 
-bool intersects(const PhysRegInterval& a, const PhysRegInterval& b) {
-   return ((a.lo() >= b.lo() && a.lo() < b.hi()) ||
-           (a.hi() > b.lo() && a.hi() <= b.hi()));
+bool
+intersects(const PhysRegInterval& a, const PhysRegInterval& b)
+{
+   return ((a.lo() >= b.lo() && a.lo() < b.hi()) || (a.hi() > b.lo() && a.hi() <= b.hi()));
 }
 
 /* Gets the stride for full (non-subdword) registers */
-uint32_t get_stride(RegClass rc) {
-    if (rc.type() == RegType::vgpr) {
-        return 1;
-    } else {
-        uint32_t size = rc.size();
-        if (size == 2) {
-            return 2;
-        } else if (size >= 4) {
-            return 4;
-        } else {
-            return 1;
-        }
-    }
+uint32_t
+get_stride(RegClass rc)
+{
+   if (rc.type() == RegType::vgpr) {
+      return 1;
+   } else {
+      uint32_t size = rc.size();
+      if (size == 2) {
+         return 2;
+      } else if (size >= 4) {
+         return 4;
+      } else {
+         return 1;
+      }
+   }
 }
 
-PhysRegInterval get_reg_bounds(Program* program, RegType type) {
+PhysRegInterval
+get_reg_bounds(Program* program, RegType type)
+{
    if (type == RegType::vgpr) {
-      return { PhysReg { 256 }, (unsigned)program->max_reg_demand.vgpr };
+      return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};
    } else {
-      return { PhysReg { 0 }, (unsigned)program->max_reg_demand.sgpr };
+      return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};
    }
 }
 
@@ -200,7 +190,8 @@ struct DefInfo {
    uint8_t stride;
    RegClass rc;
 
-   DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_) {
+   DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
+   {
       size = rc.size();
       stride = get_stride(rc);
 
@@ -229,20 +220,17 @@ struct DefInfo {
 
 class RegisterFile {
 public:
-   RegisterFile() {regs.fill(0);}
+   RegisterFile() { regs.fill(0); }
 
    std::array<uint32_t, 512> regs;
    std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
 
-   const uint32_t& operator [] (PhysReg index) const {
-      return regs[index];
-   }
+   const uint32_t& operator[](PhysReg index) const { return regs[index]; }
 
-   uint32_t& operator [] (PhysReg index) {
-      return regs[index];
-   }
+   uint32_t& operator[](PhysReg index) { return regs[index]; }
 
-   unsigned count_zero(PhysRegInterval reg_interval) {
+   unsigned count_zero(PhysRegInterval reg_interval)
+   {
       unsigned res = 0;
       for (PhysReg reg : reg_interval)
          res += !regs[reg];
@@ -250,7 +238,8 @@ public:
    }
 
    /* Returns true if any of the bytes in the given range are allocated or blocked */
-   bool test(PhysReg start, unsigned num_bytes) {
+   bool test(PhysReg start, unsigned num_bytes)
+   {
       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
          assert(i <= 511);
          if (regs[i] & 0x0FFFFFFF)
@@ -266,14 +255,16 @@ public:
       return false;
    }
 
-   void block(PhysReg start, RegClass rc) {
+   void block(PhysReg start, RegClass rc)
+   {
       if (rc.is_subdword())
          fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
       else
          fill(start, rc.size(), 0xFFFFFFFF);
    }
 
-   bool is_blocked(PhysReg start) {
+   bool is_blocked(PhysReg start)
+   {
       if (regs[start] == 0xFFFFFFFF)
          return true;
       if (regs[start] == 0xF0000000) {
@@ -284,7 +275,8 @@ public:
       return false;
    }
 
-   bool is_empty_or_blocked(PhysReg start) {
+   bool is_empty_or_blocked(PhysReg start)
+   {
       /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
        * incremented value to 1 */
       if (regs[start] == 0xF0000000) {
@@ -293,50 +285,53 @@ public:
       return regs[start] + 1 <= 1;
    }
 
-   void clear(PhysReg start, RegClass rc) {
+   void clear(PhysReg start, RegClass rc)
+   {
       if (rc.is_subdword())
          fill_subdword(start, rc.bytes(), 0);
       else
          fill(start, rc.size(), 0);
    }
 
-   void fill(Operand op) {
+   void fill(Operand op)
+   {
       if (op.regClass().is_subdword())
          fill_subdword(op.physReg(), op.bytes(), op.tempId());
       else
          fill(op.physReg(), op.size(), op.tempId());
    }
 
-   void clear(Operand op) {
-      clear(op.physReg(), op.regClass());
-   }
+   void clear(Operand op) { clear(op.physReg(), op.regClass()); }
 
-   void fill(Definition def) {
+   void fill(Definition def)
+   {
       if (def.regClass().is_subdword())
          fill_subdword(def.physReg(), def.bytes(), def.tempId());
       else
          fill(def.physReg(), def.size(), def.tempId());
    }
 
-   void clear(Definition def) {
-      clear(def.physReg(), def.regClass());
-   }
+   void clear(Definition def) { clear(def.physReg(), def.regClass()); }
 
-   unsigned get_id(PhysReg reg) {
+   unsigned get_id(PhysReg reg)
+   {
       return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
    }
 
 private:
-   void fill(PhysReg start, unsigned size, uint32_t val) {
+   void fill(PhysReg start, unsigned size, uint32_t val)
+   {
       for (unsigned i = 0; i < size; i++)
          regs[start + i] = val;
    }
 
-   void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
+   void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
+   {
       fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
          /* emplace or get */
-         std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
+         std::array<uint32_t, 4>& sub =
+            subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
          for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
             sub[j] = val;
 
@@ -348,22 +343,25 @@ private:
    }
 };
 
-
 std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
                                                   const PhysRegInterval reg_interval);
 
 /* helper function for debugging */
-UNUSED void print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable) {
+UNUSED void
+print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
+{
    if (reg_file[reg] == 0xFFFFFFFF) {
       printf("☐");
    } else if (reg_file[reg]) {
       const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
       if (show_subdword_alloc) {
          const char* block_chars[] = {
+            // clang-format off
             "?", "▘", "▝", "▀",
             "▖", "▌", "▞", "▛",
             "▗", "▚", "▐", "▜",
             "▄", "▙", "▟", "▉"
+            // clang-format on
          };
          unsigned index = 0;
          for (int i = 0; i < 4; ++i) {
@@ -387,7 +385,8 @@ UNUSED void print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjace
 }
 
 /* helper function for debugging */
-UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
+UNUSED void
+print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
 {
    PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);
    char reg_char = vgprs ? 'v' : 's';
@@ -403,7 +402,8 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
    /* print usage */
    auto line_begin_it = regs.begin();
    while (line_begin_it != regs.end()) {
-      const int regs_in_line = std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
+      const int regs_in_line =
+         std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
 
       if (line_begin_it == regs.begin()) {
          printf("%cgprs: ", reg_char);
@@ -413,9 +413,9 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
       const auto line_end_it = std::next(line_begin_it, regs_in_line);
 
       for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
-         bool has_adjacent_variable = (std::next(reg_it) != line_end_it &&
-                                       reg_file[*reg_it] != reg_file[*std::next(reg_it)] &&
-                                       reg_file[*std::next(reg_it)]);
+         bool has_adjacent_variable =
+            (std::next(reg_it) != line_end_it &&
+             reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
          print_reg(reg_file, *reg_it, has_adjacent_variable);
       }
 
@@ -423,11 +423,13 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
       printf("\n");
    }
 
-   const unsigned free_regs = std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
+   const unsigned free_regs =
+      std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
    printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
 
    /* print assignments ordered by registers */
-   std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
+   std::map<PhysReg, std::pair<unsigned, unsigned>>
+      regs_to_vars; /* maps to byte size and temp id */
    for (const auto& size_id : find_vars(ctx, reg_file, regs)) {
       auto reg = ctx.assignments[size_id.second].reg;
       ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id);
@@ -439,7 +441,8 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
       const auto& size_id = reg_and_var.second;
 
       printf("%%%u ", size_id.second);
-      if (ctx.orig_names.count(size_id.second) && ctx.orig_names[size_id.second].id() != size_id.second) {
+      if (ctx.orig_names.count(size_id.second) &&
+          ctx.orig_names[size_id.second].id() != size_id.second) {
          printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
       }
       printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());
@@ -456,8 +459,9 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
    }
 }
 
-
-unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc)
+unsigned
+get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx,
+                            RegClass rc)
 {
    /* v_readfirstlane_b32 cannot use SDWA */
    if (instr->opcode == aco_opcode::p_as_uniform)
@@ -477,8 +481,7 @@ unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>
 
    switch (instr->opcode) {
    case aco_opcode::ds_write_b8:
-   case aco_opcode::ds_write_b16:
-      return chip >= GFX8 ? 2 : 4;
+   case aco_opcode::ds_write_b16: return chip >= GFX8 ? 2 : 4;
    case aco_opcode::buffer_store_byte:
    case aco_opcode::buffer_store_short:
    case aco_opcode::flat_store_byte:
@@ -486,16 +489,16 @@ unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>
    case aco_opcode::scratch_store_byte:
    case aco_opcode::scratch_store_short:
    case aco_opcode::global_store_byte:
-   case aco_opcode::global_store_short:
-      return chip >= GFX9 ? 2 : 4;
-   default:
-      break;
+   case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4;
+   default: break;
    }
 
    return 4;
 }
 
-void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc)
+void
+add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
+                     RegClass rc)
 {
    chip_class chip = ctx.program->chip_class;
    if (instr->isPseudo() || byte == 0)
@@ -505,18 +508,10 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx
 
    if (!instr->usesModifiers() && instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
       switch (byte) {
-      case 0:
-         instr->opcode = aco_opcode::v_cvt_f32_ubyte0;
-         break;
-      case 1:
-         instr->opcode = aco_opcode::v_cvt_f32_ubyte1;
-         break;
-      case 2:
-         instr->opcode = aco_opcode::v_cvt_f32_ubyte2;
-         break;
-      case 3:
-         instr->opcode = aco_opcode::v_cvt_f32_ubyte3;
-         break;
+      case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
+      case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
+      case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
+      case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
       }
       return;
    } else if (can_use_SDWA(chip, instr, false)) {
@@ -565,7 +560,8 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx
 }
 
 /* minimum_stride, bytes_written */
-std::pair<unsigned, unsigned> get_subdword_definition_info(Program *program, const aco_ptr<Instruction>& instr, RegClass rc)
+std::pair<unsigned, unsigned>
+get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)
 {
    chip_class chip = program->chip_class;
 
@@ -581,11 +577,8 @@ std::pair<unsigned, unsigned> get_subdword_definition_info(Program *program, con
    case aco_opcode::v_mad_i16:
    case aco_opcode::v_fma_f16:
    case aco_opcode::v_div_fixup_f16:
-   case aco_opcode::v_interp_p2_f16:
-      bytes_written = chip >= GFX9 ? rc.bytes() : 4u;
-      break;
-   default:
-      break;
+   case aco_opcode::v_interp_p2_f16: bytes_written = chip >= GFX9 ? rc.bytes() : 4u; break;
+   default: break;
    }
    bytes_written = bytes_written > 4 ? align(bytes_written, 4) : bytes_written;
    bytes_written = MAX2(bytes_written, instr_info.definition_size[(int)instr->opcode] / 8u);
@@ -611,16 +604,15 @@ std::pair<unsigned, unsigned> get_subdword_definition_info(Program *program, con
          return std::make_pair(2u, 2u);
       else
          return std::make_pair(2u, 4u);
-   case aco_opcode::v_fma_mixlo_f16:
-      return std::make_pair(2u, 2u);
-   default:
-      break;
+   case aco_opcode::v_fma_mixlo_f16: return std::make_pair(2u, 2u);
+   default: break;
    }
 
    return std::make_pair(4u, bytes_written);
 }
 
-void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
+void
+add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
 {
    RegClass rc = instr->definitions[idx].regClass();
    chip_class chip = program->chip_class;
@@ -632,7 +624,8 @@ void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsi
       if (reg.byte() || chip < GFX10 || def_size > rc.bytes() * 8u)
          convert_to_SDWA(chip, instr);
       return;
-   } else if (reg.byte() && rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, reg.byte() / 2)) {
+   } else if (reg.byte() && rc.bytes() == 2 &&
+              can_use_opsel(chip, instr->opcode, -1, reg.byte() / 2)) {
       VOP3_instruction& vop3 = instr->vop3();
       if (reg.byte() == 2)
          vop3.opsel |= (1 << 3); /* dst in high half */
@@ -667,7 +660,8 @@ void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsi
    }
 }
 
-void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
+void
+adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
 {
    uint16_t max_addressible_sgpr = ctx.sgpr_limit;
    unsigned size = rc.size();
@@ -687,9 +681,10 @@ enum UpdateRenames {
 };
 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
 
-void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
-                    std::vector<std::pair<Operand, Definition>>& parallelcopies,
-                    aco_ptr<Instruction>& instr, UpdateRenames flags)
+void
+update_renames(ra_ctx& ctx, RegisterFile& reg_file,
+               std::vector<std::pair<Operand, Definition>>& parallelcopies,
+               aco_ptr<Instruction>& instr, UpdateRenames flags)
 {
    /* clear operands */
    for (std::pair<Operand, Definition>& copy : parallelcopies) {
@@ -765,9 +760,9 @@ void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
             bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();
             for (std::pair<Operand, Definition>& pc : parallelcopies) {
                PhysReg def_reg = pc.second.physReg();
-               omit_renaming &= def_reg > copy.first.physReg() ?
-                                (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
-                                (def_reg + pc.second.size() <= copy.first.physReg().reg());
+               omit_renaming &= def_reg > copy.first.physReg()
+                                   ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
+                                   : (def_reg + pc.second.size() <= copy.first.physReg().reg());
             }
             if (omit_renaming) {
                if (first)
@@ -791,9 +786,8 @@ void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
    }
 }
 
-std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
-                                        RegisterFile& reg_file,
-                                        DefInfo info)
+std::pair<PhysReg, bool>
+get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
 {
    const PhysRegInterval& bounds = info.bounds;
    uint32_t size = info.size;
@@ -811,15 +805,18 @@ std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
          return res;
    }
 
-   auto is_free = [&](PhysReg reg_index) { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
+   auto is_free = [&](PhysReg reg_index)
+   { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
 
    if (stride == 1) {
       /* best fit algorithm: find the smallest gap to fit in the variable */
-      PhysRegInterval best_gap { PhysReg { 0 }, UINT_MAX };
-      const unsigned max_gpr = (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr;
+      PhysRegInterval best_gap{PhysReg{0}, UINT_MAX};
+      const unsigned max_gpr =
+         (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr;
 
       PhysRegIterator reg_it = bounds.begin();
-      const PhysRegIterator end_it = std::min(bounds.end(), std::max(PhysRegIterator { PhysReg { max_gpr + 1 } }, reg_it));
+      const PhysRegIterator end_it =
+         std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it));
       while (reg_it != bounds.end()) {
          /* Find the next chunk of available register slots */
          reg_it = std::find_if(reg_it, end_it, is_free);
@@ -859,14 +856,15 @@ std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
          if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) ||
              ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) ||
              ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0))
-            best_gap = { PhysReg { best_gap.lo() + buffer }, best_gap.size - buffer };
+            best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer};
       }
 
       adjust_max_used_regs(ctx, rc, best_gap.lo());
       return {best_gap.lo(), true};
    }
 
-   for (PhysRegInterval reg_win = { bounds.lo(), size }; reg_win.hi() <= bounds.hi(); reg_win += stride) {
+   for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
+        reg_win += stride) {
       if (reg_file[reg_win.lo()] != 0) {
          continue;
       }
@@ -887,14 +885,15 @@ std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
          if (!bounds.contains(PhysReg{entry.first}))
             continue;
 
-         for (unsigned i = 0; i < 4; i+= info.stride) {
+         for (unsigned i = 0; i < 4; i += info.stride) {
             /* check if there's a block of free bytes large enough to hold the register */
-            bool reg_found = std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())],
-                                         [](unsigned v) { return v == 0; });
+            bool reg_found =
+               std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())],
+                           [](unsigned v) { return v == 0; });
 
             /* check if also the neighboring reg is free if needed */
             if (reg_found && i + rc.bytes() > 4)
-                reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
+               reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
 
             if (reg_found) {
                PhysReg res{entry.first};
@@ -910,8 +909,8 @@ std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
 }
 
 /* collect variables from a register area and clear reg_file */
-std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
-                                                  const PhysRegInterval reg_interval)
+std::set<std::pair<unsigned, unsigned>>
+find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
 {
    std::set<std::pair<unsigned, unsigned>> vars;
    for (PhysReg j : reg_interval) {
@@ -935,8 +934,8 @@ std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg
 }
 
 /* collect variables from a register area and clear reg_file */
-std::set<std::pair<unsigned, unsigned>> collect_vars(ra_ctx& ctx, RegisterFile& reg_file,
-                                                     const PhysRegInterval reg_interval)
+std::set<std::pair<unsigned, unsigned>>
+collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
 {
    std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval);
    for (std::pair<unsigned, unsigned> size_id : vars) {
@@ -946,17 +945,18 @@ std::set<std::pair<unsigned, unsigned>> collect_vars(ra_ctx& ctx, RegisterFile&
    return vars;
 }
 
-bool get_regs_for_copies(ra_ctx& ctx,
-                         RegisterFile& reg_file,
-                         std::vector<std::pair<Operand, Definition>>& parallelcopies,
-                         const std::set<std::pair<unsigned, unsigned>> &vars,
-                         const PhysRegInterval bounds,
-                         aco_ptr<Instruction>& instr,
-                         const PhysRegInterval def_reg)
+bool
+get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
+                    std::vector<std::pair<Operand, Definition>>& parallelcopies,
+                    const std::set<std::pair<unsigned, unsigned>>& vars,
+                    const PhysRegInterval bounds, aco_ptr<Instruction>& instr,
+                    const PhysRegInterval def_reg)
 {
    /* variables are sorted from small sized to large */
-   /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
-   for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
+   /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders
+    * slightly though. */
+   for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin();
+        it != vars.rend(); ++it) {
       unsigned id = it->second;
       assignment& var = ctx.assignments[id];
       DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
@@ -980,7 +980,8 @@ bool get_regs_for_copies(ra_ctx& ctx,
             PhysReg reg(def_reg.lo());
             for (unsigned i = 0; i < instr->operands.size(); i++) {
                if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
-                  res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && !reg_file.test(reg, var.rc.bytes())};
+                  res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) &&
+                                 !reg_file.test(reg, var.rc.bytes())};
                   break;
                }
                reg.reg_b += instr->operands[i].bytes();
@@ -1021,8 +1022,8 @@ bool get_regs_for_copies(ra_ctx& ctx,
 
       /* we use a sliding window to find potential positions */
       unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
-      for (PhysRegInterval reg_win { bounds.lo(), size };
-           reg_win.hi() <= bounds.hi(); reg_win += stride) {
+      for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
+           reg_win += stride) {
          if (!is_dead_operand && intersects(reg_win, def_reg))
             continue;
 
@@ -1082,7 +1083,7 @@ bool get_regs_for_copies(ra_ctx& ctx,
       if (num_moves == 0xFF)
          return false;
 
-      PhysRegInterval reg_win { best_pos, size };
+      PhysRegInterval reg_win{best_pos, size};
 
       /* collect variables and block reg file */
       std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win);
@@ -1105,12 +1106,10 @@ bool get_regs_for_copies(ra_ctx& ctx,
    return true;
 }
 
-
-std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
-                                      RegisterFile& reg_file,
-                                      std::vector<std::pair<Operand, Definition>>& parallelcopies,
-                                      const DefInfo& info,
-                                      aco_ptr<Instruction>& instr)
+std::pair<PhysReg, bool>
+get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
+             std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
+             aco_ptr<Instruction>& instr)
 {
    const PhysRegInterval& bounds = info.bounds;
    uint32_t size = info.size;
@@ -1125,9 +1124,7 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
    std::bitset<256> is_killed_operand; /* per-register */
    for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
       Operand& op = instr->operands[j];
-      if (op.isTemp() &&
-          op.isFirstKillBeforeDef() &&
-          bounds.contains(op.physReg()) &&
+      if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
           !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
          assert(op.isFixed());
 
@@ -1147,12 +1144,13 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
       op_moves = size - (regs_free - killed_ops);
 
    /* find the best position to place the definition */
-   PhysRegInterval best_win = { bounds.lo(), size };
+   PhysRegInterval best_win = {bounds.lo(), size};
    unsigned num_moves = 0xFF;
    unsigned num_vars = 0;
 
    /* we use a sliding window to check potential positions */
-   for (PhysRegInterval reg_win = { bounds.lo(), size }; reg_win.hi() <= bounds.hi(); reg_win += stride) {
+   for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
+        reg_win += stride) {
       /* first check if the register window starts in the middle of an
        * allocated variable: this is what we have to fix to allow for
        * num_moves > size */
@@ -1232,12 +1230,10 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
        * or which are in the definition space */
       PhysReg reg = best_win.lo();
       for (Operand& op : instr->operands) {
-         if (op.isTemp() && op.isFirstKillBeforeDef() &&
-             op.getTemp().type() == rc.type()) {
-            if (op.physReg() != reg &&
-                (ctx.program->chip_class >= GFX9 ||
-                 (op.physReg().advance(op.bytes()) > best_win.lo() &&
-                  op.physReg() < best_win.hi()))) {
+         if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) {
+            if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 ||
+                                        (op.physReg().advance(op.bytes()) > best_win.lo() &&
+                                         op.physReg() < best_win.hi()))) {
                vars.emplace(op.bytes(), op.tempId());
                tmp_file.clear(op);
             } else {
@@ -1264,11 +1260,9 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
    return {best_win.lo(), true};
 }
 
-bool get_reg_specified(ra_ctx& ctx,
-                       RegisterFile& reg_file,
-                       RegClass rc,
-                       aco_ptr<Instruction>& instr,
-                       PhysReg reg)
+bool
+get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,
+                  PhysReg reg)
 {
    /* catch out-of-range registers */
    if (reg >= PhysReg{512})
@@ -1286,9 +1280,9 @@ bool get_reg_specified(ra_ctx& ctx,
    if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0)
       return false;
 
-   PhysRegInterval reg_win = { reg, rc.size() };
+   PhysRegInterval reg_win = {reg, rc.size()};
    PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
-   PhysRegInterval vcc_win = { vcc, 2 };
+   PhysRegInterval vcc_win = {vcc, 2};
    /* VCC is outside the bounds */
    bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win);
    bool is_m0 = rc == s1 && reg == m0;
@@ -1309,11 +1303,15 @@ bool get_reg_specified(ra_ctx& ctx,
    return true;
 }
 
-bool increase_register_file(ra_ctx& ctx, RegType type) {
+bool
+increase_register_file(ra_ctx& ctx, RegType type)
+{
    if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
-      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
+      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
+                                                          ctx.program->max_reg_demand.sgpr));
    } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
-      update_vgpr_sgpr_demand(ctx.program,  RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
+      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
+                                                          ctx.program->max_reg_demand.sgpr + 1));
    } else {
       return false;
    }
@@ -1338,9 +1336,9 @@ struct IDAndInfo {
  * one. If one of the variables has 0xffffffff as an ID, the register assigned
  * for that variable will be returned.
  */
-PhysReg compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
-                              std::vector<std::pair<Operand, Definition>>& parallelcopies,
-                              PhysReg start)
+PhysReg
+compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
+                      std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
 {
    /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
     * temporary sizes to dwords.
@@ -1351,18 +1349,21 @@ PhysReg compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& var
       sorted.emplace_back(var.id, info);
    }
 
-   std::sort(sorted.begin(), sorted.end(), [&ctx](const IDAndInfo& a,
-                                                  const IDAndInfo& b) {
-      unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
-      unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
-      if (a_stride > b_stride)
-         return true;
-      if (a_stride < b_stride)
-         return false;
-      if (a.id == 0xffffffff || b.id == 0xffffffff)
-         return a.id == 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
-      return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
-   });
+   std::sort(
+      sorted.begin(), sorted.end(),
+      [&ctx](const IDAndInfo& a, const IDAndInfo& b)
+      {
+         unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
+         unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
+         if (a_stride > b_stride)
+            return true;
+         if (a_stride < b_stride)
+            return false;
+         if (a.id == 0xffffffff || b.id == 0xffffffff)
+            return a.id ==
+                   0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
+         return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
+      });
 
    PhysReg next_reg = start;
    PhysReg space_reg;
@@ -1395,7 +1396,8 @@ PhysReg compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& var
    return space_reg;
 }
 
-bool is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction *instr)
+bool
+is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
 {
    PhysReg first{512};
    for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
@@ -1424,10 +1426,8 @@ bool is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction *inst
    return true;
 }
 
-std::pair<PhysReg, bool> get_reg_vector(ra_ctx& ctx,
-                                        RegisterFile& reg_file,
-                                        Temp temp,
-                                        aco_ptr<Instruction>& instr)
+std::pair<PhysReg, bool>
+get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
 {
    Instruction* vec = ctx.vectors[temp.id()];
    unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
@@ -1448,9 +1448,7 @@ std::pair<PhysReg, bool> get_reg_vector(ra_ctx& ctx,
        */
       for (unsigned i = first_operand; i < vec->operands.size(); i++) {
          Operand& op = vec->operands[i];
-         if (op.isTemp() &&
-             op.tempId() != temp.id() &&
-             op.getTemp().type() == temp.type() &&
+         if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
              ctx.assignments[op.tempId()].assigned) {
             PhysReg reg = ctx.assignments[op.tempId()].reg;
             reg.reg_b += (our_offset - their_offset);
@@ -1477,12 +1475,10 @@ std::pair<PhysReg, bool> get_reg_vector(ra_ctx& ctx,
    return {{}, false};
 }
 
-PhysReg get_reg(ra_ctx& ctx,
-                RegisterFile& reg_file,
-                Temp temp,
-                std::vector<std::pair<Operand, Definition>>& parallelcopies,
-                aco_ptr<Instruction>& instr,
-                int operand_index=-1)
+PhysReg
+get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
+        std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
+        int operand_index = -1)
 {
    auto split_vec = ctx.split_vectors.find(temp.id());
    if (split_vec != ctx.split_vectors.end()) {
@@ -1581,11 +1577,10 @@ PhysReg get_reg(ra_ctx& ctx,
    return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
 }
 
-PhysReg get_reg_create_vector(ra_ctx& ctx,
-                              RegisterFile& reg_file,
-                              Temp temp,
-                              std::vector<std::pair<Operand, Definition>>& parallelcopies,
-                              aco_ptr<Instruction>& instr)
+PhysReg
+get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
+                      std::vector<std::pair<Operand, Definition>>& parallelcopies,
+                      aco_ptr<Instruction>& instr)
 {
    RegClass rc = temp.regClass();
    /* create_vector instructions have different costs w.r.t. register coalescing */
@@ -1594,16 +1589,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
    uint32_t stride = get_stride(rc);
    PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
 
-   //TODO: improve p_create_vector for sub-dword vectors
+   // TODO: improve p_create_vector for sub-dword vectors
 
-   PhysReg best_pos { 0xFFF };
+   PhysReg best_pos{0xFFF};
    unsigned num_moves = 0xFF;
    bool best_war_hint = true;
 
    /* test for each operand which definition placement causes the least shuffle instructions */
-   for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
+   for (unsigned i = 0, offset = 0; i < instr->operands.size();
+        offset += instr->operands[i].bytes(), i++) {
       // TODO: think about, if we can alias live operands on the same register
-      if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
+      if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
+          instr->operands[i].getTemp().type() != rc.type())
          continue;
 
       if (offset > instr->operands[i].physReg().reg_b)
@@ -1612,7 +1609,7 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
       unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
       if (reg_lower % 4)
          continue;
-      PhysRegInterval reg_win = { PhysReg { reg_lower / 4 }, size };
+      PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
       unsigned k = 0;
 
       /* no need to check multiple times */
@@ -1623,9 +1620,11 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
       // TODO: this can be improved */
       if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
          continue;
-      if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 && reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
+      if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
+          reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
          continue;
-      if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 && reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
+      if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
+          reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
          continue;
 
       /* count variables to be moved and check war_hint */
@@ -1656,9 +1655,9 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
          continue;
 
       /* count operands in wrong positions */
-      for (unsigned j = 0, offset2 = 0; j < instr->operands.size(); offset2 += instr->operands[j].bytes(), j++) {
-         if (j == i ||
-             !instr->operands[j].isTemp() ||
+      for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
+           offset2 += instr->operands[j].bytes(), j++) {
+         if (j == i || !instr->operands[j].isTemp() ||
              instr->operands[j].getTemp().type() != rc.type())
             continue;
          if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2)
@@ -1678,17 +1677,19 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
 
    /* re-enable killed operands which are in the wrong position */
    RegisterFile tmp_file(reg_file);
-   for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
-      if (instr->operands[i].isTemp() &&
-          instr->operands[i].isFirstKillBeforeDef() &&
+   for (unsigned i = 0, offset = 0; i < instr->operands.size();
+        offset += instr->operands[i].bytes(), i++) {
+      if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
           instr->operands[i].physReg().reg_b != best_pos.reg_b + offset)
          tmp_file.fill(instr->operands[i]);
    }
 
    /* collect variables to be moved */
-   std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, PhysRegInterval { best_pos, size });
+   std::set<std::pair<unsigned, unsigned>> vars =
+      collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
 
-   for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
+   for (unsigned i = 0, offset = 0; i < instr->operands.size();
+        offset += instr->operands[i].bytes(), i++) {
       if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
           instr->operands[i].getTemp().type() != rc.type())
          continue;
@@ -1700,14 +1701,15 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
       if (ctx.program->chip_class >= GFX9 && !correct_pos) {
          vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
          tmp_file.clear(instr->operands[i]);
-      /* fill operands which are in the correct position to avoid overwriting */
+         /* fill operands which are in the correct position to avoid overwriting */
       } else if (correct_pos) {
          tmp_file.fill(instr->operands[i]);
       }
    }
    bool success = false;
    std::vector<std::pair<Operand, Definition>> pc;
-   success = get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval { best_pos, size });
+   success =
+      get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size});
 
    if (!success) {
       if (!increase_register_file(ctx, temp.type())) {
@@ -1723,9 +1725,8 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
    return best_pos;
 }
 
-void handle_pseudo(ra_ctx& ctx,
-                   const RegisterFile& reg_file,
-                   Instruction* instr)
+void
+handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
 {
    if (instr->format != Format::PSEUDO)
       return;
@@ -1736,10 +1737,8 @@ void handle_pseudo(ra_ctx& ctx,
    case aco_opcode::p_create_vector:
    case aco_opcode::p_split_vector:
    case aco_opcode::p_parallelcopy:
-   case aco_opcode::p_wqm:
-      break;
-   default:
-      return;
+   case aco_opcode::p_wqm: break;
+   default: return;
    }
 
    /* if all definitions are vgpr, no need to care for SCC */
@@ -1761,8 +1760,8 @@ void handle_pseudo(ra_ctx& ctx,
       if (op.isTemp() && op.regClass().is_subdword())
          reads_subdword = true;
    }
-   bool needs_scratch_reg = (writes_sgpr && reads_sgpr) ||
-                            (ctx.program->chip_class <= GFX7 && reads_subdword);
+   bool needs_scratch_reg =
+      (writes_sgpr && reads_sgpr) || (ctx.program->chip_class <= GFX7 && reads_subdword);
    if (!needs_scratch_reg)
       return;
 
@@ -1789,7 +1788,9 @@ void handle_pseudo(ra_ctx& ctx,
    }
 }
 
-bool operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, RegClass rc)
+bool
+operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
+                    RegClass rc)
 {
    if (instr->operands[idx].isFixed())
       return instr->operands[idx].physReg() == reg;
@@ -1798,9 +1799,9 @@ bool operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned
                        instr->opcode == aco_opcode::v_writelane_b32_e64;
    if (chip <= GFX9 && is_writelane && idx <= 1) {
       /* v_writelane_b32 can take two sgprs but only if one is m0. */
-      bool is_other_sgpr = instr->operands[!idx].isTemp() &&
-                           (!instr->operands[!idx].isFixed() ||
-                            instr->operands[!idx].physReg() != m0);
+      bool is_other_sgpr =
+         instr->operands[!idx].isTemp() &&
+         (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0);
       if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {
          instr->operands[idx].setFixed(m0);
          return reg == m0;
@@ -1815,19 +1816,20 @@ bool operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned
 
    switch (instr->format) {
    case Format::SMEM:
-      return reg != scc &&
-             reg != exec &&
+      return reg != scc && reg != exec &&
              (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
-             (reg != vcc || (instr->definitions.empty() && idx == 2) || chip >= GFX10); /* sdata can be vcc */
+             (reg != vcc || (instr->definitions.empty() && idx == 2) ||
+              chip >= GFX10); /* sdata can be vcc */
    default:
       // TODO: there are more instructions with restrictions on registers
       return true;
    }
 }
 
-void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
-                         std::vector<std::pair<Operand, Definition>>& parallelcopy,
-                         aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
+void
+get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
+                    std::vector<std::pair<Operand, Definition>>& parallelcopy,
+                    aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
 {
    /* check if the operand is fixed */
    PhysReg src = ctx.assignments[operand.tempId()].reg;
@@ -1841,31 +1843,34 @@ void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
 
          RegisterFile tmp_file(register_file);
 
-         std::set<std::pair<unsigned, unsigned>> blocking_vars = collect_vars(ctx, tmp_file, target);
+         std::set<std::pair<unsigned, unsigned>> blocking_vars =
+            collect_vars(ctx, tmp_file, target);
 
-         tmp_file.clear(src, operand.regClass()); //TODO: try to avoid moving block vars to src
+         tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src
          tmp_file.block(operand.physReg(), operand.regClass());
 
          DefInfo info(ctx, instr, operand.regClass(), -1);
-         get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr, PhysRegInterval());
+         get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr,
+                             PhysRegInterval());
       }
       dst = operand.physReg();
 
    } else {
       dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
-      update_renames(ctx, register_file, parallelcopy, instr,
-                     instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0);
+      update_renames(
+         ctx, register_file, parallelcopy, instr,
+         instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0);
    }
 
    Operand pc_op = operand;
    pc_op.setFixed(src);
    Definition pc_def = Definition(dst, pc_op.regClass());
    parallelcopy.emplace_back(pc_op, pc_def);
-   update_renames(ctx, register_file, parallelcopy, instr,
-                  rename_not_killed_ops | fill_killed_ops);
+   update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
 }
 
-Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
+Temp
+read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
 {
    std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
    if (it == ctx.renames[block_idx].end())
@@ -1874,7 +1879,8 @@ Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
       return it->second;
 }
 
-Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
+Temp
+handle_live_in(ra_ctx& ctx, Temp val, Block* block)
 {
    std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
    if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())
@@ -1886,7 +1892,7 @@ Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
    }
 
    /* there are multiple predecessors and the block is sealed */
-   Temp *const ops = (Temp *)alloca(preds.size() * sizeof(Temp));
+   Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
 
    /* get the rename from each predecessor and check if they are the same */
    Temp new_val;
@@ -1902,7 +1908,8 @@ Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
    if (needs_phi) {
       /* the variable has been renamed differently in the predecessors: we need to insert a phi */
       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
-      aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
+      aco_ptr<Instruction> phi{
+         create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
       new_val = ctx.program->allocateTmp(val.regClass());
       phi->definitions[0] = Definition(new_val);
       for (unsigned i = 0; i < preds.size(); i++) {
@@ -1921,8 +1928,9 @@ Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
    return new_val;
 }
 
-void handle_loop_phis(ra_ctx& ctx, const IDSet& live_in,
-                      uint32_t loop_header_idx, uint32_t loop_exit_idx)
+void
+handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
+                 uint32_t loop_exit_idx)
 {
    Block& loop_header = ctx.program->blocks[loop_header_idx];
    std::unordered_map<unsigned, Temp> renames;
@@ -1963,9 +1971,8 @@ void handle_loop_phis(ra_ctx& ctx, const IDSet& live_in,
       aco_ptr<Instruction>& phi = loop_header.instructions[i];
       if (!is_phi(phi))
          break;
-      const std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ?
-                                           loop_header.logical_preds :
-                                           loop_header.linear_preds;
+      const std::vector<unsigned>& preds =
+         phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
       for (unsigned j = 1; j < phi->operands.size(); j++) {
          Operand& op = phi->operands[j];
          if (!op.isTemp())
@@ -2016,7 +2023,8 @@ void handle_loop_phis(ra_ctx& ctx, const IDSet& live_in,
  * Reg-to-reg moves (renames) from previous blocks are taken into account and
  * the SSA is repaired by inserting corresponding phi-nodes.
  */
-RegisterFile init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
+RegisterFile
+init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
 {
    if (block.kind & block_kind_loop_exit) {
       uint32_t header = ctx.loop_header.back();
@@ -2054,9 +2062,8 @@ RegisterFile init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_b
       for (aco_ptr<Instruction>& instr : block.instructions) {
          if (!is_phi(instr))
             break;
-         const std::vector<unsigned>& preds = instr->opcode == aco_opcode::p_phi ?
-                                              block.logical_preds :
-                                              block.linear_preds;
+         const std::vector<unsigned>& preds =
+            instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
 
          for (unsigned i = 0; i < instr->operands.size(); i++) {
             Operand& operand = instr->operands[i];
@@ -2084,12 +2091,14 @@ RegisterFile init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_b
    return register_file;
 }
 
-void get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
+void
+get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
 {
    std::vector<std::vector<Temp>> phi_ressources;
    std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
 
-   for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend(); block_rit++) {
+   for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
+        block_rit++) {
       Block& block = *block_rit;
 
       /* first, compute the death points of all live vars within the block */
@@ -2109,7 +2118,8 @@ void get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
             affinity_related.emplace_back(instr->definitions[0].getTemp());
             affinity_related.emplace_back(instr->definitions[0].getTemp());
             for (const Operand& op : instr->operands) {
-               if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
+               if (op.isTemp() && op.isKill() &&
+                   op.regClass() == instr->definitions[0].regClass()) {
                   affinity_related.emplace_back(op.getTemp());
                   temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
                }
@@ -2119,7 +2129,8 @@ void get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
             /* add vector affinities */
             if (instr->opcode == aco_opcode::p_create_vector) {
                for (const Operand& op : instr->operands) {
-                  if (op.isTemp() && op.isFirstKill() && op.getTemp().type() == instr->definitions[0].getTemp().type())
+                  if (op.isTemp() && op.isFirstKill() &&
+                      op.getTemp().type() == instr->definitions[0].getTemp().type())
                      ctx.vectors[op.tempId()] = instr.get();
                }
             } else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
@@ -2127,7 +2138,8 @@ void get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
                   ctx.vectors[instr->operands[i].tempId()] = instr.get();
             }
 
-            if (instr->opcode == aco_opcode::p_split_vector && instr->operands[0].isFirstKillBeforeDef())
+            if (instr->opcode == aco_opcode::p_split_vector &&
+                instr->operands[0].isFirstKillBeforeDef())
                ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
 
             /* add operands to live variables */
@@ -2144,28 +2156,26 @@ void get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
                continue;
             live.erase(def.tempId());
             /* mark last-seen phi operand */
-            std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
-            if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
+            std::unordered_map<unsigned, unsigned>::iterator it =
+               temp_to_phi_ressources.find(def.tempId());
+            if (it != temp_to_phi_ressources.end() &&
+                def.regClass() == phi_ressources[it->second][0].regClass()) {
                phi_ressources[it->second][0] = def.getTemp();
                /* try to coalesce phi affinities with parallelcopies */
                Operand op = Operand();
                switch (instr->opcode) {
-               case aco_opcode::p_parallelcopy:
-                  op = instr->operands[i];
-                  break;
+               case aco_opcode::p_parallelcopy: op = instr->operands[i]; break;
 
                case aco_opcode::v_interp_p2_f32:
                case aco_opcode::v_writelane_b32:
-               case aco_opcode::v_writelane_b32_e64:
-                  op = instr->operands[2];
-                  break;
+               case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break;
 
                case aco_opcode::v_fma_f32:
                case aco_opcode::v_fma_f16:
                case aco_opcode::v_pk_fma_f16:
                   if (ctx.program->chip_class < GFX10)
                      continue;
-               FALLTHROUGH;
+                  FALLTHROUGH;
                case aco_opcode::v_mad_f32:
                case aco_opcode::v_mad_f16:
                   if (instr->usesModifiers())
@@ -2173,8 +2183,7 @@ void get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
                   op = instr->operands[2];
                   break;
 
-               default:
-                  continue;
+               default: continue;
                }
 
                if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
@@ -2196,8 +2205,8 @@ void get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
 
 } /* end namespace */
 
-
-void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
+void
+register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
 {
    ra_ctx ctx(program, policy);
    get_affinities(ctx, live_out_per_block);
@@ -2217,22 +2226,26 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
        * We consider them incomplete phis and only handle the definition. */
 
       /* look up the affinities */
-      for (instr_it = block.instructions.begin(); instr_it != block.instructions.end(); ++instr_it) {
+      for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();
+           ++instr_it) {
          aco_ptr<Instruction>& phi = *instr_it;
          if (!is_phi(phi))
             break;
          Definition& definition = phi->definitions[0];
          if (definition.isKill() || definition.isFixed())
-             continue;
+            continue;
 
          if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
              ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
-            assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == definition.regClass());
+            assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc ==
+                   definition.regClass());
             PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
             if (reg == scc) {
                /* only use scc if all operands are already placed there */
-               bool use_scc = std::all_of(phi->operands.begin(), phi->operands.end(),
-                                          [] (const Operand& op) { return op.isTemp() && op.isFixed() && op.physReg() == scc;});
+               bool use_scc =
+                  std::all_of(phi->operands.begin(), phi->operands.end(),
+                              [](const Operand& op)
+                              { return op.isTemp() && op.isFixed() && op.physReg() == scc; });
                if (!use_scc)
                   continue;
             }
@@ -2247,7 +2260,8 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
       }
 
       /* find registers for phis without affinity or where the register was blocked */
-      for (instr_it = block.instructions.begin();instr_it != block.instructions.end(); ++instr_it) {
+      for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();
+           ++instr_it) {
          aco_ptr<Instruction>& phi = *instr_it;
          if (!is_phi(phi))
             break;
@@ -2274,16 +2288,18 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
                }
             }
             if (!definition.isFixed()) {
-               definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
+               definition.setFixed(
+                  get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
                update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
             }
 
             /* process parallelcopy */
             for (std::pair<Operand, Definition> pc : parallelcopy) {
                /* see if it's a copy from a different phi */
-               //TODO: prefer moving some previous phis over live-ins
-               //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
-               Instruction *prev_phi = NULL;
+               // TODO: prefer moving some previous phis over live-ins
+               // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
+               // problem in practice since they can only be fixed to exec)
+               Instruction* prev_phi = NULL;
                std::vector<aco_ptr<Instruction>>::iterator phi_it;
                for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
                   if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
@@ -2298,13 +2314,15 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
                   /* if so, just update that phi's register */
                   register_file.clear(prev_phi->definitions[0]);
                   prev_phi->definitions[0].setFixed(pc.second.physReg());
-                  ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
+                  ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
+                                                                        pc.second.regClass()};
                   register_file.fill(prev_phi->definitions[0]);
                   continue;
                }
 
                /* rename */
-               std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
+               std::unordered_map<unsigned, Temp>::iterator orig_it =
+                  ctx.orig_names.find(pc.first.tempId());
                Temp orig = pc.first.getTemp();
                if (orig_it != ctx.orig_names.end())
                   orig = orig_it->second;
@@ -2314,9 +2332,12 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
 
                /* otherwise, this is a live-in and we need to create a new phi
                 * to move it in this block's predecessors */
-               aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
-               std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
-               aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
+               aco_opcode opcode =
+                  pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
+               std::vector<unsigned>& preds =
+                  pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
+               aco_ptr<Instruction> new_phi{
+                  create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
                new_phi->definitions[0] = pc.second;
                for (unsigned i = 0; i < preds.size(); i++)
                   new_phi->operands[i] = Operand(pc.first);
@@ -2370,7 +2391,8 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
                   if (phi->operands[idx].isTemp() &&
                       phi->operands[idx].getTemp().type() == RegType::sgpr &&
                       phi->operands[idx].isFirstKillBeforeDef()) {
-                     Definition phi_op(read_variable(ctx, phi->operands[idx].getTemp(), block.index));
+                     Definition phi_op(
+                        read_variable(ctx, phi->operands[idx].getTemp(), block.index));
                      phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg);
                      register_file.clear(phi_op);
                   }
@@ -2404,8 +2426,7 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
             else
                get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
 
-            if (instr->isEXP() ||
-                (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
+            if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
                 (instr->isDS() && instr->ds().gds)) {
                for (unsigned j = 0; j < operand.size(); j++)
                   ctx.war_hint.set(operand.physReg().reg() + j);
@@ -2425,14 +2446,10 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
               instr->opcode == aco_opcode::v_mad_legacy_f16 ||
               (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) ||
               (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10)) &&
-             instr->operands[2].isTemp() &&
-             instr->operands[2].isKillBeforeDef() &&
-             instr->operands[2].getTemp().type() == RegType::vgpr &&
-             instr->operands[1].isTemp() &&
-             instr->operands[1].getTemp().type() == RegType::vgpr &&
-             !instr->usesModifiers() &&
-             instr->operands[0].physReg().byte() == 0 &&
-             instr->operands[1].physReg().byte() == 0 &&
+             instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() &&
+             instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() &&
+             instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() &&
+             instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 &&
              instr->operands[2].physReg().byte() == 0) {
             unsigned def_id = instr->definitions[0].tempId();
             auto it = ctx.affinities.find(def_id);
@@ -2441,34 +2458,21 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
                 register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
                instr->format = Format::VOP2;
                switch (instr->opcode) {
-               case aco_opcode::v_mad_f32:
-                  instr->opcode = aco_opcode::v_mac_f32;
-                  break;
-               case aco_opcode::v_fma_f32:
-                  instr->opcode = aco_opcode::v_fmac_f32;
-                  break;
+               case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
+               case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
                case aco_opcode::v_mad_f16:
-               case aco_opcode::v_mad_legacy_f16:
-                  instr->opcode = aco_opcode::v_mac_f16;
-                  break;
-               case aco_opcode::v_fma_f16:
-                  instr->opcode = aco_opcode::v_fmac_f16;
-                  break;
-               case aco_opcode::v_pk_fma_f16:
-                  instr->opcode = aco_opcode::v_pk_fmac_f16;
-                  break;
-               default:
-                  break;
+               case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
+               case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
+               case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
+               default: break;
                }
             }
          }
 
          /* handle definitions which must have the same register as an operand */
          if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
-             instr->opcode == aco_opcode::v_mac_f32 ||
-             instr->opcode == aco_opcode::v_fmac_f32 ||
-             instr->opcode == aco_opcode::v_mac_f16 ||
-             instr->opcode == aco_opcode::v_fmac_f16 ||
+             instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 ||
+             instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 ||
              instr->opcode == aco_opcode::v_pk_fmac_f16 ||
              instr->opcode == aco_opcode::v_writelane_b32 ||
              instr->opcode == aco_opcode::v_writelane_b32_e64) {
@@ -2476,12 +2480,10 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
          } else if (instr->opcode == aco_opcode::s_addk_i32 ||
                     instr->opcode == aco_opcode::s_mulk_i32) {
             instr->definitions[0].setFixed(instr->operands[0].physReg());
-         } else if (instr->isMUBUF() &&
-                    instr->definitions.size() == 1 &&
+         } else if (instr->isMUBUF() && instr->definitions.size() == 1 &&
                     instr->operands.size() == 4) {
             instr->definitions[0].setFixed(instr->operands[3].physReg());
-         } else if (instr->isMIMG() &&
-                    instr->definitions.size() == 1 &&
+         } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
                     !instr->operands[2].isUndefined()) {
             instr->definitions[0].setFixed(instr->operands[2].physReg());
          }
@@ -2497,10 +2499,11 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
             adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
             /* check if the target register is blocked */
             if (register_file.test(definition.physReg(), definition.bytes())) {
-               const PhysRegInterval def_regs { definition.physReg(), definition.size() };
+               const PhysRegInterval def_regs{definition.physReg(), definition.size()};
 
                /* create parallelcopy pair to move blocking vars */
-               std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, register_file, def_regs);
+               std::set<std::pair<unsigned, unsigned>> vars =
+                  collect_vars(ctx, register_file, def_regs);
 
                RegisterFile tmp_file(register_file);
                /* re-enable the killed operands, so that we don't move the blocking vars there */
@@ -2511,8 +2514,7 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
 
                ASSERTED bool success = false;
                DefInfo info(ctx, instr, definition.regClass(), -1);
-               success = get_regs_for_copies(ctx, tmp_file, parallelcopy,
-                                             vars, info.bounds, instr,
+               success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr,
                                              def_regs);
                assert(success);
 
@@ -2529,13 +2531,15 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
 
          /* handle all other definitions */
          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
-            Definition *definition = &instr->definitions[i];
+            Definition* definition = &instr->definitions[i];
 
             if (definition->isFixed() || !definition->isTemp())
                continue;
 
             /* find free reg */
-            if (definition->hasHint() && get_reg_specified(ctx, register_file, definition->regClass(), instr, definition->physReg())) {
+            if (definition->hasHint() &&
+                get_reg_specified(ctx, register_file, definition->regClass(), instr,
+                                  definition->physReg())) {
                definition->setFixed(definition->physReg());
             } else if (instr->opcode == aco_opcode::p_split_vector) {
                PhysReg reg = instr->operands[0].physReg();
@@ -2543,7 +2547,8 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
                   reg.reg_b += instr->definitions[j].bytes();
                if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
                   definition->setFixed(reg);
-            } else if (instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_parallelcopy) {
+            } else if (instr->opcode == aco_opcode::p_wqm ||
+                       instr->opcode == aco_opcode::p_parallelcopy) {
                PhysReg reg = instr->operands[i].physReg();
                if (instr->operands[i].isTemp() &&
                    instr->operands[i].getTemp().type() == definition->getTemp().type() &&
@@ -2568,17 +2573,21 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
                   definition->setFixed(reg);
                   if (reg.byte() || register_file.test(reg, 4)) {
                      add_subdword_definition(program, instr, i, reg);
-                     definition = &instr->definitions[i]; /* add_subdword_definition can invalidate the reference */
+                     definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
+                                                             the reference */
                   }
                } else {
                   definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
                }
                update_renames(ctx, register_file, parallelcopy, instr,
-                              instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0);
+                              instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
+                                                                           : (UpdateRenames)0);
             }
 
-            assert(definition->isFixed() && ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
-                                             (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
+            assert(
+               definition->isFixed() &&
+               ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
+                (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
             ctx.defs_done.set(i);
             ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
             register_file.fill(*definition);
@@ -2586,10 +2595,11 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
 
          handle_pseudo(ctx, register_file, instr.get());
 
-         /* kill definitions and late-kill operands and ensure that sub-dword operands can actually be read */
+         /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
+          * be read */
          for (const Definition& def : instr->definitions) {
-             if (def.isTemp() && def.isKill())
-                register_file.clear(def);
+            if (def.isTemp() && def.isKill())
+               register_file.clear(def);
          }
          for (unsigned i = 0; i < instr->operands.size(); i++) {
             const Operand& op = instr->operands[i];
@@ -2602,11 +2612,14 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
          /* emit parallelcopy */
          if (!parallelcopy.empty()) {
             aco_ptr<Pseudo_instruction> pc;
-            pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
+            pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
+                                                            Format::PSEUDO, parallelcopy.size(),
+                                                            parallelcopy.size()));
             bool sgpr_operands_alias_defs = false;
             uint64_t sgpr_operands[4] = {0, 0, 0, 0};
             for (unsigned i = 0; i < parallelcopy.size(); i++) {
-               if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
+               if (temp_in_scc && parallelcopy[i].first.isTemp() &&
+                   parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
                   if (!sgpr_operands_alias_defs) {
                      unsigned reg = parallelcopy[i].first.physReg().reg();
                      unsigned size = parallelcopy[i].first.getTemp().size();
@@ -2623,8 +2636,10 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
                pc->definitions[i] = parallelcopy[i].second;
                assert(pc->operands[i].size() == pc->definitions[i].size());
 
-               /* it might happen that the operand is already renamed. we have to restore the original name. */
-               std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
+               /* it might happen that the operand is already renamed. we have to restore the
+                * original name. */
+               std::unordered_map<unsigned, Temp>::iterator it =
+                  ctx.orig_names.find(pc->operands[i].tempId());
                Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
                ctx.orig_names[pc->definitions[i].tempId()] = orig;
                ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
@@ -2651,24 +2666,27 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
          }
 
          /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
-         bool instr_needs_vop3 = !instr->isVOP3() &&
-                                 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
-                                  (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
-                                  ((instr->opcode == aco_opcode::v_add_co_u32 ||
-                                    instr->opcode == aco_opcode::v_addc_co_u32 ||
-                                    instr->opcode == aco_opcode::v_sub_co_u32 ||
-                                    instr->opcode == aco_opcode::v_subb_co_u32 ||
-                                    instr->opcode == aco_opcode::v_subrev_co_u32 ||
-                                    instr->opcode == aco_opcode::v_subbrev_co_u32) &&
-                                   !(instr->definitions[1].physReg() == vcc)) ||
-                                  ((instr->opcode == aco_opcode::v_addc_co_u32 ||
-                                    instr->opcode == aco_opcode::v_subb_co_u32 ||
-                                    instr->opcode == aco_opcode::v_subbrev_co_u32) &&
-                                   !(instr->operands[2].physReg() == vcc)));
+         bool instr_needs_vop3 =
+            !instr->isVOP3() &&
+            ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
+             (instr->opcode == aco_opcode::v_cndmask_b32 &&
+              !(instr->operands[2].physReg() == vcc)) ||
+             ((instr->opcode == aco_opcode::v_add_co_u32 ||
+               instr->opcode == aco_opcode::v_addc_co_u32 ||
+               instr->opcode == aco_opcode::v_sub_co_u32 ||
+               instr->opcode == aco_opcode::v_subb_co_u32 ||
+               instr->opcode == aco_opcode::v_subrev_co_u32 ||
+               instr->opcode == aco_opcode::v_subbrev_co_u32) &&
+              !(instr->definitions[1].physReg() == vcc)) ||
+             ((instr->opcode == aco_opcode::v_addc_co_u32 ||
+               instr->opcode == aco_opcode::v_subb_co_u32 ||
+               instr->opcode == aco_opcode::v_subbrev_co_u32) &&
+              !(instr->operands[2].physReg() == vcc)));
          if (instr_needs_vop3) {
 
             /* if the first operand is a literal, we have to move it to a reg */
-            if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
+            if (instr->operands.size() && instr->operands[0].isLiteral() &&
+                program->chip_class < GFX10) {
                bool can_sgpr = true;
                /* check, if we have to move to vgpr */
                for (const Operand& op : instr->operands) {
@@ -2692,9 +2710,11 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
 
                aco_ptr<Instruction> mov;
                if (can_sgpr)
-                  mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
+                  mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,
+                                                                 Format::SOP1, 1, 1));
                else
-                  mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
+                  mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32,
+                                                                 Format::VOP1, 1, 1));
                mov->operands[0] = instr->operands[0];
                mov->definitions[0] = Definition(tmp);
                mov->definitions[0].setFixed(reg);
@@ -2709,7 +2729,8 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
             /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
             aco_ptr<Instruction> tmp = std::move(instr);
             Format format = asVOP3(tmp->format);
-            instr.reset(create_instruction<VOP3_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
+            instr.reset(create_instruction<VOP3_instruction>(
+               tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
             std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin());
             std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
          }
@@ -2752,4 +2773,4 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
    program->progress = CompilationProgress::after_ra;
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_reindex_ssa.cpp b/src/amd/compiler/aco_reindex_ssa.cpp
index 9ad2faced21..47653f8b6d3 100644
--- a/src/amd/compiler/aco_reindex_ssa.cpp
+++ b/src/amd/compiler/aco_reindex_ssa.cpp
@@ -34,8 +34,8 @@ struct idx_ctx {
    std::vector<uint32_t> renames;
 };
 
-inline
-void reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
+inline void
+reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    for (Definition& def : instr->definitions) {
       if (!def.isTemp())
@@ -48,8 +48,8 @@ void reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
    }
 }
 
-inline
-void reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
+inline void
+reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
 {
    for (Operand& op : instr->operands) {
       if (!op.isTemp())
@@ -60,7 +60,8 @@ void reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
    }
 }
 
-void reindex_program(idx_ctx& ctx, Program* program)
+void
+reindex_program(idx_ctx& ctx, Program* program)
 {
    ctx.renames.resize(program->peekAllocationId());
 
@@ -88,12 +89,13 @@ void reindex_program(idx_ctx& ctx, Program* program)
    /* update program members */
    program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()],
                                           program->private_segment_buffer.regClass());
-   program->scratch_offset = Temp(ctx.renames[program->scratch_offset.id()],
-                                  program->scratch_offset.regClass());
+   program->scratch_offset =
+      Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass());
    program->temp_rc = ctx.temp_rc;
 }
 
-void update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
+void
+update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
 {
    for (IDSet& set : live_out) {
       IDSet new_set;
@@ -105,7 +107,8 @@ void update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
 
 } /* end namespace */
 
-void reindex_ssa(Program* program)
+void
+reindex_ssa(Program* program)
 {
    idx_ctx ctx;
    reindex_program(ctx, program);
@@ -113,7 +116,8 @@ void reindex_ssa(Program* program)
    program->allocationID = program->temp_rc.size();
 }
 
-void reindex_ssa(Program* program, std::vector<IDSet>& live_out)
+void
+reindex_ssa(Program* program, std::vector<IDSet>& live_out)
 {
    idx_ctx ctx;
    reindex_program(ctx, program);
@@ -122,4 +126,4 @@ void reindex_ssa(Program* program, std::vector<IDSet>& live_out)
    program->allocationID = program->temp_rc.size();
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 59338b3d042..9a17a816d89 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -34,11 +34,11 @@
 #define SMEM_WINDOW_SIZE    (350 - ctx.num_waves * 35)
 #define VMEM_WINDOW_SIZE    (1024 - ctx.num_waves * 64)
 #define POS_EXP_WINDOW_SIZE 512
-#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
-#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
+#define SMEM_MAX_MOVES      (64 - ctx.num_waves * 4)
+#define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
 #define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8)
-#define POS_EXP_MAX_MOVES 512
+#define POS_EXP_MAX_MOVES         512
 
 namespace aco {
 
@@ -54,7 +54,7 @@ enum MoveResult {
  * or below a group of instruction that hardware can execute as a clause.
  */
 struct DownwardsCursor {
-   int source_idx;        /* Current instruction to consider for moving */
+   int source_idx; /* Current instruction to consider for moving */
 
    int insert_idx_clause; /* First clause instruction */
    int insert_idx;        /* First instruction *after* the clause */
@@ -66,11 +66,9 @@ struct DownwardsCursor {
    RegisterDemand total_demand;
 
    DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand)
-      : source_idx(current_idx - 1),
-        insert_idx_clause(current_idx),
-        insert_idx(current_idx + 1),
-        clause_demand(initial_clause_demand) {
-   }
+       : source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1),
+         clause_demand(initial_clause_demand)
+   {}
 
    void verify_invariants(const RegisterDemand* register_demand);
 };
@@ -91,18 +89,16 @@ struct UpwardsCursor {
       insert_idx = -1; /* to be initialized later */
    }
 
-   bool has_insert_idx() const {
-      return insert_idx != -1;
-   }
+   bool has_insert_idx() const { return insert_idx != -1; }
    void verify_invariants(const RegisterDemand* register_demand);
 };
 
 struct MoveState {
    RegisterDemand max_registers;
 
-   Block *block;
-   Instruction *current;
-   RegisterDemand *register_demand; /* demand per instruction */
+   Block* block;
+   Instruction* current;
+   RegisterDemand* register_demand; /* demand per instruction */
    bool improved_rar;
 
    std::vector<bool> depends_on;
@@ -143,19 +139,22 @@ struct sched_ctx {
  */
 
 template <typename T>
-void move_element(T begin_it, size_t idx, size_t before) {
-    if (idx < before) {
-        auto begin = std::next(begin_it, idx);
-        auto end = std::next(begin_it, before);
-        std::rotate(begin, begin + 1, end);
-    } else if (idx > before) {
-        auto begin = std::next(begin_it, before);
-        auto end = std::next(begin_it, idx + 1);
-        std::rotate(begin, end - 1, end);
-    }
+void
+move_element(T begin_it, size_t idx, size_t before)
+{
+   if (idx < before) {
+      auto begin = std::next(begin_it, idx);
+      auto end = std::next(begin_it, before);
+      std::rotate(begin, begin + 1, end);
+   } else if (idx > before) {
+      auto begin = std::next(begin_it, before);
+      auto end = std::next(begin_it, idx + 1);
+      std::rotate(begin, end - 1, end);
+   }
 }
 
-void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
+void
+DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
 {
    assert(source_idx < insert_idx_clause);
    assert(insert_idx_clause < insert_idx);
@@ -175,7 +174,8 @@ void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
 #endif
 }
 
-DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
+DownwardsCursor
+MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
 {
    improved_rar = improved_rar_;
 
@@ -202,7 +202,8 @@ DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, b
 /* If add_to_clause is true, the current clause is extended by moving the
  * instruction at source_idx in front of the clause. Otherwise, the instruction
  * is moved past the end of the clause without extending it */
-MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
+MoveResult
+MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
 {
    aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
 
@@ -211,7 +212,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause
          return move_fail_ssa;
 
    /* check if one of candidate's operands is killed by depending instruction */
-   std::vector<bool>& RAR_deps = improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
+   std::vector<bool>& RAR_deps =
+      improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
    for (const Operand& op : instr->operands) {
       if (op.isTemp() && RAR_deps[op.tempId()]) {
          // FIXME: account for difference in register pressure
@@ -274,7 +276,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause
    return move_success;
 }
 
-void MoveState::downwards_skip(DownwardsCursor& cursor)
+void
+MoveState::downwards_skip(DownwardsCursor& cursor)
 {
    aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
 
@@ -292,7 +295,9 @@ void MoveState::downwards_skip(DownwardsCursor& cursor)
    cursor.verify_invariants(register_demand);
 }
 
-void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) {
+void
+UpwardsCursor::verify_invariants(const RegisterDemand* register_demand)
+{
 #ifndef NDEBUG
    if (!has_insert_idx()) {
       return;
@@ -308,7 +313,8 @@ void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) {
 #endif
 }
 
-UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_)
+UpwardsCursor
+MoveState::upwards_init(int source_idx, bool improved_rar_)
 {
    improved_rar = improved_rar_;
 
@@ -323,7 +329,8 @@ UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_)
    return UpwardsCursor(source_idx);
 }
 
-bool MoveState::upwards_check_deps(UpwardsCursor& cursor)
+bool
+MoveState::upwards_check_deps(UpwardsCursor& cursor)
 {
    aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
    for (const Operand& op : instr->operands) {
@@ -333,13 +340,15 @@ bool MoveState::upwards_check_deps(UpwardsCursor& cursor)
    return true;
 }
 
-void MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
+void
+MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
 {
    cursor.insert_idx = cursor.source_idx;
    cursor.total_demand = register_demand[cursor.insert_idx];
 }
 
-MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
+MoveResult
+MoveState::upwards_move(UpwardsCursor& cursor)
 {
    assert(cursor.has_insert_idx());
 
@@ -355,13 +364,15 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
          return move_fail_rar;
    }
 
-   /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
+   /* check if register pressure is low enough: the diff is negative if register pressure is
+    * decreased */
    const RegisterDemand candidate_diff = get_live_changes(instr);
    const RegisterDemand temp = get_temp_registers(instr);
    if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers))
       return move_fail_pressure;
    const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]);
-   const RegisterDemand new_demand = register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
+   const RegisterDemand new_demand =
+      register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
    if (new_demand.exceeds(max_registers))
       return move_fail_pressure;
 
@@ -385,7 +396,8 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
    return move_success;
 }
 
-void MoveState::upwards_skip(UpwardsCursor& cursor)
+void
+MoveState::upwards_skip(UpwardsCursor& cursor)
 {
    if (cursor.has_insert_idx()) {
       aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
@@ -405,30 +417,33 @@ void MoveState::upwards_skip(UpwardsCursor& cursor)
    cursor.verify_invariants(register_demand);
 }
 
-bool is_gs_or_done_sendmsg(const Instruction *instr)
+bool
+is_gs_or_done_sendmsg(const Instruction* instr)
 {
    if (instr->opcode == aco_opcode::s_sendmsg) {
       uint16_t imm = instr->sopp().imm;
-      return (imm & sendmsg_id_mask) == _sendmsg_gs ||
-             (imm & sendmsg_id_mask) == _sendmsg_gs_done;
+      return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done;
    }
    return false;
 }
 
-bool is_done_sendmsg(const Instruction *instr)
+bool
+is_done_sendmsg(const Instruction* instr)
 {
    if (instr->opcode == aco_opcode::s_sendmsg)
       return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done;
    return false;
 }
 
-memory_sync_info get_sync_info_with_hack(const Instruction* instr)
+memory_sync_info
+get_sync_info_with_hack(const Instruction* instr)
 {
    memory_sync_info sync = get_sync_info(instr);
    if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
       // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
       sync.storage = (storage_class)(sync.storage | storage_buffer);
-      sync.semantics = (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
+      sync.semantics =
+         (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
    }
    return sync;
 }
@@ -451,11 +466,13 @@ struct hazard_query {
    bool contains_sendmsg;
    bool uses_exec;
    memory_event_set mem_events;
-   unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
+   unsigned aliasing_storage;      /* storage classes which are accessed (non-SMEM) */
    unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
 };
 
-void init_hazard_query(hazard_query *query) {
+void
+init_hazard_query(hazard_query* query)
+{
    query->contains_spill = false;
    query->contains_sendmsg = false;
    query->uses_exec = false;
@@ -464,7 +481,8 @@ void init_hazard_query(hazard_query *query) {
    query->aliasing_storage_smem = 0;
 }
 
-void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
+void
+add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync)
 {
    set->has_control_barrier |= is_done_sendmsg(instr);
    if (instr->opcode == aco_opcode::p_barrier) {
@@ -494,7 +512,8 @@ void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_inf
    }
 }
 
-void add_to_hazard_query(hazard_query *query, Instruction *instr)
+void
+add_to_hazard_query(hazard_query* query, Instruction* instr)
 {
    if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
       query->contains_spill = true;
@@ -507,7 +526,8 @@ void add_to_hazard_query(hazard_query *query, Instruction *instr)
 
    if (!(sync.semantics & semantic_can_reorder)) {
       unsigned storage = sync.storage;
-      /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
+      /* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and
+                                                      // buffer/global memory can alias
       if (storage & (storage_buffer | storage_image))
          storage |= storage_buffer | storage_image;
       if (instr->isSMEM())
@@ -531,7 +551,8 @@ enum HazardResult {
    hazard_fail_unreorderable,
 };
 
-HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
+HazardResult
+perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
 {
    /* don't schedule discards downwards */
    if (!upwards && instr->opcode == aco_opcode::p_exit_early_if)
@@ -549,10 +570,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
       return hazard_fail_export;
 
    /* don't move non-reorderable instructions */
-   if (instr->opcode == aco_opcode::s_memtime ||
-       instr->opcode == aco_opcode::s_memrealtime ||
-       instr->opcode == aco_opcode::s_setprio ||
-       instr->opcode == aco_opcode::s_getreg_b32)
+   if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
+       instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32)
       return hazard_fail_unreorderable;
 
    memory_event_set instr_set;
@@ -560,8 +579,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
    memory_sync_info sync = get_sync_info_with_hack(instr);
    add_memory_event(&instr_set, instr, &sync);
 
-   memory_event_set *first = &instr_set;
-   memory_event_set *second = &query->mem_events;
+   memory_event_set* first = &instr_set;
+   memory_event_set* second = &query->mem_events;
    if (upwards)
       std::swap(first, second);
 
@@ -571,7 +590,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
    if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
       return hazard_fail_barrier;
    if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
-       ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
+       ((first->access_acquire | first->bar_acquire) &
+        (second->access_relaxed | second->access_atomic)))
       return hazard_fail_barrier;
 
    /* everything before barrier(release) happens before the atomics/control_barriers after *
@@ -580,7 +600,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
    if (first->bar_release && (second->has_control_barrier || second->access_atomic))
       return hazard_fail_barrier;
    if ((first->bar_classes && (second->bar_release || second->access_release)) ||
-       ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
+       ((first->access_relaxed | first->access_atomic) &
+        (second->bar_release | second->access_release)))
       return hazard_fail_barrier;
 
    /* don't move memory barriers around other memory barriers */
@@ -589,14 +610,15 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
 
    /* Don't move memory accesses to before control barriers. I don't think
     * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
-   unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared;
-   if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes))
+   unsigned control_classes =
+      storage_buffer | storage_atomic_counter | storage_image | storage_shared;
+   if (first->has_control_barrier &&
+       ((second->access_atomic | second->access_relaxed) & control_classes))
       return hazard_fail_barrier;
 
    /* don't move memory loads/stores past potentially aliasing loads/stores */
-   unsigned aliasing_storage = instr->isSMEM() ?
-                               query->aliasing_storage_smem :
-                               query->aliasing_storage;
+   unsigned aliasing_storage =
+      instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage;
    if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
       unsigned intersect = sync.storage & aliasing_storage;
       if (intersect & storage_shared)
@@ -614,9 +636,9 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
    return hazard_success;
 }
 
-void schedule_SMEM(sched_ctx& ctx, Block* block,
-                   std::vector<RegisterDemand>& register_demand,
-                   Instruction* current, int idx)
+void
+schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+              Instruction* current, int idx)
 {
    assert(idx != 0);
    int window_size = SMEM_WINDOW_SIZE;
@@ -634,30 +656,37 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
 
    DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false);
 
-   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
+   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
+        candidate_idx--) {
       assert(candidate_idx >= 0);
       assert(candidate_idx == cursor.source_idx);
       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 
       /* break if we'd make the previous SMEM instruction stall */
-      bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
+      bool can_stall_prev_smem =
+         idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
          break;
 
       /* break when encountering another MEM instruction, logical_start or barriers */
       if (candidate->opcode == aco_opcode::p_logical_start)
          break;
-      /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves to help create more vmem clauses */
-      if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || current->operands[0].size() == 4))
+      /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
+       * to help create more vmem clauses */
+      if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
+                                  current->operands[0].size() == 4))
          break;
       /* don't move descriptor loads below buffer loads */
-      if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && candidate->operands[0].size() == 2)
+      if (candidate->format == Format::SMEM && current->operands[0].size() == 4 &&
+          candidate->operands[0].size() == 2)
          break;
 
       bool can_move_down = true;
 
       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
-      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
+      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
+          haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
+          haz == hazard_fail_export)
          can_move_down = false;
       else if (haz != hazard_success)
          break;
@@ -689,9 +718,10 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
 
    bool found_dependency = false;
    /* second, check if we have instructions after current to move up */
-   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
+   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
+        candidate_idx++) {
       assert(candidate_idx == up_cursor.source_idx);
-      assert(candidate_idx < (int) block->instructions.size());
+      assert(candidate_idx < (int)block->instructions.size());
       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 
       if (candidate->opcode == aco_opcode::p_logical_end)
@@ -748,9 +778,9 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 }
 
-void schedule_VMEM(sched_ctx& ctx, Block* block,
-                   std::vector<RegisterDemand>& register_demand,
-                   Instruction* current, int idx)
+void
+schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+              Instruction* current, int idx)
 {
    assert(idx != 0);
    int window_size = VMEM_WINDOW_SIZE;
@@ -767,7 +797,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
 
    DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
 
-   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
+   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
+        candidate_idx--) {
       assert(candidate_idx == cursor.source_idx);
       assert(candidate_idx >= 0);
       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
@@ -778,7 +809,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
          break;
 
       /* break if we'd make the previous SMEM instruction stall */
-      bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
+      bool can_stall_prev_smem =
+         idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
          break;
 
@@ -787,14 +819,15 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
          int grab_dist = cursor.insert_idx_clause - candidate_idx;
          /* We can't easily tell how much this will decrease the def-to-use
           * distances, so just use how far it will be moved as a heuristic. */
-         part_of_clause = grab_dist < clause_max_grab_dist &&
-                          should_form_clause(current, candidate.get());
+         part_of_clause =
+            grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get());
       }
 
       /* if current depends on candidate, add additional dependencies and continue */
       bool can_move_down = !is_vmem || part_of_clause;
 
-      HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
+      HazardResult haz =
+         perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
           haz == hazard_fail_export)
@@ -809,7 +842,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
          continue;
       }
 
-      Instruction *candidate_ptr = candidate.get();
+      Instruction* candidate_ptr = candidate.get();
       MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
       if (res == move_fail_ssa || res == move_fail_rar) {
          add_to_hazard_query(&indep_hq, candidate.get());
@@ -832,9 +865,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
 
    bool found_dependency = false;
    /* second, check if we have instructions after current to move up */
-   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
+   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
+        candidate_idx++) {
       assert(candidate_idx == up_cursor.source_idx);
-      assert(candidate_idx < (int) block->instructions.size());
+      assert(candidate_idx < (int)block->instructions.size());
       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
       bool is_vmem = candidate->isVMEM() || candidate->isFlatLike();
 
@@ -889,9 +923,9 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
    }
 }
 
-void schedule_position_export(sched_ctx& ctx, Block* block,
-                              std::vector<RegisterDemand>& register_demand,
-                              Instruction* current, int idx)
+void
+schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+                         Instruction* current, int idx)
 {
    assert(idx != 0);
    int window_size = POS_EXP_WINDOW_SIZE;
@@ -904,7 +938,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
    init_hazard_query(&hq);
    add_to_hazard_query(&hq, current);
 
-   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
+   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
+        candidate_idx--) {
       assert(candidate_idx >= 0);
       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 
@@ -935,7 +970,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
    }
 }
 
-void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
+void
+schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
 {
    ctx.last_SMEM_dep_idx = 0;
    ctx.last_SMEM_stall = INT16_MIN;
@@ -950,7 +986,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
          unsigned target = current->exp().dest;
          if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
             ctx.mv.current = current;
-            schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
+            schedule_position_export(ctx, block, live_vars.register_demand[block->index], current,
+                                     idx);
          }
       }
 
@@ -975,8 +1012,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
    }
 }
 
-
-void schedule_program(Program *program, live& live_vars)
+void
+schedule_program(Program* program, live& live_vars)
 {
    /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
    RegisterDemand demand;
@@ -991,7 +1028,7 @@ void schedule_program(Program *program, live& live_vars)
    /* Allowing the scheduler to reduce the number of waves to as low as 5
     * improves performance of Thrones of Britannia significantly and doesn't
     * seem to hurt anything else. */
-   //TODO: account for possible uneven num_waves on GFX10+
+   // TODO: account for possible uneven num_waves on GFX10+
    unsigned wave_fac = program->dev.physical_vgprs / 256;
    if (program->num_waves <= 5 * wave_fac)
       ctx.num_waves = program->num_waves;
@@ -1008,8 +1045,8 @@ void schedule_program(Program *program, live& live_vars)
    ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
 
    assert(ctx.num_waves > 0);
-   ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
-                            int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
+   ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
+                           int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
 
    for (Block& block : program->blocks)
       schedule_block(ctx, program, &block, live_vars);
@@ -1021,8 +1058,8 @@ void schedule_program(Program *program, live& live_vars)
    }
    update_vgpr_sgpr_demand(program, new_demand);
 
-   /* if enabled, this code asserts that register_demand is updated correctly */
-   #if 0
+/* if enabled, this code asserts that register_demand is updated correctly */
+#if 0
    int prev_num_waves = program->num_waves;
    const RegisterDemand prev_max_demand = program->max_reg_demand;
 
@@ -1042,7 +1079,7 @@ void schedule_program(Program *program, live& live_vars)
 
    assert(program->max_reg_demand == prev_max_demand);
    assert(program->num_waves == prev_num_waves);
-   #endif
+#endif
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
index c0f0471fff9..8996fae8f39 100644
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -25,6 +25,7 @@
 
 #include "aco_builder.h"
 #include "aco_ir.h"
+
 #include "common/sid.h"
 
 #include <map>
@@ -44,7 +45,7 @@ namespace aco {
 namespace {
 
 struct remat_info {
-   Instruction *instr;
+   Instruction* instr;
 };
 
 struct spill_ctx {
@@ -62,15 +63,16 @@ struct spill_ctx {
    std::vector<std::vector<uint32_t>> affinities;
    std::vector<bool> is_reloaded;
    std::map<Temp, remat_info> remat;
-   std::map<Instruction *, bool> remat_used;
+   std::map<Instruction*, bool> remat_used;
    unsigned wave_size;
 
    spill_ctx(const RegisterDemand target_pressure_, Program* program_,
              std::vector<std::vector<RegisterDemand>> register_demand_)
-      : target_pressure(target_pressure_), program(program_),
-        register_demand(std::move(register_demand_)), renames(program->blocks.size()),
-        spills_entry(program->blocks.size()), spills_exit(program->blocks.size()),
-        processed(program->blocks.size(), false), wave_size(program->wave_size) {}
+       : target_pressure(target_pressure_), program(program_),
+         register_demand(std::move(register_demand_)), renames(program->blocks.size()),
+         spills_entry(program->blocks.size()), spills_exit(program->blocks.size()),
+         processed(program->blocks.size(), false), wave_size(program->wave_size)
+   {}
 
    void add_affinity(uint32_t first, uint32_t second)
    {
@@ -93,7 +95,9 @@ struct spill_ctx {
          affinities[found_second].push_back(first);
       } else if (found_first != found_second) {
          /* merge second into first */
-         affinities[found_first].insert(affinities[found_first].end(), affinities[found_second].begin(), affinities[found_second].end());
+         affinities[found_first].insert(affinities[found_first].end(),
+                                        affinities[found_second].begin(),
+                                        affinities[found_second].end());
          affinities.erase(std::next(affinities.begin(), found_second));
       } else {
          assert(found_first == found_second);
@@ -120,7 +124,8 @@ struct spill_ctx {
    uint32_t next_spill_id = 0;
 };
 
-int32_t get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)
+int32_t
+get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)
 {
 
    if (idx_a == -1)
@@ -146,21 +151,23 @@ int32_t get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)
    return idx_a;
 }
 
-void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist)
+void
+next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist)
 {
    Block* block = &ctx.program->blocks[block_idx];
    std::map<Temp, std::pair<uint32_t, uint32_t>> next_uses = ctx.next_use_distances_end[block_idx];
 
-   /* to compute the next use distance at the beginning of the block, we have to add the block's size */
-   for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin(); it != next_uses.end(); ++it)
+   /* to compute the next use distance at the beginning of the block, we have to add the block's
+    * size */
+   for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin();
+        it != next_uses.end(); ++it)
       it->second.second = it->second.second + block->instructions.size();
 
    int idx = block->instructions.size() - 1;
    while (idx >= 0) {
       aco_ptr<Instruction>& instr = block->instructions[idx];
 
-      if (instr->opcode == aco_opcode::p_linear_phi ||
-          instr->opcode == aco_opcode::p_phi)
+      if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
          break;
 
       for (const Definition& def : instr->definitions) {
@@ -192,13 +199,14 @@ void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>&
       }
 
       auto it = next_uses.find(instr->definitions[0].getTemp());
-      std::pair<uint32_t, uint32_t> distance = it == next_uses.end() ? std::make_pair(block_idx, 0u) : it->second;
+      std::pair<uint32_t, uint32_t> distance =
+         it == next_uses.end() ? std::make_pair(block_idx, 0u) : it->second;
       for (unsigned i = 0; i < instr->operands.size(); i++) {
-         unsigned pred_idx = instr->opcode == aco_opcode::p_phi ?
-                             block->logical_preds[i] :
-                             block->linear_preds[i];
+         unsigned pred_idx =
+            instr->opcode == aco_opcode::p_phi ? block->logical_preds[i] : block->linear_preds[i];
          if (instr->operands[i].isTemp()) {
-            if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) == ctx.next_use_distances_end[pred_idx].end() ||
+            if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) ==
+                   ctx.next_use_distances_end[pred_idx].end() ||
                 ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != distance)
                worklist.insert(pred_idx);
             ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = distance;
@@ -217,19 +225,22 @@ void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>&
       for (unsigned pred_idx : preds) {
          if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth)
             distance += 0xFFFF;
-         if (ctx.next_use_distances_end[pred_idx].find(temp) != ctx.next_use_distances_end[pred_idx].end()) {
-            dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program, temp.is_linear());
+         if (ctx.next_use_distances_end[pred_idx].find(temp) !=
+             ctx.next_use_distances_end[pred_idx].end()) {
+            dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program,
+                                temp.is_linear());
             distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance);
          }
-         if (ctx.next_use_distances_end[pred_idx][temp] != std::pair<uint32_t, uint32_t>{dom, distance})
+         if (ctx.next_use_distances_end[pred_idx][temp] !=
+             std::pair<uint32_t, uint32_t>{dom, distance})
             worklist.insert(pred_idx);
          ctx.next_use_distances_end[pred_idx][temp] = {dom, distance};
       }
    }
-
 }
 
-void compute_global_next_uses(spill_ctx& ctx)
+void
+compute_global_next_uses(spill_ctx& ctx)
 {
    ctx.next_use_distances_start.resize(ctx.program->blocks.size());
    ctx.next_use_distances_end.resize(ctx.program->blocks.size());
@@ -245,12 +256,15 @@ void compute_global_next_uses(spill_ctx& ctx)
    }
 }
 
-bool should_rematerialize(aco_ptr<Instruction>& instr)
+bool
+should_rematerialize(aco_ptr<Instruction>& instr)
 {
    /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
-   if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO && instr->format != Format::SOPK)
+   if (instr->format != Format::VOP1 && instr->format != Format::SOP1 &&
+       instr->format != Format::PSEUDO && instr->format != Format::SOPK)
       return false;
-   /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector/p_parallelcopy */
+   /* TODO: pseudo-instruction rematerialization is only supported for
+    * p_create_vector/p_parallelcopy */
    if (instr->isPseudo() && instr->opcode != aco_opcode::p_create_vector &&
        instr->opcode != aco_opcode::p_parallelcopy)
       return false;
@@ -270,24 +284,32 @@ bool should_rematerialize(aco_ptr<Instruction>& instr)
    return true;
 }
 
-aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
+aco_ptr<Instruction>
+do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
 {
    std::map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
    if (remat != ctx.remat.end()) {
-      Instruction *instr = remat->second.instr;
-      assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) && "unsupported");
-      assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector || instr->opcode == aco_opcode::p_parallelcopy) && "unsupported");
+      Instruction* instr = remat->second.instr;
+      assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) &&
+             "unsupported");
+      assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector ||
+              instr->opcode == aco_opcode::p_parallelcopy) &&
+             "unsupported");
       assert(instr->definitions.size() == 1 && "unsupported");
 
       aco_ptr<Instruction> res;
       if (instr->isVOP1()) {
-         res.reset(create_instruction<VOP1_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
+         res.reset(create_instruction<VOP1_instruction>(
+            instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
       } else if (instr->isSOP1()) {
-         res.reset(create_instruction<SOP1_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
+         res.reset(create_instruction<SOP1_instruction>(
+            instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
       } else if (instr->isPseudo()) {
-         res.reset(create_instruction<Pseudo_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
+         res.reset(create_instruction<Pseudo_instruction>(
+            instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
       } else if (instr->isSOPK()) {
-         res.reset(create_instruction<SOPK_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
+         res.reset(create_instruction<SOPK_instruction>(
+            instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
          res->sopk().imm = instr->sopk().imm;
       }
       for (unsigned i = 0; i < instr->operands.size(); i++) {
@@ -301,7 +323,8 @@ aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t
       res->definitions[0] = Definition(new_name);
       return res;
    } else {
-      aco_ptr<Pseudo_instruction> reload{create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
+      aco_ptr<Pseudo_instruction> reload{
+         create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
       reload->operands[0] = Operand(spill_id);
       reload->definitions[0] = Definition(new_name);
       ctx.is_reloaded[spill_id] = true;
@@ -309,7 +332,8 @@ aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t
    }
 }
 
-void get_rematerialize_info(spill_ctx& ctx)
+void
+get_rematerialize_info(spill_ctx& ctx)
 {
    for (Block& block : ctx.program->blocks) {
       bool logical = false;
@@ -330,12 +354,14 @@ void get_rematerialize_info(spill_ctx& ctx)
    }
 }
 
-std::vector<std::map<Temp, uint32_t>> local_next_uses(spill_ctx& ctx, Block* block)
+std::vector<std::map<Temp, uint32_t>>
+local_next_uses(spill_ctx& ctx, Block* block)
 {
    std::vector<std::map<Temp, uint32_t>> local_next_uses(block->instructions.size());
 
    std::map<Temp, uint32_t> next_uses;
-   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block->index])
+   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :
+        ctx.next_use_distances_end[block->index])
       next_uses[pair.first] = pair.second.second + block->instructions.size();
 
    for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
@@ -362,7 +388,8 @@ std::vector<std::map<Temp, uint32_t>> local_next_uses(spill_ctx& ctx, Block* blo
    return local_next_uses;
 }
 
-RegisterDemand get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx)
+RegisterDemand
+get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx)
 {
    if (idx == 0) {
       RegisterDemand demand = ctx.register_demand[block_idx][idx];
@@ -374,7 +401,8 @@ RegisterDemand get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned id
    }
 }
 
-RegisterDemand get_live_in_demand(spill_ctx& ctx, unsigned block_idx)
+RegisterDemand
+get_live_in_demand(spill_ctx& ctx, unsigned block_idx)
 {
    unsigned idx = 0;
    RegisterDemand reg_pressure = RegisterDemand();
@@ -398,12 +426,14 @@ RegisterDemand get_live_in_demand(spill_ctx& ctx, unsigned block_idx)
    /* Consider register pressure from linear predecessors. This can affect
     * reg_pressure if the branch instructions define sgprs. */
    for (unsigned pred : block.linear_preds)
-      reg_pressure.sgpr = std::max<int16_t>(reg_pressure.sgpr, ctx.register_demand[pred].back().sgpr);
+      reg_pressure.sgpr =
+         std::max<int16_t>(reg_pressure.sgpr, ctx.register_demand[pred].back().sgpr);
 
    return reg_pressure;
 }
 
-RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
+RegisterDemand
+init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
 {
    RegisterDemand spilled_registers;
 
@@ -461,7 +491,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
          Temp to_spill;
          for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
             if (pair.first.type() == type &&
-                (pair.second.first >= loop_end || (ctx.remat.count(pair.first) && type == RegType::sgpr)) &&
+                (pair.second.first >= loop_end ||
+                 (ctx.remat.count(pair.first) && type == RegType::sgpr)) &&
                 pair.second.second > distance &&
                 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
                to_spill = pair.first;
@@ -478,7 +509,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
          }
 
          uint32_t spill_id;
-         if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) {
+         if (ctx.spills_exit[block_idx - 1].find(to_spill) ==
+             ctx.spills_exit[block_idx - 1].end()) {
             spill_id = ctx.allocate_spill_id(to_spill.regClass());
          } else {
             spill_id = ctx.spills_exit[block_idx - 1][to_spill];
@@ -502,8 +534,7 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
          type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
 
          for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
-            if (pair.first.type() == type &&
-                pair.second.second > distance &&
+            if (pair.first.type() == type && pair.second.second > distance &&
                 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
                to_spill = pair.first;
                distance = pair.second.second;
@@ -542,7 +573,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
          }
       }
 
-      /* if register demand is still too high, we just keep all spilled live vars and process the block */
+      /* if register demand is still too high, we just keep all spilled live vars
+       * and process the block */
       if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {
          pred_idx = block->linear_preds[0];
          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
@@ -553,7 +585,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
             }
          }
       }
-      if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr && block->logical_preds.size() == 1) {
+      if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr &&
+          block->logical_preds.size() == 1) {
          pred_idx = block->logical_preds[0];
          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
             if (pair.first.type() == RegType::vgpr &&
@@ -572,17 +605,21 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
 
    /* keep variables spilled on all incoming paths */
    for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
-      std::vector<unsigned>& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
-      /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload it.
-       * Otherwise, if any predecessor reloads it, ensure it's reloaded on all other predecessors.
-       * The idea is that it's better in practice to rematerialize redundantly than to create lots of phis. */
-      /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db doesn't seem to exercise this path much) */
+      std::vector<unsigned>& preds =
+         pair.first.is_linear() ? block->linear_preds : block->logical_preds;
+      /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload
+       * it. Otherwise, if any predecessor reloads it, ensure it's reloaded on all other
+       * predecessors. The idea is that it's better in practice to rematerialize redundantly than to
+       * create lots of phis. */
+      /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db
+       * doesn't seem to exercise this path much) */
       bool remat = ctx.remat.count(pair.first);
       bool spill = !remat;
       uint32_t spill_id = 0;
       for (unsigned pred_idx : preds) {
          /* variable is not even live at the predecessor: probably from a phi */
-         if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) {
+         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
+             ctx.next_use_distances_end[pred_idx].end()) {
             spill = false;
             break;
          }
@@ -591,7 +628,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
                spill = false;
          } else {
             partial_spills.insert(pair.first);
-            /* it might be that on one incoming path, the variable has a different spill_id, but add_couple_code() will take care of that. */
+            /* it might be that on one incoming path, the variable has a different spill_id, but
+             * add_couple_code() will take care of that. */
             spill_id = ctx.spills_exit[pred_idx][pair.first];
             if (remat)
                spill = true;
@@ -611,7 +649,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
       if (!phi->definitions[0].isTemp())
          continue;
 
-      std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
+      std::vector<unsigned>& preds =
+         phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
       bool spill = true;
 
       for (unsigned i = 0; i < phi->operands.size(); i++) {
@@ -621,13 +660,15 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
             continue;
          }
 
-         if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
+         if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) ==
+             ctx.spills_exit[preds[i]].end())
             spill = false;
          else
             partial_spills.insert(phi->definitions[0].getTemp());
       }
       if (spill) {
-         ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] = ctx.allocate_spill_id(phi->definitions[0].regClass());
+         ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] =
+            ctx.allocate_spill_id(phi->definitions[0].regClass());
          partial_spills.erase(phi->definitions[0].getTemp());
          spilled_registers += phi->definitions[0].getTemp();
       }
@@ -664,7 +705,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
    return spilled_registers;
 }
 
-void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
+void
+add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 {
    /* no coupling code necessary */
    if (block->linear_preds.size() == 0)
@@ -672,14 +714,16 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
    std::vector<aco_ptr<Instruction>> instructions;
    /* branch block: TODO take other branch into consideration */
-   if (block->linear_preds.size() == 1 && !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {
+   if (block->linear_preds.size() == 1 &&
+       !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {
       assert(ctx.processed[block->linear_preds[0]]);
       assert(ctx.register_demand[block_idx].size() == block->instructions.size());
       std::vector<RegisterDemand> reg_demand;
       unsigned insert_idx = 0;
       RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0);
 
-      for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live : ctx.next_use_distances_start[block_idx]) {
+      for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :
+           ctx.next_use_distances_start[block_idx]) {
          const unsigned pred_idx = block->linear_preds[0];
 
          if (!live.first.is_linear())
@@ -698,7 +742,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
          /* variable is spilled at predecessor and live at current block: create reload instruction */
          Temp new_name = ctx.program->allocateTmp(live.first.regClass());
-         aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
+         aco_ptr<Instruction> reload =
+            do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
          instructions.emplace_back(std::move(reload));
          reg_demand.push_back(demand_before);
          ctx.renames[block_idx][live.first] = new_name;
@@ -713,7 +758,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          } while (instructions.back()->opcode != aco_opcode::p_logical_start);
 
          unsigned pred_idx = block->logical_preds[0];
-         for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live : ctx.next_use_distances_start[block_idx]) {
+         for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :
+              ctx.next_use_distances_start[block_idx]) {
             if (live.first.is_linear())
                continue;
             /* still spilled */
@@ -728,9 +774,11 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
                continue;
             }
 
-            /* variable is spilled at predecessor and live at current block: create reload instruction */
+            /* variable is spilled at predecessor and live at current block:
+             * create reload instruction */
             Temp new_name = ctx.program->allocateTmp(live.first.regClass());
-            aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
+            aco_ptr<Instruction> reload =
+               do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
             instructions.emplace_back(std::move(reload));
             reg_demand.emplace_back(reg_demand.back());
             ctx.renames[block_idx][live.first] = new_name;
@@ -739,12 +787,15 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
       /* combine new reload instructions with original block */
       if (!instructions.empty()) {
-         reg_demand.insert(reg_demand.end(), std::next(ctx.register_demand[block->index].begin(), insert_idx),
+         reg_demand.insert(reg_demand.end(),
+                           std::next(ctx.register_demand[block->index].begin(), insert_idx),
                            ctx.register_demand[block->index].end());
          ctx.register_demand[block_idx] = std::move(reg_demand);
          instructions.insert(instructions.end(),
-                             std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(std::next(block->instructions.begin(), insert_idx)),
-                             std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
+                             std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
+                                std::next(block->instructions.begin(), insert_idx)),
+                             std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
+                                block->instructions.end()));
          block->instructions = std::move(instructions);
       }
       return;
@@ -761,12 +812,14 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
       /* if the phi is not spilled, add to instructions */
       if (!phi->definitions[0].isTemp() ||
-          ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()) {
+          ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==
+             ctx.spills_entry[block_idx].end()) {
          instructions.emplace_back(std::move(phi));
          continue;
       }
 
-      std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
+      std::vector<unsigned>& preds =
+         phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
       uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
 
       for (unsigned i = 0; i < phi->operands.size(); i++) {
@@ -807,7 +860,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
             ctx.add_interference(spill_id, pair.second);
          ctx.add_affinity(def_spill_id, spill_id);
 
-         aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
+         aco_ptr<Pseudo_instruction> spill{
+            create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
          spill->operands[0] = spill_op;
          spill->operands[1] = Operand(spill_id);
          Block& pred = ctx.program->blocks[pred_idx];
@@ -815,7 +869,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          do {
             assert(idx != 0);
             idx--;
-         } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
+         } while (phi->opcode == aco_opcode::p_phi &&
+                  pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
          pred.instructions.insert(it, std::move(spill));
          if (spill_op.isTemp())
@@ -829,7 +884,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
    /* iterate all (other) spilled variables for which to spill at the predecessor */
    // TODO: would be better to have them sorted: first vgprs and first with longest distance
    for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
-      std::vector<unsigned> preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
+      std::vector<unsigned> preds =
+         pair.first.is_linear() ? block->linear_preds : block->logical_preds;
 
       for (unsigned pred_idx : preds) {
          /* variable is already spilled at predecessor */
@@ -841,7 +897,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          }
 
          /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
-         if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
+         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
+             ctx.next_use_distances_end[pred_idx].end())
             continue;
 
          /* add interferences between spilled variable and predecessors exit spills */
@@ -860,7 +917,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
             ctx.renames[pred_idx].erase(rename_it);
          }
 
-         aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
+         aco_ptr<Pseudo_instruction> spill{
+            create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
          spill->operands[0] = Operand(var);
          spill->operands[1] = Operand(pair.second);
          Block& pred = ctx.program->blocks[pred_idx];
@@ -868,7 +926,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          do {
             assert(idx != 0);
             idx--;
-         } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
+         } while (pair.first.type() == RegType::vgpr &&
+                  pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
          pred.instructions.insert(it, std::move(spill));
          ctx.spills_exit[pred.index][pair.first] = pair.second;
@@ -878,17 +937,22 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
    /* iterate phis for which operands to reload */
    for (aco_ptr<Instruction>& phi : instructions) {
       assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi);
-      assert(!phi->definitions[0].isTemp() || ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end());
+      assert(!phi->definitions[0].isTemp() ||
+             ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==
+                ctx.spills_entry[block_idx].end());
 
-      std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
+      std::vector<unsigned>& preds =
+         phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
       for (unsigned i = 0; i < phi->operands.size(); i++) {
          if (!phi->operands[i].isTemp())
             continue;
          unsigned pred_idx = preds[i];
 
          /* if the operand was reloaded, rename */
-         if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) == ctx.spills_exit[pred_idx].end()) {
-            std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(phi->operands[i].getTemp());
+         if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) ==
+             ctx.spills_exit[pred_idx].end()) {
+            std::map<Temp, Temp>::iterator it =
+               ctx.renames[pred_idx].find(phi->operands[i].getTemp());
             if (it != ctx.renames[pred_idx].end())
                phi->operands[i].setTemp(it->second);
             /* prevent the definining instruction from being DCE'd if it could be rematerialized */
@@ -906,9 +970,11 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          do {
             assert(idx != 0);
             idx--;
-         } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
+         } while (phi->opcode == aco_opcode::p_phi &&
+                  pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
-         aco_ptr<Instruction> reload = do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
+         aco_ptr<Instruction> reload =
+            do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
 
          /* reload spilled exec mask directly to exec */
          if (!phi->definitions[0].isTemp()) {
@@ -927,16 +993,19 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
    /* iterate live variables for which to reload */
    // TODO: reload at current block if variable is spilled on all predecessors
-   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
+   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :
+        ctx.next_use_distances_start[block_idx]) {
       /* skip spilled variables */
       if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end())
          continue;
-      std::vector<unsigned> preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
+      std::vector<unsigned> preds =
+         pair.first.is_linear() ? block->linear_preds : block->logical_preds;
 
       /* variable is dead at predecessor, it must be from a phi */
       bool is_dead = false;
       for (unsigned pred_idx : preds) {
-         if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
+         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
+             ctx.next_use_distances_end[pred_idx].end())
             is_dead = true;
       }
       if (is_dead)
@@ -953,10 +1022,12 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          do {
             assert(idx != 0);
             idx--;
-         } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
+         } while (pair.first.type() == RegType::vgpr &&
+                  pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
          std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
 
-         aco_ptr<Instruction> reload = do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]);
+         aco_ptr<Instruction> reload =
+            do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]);
          pred.instructions.insert(it, std::move(reload));
 
          ctx.spills_exit[pred.index].erase(pair.first);
@@ -986,7 +1057,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
       if (!is_same) {
          /* the variable was renamed differently in the predecessors: we have to create a phi */
          aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
-         aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
+         aco_ptr<Pseudo_instruction> phi{
+            create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
          rename = ctx.program->allocateTmp(pair.first.regClass());
          for (unsigned i = 0; i < phi->operands.size(); i++) {
             Temp tmp;
@@ -1020,18 +1092,22 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
    if (!ctx.processed[block_idx]) {
       assert(!(block->kind & block_kind_loop_header));
       RegisterDemand demand_before = get_demand_before(ctx, block_idx, idx);
-      ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), ctx.register_demand[block->index].begin() + idx);
-      ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), instructions.size(), demand_before);
+      ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(),
+                                              ctx.register_demand[block->index].begin() + idx);
+      ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(),
+                                               instructions.size(), demand_before);
    }
 
    std::vector<aco_ptr<Instruction>>::iterator start = std::next(block->instructions.begin(), idx);
-   instructions.insert(instructions.end(), std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(start),
-               std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
+   instructions.insert(
+      instructions.end(), std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(start),
+      std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
    block->instructions = std::move(instructions);
 }
 
-void process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
-                   std::map<Temp, uint32_t> &current_spills, RegisterDemand spilled_registers)
+void
+process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
+              std::map<Temp, uint32_t>& current_spills, RegisterDemand spilled_registers)
 {
    assert(!ctx.processed[block_idx]);
 
@@ -1099,7 +1175,8 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
                if (((pair.second > distance && can_rematerialize == do_rematerialize) ||
                     (can_rematerialize && !do_rematerialize && pair.second > idx)) &&
                    current_spills.find(pair.first) == current_spills.end() &&
-                   ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) {
+                   ctx.spills_exit[block_idx].find(pair.first) ==
+                      ctx.spills_exit[block_idx].end()) {
                   to_spill = pair.first;
                   distance = pair.second;
                   do_rematerialize = can_rematerialize;
@@ -1124,7 +1201,8 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
             }
 
             /* add spill to new instructions */
-            aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
+            aco_ptr<Pseudo_instruction> spill{
+               create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
             spill->operands[0] = Operand(to_spill);
             spill->operands[1] = Operand(spill_id);
             instructions.emplace_back(std::move(spill));
@@ -1133,7 +1211,8 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
 
       /* add reloads and instruction to new instructions */
       for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {
-         aco_ptr<Instruction> reload = do_reload(ctx, pair.second.first, pair.first, pair.second.second);
+         aco_ptr<Instruction> reload =
+            do_reload(ctx, pair.second.first, pair.first, pair.second.second);
          instructions.emplace_back(std::move(reload));
       }
       instructions.emplace_back(std::move(instr));
@@ -1144,7 +1223,8 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
    ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
 }
 
-void spill_block(spill_ctx& ctx, unsigned block_idx)
+void
+spill_block(spill_ctx& ctx, unsigned block_idx)
 {
    Block* block = &ctx.program->blocks[block_idx];
 
@@ -1152,7 +1232,8 @@ void spill_block(spill_ctx& ctx, unsigned block_idx)
    RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);
 
    /* add interferences for spilled variables */
-   for (auto it = ctx.spills_entry[block_idx].begin(); it != ctx.spills_entry[block_idx].end(); ++it) {
+   for (auto it = ctx.spills_entry[block_idx].begin(); it != ctx.spills_entry[block_idx].end();
+        ++it) {
       for (auto it2 = std::next(it); it2 != ctx.spills_entry[block_idx].end(); ++it2)
          ctx.add_interference(it->second, it2->second);
    }
@@ -1167,8 +1248,7 @@ void spill_block(spill_ctx& ctx, unsigned block_idx)
 
    /* check conditions to process this block */
    bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) ||
-                  !ctx.renames[block_idx].empty() ||
-                  ctx.remat_used.size();
+                  !ctx.renames[block_idx].empty() || ctx.remat_used.size();
 
    for (auto it = current_spills.begin(); !process && it != current_spills.end(); ++it) {
       if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx)
@@ -1183,7 +1263,8 @@ void spill_block(spill_ctx& ctx, unsigned block_idx)
    ctx.processed[block_idx] = true;
 
    /* check if the next block leaves the current loop */
-   if (block->loop_nest_depth == 0 || ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
+   if (block->loop_nest_depth == 0 ||
+       ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
       return;
 
    Block* loop_header = ctx.loop_header.top();
@@ -1206,7 +1287,8 @@ void spill_block(spill_ctx& ctx, unsigned block_idx)
             aco_ptr<Instruction>& phi = *instr_it;
             if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
                break;
-            /* no need to rename the loop header phis once again. this happened in add_coupling_code() */
+            /* no need to rename the loop header phis once again. this happened in
+             * add_coupling_code() */
             if (idx == loop_header->index) {
                instr_it++;
                continue;
@@ -1240,7 +1322,7 @@ void spill_block(spill_ctx& ctx, unsigned block_idx)
                   op.setTemp(rename.second);
                   /* we can stop with this block as soon as the variable is spilled */
                   if (instr->opcode == aco_opcode::p_spill)
-                    renamed = true;
+                     renamed = true;
                }
             }
             instr_it++;
@@ -1252,9 +1334,10 @@ void spill_block(spill_ctx& ctx, unsigned block_idx)
    ctx.loop_header.pop();
 }
 
-Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
-                           std::vector<aco_ptr<Instruction>>& instructions,
-                           unsigned offset, bool is_top_level)
+Temp
+load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
+                      std::vector<aco_ptr<Instruction>>& instructions, unsigned offset,
+                      bool is_top_level)
 {
    Builder bld(ctx.program);
    if (is_top_level) {
@@ -1269,19 +1352,21 @@ Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
 
    Temp private_segment_buffer = ctx.program->private_segment_buffer;
    if (ctx.program->stage != compute_cs)
-      private_segment_buffer = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(0u));
+      private_segment_buffer =
+         bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(0u));
 
    if (offset)
-      scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), scratch_offset, Operand(offset));
+      scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
+                                scratch_offset, Operand(offset));
 
-   uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
-                        S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2);
+   uint32_t rsrc_conf =
+      S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2);
 
    if (ctx.program->chip_class >= GFX10) {
       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
-                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                   S_008F0C_RESOURCE_LEVEL(1);
-   } else if (ctx.program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
+                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   } else if (ctx.program->chip_class <= GFX7) {
+      /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
    }
@@ -1289,14 +1374,13 @@ Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
    if (ctx.program->chip_class <= GFX8)
       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
 
-   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
-                     private_segment_buffer, Operand(-1u),
+   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(-1u),
                      Operand(rsrc_conf));
 }
 
-void add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned,
-                       std::vector<uint32_t>& slots, std::vector<bool>& slots_used,
-                       unsigned id)
+void
+add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned, std::vector<uint32_t>& slots,
+                  std::vector<bool>& slots_used, unsigned id)
 {
    for (unsigned other : ctx.interferences[id].second) {
       if (!is_assigned[other])
@@ -1308,8 +1392,9 @@ void add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned,
    }
 }
 
-unsigned find_available_slot(std::vector<bool>& used, unsigned wave_size,
-                             unsigned size, bool is_sgpr, unsigned *num_slots)
+unsigned
+find_available_slot(std::vector<bool>& used, unsigned wave_size, unsigned size, bool is_sgpr,
+                    unsigned* num_slots)
 {
    unsigned wave_size_minus_one = wave_size - 1;
    unsigned slot = 0;
@@ -1341,10 +1426,9 @@ unsigned find_available_slot(std::vector<bool>& used, unsigned wave_size,
    }
 }
 
-void assign_spill_slots_helper(spill_ctx& ctx, RegType type,
-                               std::vector<bool>& is_assigned,
-                               std::vector<uint32_t>& slots,
-                               unsigned *num_slots)
+void
+assign_spill_slots_helper(spill_ctx& ctx, RegType type, std::vector<bool>& is_assigned,
+                          std::vector<uint32_t>& slots, unsigned* num_slots)
 {
    std::vector<bool> slots_used(*num_slots);
 
@@ -1360,9 +1444,9 @@ void assign_spill_slots_helper(spill_ctx& ctx, RegType type,
          add_interferences(ctx, is_assigned, slots, slots_used, id);
       }
 
-      unsigned slot = find_available_slot(slots_used, ctx.wave_size,
-                                          ctx.interferences[vec[0]].first.size(),
-                                          type == RegType::sgpr, num_slots);
+      unsigned slot =
+         find_available_slot(slots_used, ctx.wave_size, ctx.interferences[vec[0]].first.size(),
+                             type == RegType::sgpr, num_slots);
 
       for (unsigned id : vec) {
          assert(!is_assigned[id]);
@@ -1381,9 +1465,9 @@ void assign_spill_slots_helper(spill_ctx& ctx, RegType type,
 
       add_interferences(ctx, is_assigned, slots, slots_used, id);
 
-      unsigned slot = find_available_slot(slots_used, ctx.wave_size,
-                                          ctx.interferences[id].first.size(),
-                                          type == RegType::sgpr, num_slots);
+      unsigned slot =
+         find_available_slot(slots_used, ctx.wave_size, ctx.interferences[id].first.size(),
+                             type == RegType::sgpr, num_slots);
 
       slots[id] = slot;
       is_assigned[id] = true;
@@ -1392,7 +1476,9 @@ void assign_spill_slots_helper(spill_ctx& ctx, RegType type,
    *num_slots = slots_used.size();
 }
 
-void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
+void
+assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
+{
    std::vector<uint32_t> slots(ctx.interferences.size());
    std::vector<bool> is_assigned(ctx.interferences.size());
 
@@ -1426,7 +1512,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
             if (!is_assigned[vec[i]])
                continue;
             assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);
-            assert(ctx.interferences[vec[i]].first.type() == ctx.interferences[vec[j]].first.type());
+            assert(ctx.interferences[vec[i]].first.type() ==
+                   ctx.interferences[vec[j]].first.type());
             assert(slots[vec[i]] == slots[vec[j]]);
          }
       }
@@ -1451,7 +1538,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
          }
 
          if (end_vgprs > 0) {
-            aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)};
+            aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(
+               aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)};
             int k = 0;
             for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
                if (reload_in_loop[i])
@@ -1505,17 +1593,25 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                /* spill vgpr */
                ctx.program->config->spilled_vgprs += (*it)->operands[0].size();
                uint32_t spill_slot = slots[spill_id];
-               bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
-               unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
+               bool add_offset_to_sgpr =
+                  ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +
+                     vgpr_spill_slots * 4 >
+                  4096;
+               unsigned base_offset =
+                  add_offset_to_sgpr
+                     ? 0
+                     : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
 
                /* check if the scratch resource descriptor already exists */
                if (scratch_rsrc == Temp()) {
-                  unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
-                  scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
-                                                       last_top_level_block_idx == block.index ?
-                                                       instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
-                                                       offset,
-                                                       last_top_level_block_idx == block.index);
+                  unsigned offset =
+                     add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
+                  scratch_rsrc = load_scratch_resource(
+                     ctx, scratch_offset,
+                     last_top_level_block_idx == block.index
+                        ? instructions
+                        : ctx.program->blocks[last_top_level_block_idx].instructions,
+                     offset, last_top_level_block_idx == block.index);
                }
 
                unsigned offset = base_offset + spill_slot * 4;
@@ -1524,17 +1620,21 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                Temp temp = (*it)->operands[0].getTemp();
                assert(temp.type() == RegType::vgpr && !temp.is_linear());
                if (temp.size() > 1) {
-                  Instruction* split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
+                  Instruction* split{create_instruction<Pseudo_instruction>(
+                     aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
                   split->operands[0] = Operand(temp);
                   for (unsigned i = 0; i < temp.size(); i++)
                      split->definitions[i] = bld.def(v1);
                   bld.insert(split);
                   for (unsigned i = 0; i < temp.size(); i++) {
-                     Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+                     Instruction* instr =
+                        bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset,
+                                  split->definitions[i].getTemp(), offset + i * 4, false, true);
                      instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                   }
                } else {
-                  Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+                  Instruction* instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset,
+                                                 temp, offset, false, true);
                   instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                }
             } else {
@@ -1546,7 +1646,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
                   Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
                   vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
-                  aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
+                  aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+                     aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
                   create->definitions[0] = Definition(linear_vgpr);
                   /* find the right place to insert this definition */
                   if (last_top_level_block_idx == block.index) {
@@ -1555,13 +1656,15 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                   } else {
                      assert(last_top_level_block_idx < block.index);
                      /* insert before the branch at last top level block */
-                     std::vector<aco_ptr<Instruction>>& block_instrs = ctx.program->blocks[last_top_level_block_idx].instructions;
+                     std::vector<aco_ptr<Instruction>>& block_instrs =
+                        ctx.program->blocks[last_top_level_block_idx].instructions;
                      block_instrs.insert(std::prev(block_instrs.end()), std::move(create));
                   }
                }
 
                /* spill sgpr: just add the vgpr temp to operands */
-               Pseudo_instruction* spill = create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
+               Pseudo_instruction* spill =
+                  create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
                spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
                spill->operands[1] = Operand(spill_slot % ctx.wave_size);
                spill->operands[2] = (*it)->operands[0];
@@ -1577,34 +1680,46 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
             } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {
                /* reload vgpr */
                uint32_t spill_slot = slots[spill_id];
-               bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
-               unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
+               bool add_offset_to_sgpr =
+                  ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +
+                     vgpr_spill_slots * 4 >
+                  4096;
+               unsigned base_offset =
+                  add_offset_to_sgpr
+                     ? 0
+                     : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
 
                /* check if the scratch resource descriptor already exists */
                if (scratch_rsrc == Temp()) {
-                  unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
-                  scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
-                                                       last_top_level_block_idx == block.index ?
-                                                       instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
-                                                       offset,
-                                                       last_top_level_block_idx == block.index);
+                  unsigned offset =
+                     add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
+                  scratch_rsrc = load_scratch_resource(
+                     ctx, scratch_offset,
+                     last_top_level_block_idx == block.index
+                        ? instructions
+                        : ctx.program->blocks[last_top_level_block_idx].instructions,
+                     offset, last_top_level_block_idx == block.index);
                }
 
                unsigned offset = base_offset + spill_slot * 4;
                aco_opcode opcode = aco_opcode::buffer_load_dword;
                Definition def = (*it)->definitions[0];
                if (def.size() > 1) {
-                  Instruction* vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
+                  Instruction* vec{create_instruction<Pseudo_instruction>(
+                     aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
                   vec->definitions[0] = def;
                   for (unsigned i = 0; i < def.size(); i++) {
                      Temp tmp = bld.tmp(v1);
                      vec->operands[i] = Operand(tmp);
-                     Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+                     Instruction* instr =
+                        bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1),
+                                  scratch_offset, offset + i * 4, false, true);
                      instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                   }
                   bld.insert(vec);
                } else {
-                  Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+                  Instruction* instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1),
+                                                 scratch_offset, offset, false, true);
                   instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                }
             } else {
@@ -1615,7 +1730,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
                   Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
                   vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
-                  aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
+                  aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+                     aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
                   create->definitions[0] = Definition(linear_vgpr);
                   /* find the right place to insert this definition */
                   if (last_top_level_block_idx == block.index) {
@@ -1624,13 +1740,15 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                   } else {
                      assert(last_top_level_block_idx < block.index);
                      /* insert before the branch at last top level block */
-                     std::vector<aco_ptr<Instruction>>& block_instrs = ctx.program->blocks[last_top_level_block_idx].instructions;
+                     std::vector<aco_ptr<Instruction>>& block_instrs =
+                        ctx.program->blocks[last_top_level_block_idx].instructions;
                      block_instrs.insert(std::prev(block_instrs.end()), std::move(create));
                   }
                }
 
                /* reload sgpr: just add the vgpr temp to operands */
-               Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 2, 1);
+               Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(
+                  aco_opcode::p_reload, Format::PSEUDO, 2, 1);
                reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
                reload->operands[1] = Operand(spill_slot % ctx.wave_size);
                reload->definitions[0] = (*it)->definitions[0];
@@ -1639,13 +1757,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
          } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) {
             instructions.emplace_back(std::move(*it));
          }
-
       }
       block.instructions = std::move(instructions);
    }
 
    /* update required scratch memory */
-   ctx.program->config->scratch_bytes_per_wave += align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024);
+   ctx.program->config->scratch_bytes_per_wave +=
+      align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024);
 
    /* SSA elimination inserts copies for logical phis right before p_logical_end
     * So if a linear vgpr is used between that p_logical_end and the branch,
@@ -1686,7 +1804,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
       if (!vgprs.size())
          continue;
 
-      aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)};
+      aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)};
       int k = 0;
       for (Temp tmp : vgprs) {
          destr->operands[k++] = Operand(tmp);
@@ -1701,8 +1820,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
 
 } /* end namespace */
 
-
-void spill(Program* program, live& live_vars)
+void
+spill(Program* program, live& live_vars)
 {
    program->config->spilled_vgprs = 0;
    program->config->spilled_sgprs = 0;
@@ -1758,5 +1877,4 @@ void spill(Program* program, live& live_vars)
    assert(program->num_waves > 0);
 }
 
-}
-
+} // namespace aco
diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp
index dcb8b162b76..266af1e4893 100644
--- a/src/amd/compiler/aco_ssa_elimination.cpp
+++ b/src/amd/compiler/aco_ssa_elimination.cpp
@@ -37,7 +37,8 @@ struct phi_info_item {
 };
 
 struct ssa_elimination_ctx {
-   /* The outer vectors should be indexed by block index. The inner vectors store phi information for each block. */
+   /* The outer vectors should be indexed by block index. The inner vectors store phi information
+    * for each block. */
    std::vector<std::vector<phi_info_item>> logical_phi_info;
    std::vector<std::vector<phi_info_item>> linear_phi_info;
    std::vector<bool> empty_blocks;
@@ -45,14 +46,14 @@ struct ssa_elimination_ctx {
    Program* program;
 
    ssa_elimination_ctx(Program* program_)
-      : logical_phi_info(program_->blocks.size())
-      , linear_phi_info(program_->blocks.size())
-      , empty_blocks(program_->blocks.size(), true)
-      , blocks_incoming_exec_used(program_->blocks.size(), true)
-      , program(program_) {}
+       : logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()),
+         empty_blocks(program_->blocks.size(), true),
+         blocks_incoming_exec_used(program_->blocks.size(), true), program(program_)
+   {}
 };
 
-void collect_phi_info(ssa_elimination_ctx& ctx)
+void
+collect_phi_info(ssa_elimination_ctx& ctx)
 {
    for (Block& block : ctx.program->blocks) {
       for (aco_ptr<Instruction>& phi : block.instructions) {
@@ -67,9 +68,11 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
 
             assert(phi->definitions[0].size() == phi->operands[i].size());
 
-            std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
+            std::vector<unsigned>& preds =
+               phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
             uint32_t pred_idx = preds[i];
-            auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx] : ctx.linear_phi_info[pred_idx];
+            auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx]
+                                                              : ctx.linear_phi_info[pred_idx];
             info_vec.push_back({phi->definitions[0], phi->operands[i]});
             ctx.empty_blocks[pred_idx] = false;
          }
@@ -77,11 +80,12 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
    }
 }
 
-void insert_parallelcopies(ssa_elimination_ctx& ctx)
+void
+insert_parallelcopies(ssa_elimination_ctx& ctx)
 {
    /* insert the parallelcopies from logical phis before p_logical_end */
    for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
-      auto &logical_phi_info = ctx.logical_phi_info[block_idx];
+      auto& logical_phi_info = ctx.logical_phi_info[block_idx];
       if (logical_phi_info.empty())
          continue;
 
@@ -93,10 +97,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
       }
 
       std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
-      aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, logical_phi_info.size(), logical_phi_info.size())};
+      aco_ptr<Pseudo_instruction> pc{
+         create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
+                                                logical_phi_info.size(), logical_phi_info.size())};
       unsigned i = 0;
-      for (auto& phi_info : logical_phi_info)
-      {
+      for (auto& phi_info : logical_phi_info) {
          pc->definitions[i] = phi_info.def;
          pc->operands[i] = phi_info.op;
          i++;
@@ -108,7 +113,7 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
 
    /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
    for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
-      auto &linear_phi_info = ctx.linear_phi_info[block_idx];
+      auto& linear_phi_info = ctx.linear_phi_info[block_idx];
       if (linear_phi_info.empty())
          continue;
 
@@ -116,10 +121,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
       std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
       --it;
       assert((*it)->isBranch());
-      aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, linear_phi_info.size(), linear_phi_info.size())};
+      aco_ptr<Pseudo_instruction> pc{
+         create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
+                                                linear_phi_info.size(), linear_phi_info.size())};
       unsigned i = 0;
-      for (auto& phi_info : linear_phi_info)
-      {
+      for (auto& phi_info : linear_phi_info) {
          pc->definitions[i] = phi_info.def;
          pc->operands[i] = phi_info.op;
          i++;
@@ -130,38 +136,38 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
    }
 }
 
-bool is_empty_block(Block* block, bool ignore_exec_writes)
+bool
+is_empty_block(Block* block, bool ignore_exec_writes)
 {
    /* check if this block is empty and the exec mask is not needed */
    for (aco_ptr<Instruction>& instr : block->instructions) {
       switch (instr->opcode) {
-         case aco_opcode::p_linear_phi:
-         case aco_opcode::p_phi:
-         case aco_opcode::p_logical_start:
-         case aco_opcode::p_logical_end:
-         case aco_opcode::p_branch:
+      case aco_opcode::p_linear_phi:
+      case aco_opcode::p_phi:
+      case aco_opcode::p_logical_start:
+      case aco_opcode::p_logical_end:
+      case aco_opcode::p_branch: break;
+      case aco_opcode::p_parallelcopy:
+         for (unsigned i = 0; i < instr->definitions.size(); i++) {
+            if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
+               continue;
+            if (instr->definitions[i].physReg() != instr->operands[i].physReg())
+               return false;
+         }
+         break;
+      case aco_opcode::s_andn2_b64:
+      case aco_opcode::s_andn2_b32:
+         if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
             break;
-         case aco_opcode::p_parallelcopy:
-            for (unsigned i = 0; i < instr->definitions.size(); i++) {
-               if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
-                  continue;
-               if (instr->definitions[i].physReg() != instr->operands[i].physReg())
-                  return false;
-            }
-            break;
-         case aco_opcode::s_andn2_b64:
-         case aco_opcode::s_andn2_b32:
-            if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
-               break;
-            return false;
-         default:
-            return false;
+         return false;
+      default: return false;
       }
    }
    return true;
 }
 
-void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
+void
+try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
 {
    /* check if the successor is another merge block which restores exec */
    // TODO: divergent loops also restore exec
@@ -179,7 +185,8 @@ void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
    block->instructions.emplace_back(std::move(branch));
 }
 
-void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
+void
+try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
 {
    assert(block->linear_succs.size() == 2);
    /* only remove this block if the successor got removed as well */
@@ -193,7 +200,7 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
    unsigned succ_idx = block->linear_succs[0];
    assert(block->linear_preds.size() == 2);
    for (unsigned i = 0; i < 2; i++) {
-      Block *pred = &ctx.program->blocks[block->linear_preds[i]];
+      Block* pred = &ctx.program->blocks[block->linear_preds[i]];
       pred->linear_succs[0] = succ_idx;
       ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;
 
@@ -208,7 +215,8 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
    block->linear_succs.clear();
 }
 
-void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
+void
+try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
 {
    if (!is_empty_block(block, false))
       return;
@@ -277,7 +285,8 @@ void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
    block->linear_succs.clear();
 }
 
-bool instr_writes_exec(Instruction* instr)
+bool
+instr_writes_exec(Instruction* instr)
 {
    for (Definition& def : instr->definitions)
       if (def.physReg() == exec || def.physReg() == exec_hi)
@@ -286,7 +295,8 @@ bool instr_writes_exec(Instruction* instr)
    return false;
 }
 
-void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
+void
+eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
 {
    /* Check if any successor needs the outgoing exec mask from the current block. */
 
@@ -309,8 +319,9 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
          exec_write_used = false;
       else
          /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
-         exec_write_used = std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
-                                       [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
+         exec_write_used =
+            std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
+                        [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
    }
 
    /* Go through all instructions and eliminate useless exec writes. */
@@ -318,7 +329,8 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
    for (int i = block.instructions.size() - 1; i >= 0; --i) {
       aco_ptr<Instruction>& instr = block.instructions[i];
 
-      /* We already take information from phis into account before the loop, so let's just break on phis. */
+      /* We already take information from phis into account before the loop, so let's just break on
+       * phis. */
       if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
          break;
 
@@ -341,16 +353,15 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
    }
 
    /* Remember if the current block needs an incoming exec mask from its predecessors. */
-
    ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
 
    /* Cleanup: remove deleted instructions from the vector. */
-
    auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
    block.instructions.resize(new_end - block.instructions.begin());
 }
 
-void jump_threading(ssa_elimination_ctx& ctx)
+void
+jump_threading(ssa_elimination_ctx& ctx)
 {
    for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
       Block* block = &ctx.program->blocks[i];
@@ -367,8 +378,7 @@ void jump_threading(ssa_elimination_ctx& ctx)
       if (block->linear_succs.size() > 1)
          continue;
 
-      if (block->kind & block_kind_merge ||
-          block->kind & block_kind_loop_exit)
+      if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit)
          try_remove_merge_block(ctx, block);
 
       if (block->linear_preds.size() == 1)
@@ -378,8 +388,8 @@ void jump_threading(ssa_elimination_ctx& ctx)
 
 } /* end namespace */
 
-
-void ssa_elimination(Program* program)
+void
+ssa_elimination(Program* program)
 {
    ssa_elimination_ctx ctx(program);
 
@@ -391,6 +401,5 @@ void ssa_elimination(Program* program)
 
    /* insert parallelcopies from SSA elimination */
    insert_parallelcopies(ctx);
-
-}
 }
+} // namespace aco
diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp
index a8652de8f56..ce114e3f879 100644
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "aco_ir.h"
+
 #include "util/crc32.h"
 
 #include <algorithm>
@@ -33,7 +34,8 @@
 namespace aco {
 
 /* sgpr_presched/vgpr_presched */
-void collect_presched_stats(Program *program)
+void
+collect_presched_stats(Program* program)
 {
    RegisterDemand presched_demand;
    for (Block& block : program->blocks)
@@ -56,9 +58,9 @@ public:
       resource_count,
    };
 
-   BlockCycleEstimator(Program *program_) : program(program_) {}
+   BlockCycleEstimator(Program* program_) : program(program_) {}
 
-   Program *program;
+   Program* program;
 
    int32_t cur_cycle = 0;
    int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
@@ -72,6 +74,7 @@ public:
    unsigned predict_cost(aco_ptr<Instruction>& instr);
    void add(aco_ptr<Instruction>& instr);
    void join(const BlockCycleEstimator& other);
+
 private:
    unsigned get_waitcnt_cost(wait_imm imm);
    unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
@@ -81,8 +84,9 @@ private:
 };
 
 struct wait_counter_info {
-   wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) :
-      vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
+   wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)
+       : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
+   {}
 
    unsigned vm;
    unsigned exp;
@@ -100,107 +104,83 @@ struct perf_info {
    unsigned cost1;
 };
 
-static perf_info get_perf_info(Program *program, aco_ptr<Instruction>& instr)
+static perf_info
+get_perf_info(Program* program, aco_ptr<Instruction>& instr)
 {
    instr_class cls = instr_info.classes[(int)instr->opcode];
 
-   #define WAIT(res) BlockCycleEstimator::res, 0
-   #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
+#define WAIT(res)          BlockCycleEstimator::res, 0
+#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
 
    if (program->chip_class >= GFX10) {
       /* fp64 might be incorrect */
       switch (cls) {
       case instr_class::valu32:
       case instr_class::valu_convert32:
-      case instr_class::valu_fma:
-         return {5, WAIT_USE(valu, 1)};
-      case instr_class::valu64:
-         return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
+      case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
+      case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
       case instr_class::valu_quarter_rate32:
          return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
       case instr_class::valu_transcendental32:
          return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
-      case instr_class::valu_double:
-         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
+      case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
       case instr_class::valu_double_add:
          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
       case instr_class::valu_double_convert:
          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
       case instr_class::valu_double_transcendental:
          return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
-      case instr_class::salu:
-         return {2, WAIT_USE(scalar, 1)};
-      case instr_class::smem:
-         return {0, WAIT_USE(scalar, 1)};
+      case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
+      case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
       case instr_class::branch:
-      case instr_class::sendmsg:
-         return {0, WAIT_USE(branch_sendmsg, 1)};
+      case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
       case instr_class::ds:
-         return instr->ds().gds ?
-                perf_info{0, WAIT_USE(export_gds, 1)} :
-                perf_info{0, WAIT_USE(lds, 1)};
-      case instr_class::exp:
-         return {0, WAIT_USE(export_gds, 1)};
-      case instr_class::vmem:
-         return {0, WAIT_USE(vmem, 1)};
+         return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
+                                : perf_info{0, WAIT_USE(lds, 1)};
+      case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
+      case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
       case instr_class::barrier:
       case instr_class::waitcnt:
       case instr_class::other:
-      default:
-         return {0};
+      default: return {0};
       }
    } else {
       switch (cls) {
-      case instr_class::valu32:
-         return {4, WAIT_USE(valu, 4)};
-      case instr_class::valu_convert32:
-         return {16, WAIT_USE(valu, 16)};
-      case instr_class::valu64:
-         return {8, WAIT_USE(valu, 8)};
-      case instr_class::valu_quarter_rate32:
-         return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
+      case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
+      case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
       case instr_class::valu_fma:
-         return program->dev.has_fast_fma32 ?
-                perf_info{4, WAIT_USE(valu, 4)} :
-                perf_info{16, WAIT_USE(valu, 16)};
-      case instr_class::valu_transcendental32:
-         return {16, WAIT_USE(valu, 16)};
-      case instr_class::valu_double:
-         return {64, WAIT_USE(valu, 64)};
-      case instr_class::valu_double_add:
-         return {32, WAIT_USE(valu, 32)};
-      case instr_class::valu_double_convert:
-         return {16, WAIT_USE(valu, 16)};
-      case instr_class::valu_double_transcendental:
-         return {64, WAIT_USE(valu, 64)};
-      case instr_class::salu:
-         return {4, WAIT_USE(scalar, 4)};
-      case instr_class::smem:
-         return {4, WAIT_USE(scalar, 4)};
+         return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
+                                            : perf_info{16, WAIT_USE(valu, 16)};
+      case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
+      case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
+      case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
+      case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
+      case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
       case instr_class::branch:
          return {8, WAIT_USE(branch_sendmsg, 8)};
          return {4, WAIT_USE(branch_sendmsg, 4)};
       case instr_class::ds:
-         return instr->ds().gds ?
-                perf_info{4, WAIT_USE(export_gds, 4)} :
-                perf_info{4, WAIT_USE(lds, 4)};
-      case instr_class::exp:
-         return {16, WAIT_USE(export_gds, 16)};
-      case instr_class::vmem:
-         return {4, WAIT_USE(vmem, 4)};
+         return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
+                                : perf_info{4, WAIT_USE(lds, 4)};
+      case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
+      case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
       case instr_class::barrier:
       case instr_class::waitcnt:
       case instr_class::other:
-      default:
-         return {4};
+      default: return {4};
       }
    }
 
-   #undef WAIT_USE
-   #undef WAIT
+#undef WAIT_USE
+#undef WAIT
 }
 
-void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
+void
+BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
 {
    perf_info perf = get_perf_info(program, instr);
 
@@ -215,7 +195,8 @@ void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
    }
 }
 
-int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
+int32_t
+BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
 {
    perf_info perf = get_perf_info(program, instr);
 
@@ -228,7 +209,8 @@ int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& in
    return cost;
 }
 
-static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
+static wait_counter_info
+get_wait_counter_info(aco_ptr<Instruction>& instr)
 {
    /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
     * depends a lot on the situation. */
@@ -252,8 +234,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
 
       bool likely_desc_load = instr->operands[0].size() == 2;
       bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
-      bool const_offset = instr->operands[1].isConstant() &&
-                          (!soe || instr->operands.back().isConstant());
+      bool const_offset =
+         instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
 
       if (likely_desc_load || const_offset)
          return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
@@ -273,7 +255,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
    return wait_counter_info(0, 0, 0, 0);
 }
 
-static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
+static wait_imm
+get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
 {
    if (instr->opcode == aco_opcode::s_endpgm) {
       return wait_imm(0, 0, 0, 0);
@@ -297,7 +280,8 @@ static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
    }
 }
 
-unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
+unsigned
+BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
 {
    int deps_available = cur_cycle;
 
@@ -337,13 +321,15 @@ unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
    return deps_available - cur_cycle;
 }
 
-unsigned BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
+unsigned
+BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
 {
    int32_t dep = get_dependency_cost(instr);
    return dep + std::max(cycles_until_res_available(instr) - dep, 0);
 }
 
-static bool is_vector(aco_opcode op)
+static bool
+is_vector(aco_opcode op)
 {
    switch (instr_info.classes[(int)op]) {
    case instr_class::valu32:
@@ -358,14 +344,13 @@ static bool is_vector(aco_opcode op)
    case instr_class::exp:
    case instr_class::valu64:
    case instr_class::valu_quarter_rate32:
-   case instr_class::valu_transcendental32:
-      return true;
-   default:
-      return false;
+   case instr_class::valu_transcendental32: return true;
+   default: return false;
    }
 }
 
-void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
+void
+BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
 {
    perf_info perf = get_perf_info(program, instr);
 
@@ -411,13 +396,14 @@ void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
    int32_t result_available = start + MAX2(perf.latency, latency);
 
    for (Definition& def : instr->definitions) {
-      int32_t *available = &reg_available[def.physReg().reg()];
+      int32_t* available = &reg_available[def.physReg().reg()];
       for (unsigned i = 0; i < def.size(); i++)
          available[i] = MAX2(available[i], result_available);
    }
 }
 
-static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
+static void
+join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
 {
    for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
       queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
@@ -425,7 +411,8 @@ static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pr
       queue.push_front(pred[i] + cycle_diff);
 }
 
-void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
+void
+BlockCycleEstimator::join(const BlockCycleEstimator& pred)
 {
    assert(cur_cycle == 0);
 
@@ -435,8 +422,7 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
    }
 
    for (unsigned i = 0; i < 512; i++)
-      reg_available[i] = MAX2(reg_available[i],
-                              pred.reg_available[i] - pred.cur_cycle + cur_cycle);
+      reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
 
    join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
    join_queue(exp, pred.exp, -pred.cur_cycle);
@@ -445,11 +431,12 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
 }
 
 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
-void collect_preasm_stats(Program *program)
+void
+collect_preasm_stats(Program* program)
 {
    for (Block& block : program->blocks) {
-      std::set<Instruction *> vmem_clause;
-      std::set<Instruction *> smem_clause;
+      std::set<Instruction*> vmem_clause;
+      std::set<Instruction*> smem_clause;
 
       program->statistics[statistic_instructions] += block.instructions.size();
 
@@ -462,7 +449,8 @@ void collect_preasm_stats(Program *program)
 
          if (instr->isVMEM() && !instr->operands.empty()) {
             if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
-                             [&](Instruction *other) {return should_form_clause(instr.get(), other);}))
+                             [&](Instruction* other)
+                             { return should_form_clause(instr.get(), other); }))
                program->statistics[statistic_vmem_clauses]++;
             vmem_clause.insert(instr.get());
          } else {
@@ -471,12 +459,13 @@ void collect_preasm_stats(Program *program)
 
          if (instr->isSMEM() && !instr->operands.empty()) {
             if (std::none_of(smem_clause.begin(), smem_clause.end(),
-                             [&](Instruction *other) {return should_form_clause(instr.get(), other);}))
+                             [&](Instruction* other)
+                             { return should_form_clause(instr.get(), other); }))
                program->statistics[statistic_smem_clauses]++;
             smem_clause.insert(instr.get());
          } else {
             smem_clause.clear();
-          }
+         }
       }
    }
 
@@ -514,8 +503,10 @@ void collect_preasm_stats(Program *program)
       iter *= pow(0.5, block.uniform_if_depth);
       iter *= pow(0.75, block.divergent_if_logical_depth);
 
-      bool divergent_if_linear_else = block.logical_preds.empty() && block.linear_preds.size() == 1 && block.linear_succs.size() == 1 &&
-                                      program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
+      bool divergent_if_linear_else =
+         block.logical_preds.empty() && block.linear_preds.size() == 1 &&
+         block.linear_succs.size() == 1 &&
+         program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
       if (divergent_if_linear_else)
          iter *= 0.25;
 
@@ -540,7 +531,8 @@ void collect_preasm_stats(Program *program)
 
    double max_utilization = 1.0;
    if (program->workgroup_size != UINT_MAX)
-      max_utilization = program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
+      max_utilization =
+         program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
    wave64_per_cycle *= max_utilization;
 
    program->statistics[statistic_latency] = round(latency);
@@ -551,7 +543,8 @@ void collect_preasm_stats(Program *program)
 
       fprintf(stderr, "num_waves: %u\n", program->num_waves);
       fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
-      fprintf(stderr, "branch_sendmsg_usage: %f\n", usage[(int)BlockCycleEstimator::branch_sendmsg]);
+      fprintf(stderr, "branch_sendmsg_usage: %f\n",
+              usage[(int)BlockCycleEstimator::branch_sendmsg]);
       fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
       fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
       fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
@@ -565,9 +558,10 @@ void collect_preasm_stats(Program *program)
    }
 }
 
-void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code)
+void
+collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
 {
    program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
 }
 
-}
+} // namespace aco
diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h
index 9d24fb936db..88f52182a5f 100644
--- a/src/amd/compiler/aco_util.h
+++ b/src/amd/compiler/aco_util.h
@@ -35,207 +35,198 @@
 namespace aco {
 
 /*! \brief      Definition of a span object
-*
-*   \details    A "span" is an "array view" type for holding a view of contiguous
-*               data. The "span" object does not own the data itself.
-*/
-template <typename T>
-class span {
+ *
+ *   \details    A "span" is an "array view" type for holding a view of contiguous
+ *               data. The "span" object does not own the data itself.
+ */
+template <typename T> class span {
 public:
-   using value_type             = T;
-   using pointer                = value_type*;
-   using const_pointer          = const value_type*;
-   using reference              = value_type&;
-   using const_reference        = const value_type&;
-   using iterator               = pointer;
-   using const_iterator         = const_pointer;
-   using reverse_iterator       = std::reverse_iterator<iterator>;
+   using value_type = T;
+   using pointer = value_type*;
+   using const_pointer = const value_type*;
+   using reference = value_type&;
+   using const_reference = const value_type&;
+   using iterator = pointer;
+   using const_iterator = const_pointer;
+   using reverse_iterator = std::reverse_iterator<iterator>;
    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-   using size_type              = uint16_t;
-   using difference_type        = ptrdiff_t;
+   using size_type = uint16_t;
+   using difference_type = ptrdiff_t;
 
    /*! \brief                  Compiler generated default constructor
-   */
+    */
    constexpr span() = default;
 
    /*! \brief                 Constructor taking a pointer and the length of the span
-   *   \param[in]   data      Pointer to the underlying data array
-   *   \param[in]   length    The size of the span
-   */
-   constexpr span(uint16_t offset_, const size_type length_)
-       : offset{ offset_ } , length{ length_ } {}
+    *  \param[in]   data      Pointer to the underlying data array
+    *  \param[in]   length    The size of the span
+    */
+   constexpr span(uint16_t offset_, const size_type length_) : offset{offset_}, length{length_} {}
 
    /*! \brief                 Returns an iterator to the begin of the span
-   *   \return                data
-   */
-   constexpr iterator begin() noexcept {
-      return (pointer)((uintptr_t)this + offset);
-   }
+    *  \return                data
+    */
+   constexpr iterator begin() noexcept { return (pointer)((uintptr_t)this + offset); }
 
    /*! \brief                 Returns a const_iterator to the begin of the span
-   *   \return                data
-   */
-   constexpr const_iterator begin() const noexcept {
+    *  \return                data
+    */
+   constexpr const_iterator begin() const noexcept
+   {
       return (const_pointer)((uintptr_t)this + offset);
    }
 
    /*! \brief                 Returns an iterator to the end of the span
-   *   \return                data + length
-   */
-   constexpr iterator end() noexcept {
-      return std::next(begin(), length);
-   }
+    *  \return                data + length
+    */
+   constexpr iterator end() noexcept { return std::next(begin(), length); }
 
    /*! \brief                 Returns a const_iterator to the end of the span
-   *   \return                data + length
-   */
-   constexpr const_iterator end() const noexcept {
-      return std::next(begin(), length);
-   }
+    *  \return                data + length
+    */
+   constexpr const_iterator end() const noexcept { return std::next(begin(), length); }
 
    /*! \brief                 Returns a const_iterator to the begin of the span
-   *   \return                data
-   */
-   constexpr const_iterator cbegin() const noexcept {
-      return begin();
-   }
+    *  \return                data
+    */
+   constexpr const_iterator cbegin() const noexcept { return begin(); }
 
    /*! \brief                 Returns a const_iterator to the end of the span
-   *   \return                data + length
-   */
-   constexpr const_iterator cend() const noexcept {
-      return std::next(begin(), length);
-   }
+    *  \return                data + length
+    */
+   constexpr const_iterator cend() const noexcept { return std::next(begin(), length); }
 
    /*! \brief                 Returns a reverse_iterator to the end of the span
-   *   \return                reverse_iterator(end())
-   */
-   constexpr reverse_iterator rbegin() noexcept {
-      return reverse_iterator(end());
-   }
+    *  \return                reverse_iterator(end())
+    */
+   constexpr reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
 
    /*! \brief                 Returns a const_reverse_iterator to the end of the span
-   *   \return                reverse_iterator(end())
-   */
-   constexpr const_reverse_iterator rbegin() const noexcept {
+    *  \return                reverse_iterator(end())
+    */
+   constexpr const_reverse_iterator rbegin() const noexcept
+   {
       return const_reverse_iterator(end());
    }
 
    /*! \brief                 Returns a reverse_iterator to the begin of the span
-   *   \return                reverse_iterator(begin())
-   */
-   constexpr reverse_iterator rend() noexcept {
-      return reverse_iterator(begin());
-   }
+    *   \return                reverse_iterator(begin())
+    */
+   constexpr reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
 
    /*! \brief                 Returns a const_reverse_iterator to the begin of the span
-   *   \return                reverse_iterator(begin())
-   */
-   constexpr const_reverse_iterator rend() const noexcept {
+    *  \return                reverse_iterator(begin())
+    */
+   constexpr const_reverse_iterator rend() const noexcept
+   {
       return const_reverse_iterator(begin());
    }
 
    /*! \brief                 Returns a const_reverse_iterator to the end of the span
-   *   \return                rbegin()
-   */
-   constexpr const_reverse_iterator crbegin() const noexcept {
+    *  \return                rbegin()
+    */
+   constexpr const_reverse_iterator crbegin() const noexcept
+   {
       return const_reverse_iterator(cend());
    }
 
    /*! \brief                 Returns a const_reverse_iterator to the begin of the span
-   *   \return                rend()
-   */
-   constexpr const_reverse_iterator crend() const noexcept {
+    *  \return                rend()
+    */
+   constexpr const_reverse_iterator crend() const noexcept
+   {
       return const_reverse_iterator(cbegin());
    }
 
    /*! \brief                 Unchecked access operator
-   *   \param[in] index       Index of the element we want to access
-   *   \return                *(std::next(data, index))
-   */
-   constexpr reference operator[](const size_type index) noexcept {
+    *  \param[in] index       Index of the element we want to access
+    *  \return                *(std::next(data, index))
+    */
+   constexpr reference operator[](const size_type index) noexcept
+   {
       assert(length > index);
       return *(std::next(begin(), index));
    }
 
    /*! \brief                 Unchecked const access operator
-   *   \param[in] index       Index of the element we want to access
-   *   \return                *(std::next(data, index))
-   */
-   constexpr const_reference operator[](const size_type index) const noexcept {
+    *  \param[in] index       Index of the element we want to access
+    *  \return                *(std::next(data, index))
+    */
+   constexpr const_reference operator[](const size_type index) const noexcept
+   {
       assert(length > index);
       return *(std::next(begin(), index));
    }
 
    /*! \brief                 Returns a reference to the last element of the span
-   *   \return                *(std::next(data, length - 1))
-   */
-   constexpr reference back() noexcept {
+    *  \return                *(std::next(data, length - 1))
+    */
+   constexpr reference back() noexcept
+   {
       assert(length > 0);
       return *(std::next(begin(), length - 1));
    }
 
    /*! \brief                 Returns a const_reference to the last element of the span
-   *   \return                *(std::next(data, length - 1))
-   */
-   constexpr const_reference back() const noexcept {
+    *  \return                *(std::next(data, length - 1))
+    */
+   constexpr const_reference back() const noexcept
+   {
       assert(length > 0);
       return *(std::next(begin(), length - 1));
    }
 
    /*! \brief                 Returns a reference to the first element of the span
-   *   \return                *begin()
-   */
-   constexpr reference front() noexcept {
+    *  \return                *begin()
+    */
+   constexpr reference front() noexcept
+   {
       assert(length > 0);
       return *begin();
    }
 
    /*! \brief                 Returns a const_reference to the first element of the span
-   *   \return                *cbegin()
-   */
-   constexpr const_reference front() const noexcept {
+    *  \return                *cbegin()
+    */
+   constexpr const_reference front() const noexcept
+   {
       assert(length > 0);
       return *cbegin();
    }
 
    /*! \brief                 Returns true if the span is empty
-   *   \return                length == 0
-   */
-   constexpr bool empty() const noexcept {
-      return length == 0;
-   }
+    *  \return                length == 0
+    */
+   constexpr bool empty() const noexcept { return length == 0; }
 
    /*! \brief                 Returns the size of the span
-   *   \return                length == 0
-   */
-   constexpr size_type size() const noexcept {
-      return length;
-   }
+    *  \return                length == 0
+    */
+   constexpr size_type size() const noexcept { return length; }
 
    /*! \brief                 Decreases the size of the span by 1
-   */
-   constexpr void pop_back() noexcept {
+    */
+   constexpr void pop_back() noexcept
+   {
       assert(length > 0);
       --length;
    }
 
    /*! \brief                 Adds an element to the end of the span
-   */
-   constexpr void push_back(const_reference val) noexcept {
-      *std::next(begin(), length++) = val;
-   }
+    */
+   constexpr void push_back(const_reference val) noexcept { *std::next(begin(), length++) = val; }
 
    /*! \brief                 Clears the span
-   */
-   constexpr void clear() noexcept {
+    */
+   constexpr void clear() noexcept
+   {
       offset = 0;
       length = 0;
    }
 
 private:
-   uint16_t offset{ 0 };      //!> Byte offset from span to data
-   size_type length{ 0 };     //!> Size of the span
+   uint16_t offset{0};  //!> Byte offset from span to data
+   size_type length{0}; //!> Size of the span
 };
 
 /*
@@ -250,30 +241,32 @@ private:
  */
 struct IDSet {
    struct Iterator {
-      const IDSet *set;
+      const IDSet* set;
       union {
          struct {
-            uint32_t bit:6;
-            uint32_t word:26;
+            uint32_t bit : 6;
+            uint32_t word : 26;
          };
          uint32_t id;
       };
 
-      Iterator& operator ++();
+      Iterator& operator++();
 
-      bool operator != (const Iterator& other) const;
+      bool operator!=(const Iterator& other) const;
 
-      uint32_t operator * () const;
+      uint32_t operator*() const;
    };
 
-   size_t count(uint32_t id) const {
+   size_t count(uint32_t id) const
+   {
       if (id >= words.size() * 64)
          return 0;
 
       return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0;
    }
 
-   Iterator find(uint32_t id) const {
+   Iterator find(uint32_t id) const
+   {
       if (!count(id))
          return end();
 
@@ -284,7 +277,8 @@ struct IDSet {
       return it;
    }
 
-   std::pair<Iterator, bool> insert(uint32_t id) {
+   std::pair<Iterator, bool> insert(uint32_t id)
+   {
       if (words.size() * 64u <= id)
          words.resize(id / 64u + 1);
 
@@ -302,7 +296,8 @@ struct IDSet {
       return std::make_pair(it, true);
    }
 
-   size_t erase(uint32_t id) {
+   size_t erase(uint32_t id)
+   {
       if (!count(id))
          return 0;
 
@@ -311,7 +306,8 @@ struct IDSet {
       return 1;
    }
 
-   Iterator cbegin() const {
+   Iterator cbegin() const
+   {
       Iterator it;
       it.set = this;
       for (size_t i = 0; i < words.size(); i++) {
@@ -324,7 +320,8 @@ struct IDSet {
       return end();
    }
 
-   Iterator cend() const {
+   Iterator cend() const
+   {
       Iterator it;
       it.set = this;
       it.word = words.size();
@@ -332,27 +329,21 @@ struct IDSet {
       return it;
    }
 
-   Iterator begin() const {
-      return cbegin();
-   }
+   Iterator begin() const { return cbegin(); }
 
-   Iterator end() const {
-      return cend();
-   }
+   Iterator end() const { return cend(); }
 
-   bool empty() const {
-      return bits_set == 0;
-   }
+   bool empty() const { return bits_set == 0; }
 
-   size_t size() const {
-      return bits_set;
-   }
+   size_t size() const { return bits_set; }
 
    std::vector<uint64_t> words;
    uint32_t bits_set = 0;
 };
 
-inline IDSet::Iterator& IDSet::Iterator::operator ++() {
+inline IDSet::Iterator&
+IDSet::Iterator::operator++()
+{
    uint64_t m = set->words[word];
    m &= ~((2ull << bit) - 1ull);
    if (!m) {
@@ -374,12 +365,16 @@ inline IDSet::Iterator& IDSet::Iterator::operator ++() {
    return *this;
 }
 
-inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const {
+inline bool
+IDSet::Iterator::operator!=(const IDSet::Iterator& other) const
+{
    assert(set == other.set);
    return id != other.id;
 }
 
-inline uint32_t IDSet::Iterator::operator * () const {
+inline uint32_t
+IDSet::Iterator::operator*() const
+{
    return (word << 6) | bit;
 }
 
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index 400d58e5765..af1393ba418 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "aco_ir.h"
+
 #include "util/memstream.h"
 
 #include <array>
@@ -32,11 +33,11 @@
 
 namespace aco {
 
-static void aco_log(Program *program, enum radv_compiler_debug_level level,
-                    const char *prefix, const char *file, unsigned line,
-                    const char *fmt, va_list args)
+static void
+aco_log(Program* program, enum radv_compiler_debug_level level, const char* prefix,
+        const char* file, unsigned line, const char* fmt, va_list args)
 {
-   char *msg;
+   char* msg;
 
    if (program->debug.shorten_messages) {
       msg = ralloc_vasprintf(NULL, fmt, args);
@@ -55,38 +56,39 @@ static void aco_log(Program *program, enum radv_compiler_debug_level level,
    ralloc_free(msg);
 }
 
-void _aco_perfwarn(Program *program, const char *file, unsigned line,
-                   const char *fmt, ...)
+void
+_aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...)
 {
    va_list args;
 
    va_start(args, fmt);
-   aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN,
-           "ACO PERFWARN:\n", file, line, fmt, args);
+   aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN, "ACO PERFWARN:\n", file, line, fmt, args);
    va_end(args);
 }
 
-void _aco_err(Program *program, const char *file, unsigned line,
-              const char *fmt, ...)
+void
+_aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...)
 {
    va_list args;
 
    va_start(args, fmt);
-   aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR,
-           "ACO ERROR:\n", file, line, fmt, args);
+   aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args);
    va_end(args);
 }
 
-bool validate_ir(Program* program)
+bool
+validate_ir(Program* program)
 {
    bool is_valid = true;
-   auto check = [&program, &is_valid](bool success, const char * msg, aco::Instruction * instr) -> void {
+   auto check = [&program, &is_valid](bool success, const char* msg,
+                                      aco::Instruction* instr) -> void
+   {
       if (!success) {
-         char *out;
+         char* out;
          size_t outsize;
          struct u_memstream mem;
          u_memstream_open(&mem, &out, &outsize);
-         FILE *const memf = u_memstream_get(&mem);
+         FILE* const memf = u_memstream_get(&mem);
 
          fprintf(memf, "%s: ", msg);
          aco_print_instr(instr, memf);
@@ -99,7 +101,9 @@ bool validate_ir(Program* program)
       }
    };
 
-   auto check_block = [&program, &is_valid](bool success, const char * msg, aco::Block * block) -> void {
+   auto check_block = [&program, &is_valid](bool success, const char* msg,
+                                            aco::Block* block) -> void
+   {
       if (!success) {
          aco_err(program, "%s: BB%u", msg, block->index);
          is_valid = false;
@@ -132,32 +136,32 @@ bool validate_ir(Program* program)
                base_format = Format::VINTRP;
             }
          }
-         check(base_format == instr_info.format[(int)instr->opcode], "Wrong base format for instruction", instr.get());
+         check(base_format == instr_info.format[(int)instr->opcode],
+               "Wrong base format for instruction", instr.get());
 
          /* check VOP3 modifiers */
          if (instr->isVOP3() && instr->format != Format::VOP3) {
-            check(base_format == Format::VOP2 ||
-                  base_format == Format::VOP1 ||
-                  base_format == Format::VOPC ||
-                  base_format == Format::VINTRP,
+            check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
+                     base_format == Format::VOPC || base_format == Format::VINTRP,
                   "Format cannot have VOP3/VOP3B applied", instr.get());
          }
 
          /* check SDWA */
          if (instr->isSDWA()) {
-            check(base_format == Format::VOP2 ||
-                  base_format == Format::VOP1 ||
-                  base_format == Format::VOPC,
+            check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
+                     base_format == Format::VOPC,
                   "Format cannot have SDWA applied", instr.get());
 
             check(program->chip_class >= GFX8, "SDWA is GFX8+ only", instr.get());
 
             SDWA_instruction& sdwa = instr->sdwa();
-            check(sdwa.omod == 0 || program->chip_class >= GFX9, "SDWA omod only supported on GFX9+", instr.get());
+            check(sdwa.omod == 0 || program->chip_class >= GFX9,
+                  "SDWA omod only supported on GFX9+", instr.get());
             if (base_format == Format::VOPC) {
-               check(sdwa.clamp == false || program->chip_class == GFX8, "SDWA VOPC clamp only supported on GFX8", instr.get());
+               check(sdwa.clamp == false || program->chip_class == GFX8,
+                     "SDWA VOPC clamp only supported on GFX8", instr.get());
                check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
-                     program->chip_class >= GFX9,
+                        program->chip_class >= GFX9,
                      "SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
             }
 
@@ -171,8 +175,7 @@ bool validate_ir(Program* program)
             }
 
             const bool sdwa_opcodes =
-               instr->opcode != aco_opcode::v_fmac_f32 &&
-               instr->opcode != aco_opcode::v_fmac_f16 &&
+               instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 &&
                instr->opcode != aco_opcode::v_fmamk_f32 &&
                instr->opcode != aco_opcode::v_fmaak_f32 &&
                instr->opcode != aco_opcode::v_fmamk_f16 &&
@@ -186,67 +189,75 @@ bool validate_ir(Program* program)
 
             const bool feature_mac =
                program->chip_class == GFX8 &&
-               (instr->opcode == aco_opcode::v_mac_f32 &&
-                instr->opcode == aco_opcode::v_mac_f16);
+               (instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);
 
             check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());
 
             if (instr->definitions[0].regClass().is_subdword())
-               check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()), "Unexpected SDWA sel for sub-dword definition", instr.get());
+               check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()),
+                     "Unexpected SDWA sel for sub-dword definition", instr.get());
          }
 
          /* check opsel */
          if (instr->isVOP3()) {
             VOP3_instruction& vop3 = instr->vop3();
-            check(vop3.opsel == 0 || program->chip_class >= GFX9, "Opsel is only supported on GFX9+", instr.get());
+            check(vop3.opsel == 0 || program->chip_class >= GFX9,
+                  "Opsel is only supported on GFX9+", instr.get());
 
             for (unsigned i = 0; i < 3; i++) {
                if (i >= instr->operands.size() ||
-                   (instr->operands[i].hasRegClass() && instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
+                   (instr->operands[i].hasRegClass() &&
+                    instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
                   check((vop3.opsel & (1 << i)) == 0, "Unexpected opsel for operand", instr.get());
             }
             if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed())
-               check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition", instr.get());
+               check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition",
+                     instr.get());
          }
 
          /* check for undefs */
          for (unsigned i = 0; i < instr->operands.size(); i++) {
             if (instr->operands[i].isUndefined()) {
                bool flat = instr->isFlatLike();
-               bool can_be_undef = is_phi(instr) || instr->isEXP() ||
-                                   instr->isReduction() ||
+               bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
                                    instr->opcode == aco_opcode::p_create_vector ||
                                    (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
                                    ((instr->isMUBUF() || instr->isMTBUF()) && i == 1);
                check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
             } else {
-               check(instr->operands[i].isFixed() || instr->operands[i].isTemp() || instr->operands[i].isConstant(), "Uninitialized Operand", instr.get());
+               check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
+                        instr->operands[i].isConstant(),
+                     "Uninitialized Operand", instr.get());
             }
          }
 
          /* check subdword definitions */
          for (unsigned i = 0; i < instr->definitions.size(); i++) {
             if (instr->definitions[i].regClass().is_subdword())
-               check(instr->isPseudo() || instr->definitions[i].bytes() <= 4, "Only Pseudo instructions can write subdword registers larger than 4 bytes", instr.get());
+               check(instr->isPseudo() || instr->definitions[i].bytes() <= 4,
+                     "Only Pseudo instructions can write subdword registers larger than 4 bytes",
+                     instr.get());
          }
 
          if (instr->isSALU() || instr->isVALU()) {
             /* check literals */
             Operand literal(s1);
-            for (unsigned i = 0; i < instr->operands.size(); i++)
-            {
+            for (unsigned i = 0; i < instr->operands.size(); i++) {
                Operand op = instr->operands[i];
                if (!op.isLiteral())
                   continue;
 
                check(!instr->isDPP() && !instr->isSDWA() &&
-                     (!instr->isVOP3() || program->chip_class >= GFX10) &&
-                     (!instr->isVOP3P() || program->chip_class >= GFX10),
+                        (!instr->isVOP3() || program->chip_class >= GFX10) &&
+                        (!instr->isVOP3P() || program->chip_class >= GFX10),
                      "Literal applied on wrong instruction format", instr.get());
 
-               check(literal.isUndefined() || (literal.size() == op.size() && literal.constantValue() == op.constantValue()), "Only 1 Literal allowed", instr.get());
+               check(literal.isUndefined() || (literal.size() == op.size() &&
+                                               literal.constantValue() == op.constantValue()),
+                     "Only 1 Literal allowed", instr.get());
                literal = op;
-               check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2, "Wrong source position for Literal argument", instr.get());
+               check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2,
+                     "Wrong source position for Literal argument", instr.get());
             }
 
             /* check num sgprs for VALU */
@@ -264,8 +275,7 @@ bool validate_ir(Program* program)
                else if (instr->isDPP())
                   scalar_mask = 0x0;
 
-               if (instr->isVOPC() ||
-                   instr->opcode == aco_opcode::v_readfirstlane_b32 ||
+               if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
                    instr->opcode == aco_opcode::v_readlane_b32 ||
                    instr->opcode == aco_opcode::v_readlane_b32_e64) {
                   check(instr->definitions[0].getTemp().type() == RegType::sgpr,
@@ -277,45 +287,42 @@ bool validate_ir(Program* program)
 
                unsigned num_sgprs = 0;
                unsigned sgpr[] = {0, 0};
-               for (unsigned i = 0; i < instr->operands.size(); i++)
-               {
+               for (unsigned i = 0; i < instr->operands.size(); i++) {
                   Operand op = instr->operands[i];
                   if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
                       instr->opcode == aco_opcode::v_readlane_b32 ||
                       instr->opcode == aco_opcode::v_readlane_b32_e64) {
-                     check(i != 1 ||
-                           (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
-                           op.isConstant(),
+                     check(i != 1 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
+                              op.isConstant(),
                            "Must be a SGPR or a constant", instr.get());
-                     check(i == 1 ||
-                           (op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4),
+                     check(i == 1 || (op.isTemp() && op.regClass().type() == RegType::vgpr &&
+                                      op.bytes() <= 4),
                            "Wrong Operand type for VALU instruction", instr.get());
                      continue;
                   }
                   if (instr->opcode == aco_opcode::v_permlane16_b32 ||
                       instr->opcode == aco_opcode::v_permlanex16_b32) {
-                     check(i != 0 ||
-                           (op.isTemp() && op.regClass().type() == RegType::vgpr),
+                     check(i != 0 || (op.isTemp() && op.regClass().type() == RegType::vgpr),
                            "Operand 0 of v_permlane must be VGPR", instr.get());
-                     check(i == 0 ||
-                           (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
-                           op.isConstant(),
-                           "Lane select operands of v_permlane must be SGPR or constant", instr.get());
+                     check(i == 0 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
+                              op.isConstant(),
+                           "Lane select operands of v_permlane must be SGPR or constant",
+                           instr.get());
                   }
 
                   if (instr->opcode == aco_opcode::v_writelane_b32 ||
                       instr->opcode == aco_opcode::v_writelane_b32_e64) {
-                     check(i != 2 ||
-                           (op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4),
+                     check(i != 2 || (op.isTemp() && op.regClass().type() == RegType::vgpr &&
+                                      op.bytes() <= 4),
                            "Wrong Operand type for VALU instruction", instr.get());
-                     check(i == 2 ||
-                           (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
-                           op.isConstant(),
+                     check(i == 2 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
+                              op.isConstant(),
                            "Must be a SGPR or a constant", instr.get());
                      continue;
                   }
                   if (op.isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) {
-                     check(scalar_mask & (1 << i), "Wrong source position for SGPR argument", instr.get());
+                     check(scalar_mask & (1 << i), "Wrong source position for SGPR argument",
+                           instr.get());
 
                      if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
                         if (num_sgprs < 2)
@@ -324,19 +331,22 @@ bool validate_ir(Program* program)
                   }
 
                   if (op.isConstant() && !op.isLiteral())
-                     check(scalar_mask & (1 << i), "Wrong source position for constant argument", instr.get());
+                     check(scalar_mask & (1 << i), "Wrong source position for constant argument",
+                           instr.get());
                }
-               check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit, "Too many SGPRs/literals", instr.get());
+               check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit,
+                     "Too many SGPRs/literals", instr.get());
             }
 
             if (instr->isSOP1() || instr->isSOP2()) {
-               check(instr->definitions[0].getTemp().type() == RegType::sgpr, "Wrong Definition type for SALU instruction", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::sgpr,
+                     "Wrong Definition type for SALU instruction", instr.get());
                for (const Operand& op : instr->operands) {
-                 check(op.isConstant() || op.regClass().type() <= RegType::sgpr,
-                       "Wrong Operand type for SALU instruction", instr.get());
+                  check(op.isConstant() || op.regClass().type() <= RegType::sgpr,
+                        "Wrong Operand type for SALU instruction", instr.get());
+               }
             }
          }
-         }
 
          switch (instr->format) {
          case Format::PSEUDO: {
@@ -346,7 +356,8 @@ bool validate_ir(Program* program)
                   check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
                   size += op.bytes();
                }
-               check(size == instr->definitions[0].bytes(), "Definition size does not match operand sizes", instr.get());
+               check(size == instr->definitions[0].bytes(),
+                     "Definition size does not match operand sizes", instr.get());
                if (instr->definitions[0].getTemp().type() == RegType::sgpr) {
                   for (const Operand& op : instr->operands) {
                      check(op.isConstant() || op.regClass().type() == RegType::sgpr,
@@ -354,55 +365,75 @@ bool validate_ir(Program* program)
                   }
                }
             } else if (instr->opcode == aco_opcode::p_extract_vector) {
-               check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(), "Wrong Operand types", instr.get());
-               check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <= instr->operands[0].bytes(), "Index out of range", instr.get());
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->operands[0].regClass().type() == RegType::sgpr,
+               check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(),
+                     "Wrong Operand types", instr.get());
+               check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <=
+                        instr->operands[0].bytes(),
+                     "Index out of range", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr ||
+                        instr->operands[0].regClass().type() == RegType::sgpr,
                      "Cannot extract SGPR value from VGPR vector", instr.get());
-               check(program->chip_class >= GFX9 || !instr->definitions[0].regClass().is_subdword() ||
-                     instr->operands[0].regClass().type() == RegType::vgpr, "Cannot extract subdword from SGPR before GFX9+", instr.get());
+               check(program->chip_class >= GFX9 ||
+                        !instr->definitions[0].regClass().is_subdword() ||
+                        instr->operands[0].regClass().type() == RegType::vgpr,
+                     "Cannot extract subdword from SGPR before GFX9+", instr.get());
             } else if (instr->opcode == aco_opcode::p_split_vector) {
                check(instr->operands[0].isTemp(), "Operand must be a temporary", instr.get());
                unsigned size = 0;
                for (const Definition& def : instr->definitions) {
                   size += def.bytes();
                }
-               check(size == instr->operands[0].bytes(), "Operand size does not match definition sizes", instr.get());
+               check(size == instr->operands[0].bytes(),
+                     "Operand size does not match definition sizes", instr.get());
                if (instr->operands[0].getTemp().type() == RegType::vgpr) {
                   for (const Definition& def : instr->definitions)
-                     check(def.regClass().type() == RegType::vgpr, "Wrong Definition type for VGPR split_vector", instr.get());
+                     check(def.regClass().type() == RegType::vgpr,
+                           "Wrong Definition type for VGPR split_vector", instr.get());
                } else {
                   for (const Definition& def : instr->definitions)
-                     check(program->chip_class >= GFX9 || !def.regClass().is_subdword(), "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
+                     check(program->chip_class >= GFX9 || !def.regClass().is_subdword(),
+                           "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
                }
             } else if (instr->opcode == aco_opcode::p_parallelcopy) {
-               check(instr->definitions.size() == instr->operands.size(), "Number of Operands does not match number of Definitions", instr.get());
+               check(instr->definitions.size() == instr->operands.size(),
+                     "Number of Operands does not match number of Definitions", instr.get());
                for (unsigned i = 0; i < instr->operands.size(); i++) {
-                  check(instr->definitions[i].bytes() == instr->operands[i].bytes(), "Operand and Definition size must match", instr.get());
+                  check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
+                        "Operand and Definition size must match", instr.get());
                   if (instr->operands[i].isTemp())
-                     check((instr->definitions[i].getTemp().type() == instr->operands[i].regClass().type()) ||
-                           (instr->definitions[i].getTemp().type() == RegType::vgpr && instr->operands[i].regClass().type() == RegType::sgpr),
+                     check((instr->definitions[i].getTemp().type() ==
+                            instr->operands[i].regClass().type()) ||
+                              (instr->definitions[i].getTemp().type() == RegType::vgpr &&
+                               instr->operands[i].regClass().type() == RegType::sgpr),
                            "Operand and Definition types do not match", instr.get());
                }
             } else if (instr->opcode == aco_opcode::p_phi) {
-               check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr, "Logical Phi Definition must be vgpr", instr.get());
+               check(instr->operands.size() == block.logical_preds.size(),
+                     "Number of Operands does not match number of predecessors", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr,
+                     "Logical Phi Definition must be vgpr", instr.get());
                for (const Operand& op : instr->operands)
-                  check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get());
+                  check(instr->definitions[0].size() == op.size(),
+                        "Operand sizes must match Definition size", instr.get());
             } else if (instr->opcode == aco_opcode::p_linear_phi) {
                for (const Operand& op : instr->operands) {
-                  check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get());
-                  check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get());
+                  check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type",
+                        instr.get());
+                  check(instr->definitions[0].size() == op.size(),
+                        "Operand sizes must match Definition size", instr.get());
                }
-               check(instr->operands.size() == block.linear_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
-            } else if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_insert) {
-               check(instr->operands[0].isTemp(),
-                     "Data operand must be temporary", instr.get());
+               check(instr->operands.size() == block.linear_preds.size(),
+                     "Number of Operands does not match number of predecessors", instr.get());
+            } else if (instr->opcode == aco_opcode::p_extract ||
+                       instr->opcode == aco_opcode::p_insert) {
+               check(instr->operands[0].isTemp(), "Data operand must be temporary", instr.get());
                check(instr->operands[1].isConstant(), "Index must be constant", instr.get());
                if (instr->opcode == aco_opcode::p_extract)
-                  check(instr->operands[3].isConstant(), "Sign-extend flag must be constant", instr.get());
+                  check(instr->operands[3].isConstant(), "Sign-extend flag must be constant",
+                        instr.get());
 
                check(instr->definitions[0].getTemp().type() != RegType::sgpr ||
-                     instr->operands[0].getTemp().type() == RegType::sgpr,
+                        instr->operands[0].getTemp().type() == RegType::sgpr,
                      "Can't extract/insert VGPR to SGPR", instr.get());
 
                if (instr->operands[0].getTemp().type() == RegType::vgpr)
@@ -410,69 +441,106 @@ bool validate_ir(Program* program)
                         "Sizes of operand and definition must match", instr.get());
 
                if (instr->definitions[0].getTemp().type() == RegType::sgpr)
-                  check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() && instr->definitions[1].physReg() == scc, "SGPR extract/insert needs a SCC definition", instr.get());
+                  check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
+                           instr->definitions[1].physReg() == scc,
+                        "SGPR extract/insert needs a SCC definition", instr.get());
 
-               check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16), "Size must be 8 or 16", instr.get());
-               check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u, "Size must be smaller than source", instr.get());
+               check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16),
+                     "Size must be 8 or 16", instr.get());
+               check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u,
+                     "Size must be smaller than source", instr.get());
 
-               unsigned comp = instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
-               check(instr->operands[1].constantValue() < comp, "Index must be in-bounds", instr.get());
+               unsigned comp =
+                  instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
+               check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
+                     instr.get());
             }
             break;
          }
          case Format::PSEUDO_REDUCTION: {
-            for (const Operand &op : instr->operands)
-               check(op.regClass().type() == RegType::vgpr, "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.", instr.get());
+            for (const Operand& op : instr->operands)
+               check(op.regClass().type() == RegType::vgpr,
+                     "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.",
+                     instr.get());
 
-            if (instr->opcode == aco_opcode::p_reduce && instr->reduction().cluster_size == program->wave_size)
-               check(instr->definitions[0].regClass().type() == RegType::sgpr || program->wave_size == 32, "The result of unclustered reductions must go into an SGPR.", instr.get());
+            if (instr->opcode == aco_opcode::p_reduce &&
+                instr->reduction().cluster_size == program->wave_size)
+               check(instr->definitions[0].regClass().type() == RegType::sgpr ||
+                        program->wave_size == 32,
+                     "The result of unclustered reductions must go into an SGPR.", instr.get());
             else
-               check(instr->definitions[0].regClass().type() == RegType::vgpr, "The result of scans and clustered reductions must go into a VGPR.", instr.get());
+               check(instr->definitions[0].regClass().type() == RegType::vgpr,
+                     "The result of scans and clustered reductions must go into a VGPR.",
+                     instr.get());
 
             break;
          }
          case Format::SMEM: {
             if (instr->operands.size() >= 1)
                check((instr->operands[0].isFixed() && !instr->operands[0].isConstant()) ||
-                     (instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr), "SMEM operands must be sgpr", instr.get());
+                        (instr->operands[0].isTemp() &&
+                         instr->operands[0].regClass().type() == RegType::sgpr),
+                     "SMEM operands must be sgpr", instr.get());
             if (instr->operands.size() >= 2)
-               check(instr->operands[1].isConstant() || (instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr),
+               check(instr->operands[1].isConstant() ||
+                        (instr->operands[1].isTemp() &&
+                         instr->operands[1].regClass().type() == RegType::sgpr),
                      "SMEM offset must be constant or sgpr", instr.get());
             if (!instr->definitions.empty())
-               check(instr->definitions[0].getTemp().type() == RegType::sgpr, "SMEM result must be sgpr", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::sgpr,
+                     "SMEM result must be sgpr", instr.get());
             break;
          }
          case Format::MTBUF:
          case Format::MUBUF: {
-            check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get());
-            check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr,
+            check(instr->operands.size() > 1, "VMEM instructions must have at least one operand",
+                  instr.get());
+            check(instr->operands[1].hasRegClass() &&
+                     instr->operands[1].regClass().type() == RegType::vgpr,
                   "VADDR must be in vgpr for VMEM instructions", instr.get());
-            check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get());
-            check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get());
+            check(
+               instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr,
+               "VMEM resource constant must be sgpr", instr.get());
+            check(instr->operands.size() < 4 ||
+                     (instr->operands[3].isTemp() &&
+                      instr->operands[3].regClass().type() == RegType::vgpr),
+                  "VMEM write data must be vgpr", instr.get());
             break;
          }
          case Format::MIMG: {
-            check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get());
-            check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
+            check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands",
+                  instr.get());
+            check(instr->operands[0].hasRegClass() &&
+                     (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
                   "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
             if (instr->operands[1].hasRegClass())
-               check(instr->operands[1].regClass() == s4, "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
+               check(instr->operands[1].regClass() == s4,
+                     "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
             if (!instr->operands[2].isUndefined()) {
                bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap ||
                                  instr->opcode == aco_opcode::image_atomic_fcmpswap;
-               check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
-                     "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
+               check(instr->definitions.empty() ||
+                        (instr->definitions[0].regClass() == instr->operands[2].regClass() ||
+                         is_cmpswap),
+                     "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and "
+                     "TFE/LWE loads",
+                     instr.get());
             }
-            check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get());
+            check(instr->operands.size() == 4 || program->chip_class >= GFX10,
+                  "NSA is only supported on GFX10+", instr.get());
             for (unsigned i = 3; i < instr->operands.size(); i++) {
                if (instr->operands.size() == 4) {
-                  check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
+                  check(instr->operands[i].hasRegClass() &&
+                           instr->operands[i].regClass().type() == RegType::vgpr,
                         "MIMG operands[3] (VADDR) must be VGPR", instr.get());
                } else {
-                  check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get());
+                  check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used",
+                        instr.get());
                }
             }
-            check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
+            check(instr->definitions.empty() ||
+                     (instr->definitions[0].isTemp() &&
+                      instr->definitions[0].regClass().type() == RegType::vgpr),
                   "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
             break;
          }
@@ -482,31 +550,38 @@ bool validate_ir(Program* program)
                      "Only VGPRs are valid DS instruction operands", instr.get());
             }
             if (!instr->definitions.empty())
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr, "DS instruction must return VGPR", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr,
+                     "DS instruction must return VGPR", instr.get());
             break;
          }
          case Format::EXP: {
             for (unsigned i = 0; i < 4; i++)
-               check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
+               check(instr->operands[i].hasRegClass() &&
+                        instr->operands[i].regClass().type() == RegType::vgpr,
                      "Only VGPRs are valid Export arguments", instr.get());
             break;
          }
          case Format::FLAT:
-            check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR", instr.get());
+            check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR",
+                  instr.get());
             FALLTHROUGH;
          case Format::GLOBAL:
          case Format::SCRATCH: {
-            check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get());
-            check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr,
+            check(
+               instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr,
+               "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get());
+            check(instr->operands[1].hasRegClass() &&
+                     instr->operands[1].regClass().type() == RegType::sgpr,
                   "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get());
             if (!instr->definitions.empty())
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr,
+                     "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
             else
-               check(instr->operands[2].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
+               check(instr->operands[2].regClass().type() == RegType::vgpr,
+                     "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
             break;
          }
-         default:
-            break;
+         default: break;
          }
       }
    }
@@ -518,20 +593,26 @@ bool validate_ir(Program* program)
 
       /* predecessors/successors should be sorted */
       for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++)
-         check_block(block.linear_preds[j] < block.linear_preds[j + 1], "linear predecessors must be sorted", &block);
+         check_block(block.linear_preds[j] < block.linear_preds[j + 1],
+                     "linear predecessors must be sorted", &block);
       for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++)
-         check_block(block.logical_preds[j] < block.logical_preds[j + 1], "logical predecessors must be sorted", &block);
+         check_block(block.logical_preds[j] < block.logical_preds[j + 1],
+                     "logical predecessors must be sorted", &block);
       for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++)
-         check_block(block.linear_succs[j] < block.linear_succs[j + 1], "linear successors must be sorted", &block);
+         check_block(block.linear_succs[j] < block.linear_succs[j + 1],
+                     "linear successors must be sorted", &block);
       for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++)
-         check_block(block.logical_succs[j] < block.logical_succs[j + 1], "logical successors must be sorted", &block);
+         check_block(block.logical_succs[j] < block.logical_succs[j + 1],
+                     "logical successors must be sorted", &block);
 
       /* critical edges are not allowed */
       if (block.linear_preds.size() > 1) {
          for (unsigned pred : block.linear_preds)
-            check_block(program->blocks[pred].linear_succs.size() == 1, "linear critical edges are not allowed", &program->blocks[pred]);
+            check_block(program->blocks[pred].linear_succs.size() == 1,
+                        "linear critical edges are not allowed", &program->blocks[pred]);
          for (unsigned pred : block.logical_preds)
-            check_block(program->blocks[pred].logical_succs.size() == 1, "logical critical edges are not allowed", &program->blocks[pred]);
+            check_block(program->blocks[pred].logical_succs.size() == 1,
+                        "logical critical edges are not allowed", &program->blocks[pred]);
       }
    }
 
@@ -544,8 +625,8 @@ namespace {
 struct Location {
    Location() : block(NULL), instr(NULL) {}
 
-   Block *block;
-   Instruction *instr; //NULL if it's the block's live-in
+   Block* block;
+   Instruction* instr; // NULL if it's the block's live-in
 };
 
 struct Assignment {
@@ -554,18 +635,20 @@ struct Assignment {
    PhysReg reg;
 };
 
-bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...) {
+bool
+ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...)
+{
    va_list args;
    va_start(args, fmt);
    char msg[1024];
    vsprintf(msg, fmt, args);
    va_end(args);
 
-   char *out;
+   char* out;
    size_t outsize;
    struct u_memstream mem;
    u_memstream_open(&mem, &out, &outsize);
-   FILE *const memf = u_memstream_get(&mem);
+   FILE* const memf = u_memstream_get(&mem);
 
    fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index);
    if (loc.instr) {
@@ -587,7 +670,8 @@ bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...
    return true;
 }
 
-bool validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, unsigned index)
+bool
+validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, unsigned index)
 {
    Operand op = instr->operands[index];
    unsigned byte = op.physReg().byte();
@@ -635,14 +719,14 @@ bool validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& inst
       if (byte == 2 && index == 2)
          return true;
       break;
-   default:
-      break;
+   default: break;
    }
 
    return byte == 0;
 }
 
-bool validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
+bool
+validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
 {
    Definition def = instr->definitions[0];
    unsigned byte = def.physReg().byte();
@@ -664,16 +748,15 @@ bool validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& i
    case aco_opcode::global_load_ubyte_d16_hi:
    case aco_opcode::global_load_short_d16_hi:
    case aco_opcode::ds_read_u8_d16_hi:
-   case aco_opcode::ds_read_u16_d16_hi:
-      return byte == 2;
-   default:
-      break;
+   case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
+   default: break;
    }
 
    return byte == 0;
 }
 
-unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>& instr, unsigned index)
+unsigned
+get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr, unsigned index)
 {
    chip_class chip = program->chip_class;
    Definition def = instr->definitions[index];
@@ -703,8 +786,7 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>
    case aco_opcode::global_load_ubyte_d16_hi:
    case aco_opcode::global_load_short_d16_hi:
    case aco_opcode::ds_read_u8_d16_hi:
-   case aco_opcode::ds_read_u16_d16_hi:
-      return program->dev.sram_ecc_enabled ? 4 : 2;
+   case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
    case aco_opcode::v_mad_f16:
    case aco_opcode::v_mad_u16:
    case aco_opcode::v_mad_i16:
@@ -714,16 +796,18 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>
       if (chip >= GFX9)
          return 2;
       break;
-   default:
-      break;
+   default: break;
    }
 
-   return MAX2(chip >= GFX10 ? def.bytes() : 4, instr_info.definition_size[(int)instr->opcode] / 8u);
+   return MAX2(chip >= GFX10 ? def.bytes() : 4,
+               instr_info.definition_size[(int)instr->opcode] / 8u);
 }
 
 } /* end namespace */
 
-bool validate_ra(Program *program) {
+bool
+validate_ra(Program* program)
+{
    if (!(debug_flags & DEBUG_VALIDATE_RA))
       return false;
 
@@ -754,13 +838,21 @@ bool validate_ra(Program *program) {
             if (!op.isFixed())
                err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i);
             if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg())
-               err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i);
-            if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
-                (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit))
-               err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i);
+               err |=
+                  ra_fail(program, loc, assignments.at(op.tempId()).firstloc,
+                          "Operand %d has an inconsistent register assignment with instruction", i);
+            if ((op.getTemp().type() == RegType::vgpr &&
+                 op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
+                (op.getTemp().type() == RegType::sgpr &&
+                 op.physReg() + op.size() > program->config->num_sgprs &&
+                 op.physReg() < sgpr_limit))
+               err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc,
+                              "Operand %d has an out-of-bounds register assignment", i);
             if (op.physReg() == vcc && !program->needs_vcc)
-               err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i);
-            if (op.regClass().is_subdword() && !validate_subdword_operand(program->chip_class, instr, i))
+               err |= ra_fail(program, loc, Location(),
+                              "Operand %d fixed to vcc but needs_vcc=false", i);
+            if (op.regClass().is_subdword() &&
+                !validate_subdword_operand(program->chip_class, instr, i))
                err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i);
             if (!assignments[op.tempId()].firstloc.block)
                assignments[op.tempId()].firstloc = loc;
@@ -773,15 +865,23 @@ bool validate_ra(Program *program) {
             if (!def.isTemp())
                continue;
             if (!def.isFixed())
-               err |= ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
+               err |=
+                  ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
             if (assignments[def.tempId()].defloc.block)
-               err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId());
-            if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
-                (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit))
-               err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i);
+               err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc,
+                              "Temporary %%%d also defined by instruction", def.tempId());
+            if ((def.getTemp().type() == RegType::vgpr &&
+                 def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
+                (def.getTemp().type() == RegType::sgpr &&
+                 def.physReg() + def.size() > program->config->num_sgprs &&
+                 def.physReg() < sgpr_limit))
+               err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc,
+                              "Definition %d has an out-of-bounds register assignment", i);
             if (def.physReg() == vcc && !program->needs_vcc)
-               err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i);
-            if (def.regClass().is_subdword() && !validate_subdword_definition(program->chip_class, instr))
+               err |= ra_fail(program, loc, Location(),
+                              "Definition %d fixed to vcc but needs_vcc=false", i);
+            if (def.regClass().is_subdword() &&
+                !validate_subdword_definition(program->chip_class, instr))
                err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i);
             if (!assignments[def.tempId()].firstloc.block)
                assignments[def.tempId()].firstloc = loc;
@@ -810,7 +910,9 @@ bool validate_ra(Program *program) {
          PhysReg reg = assignments.at(tmp.id()).reg;
          for (unsigned i = 0; i < tmp.bytes(); i++) {
             if (regs[reg.reg_b + i]) {
-               err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]);
+               err |= ra_fail(program, loc, Location(),
+                              "Assignment of element %d of %%%d already taken by %%%d in live-out",
+                              i, tmp.id(), regs[reg.reg_b + i]);
             }
             regs[reg.reg_b + i] = tmp.id();
          }
@@ -826,7 +928,10 @@ bool validate_ra(Program *program) {
                PhysReg reg = assignments.at(tmp.id()).reg;
                for (unsigned i = 0; i < tmp.bytes(); i++) {
                   if (regs[reg.reg_b + i])
-                     err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]);
+                     err |= ra_fail(
+                        program, loc, Location(),
+                        "Assignment of element %d of %%%d already taken by %%%d in live-out", i,
+                        tmp.id(), regs[reg.reg_b + i]);
                }
                live.emplace(tmp);
             }
@@ -886,16 +991,23 @@ bool validate_ra(Program *program) {
             PhysReg reg = assignments.at(tmp.id()).reg;
             for (unsigned j = 0; j < tmp.bytes(); j++) {
                if (regs[reg.reg_b + j])
-                  err |= ra_fail(program, loc, assignments.at(regs[reg.reg_b + j]).defloc, "Assignment of element %d of %%%d already taken by %%%d from instruction", i, tmp.id(), regs[reg.reg_b + j]);
+                  err |= ra_fail(
+                     program, loc, assignments.at(regs[reg.reg_b + j]).defloc,
+                     "Assignment of element %d of %%%d already taken by %%%d from instruction", i,
+                     tmp.id(), regs[reg.reg_b + j]);
                regs[reg.reg_b + j] = tmp.id();
             }
             if (def.regClass().is_subdword() && def.bytes() < 4) {
                unsigned written = get_subdword_bytes_written(program, instr, i);
-               /* If written=4, the instruction still might write the upper half. In that case, it's the lower half that isn't preserved */
+               /* If written=4, the instruction still might write the upper half. In that case, it's
+                * the lower half that isn't preserved */
                for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) {
                   unsigned written_reg = reg.reg() * 4u + j;
                   if (regs[written_reg] && regs[written_reg] != def.tempId())
-                     err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc, "Assignment of element %d of %%%d overwrites the full register taken by %%%d from instruction", i, tmp.id(), regs[written_reg]);
+                     err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc,
+                                    "Assignment of element %d of %%%d overwrites the full register "
+                                    "taken by %%%d from instruction",
+                                    i, tmp.id(), regs[written_reg]);
                }
             }
          }
@@ -924,4 +1036,4 @@ bool validate_ra(Program *program) {
 
    return err;
 }
-}
+} // namespace aco