diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
index 385eec9263a..e6ab9d94290 100644
--- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h
+++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
@@ -248,6 +248,9 @@ void finish_program(isel_context* ctx);
 void _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
                const char* msg);
 
+/* aco_select_nir_alu.cpp */
+void visit_alu_instr(isel_context* ctx, nir_alu_instr* instr);
+
 } // namespace aco
 
 #endif /* ACO_INSTRUCTION_SELECTION_H */
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir.cpp b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
index 50c5c995553..9a77600d213 100644
--- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
@@ -25,17 +25,6 @@ namespace {
 
 void visit_cf_list(struct isel_context* ctx, struct exec_list* list);
 
-Builder
-create_alu_builder(isel_context* ctx, nir_alu_instr* instr)
-{
-   Builder bld(ctx->program, ctx->block);
-   bld.is_precise = instr->exact;
-   bld.is_sz_preserve = nir_alu_instr_is_signed_zero_preserve(instr);
-   bld.is_inf_preserve = nir_alu_instr_is_inf_preserve(instr);
-   bld.is_nan_preserve = nir_alu_instr_is_nan_preserve(instr);
-   return bld;
-}
-
 Temp
 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
 {
@@ -199,3590 +188,6 @@ emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
 }
 
-enum sgpr_extract_mode {
-   sgpr_extract_sext,
-   sgpr_extract_zext,
-   sgpr_extract_undef,
-};
-
-Temp
-extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
-{
-   Temp vec = get_ssa_temp(ctx, src->src.ssa);
-   unsigned src_size = src->src.ssa->bit_size;
-   unsigned swizzle = src->swizzle[0];
-
-   if (vec.size() > 1) {
-      assert(src_size == 16);
-      vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
-      swizzle = swizzle & 1;
-   }
-
-   Builder bld(ctx->program, ctx->block);
-   Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
-
-   if (mode == sgpr_extract_undef && swizzle == 0)
-      bld.copy(Definition(tmp), vec);
-   else
-      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
-                 Operand::c32(swizzle), Operand::c32(src_size),
-                 Operand::c32((mode == sgpr_extract_sext)));
-
-   if (dst.regClass() == s2)
-      convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
-
-   return dst;
-}
-
-Temp
-get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
-{
-   if (src.src.ssa->num_components == 1 && size == 1)
-      return get_ssa_temp(ctx, src.src.ssa);
-
-   Temp vec = get_ssa_temp(ctx, src.src.ssa);
-   unsigned elem_size = src.src.ssa->bit_size / 8u;
-   bool identity_swizzle = true;
-
-   for (unsigned i = 0; identity_swizzle && i < size; i++) {
-      if (src.swizzle[i] != i)
-         identity_swizzle = false;
-   }
-   if (identity_swizzle)
-      return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
-
-   assert(elem_size > 0);
-   assert(vec.bytes() % elem_size == 0);
-
-   if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
-      assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
-      return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
-                                           sgpr_extract_undef);
-   }
-
-   bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
-   if (as_uniform)
-      vec = as_vgpr(ctx, vec);
-
-   RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
-                                    : RegClass(vec.type(), elem_size / 4);
-   if (size == 1) {
-      return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
-   } else {
-      assert(size <= 4);
-      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
-      aco_ptr<Instruction> vec_instr{
-         create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
-      for (unsigned i = 0; i < size; ++i) {
-         elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
-         vec_instr->operands[i] = Operand{elems[i]};
-      }
-      Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
-      vec_instr->definitions[0] = Definition(dst);
-      ctx->block->instructions.emplace_back(std::move(vec_instr));
-      ctx->allocated_vec.emplace(dst.id(), elems);
-      return as_uniform ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
-   }
-}
-
-Temp
-get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
-{
-   /* returns v2b or v1 for vop3p usage.
-    * The source expects exactly 2 16bit components
-    * which are within the same dword
-    */
-   assert(src.src.ssa->bit_size == 16);
-   assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
-
-   Temp tmp = get_ssa_temp(ctx, src.src.ssa);
-   if (tmp.size() == 1)
-      return tmp;
-
-   /* the size is larger than 1 dword: check the swizzle */
-   unsigned dword = src.swizzle[0] >> 1;
-
-   /* extract a full dword if possible */
-   if (tmp.bytes() >= (dword + 1) * 4) {
-      /* if the source is split into components, use p_create_vector */
-      auto it = ctx->allocated_vec.find(tmp.id());
-      if (it != ctx->allocated_vec.end()) {
-         unsigned index = dword << 1;
-         Builder bld(ctx->program, ctx->block);
-         if (it->second[index].regClass() == v2b)
-            return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
-                              it->second[index + 1]);
-      }
-      return emit_extract_vector(ctx, tmp, dword, v1);
-   } else {
-      /* This must be a swizzled access to %a.zz where %a is v6b */
-      assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
-      assert(tmp.regClass() == v6b && dword == 1);
-      return emit_extract_vector(ctx, tmp, dword * 2, v2b);
-   }
-}
-
-uint32_t
-get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
-{
-   nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
-   return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
-}
-
-void
-emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
-                      bool writes_scc, uint8_t uses_ub = 0)
-{
-   Builder bld = create_alu_builder(ctx, instr);
-   bld.is_nuw = instr->no_unsigned_wrap;
-
-   Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
-                          Operand(get_alu_src(ctx, instr->src[1]))};
-   u_foreach_bit (i, uses_ub) {
-      uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
-      if (src_ub <= 0xffff)
-         operands[i].set16bit(true);
-      else if (src_ub <= 0xffffff)
-         operands[i].set24bit(true);
-   }
-
-   if (writes_scc)
-      bld.sop2(op, Definition(dst), bld.def(s1, scc), operands[0], operands[1]);
-   else
-      bld.sop2(op, Definition(dst), operands[0], operands[1]);
-}
-
-void
-emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
-                      bool commutative, bool swap_srcs = false, bool flush_denorms = false,
-                      bool nuw = false, uint8_t uses_ub = 0)
-{
-   Builder bld = create_alu_builder(ctx, instr);
-   bld.is_nuw = nuw;
-
-   Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
-                          Operand(get_alu_src(ctx, instr->src[1]))};
-   u_foreach_bit (i, uses_ub) {
-      uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
-      if (src_ub <= 0xffff)
-         operands[i].set16bit(true);
-      else if (src_ub <= 0xffffff)
-         operands[i].set24bit(true);
-   }
-
-   if (swap_srcs)
-      std::swap(operands[0], operands[1]);
-
-   if (operands[1].isOfType(RegType::sgpr)) {
-      if (commutative && operands[0].isOfType(RegType::vgpr)) {
-         std::swap(operands[0], operands[1]);
-      } else {
-         operands[1] = bld.copy(bld.def(RegType::vgpr, operands[1].size()), operands[1]);
-      }
-   }
-
-   if (flush_denorms && ctx->program->gfx_level < GFX9) {
-      assert(dst.size() == 1);
-      Temp tmp = bld.vop2(opc, bld.def(dst.regClass()), operands[0], operands[1]);
-      if (dst.bytes() == 2)
-         bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
-      else
-         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
-   } else {
-      bld.vop2(opc, Definition(dst), operands[0], operands[1]);
-   }
-}
-
-void
-emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
-{
-   Builder bld = create_alu_builder(ctx, instr);
-
-   Temp src0 = get_alu_src(ctx, instr->src[0]);
-   Temp src1 = get_alu_src(ctx, instr->src[1]);
-
-   if (src1.type() == RegType::sgpr) {
-      assert(src0.type() == RegType::vgpr);
-      std::swap(src0, src1);
-   }
-
-   Temp src00 = bld.tmp(src0.type(), 1);
-   Temp src01 = bld.tmp(src0.type(), 1);
-   bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
-   Temp src10 = bld.tmp(v1);
-   Temp src11 = bld.tmp(v1);
-   bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
-   Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
-   Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
-   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
-}
-
-void
-emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
-                       bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
-{
-   assert(num_sources == 2 || num_sources == 3);
-   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
-   bool has_sgpr = false;
-   for (unsigned i = 0; i < num_sources; i++) {
-      src[i] = get_alu_src(ctx, instr->src[(swap_srcs && i < 2) ? 1 - i : i]);
-      if (has_sgpr)
-         src[i] = as_vgpr(ctx, src[i]);
-      else
-         has_sgpr = src[i].type() == RegType::sgpr;
-   }
-
-   Builder bld = create_alu_builder(ctx, instr);
-   if (flush_denorms && ctx->program->gfx_level < GFX9) {
-      Temp tmp;
-      if (num_sources == 3)
-         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
-      else
-         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
-      if (dst.size() == 1)
-         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
-      else
-         bld.vop3(aco_opcode::v_mul_f64_e64, Definition(dst), Operand::c64(0x3FF0000000000000),
-                  tmp);
-   } else if (num_sources == 3) {
-      bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
-   } else {
-      bld.vop3(op, Definition(dst), src[0], src[1]);
-   }
-}
-
-Builder::Result
-emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
-                       bool swap_srcs = false)
-{
-   Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
-   Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
-   if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
-      src1 = as_vgpr(ctx, src1);
-   assert(instr->def.num_components == 2);
-
-   /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
-   unsigned opsel_lo =
-      (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
-   unsigned opsel_hi =
-      (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
-
-   Builder bld = create_alu_builder(ctx, instr);
-   Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
-   emit_split_vector(ctx, dst, 2);
-   return res;
-}
-
-void
-emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
-                      unsigned neg_lo = 0)
-{
-   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
-   bool has_sgpr = false;
-   for (unsigned i = 0; i < 3; i++) {
-      src[i] = get_alu_src(ctx, instr->src[i]);
-      if (has_sgpr)
-         src[i] = as_vgpr(ctx, src[i]);
-      else
-         has_sgpr = src[i].type() == RegType::sgpr;
-   }
-
-   Builder bld = create_alu_builder(ctx, instr);
-   VALU_instruction& vop3p =
-      bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
-   vop3p.clamp = clamp;
-   vop3p.neg_lo = neg_lo;
-}
-
-void
-emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
-{
-   Builder bld = create_alu_builder(ctx, instr);
-   if (dst.type() == RegType::sgpr)
-      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
-   else
-      bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
-}
-
-void
-emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
-{
-   Temp src0 = get_alu_src(ctx, instr->src[0]);
-   Temp src1 = get_alu_src(ctx, instr->src[1]);
-   assert(src0.size() == src1.size());
-
-   aco_ptr<Instruction> vopc;
-   if (src1.type() == RegType::sgpr) {
-      if (src0.type() == RegType::vgpr) {
-         /* to swap the operands, we might also have to change the opcode */
-         op = get_vcmp_swapped(op);
-         Temp t = src0;
-         src0 = src1;
-         src1 = t;
-      } else {
-         src1 = as_vgpr(ctx, src1);
-      }
-   }
-
-   Builder bld = create_alu_builder(ctx, instr);
-   bld.vopc(op, Definition(dst), src0, src1);
-}
-
-void
-emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
-{
-   Temp src0 = get_alu_src(ctx, instr->src[0]);
-   Temp src1 = get_alu_src(ctx, instr->src[1]);
-   Builder bld = create_alu_builder(ctx, instr);
-
-   assert(dst.regClass() == bld.lm);
-   assert(src0.type() == RegType::sgpr);
-   assert(src1.type() == RegType::sgpr);
-
-   /* Emit the SALU comparison instruction */
-   Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
-   /* Turn the result into a per-lane bool */
-   bool_to_vector_condition(ctx, cmp, dst);
-}
-
-void
-emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
-                aco_opcode v32_op, aco_opcode v64_op, aco_opcode s16_op = aco_opcode::num_opcodes,
-                aco_opcode s32_op = aco_opcode::num_opcodes,
-                aco_opcode s64_op = aco_opcode::num_opcodes)
-{
-   aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
-                     : instr->src[0].src.ssa->bit_size == 32 ? s32_op
-                                                             : s16_op;
-   aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
-                     : instr->src[0].src.ssa->bit_size == 32 ? v32_op
-                                                             : v16_op;
-   bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
-                   get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
-                   get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
-   aco_opcode op = use_valu ? v_op : s_op;
-   assert(op != aco_opcode::num_opcodes);
-   assert(dst.regClass() == ctx->program->lane_mask);
-
-   if (use_valu)
-      emit_vopc_instruction(ctx, instr, op, dst);
-   else
-      emit_sopc_instruction(ctx, instr, op, dst);
-}
-
-void
-emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
-                   Temp dst)
-{
-   Builder bld(ctx->program, ctx->block);
-   Temp src0 = get_alu_src(ctx, instr->src[0]);
-   Temp src1 = get_alu_src(ctx, instr->src[1]);
-
-   assert(dst.regClass() == bld.lm);
-   assert(src0.regClass() == bld.lm);
-   assert(src1.regClass() == bld.lm);
-
-   bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
-}
-
-void
-emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
-{
-   Builder bld(ctx->program, ctx->block);
-   Temp cond = get_alu_src(ctx, instr->src[0]);
-   Temp then = get_alu_src(ctx, instr->src[1]);
-   Temp els = get_alu_src(ctx, instr->src[2]);
-
-   assert(cond.regClass() == bld.lm);
-
-   if (dst.type() == RegType::vgpr) {
-      aco_ptr<Instruction> bcsel;
-      if (dst.size() == 1) {
-         then = as_vgpr(ctx, then);
-         els = as_vgpr(ctx, els);
-
-         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
-      } else if (dst.size() == 2) {
-         select_vec2(ctx, dst, cond, then, els);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      return;
-   }
-
-   if (instr->def.bit_size == 1) {
-      assert(dst.regClass() == bld.lm);
-      assert(then.regClass() == bld.lm);
-      assert(els.regClass() == bld.lm);
-   }
-
-   if (!nir_src_is_divergent(&instr->src[0].src)) { /* uniform condition and values in sgpr */
-      cond = bool_to_scalar_condition(ctx, cond);
-
-      bool els_zero =
-         nir_src_is_const(instr->src[2].src) && nir_src_as_uint(instr->src[2].src) == 0;
-
-      if (dst.regClass() == s1 && els_zero) {
-         /* Use s_mul_i32 because it doesn't require scc. */
-         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), then, cond);
-      } else if (dst.regClass() == s1 || dst.regClass() == s2) {
-         assert((then.regClass() == s1 || then.regClass() == s2) &&
-                els.regClass() == then.regClass());
-         assert(dst.size() == then.size());
-         aco_opcode op =
-            dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
-         bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
-      } else {
-         isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
-      }
-      return;
-   }
-
-   /* divergent boolean bcsel
-    * this implements bcsel on bools: dst = s0 ? s1 : s2
-    * are going to be: dst = (s0 & s1) | (~s0 & s2) */
-   assert(instr->def.bit_size == 1);
-
-   if (cond.id() != then.id())
-      then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
-
-   if (cond.id() == els.id())
-      bld.copy(Definition(dst), then);
-   else
-      bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
-               bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
-}
-
-void
-emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode vop,
-               aco_opcode sop, uint32_t undo)
-{
-   if (ctx->block->fp_mode.denorm32 == 0) {
-      if (dst.regClass() == v1)
-         bld.vop1(vop, dst, val);
-      else if (ctx->options->gfx_level >= GFX12)
-         bld.vop3(sop, dst, val);
-      else
-         bld.pseudo(aco_opcode::p_as_uniform, dst, bld.vop1(vop, bld.def(v1), val));
-      return;
-   }
-
-   /* multiply by 16777216 to handle denormals */
-   Temp scale, unscale;
-   if (val.regClass() == v1) {
-      val = as_vgpr(bld, val);
-      Temp is_denormal = bld.tmp(bld.lm);
-      VALU_instruction& valu = bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal),
-                                            val, Operand::c32(1u << 4))
-                                  ->valu();
-      valu.neg[0] = true;
-      valu.abs[0] = true;
-      scale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
-                           bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), is_denormal);
-      unscale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
-                             bld.copy(bld.def(s1), Operand::c32(undo)), is_denormal);
-   } else {
-      Temp abs = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), val,
-                          bld.copy(bld.def(s1), Operand::c32(0x7fffffff)));
-      Temp denorm_cmp = bld.copy(bld.def(s1), Operand::c32(0x00800000));
-      Temp is_denormal = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, denorm_cmp);
-      scale = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
-                       bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), Operand::c32(0x3f800000),
-                       bld.scc(is_denormal));
-      unscale =
-         bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(undo)),
-                  Operand::c32(0x3f800000), bld.scc(is_denormal));
-   }
-
-   if (dst.regClass() == v1) {
-      Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), scale, as_vgpr(bld, val));
-      scaled = bld.vop1(vop, bld.def(v1), scaled);
-      bld.vop2(aco_opcode::v_mul_f32, dst, unscale, scaled);
-   } else {
-      assert(ctx->options->gfx_level >= GFX11_5);
-      Temp scaled = bld.sop2(aco_opcode::s_mul_f32, bld.def(s1), scale, val);
-      if (ctx->options->gfx_level >= GFX12)
-         scaled = bld.vop3(sop, bld.def(s1), scaled);
-      else
-         scaled = bld.as_uniform(bld.vop1(vop, bld.def(v1), scaled));
-      bld.sop2(aco_opcode::s_mul_f32, dst, unscale, scaled);
-   }
-}
-
-void
-emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
-{
-   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, aco_opcode::v_s_rcp_f32, 0x4b800000u);
-}
-
-void
-emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
-{
-   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, aco_opcode::v_s_rsq_f32, 0x45800000u);
-}
-
-void
-emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
-{
-   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, aco_opcode::v_s_sqrt_f32,
-                  0x39800000u);
-}
-
-void
-emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
-{
-   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, aco_opcode::v_s_log_f32, 0xc1c00000u);
-}
-
-Temp
-emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
-{
-   if (ctx->options->gfx_level >= GFX7)
-      return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
-
-   /* GFX6 doesn't support V_TRUNC_F64, lower it. */
-   /* TODO: create more efficient code! */
-   if (val.type() == RegType::sgpr)
-      val = as_vgpr(ctx, val);
-
-   /* Split the input value. */
-   Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
-   bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
-
-   /* Extract the exponent and compute the unbiased value. */
-   Temp exponent =
-      bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
-   exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
-
-   /* Extract the fractional part. */
-   Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
-                                Operand::c32(0x000fffffu));
-   fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
-
-   Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
-   bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
-              fract_mask);
-
-   Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
-   Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
-   fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
-   tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
-   fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
-
-   /* Get the sign bit. */
-   Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
-
-   /* Decide the operation to apply depending on the unbiased exponent. */
-   Temp exp_lt0 =
-      bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
-   Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
-                          bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
-   Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
-   Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
-   dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
-   dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
-
-   return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
-}
-
-Temp
-emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
-{
-   if (ctx->options->gfx_level >= GFX7)
-      return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
-
-   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
-    * lowered at NIR level for precision reasons). */
-   Temp src0 = as_vgpr(ctx, val);
-
-   Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
-                             Operand::c32(0x3fefffffu));
-
-   Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
-   Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
-   Temp min = bld.vop3(aco_opcode::v_min_f64_e64, bld.def(v2), fract, min_val);
-
-   Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
-   bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
-   Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
-   bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
-
-   Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
-   Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
-
-   Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
-
-   Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src0, v);
-   add->valu().neg[1] = true;
-
-   return add->definitions[0].getTemp();
-}
-
-Temp
-uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
-{
-   if (bld.program->gfx_level < GFX8) {
-      Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
-      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
-                          add.def(1).getTemp());
-   }
-
-   Builder::Result add(NULL);
-   if (bld.program->gfx_level >= GFX9) {
-      add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
-   } else {
-      add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
-   }
-   add->valu().clamp = 1;
-   return dst.getTemp();
-}
-
-Temp
-usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
-{
-   if (bld.program->gfx_level < GFX8) {
-      Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
-      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
-                          sub.def(1).getTemp());
-   }
-
-   Builder::Result sub(NULL);
-   if (bld.program->gfx_level >= GFX9) {
-      sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
-   } else {
-      sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
-   }
-   sub->valu().clamp = 1;
-   return dst.getTemp();
-}
-
-void
-emit_vec2_f2f16(isel_context* ctx, nir_alu_instr* instr, Temp dst)
-{
-   Builder bld = create_alu_builder(ctx, instr);
-   Temp src = get_ssa_temp(ctx, instr->src[0].src.ssa);
-   RegClass rc = RegClass(src.regClass().type(), instr->src[0].src.ssa->bit_size / 32);
-   Temp src0 = emit_extract_vector(ctx, src, instr->src[0].swizzle[0], rc);
-   Temp src1 = emit_extract_vector(ctx, src, instr->src[0].swizzle[1], rc);
-
-   if (dst.regClass() == s1) {
-      bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src0, src1);
-   } else {
-      src1 = as_vgpr(ctx, src1);
-      if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
-         bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src0, src1);
-      else
-         bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
-      emit_split_vector(ctx, dst, 2);
-   }
-}
-
-void
-visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
-{
-   Builder bld = create_alu_builder(ctx, instr);
-   Temp dst = get_ssa_temp(ctx, &instr->def);
-   switch (instr->op) {
-   case nir_op_vec2:
-   case nir_op_vec3:
-   case nir_op_vec4:
-   case nir_op_vec5:
-   case nir_op_vec8:
-   case nir_op_vec16: {
-      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
-      unsigned num = instr->def.num_components;
-      for (unsigned i = 0; i < num; ++i)
-         elems[i] = get_alu_src(ctx, instr->src[i]);
-
-      if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
-         aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
-                                                     instr->def.num_components, 1)};
-         RegClass elem_rc = RegClass::get(dst.type(), instr->def.bit_size / 8u);
-         for (unsigned i = 0; i < num; ++i) {
-            if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
-               elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
-
-            if (nir_src_is_undef(instr->src[i].src))
-               vec->operands[i] = Operand{elem_rc};
-            else
-               vec->operands[i] = Operand{elems[i]};
-         }
-         vec->definitions[0] = Definition(dst);
-         ctx->block->instructions.emplace_back(std::move(vec));
-         ctx->allocated_vec.emplace(dst.id(), elems);
-      } else {
-         bool use_s_pack = ctx->program->gfx_level >= GFX9;
-         Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
-
-         std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
-         uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
-         bitarray32 undef_mask = UINT32_MAX;
-         for (unsigned i = 0; i < num; i++) {
-            unsigned packed_size = use_s_pack ? 16 : 32;
-            unsigned idx = i * instr->def.bit_size / packed_size;
-            unsigned offset = i * instr->def.bit_size % packed_size;
-            if (nir_src_is_undef(instr->src[i].src))
-               continue;
-            else
-               undef_mask[idx] = false;
-
-            if (nir_src_is_const(instr->src[i].src)) {
-               const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
-               continue;
-            }
-
-            if (offset != packed_size - instr->def.bit_size)
-               elems[i] =
-                  bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
-
-            if (offset)
-               elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
-                                   Operand::c32(offset));
-
-            if (packed[idx].id())
-               packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
-                                      packed[idx]);
-            else
-               packed[idx] = elems[i];
-         }
-
-         if (use_s_pack) {
-            for (unsigned i = 0; i < dst.size(); i++) {
-               bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
-
-               if (packed[i * 2].id() && packed[i * 2 + 1].id())
-                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
-                                       packed[i * 2 + 1]);
-               else if (packed[i * 2 + 1].id())
-                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
-                                       Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
-               else if (packed[i * 2].id())
-                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
-                                       Operand::c32(const_vals[i * 2 + 1]));
-               else
-                  packed[i] = Temp(0, s1); /* Both constants, so reset the entry */
-
-               undef_mask[i] = undef_mask[i * 2] && undef_mask[i * 2 + 1];
-
-               if (same)
-                  const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
-               else
-                  const_vals[i] = 0;
-            }
-         }
-
-         for (unsigned i = 0; i < dst.size(); i++) {
-            if (const_vals[i] && packed[i].id())
-               packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
-                                    Operand::c32(const_vals[i]), packed[i]);
-            else if (!packed[i].id() && !undef_mask[i])
-               packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
-         }
-
-         if (dst.size() == 1 && packed[0].id())
-            bld.copy(Definition(dst), packed[0]);
-         else {
-            aco_ptr<Instruction> vec{
-               create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
-            vec->definitions[0] = Definition(dst);
-            for (unsigned i = 0; i < dst.size(); ++i)
-               vec->operands[i] = Operand(packed[i]);
-            bld.insert(std::move(vec));
-         }
-      }
-      break;
-   }
-   case nir_op_mov: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
-         /* use size() instead of bytes() for 8/16-bit */
-         assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
-         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
-      } else {
-         assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
-         bld.copy(Definition(dst), src);
-      }
-      break;
-   }
-   case nir_op_inot: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
-      } else if (dst.regClass() == v2) {
-         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
-         lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
-         hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
-      } else if (dst.type() == RegType::sgpr) {
-         aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
-         bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_iabs: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
-
-         unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
-         unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
-
-         Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
-                              src, opsel_lo, opsel_hi);
-         bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
-         emit_split_vector(ctx, dst, 2);
-         break;
-      }
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == s1) {
-         bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
-      } else if (dst.regClass() == v1) {
-         bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
-                  bld.vsub32(bld.def(v1), Operand::zero(), src));
-      } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
-         bld.vop3(
-            aco_opcode::v_max_i16_e64, Definition(dst), src,
-            bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
-      } else if (dst.regClass() == v2b) {
-         src = as_vgpr(ctx, src);
-         bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
-                  bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_isign: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == s1) {
-         Temp tmp =
-            bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
-         bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
-      } else if (dst.regClass() == s2) {
-         Temp neg =
-            bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
-         Temp neqz;
-         if (ctx->program->gfx_level >= GFX8)
-            neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
-         else
-            neqz =
-               bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
-                  .def(1)
-                  .getTemp();
-         /* SCC gets zero-extended to 64 bit */
-         bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
-      } else if (dst.regClass() == v1) {
-         bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
-      } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
-         bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
-      } else if (dst.regClass() == v2b) {
-         src = as_vgpr(ctx, src);
-         bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
-                  bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
-      } else if (dst.regClass() == v2) {
-         Temp upper = emit_extract_vector(ctx, src, 1, v1);
-         Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
-         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
-         Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
-         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_imax: {
-      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
-      } else if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_umax: {
-      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
-      } else if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_imin: {
-      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
-      } else if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_umin: {
-      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
-      } else if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ior: {
-      if (instr->def.bit_size == 1) {
-         emit_boolean_logic(ctx, instr, Builder::s_or, dst);
-      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
-      } else if (dst.regClass() == v2) {
-         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
-      } else if (dst.regClass() == s2) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_iand: {
-      if (instr->def.bit_size == 1) {
-         emit_boolean_logic(ctx, instr, Builder::s_and, dst);
-      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
-      } else if (dst.regClass() == v2) {
-         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
-      } else if (dst.regClass() == s2) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ixor: {
-      if (instr->def.bit_size == 1) {
-         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
-      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
-      } else if (dst.regClass() == v2) {
-         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
-      } else if (dst.regClass() == s2) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ushr: {
-      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
-      } else if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
-      } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
-         bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
-                  get_alu_src(ctx, instr->src[0]));
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
-      } else if (dst.regClass() == s2) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ishl: {
-      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
-      } else if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
-                               false, 1);
-      } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
-         bld.vop3(aco_opcode::v_lshlrev_b64_e64, Definition(dst), get_alu_src(ctx, instr->src[1]),
-                  get_alu_src(ctx, instr->src[0]));
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
-      } else if (dst.regClass() == s2) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ishr: {
-      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
-      } else if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
-      } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
-         bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
-                  get_alu_src(ctx, instr->src[0]));
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
-      } else if (dst.regClass() == s2) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_find_lsb: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (src.regClass() == s1) {
-         bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
-      } else if (src.regClass() == v1) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
-      } else if (src.regClass() == s2) {
-         bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
-      } else if (src.regClass() == v2) {
-         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
-         lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
-         hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
-         hi = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32u), hi);
-         bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ufind_msb:
-   case nir_op_ifind_msb: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (src.regClass() == s1 || src.regClass() == s2) {
-         aco_opcode op = src.regClass() == s2
-                            ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
-                                                             : aco_opcode::s_flbit_i32_i64)
-                            : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
-                                                             : aco_opcode::s_flbit_i32);
-         Temp msb_rev = bld.sop1(op, bld.def(s1), src);
-
-         Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
-                                        Operand::c32(src.size() * 32u - 1u), msb_rev);
-         Temp msb = sub.def(0).getTemp();
-         Temp carry = sub.def(1).getTemp();
-
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
-                  bld.scc(carry));
-      } else if (src.regClass() == v1) {
-         aco_opcode op =
-            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
-         Temp msb_rev = bld.tmp(v1);
-         emit_vop1_instruction(ctx, instr, op, msb_rev);
-         Temp msb = bld.tmp(v1);
-         Temp carry =
-            bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
-         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
-      } else if (src.regClass() == v2) {
-         aco_opcode op =
-            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
-
-         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
-
-         lo = bld.vop1(op, bld.def(v1), lo);
-         lo = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32), lo);
-         hi = bld.vop1(op, bld.def(v1), hi);
-         Temp msb_rev = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), lo, hi);
-
-         Temp msb = bld.tmp(v1);
-         Temp carry =
-            bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
-         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ufind_msb_rev:
-   case nir_op_ifind_msb_rev: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (src.regClass() == s1) {
-         aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
-                                                           : aco_opcode::s_flbit_i32;
-         bld.sop1(op, Definition(dst), src);
-      } else if (src.regClass() == v1) {
-         aco_opcode op =
-            instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
-         emit_vop1_instruction(ctx, instr, op, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_bitfield_reverse: {
-      if (dst.regClass() == s1) {
-         bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else if (dst.regClass() == v1) {
-         bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_iadd: {
-      if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
-         break;
-      } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
-         break;
-      } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
-         break;
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
-         break;
-      } else if (dst.regClass() == s2 && ctx->program->gfx_level >= GFX12) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u64, dst, false);
-         break;
-      }
-
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
-         if (instr->no_unsigned_wrap)
-            bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
-         else
-            bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
-         break;
-      }
-
-      assert(src0.size() == 2 && src1.size() == 2);
-      Temp src00 = bld.tmp(src0.type(), 1);
-      Temp src01 = bld.tmp(dst.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
-      Temp src10 = bld.tmp(src1.type(), 1);
-      Temp src11 = bld.tmp(dst.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
-
-      if (dst.regClass() == s2) {
-         Temp carry = bld.tmp(s1);
-         Temp dst0 =
-            bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
-         Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
-                              bld.scc(carry));
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
-      } else if (dst.regClass() == v2) {
-         Temp dst0 = bld.tmp(v1);
-         Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
-         Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_uadd_sat: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
-         add_instr->valu().clamp = 1;
-         break;
-      }
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.regClass() == s1) {
-         Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
-         bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
-                  bld.scc(carry));
-         break;
-      } else if (dst.regClass() == v2b) {
-         Instruction* add_instr;
-         if (ctx->program->gfx_level >= GFX10) {
-            add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
-         } else {
-            if (src1.type() == RegType::sgpr)
-               std::swap(src0, src1);
-            add_instr =
-               bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
-         }
-         add_instr->valu().clamp = 1;
-         break;
-      } else if (dst.regClass() == v1) {
-         uadd32_sat(bld, Definition(dst), src0, src1);
-         break;
-      }
-
-      assert(src0.size() == 2 && src1.size() == 2);
-
-      Temp src00 = bld.tmp(src0.type(), 1);
-      Temp src01 = bld.tmp(src0.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
-      Temp src10 = bld.tmp(src1.type(), 1);
-      Temp src11 = bld.tmp(src1.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
-
-      if (dst.regClass() == s2) {
-         Temp carry0 = bld.tmp(s1);
-         Temp carry1 = bld.tmp(s1);
-
-         Temp no_sat0 =
-            bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
-         Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
-                                 src01, src11, bld.scc(carry0));
-
-         Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
-
-         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
-                  bld.scc(carry1));
-      } else if (dst.regClass() == v2) {
-         Temp no_sat0 = bld.tmp(v1);
-         Temp dst0 = bld.tmp(v1);
-         Temp dst1 = bld.tmp(v1);
-
-         Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
-         Temp carry1;
-
-         if (ctx->program->gfx_level >= GFX8) {
-            carry1 = bld.tmp(bld.lm);
-            bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
-                         as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
-               ->valu()
-               .clamp = 1;
-         } else {
-            Temp no_sat1 = bld.tmp(v1);
-            carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
-            bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
-                         carry1);
-         }
-
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
-                      carry1);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_iadd_sat: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
-         add_instr->valu().clamp = 1;
-         break;
-      }
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.regClass() == s1) {
-         Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
-         Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
-                               Operand::c32(INT32_MAX), cond);
-         Temp overflow = bld.tmp(s1);
-         Temp add =
-            bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
-         break;
-      }
-
-      src1 = as_vgpr(ctx, src1);
-
-      if (dst.regClass() == v2b) {
-         Instruction* add_instr =
-            bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
-         add_instr->valu().clamp = 1;
-      } else if (dst.regClass() == v1) {
-         Instruction* add_instr =
-            bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
-         add_instr->valu().clamp = 1;
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_uadd_carry: {
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.regClass() == s1) {
-         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
-         break;
-      }
-      if (dst.regClass() == v1) {
-         Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
-                      carry);
-         break;
-      }
-
-      Temp src00 = bld.tmp(src0.type(), 1);
-      Temp src01 = bld.tmp(dst.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
-      Temp src10 = bld.tmp(src1.type(), 1);
-      Temp src11 = bld.tmp(dst.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
-      if (dst.regClass() == s2) {
-         Temp carry = bld.tmp(s1);
-         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
-         carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
-                          bld.scc(carry))
-                    .def(1)
-                    .getTemp();
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
-      } else if (dst.regClass() == v2) {
-         Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
-         carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
-         carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
-                              Operand::c32(1u), carry);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_isub: {
-      if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
-         break;
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
-         break;
-      } else if (dst.regClass() == s2 && ctx->program->gfx_level >= GFX12) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_u64, dst, false);
-         break;
-      }
-
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.regClass() == v1) {
-         bld.vsub32(Definition(dst), src0, src1);
-         break;
-      } else if (dst.bytes() <= 2) {
-         if (ctx->program->gfx_level >= GFX10)
-            bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
-         else if (src1.type() == RegType::sgpr)
-            bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
-         else if (ctx->program->gfx_level >= GFX8)
-            bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
-         else
-            bld.vsub32(Definition(dst), src0, src1);
-         break;
-      }
-
-      Temp src00 = bld.tmp(src0.type(), 1);
-      Temp src01 = bld.tmp(dst.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
-      Temp src10 = bld.tmp(src1.type(), 1);
-      Temp src11 = bld.tmp(dst.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
-      if (dst.regClass() == s2) {
-         Temp borrow = bld.tmp(s1);
-         Temp dst0 =
-            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
-         Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
-                              bld.scc(borrow));
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
-      } else if (dst.regClass() == v2) {
-         Temp lower = bld.tmp(v1);
-         Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
-         Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_usub_borrow: {
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.regClass() == s1) {
-         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
-         break;
-      } else if (dst.regClass() == v1) {
-         Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
-                      borrow);
-         break;
-      }
-
-      Temp src00 = bld.tmp(src0.type(), 1);
-      Temp src01 = bld.tmp(dst.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
-      Temp src10 = bld.tmp(src1.type(), 1);
-      Temp src11 = bld.tmp(dst.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
-      if (dst.regClass() == s2) {
-         Temp borrow = bld.tmp(s1);
-         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
-         borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
-                           bld.scc(borrow))
-                     .def(1)
-                     .getTemp();
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
-      } else if (dst.regClass() == v2) {
-         Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
-         borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
-         borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
-                               Operand::c32(1u), borrow);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_usub_sat: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
-         sub_instr->valu().clamp = 1;
-         break;
-      }
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.regClass() == s1) {
-         Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
-         bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
-         break;
-      } else if (dst.regClass() == v2b) {
-         Instruction* sub_instr;
-         if (ctx->program->gfx_level >= GFX10) {
-            sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
-         } else {
-            aco_opcode op = aco_opcode::v_sub_u16;
-            if (src1.type() == RegType::sgpr) {
-               std::swap(src0, src1);
-               op = aco_opcode::v_subrev_u16;
-            }
-            sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
-         }
-         sub_instr->valu().clamp = 1;
-         break;
-      } else if (dst.regClass() == v1) {
-         usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
-         break;
-      }
-
-      assert(src0.size() == 2 && src1.size() == 2);
-      Temp src00 = bld.tmp(src0.type(), 1);
-      Temp src01 = bld.tmp(src0.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
-      Temp src10 = bld.tmp(src1.type(), 1);
-      Temp src11 = bld.tmp(src1.type(), 1);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
-
-      if (dst.regClass() == s2) {
-         Temp carry0 = bld.tmp(s1);
-         Temp carry1 = bld.tmp(s1);
-
-         Temp no_sat0 =
-            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
-         Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
-                                 src01, src11, bld.scc(carry0));
-
-         Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
-
-         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
-                  bld.scc(carry1));
-      } else if (dst.regClass() == v2) {
-         Temp no_sat0 = bld.tmp(v1);
-         Temp dst0 = bld.tmp(v1);
-         Temp dst1 = bld.tmp(v1);
-
-         Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
-         Temp carry1;
-
-         if (ctx->program->gfx_level >= GFX8) {
-            carry1 = bld.tmp(bld.lm);
-            bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
-                         as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
-               ->valu()
-               .clamp = 1;
-         } else {
-            Temp no_sat1 = bld.tmp(v1);
-            carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
-            bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
-                         carry1);
-         }
-
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
-                      carry1);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_isub_sat: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
-         sub_instr->valu().clamp = 1;
-         break;
-      }
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.regClass() == s1) {
-         Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
-         Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
-                               Operand::c32(INT32_MAX), cond);
-         Temp overflow = bld.tmp(s1);
-         Temp sub =
-            bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
-         break;
-      }
-
-      src1 = as_vgpr(ctx, src1);
-
-      if (dst.regClass() == v2b) {
-         Instruction* sub_instr =
-            bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
-         sub_instr->valu().clamp = 1;
-      } else if (dst.regClass() == v1) {
-         Instruction* sub_instr =
-            bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
-         sub_instr->valu().clamp = 1;
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_imul: {
-      if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
-      } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
-      } else if (dst.type() == RegType::vgpr) {
-         uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
-         uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
-
-         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
-            bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
-                                  true /* commutative */, false, false, nuw_16bit, 0x3);
-         } else if (nir_src_is_const(instr->src[0].src)) {
-            bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
-                          nir_src_as_uint(instr->src[0].src), false);
-         } else if (nir_src_is_const(instr->src[1].src)) {
-            bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
-                          nir_src_as_uint(instr->src[1].src), false);
-         } else {
-            emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
-         }
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_imul24_relaxed: {
-      if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_i32_i24, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_umul24_relaxed: {
-      if (dst.regClass() == s1) {
-         Operand op1(get_alu_src(ctx, instr->src[0]));
-         Operand op2(get_alu_src(ctx, instr->src[1]));
-         op1.set24bit(true);
-         op2.set24bit(true);
-         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), op1, op2);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true /* commutative */,
-                               false, false, false, 0x3);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_umul_high: {
-      if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
-      } else if (dst.bytes() == 4) {
-         uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
-         uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
-
-         Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
-         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
-         } else {
-            emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
-         }
-
-         if (dst.regClass() == s1)
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_imul_high: {
-      if (dst.regClass() == v1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
-      } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
-      } else if (dst.regClass() == s1) {
-         Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
-                             as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
-         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fmul: {
-      if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64_e64, dst);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f16, dst, false);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f32, dst, false);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fmulz: {
-      if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fadd: {
-      if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64_e64, dst);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f16, dst, false);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f32, dst, false);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fsub: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
-         VALU_instruction& sub = add->valu();
-         sub.neg_lo[1] = true;
-         sub.neg_hi[1] = true;
-         break;
-      }
-
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      if (dst.regClass() == v2b) {
-         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
-         else
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
-      } else if (dst.regClass() == v1) {
-         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
-         else
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
-      } else if (dst.regClass() == v2) {
-         Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), as_vgpr(ctx, src0),
-                                     as_vgpr(ctx, src1));
-         add->valu().neg[1] = true;
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f16, dst, false);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f32, dst, false);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ffma: {
-      if (dst.regClass() == v2b) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         assert(instr->def.num_components == 2);
-
-         Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
-         Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
-         Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
-
-         /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
-         unsigned opsel_lo = 0, opsel_hi = 0;
-         for (unsigned i = 0; i < 3; i++) {
-            opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
-            opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
-         }
-
-         bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
-         emit_split_vector(ctx, dst, 2);
-      } else if (dst.regClass() == v1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
-                                ctx->block->fp_mode.must_flush_denorms32, 3);
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
-      } else if (dst.regClass() == s1) {
-         Temp src0 = get_alu_src(ctx, instr->src[0]);
-         Temp src1 = get_alu_src(ctx, instr->src[1]);
-         Temp src2 = get_alu_src(ctx, instr->src[2]);
-         aco_opcode op =
-            instr->def.bit_size == 16 ? aco_opcode::s_fmac_f16 : aco_opcode::s_fmac_f32;
-         bld.sop2(op, Definition(dst), src0, src1, src2);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ffmaz: {
-      if (dst.regClass() == v1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
-                                ctx->block->fp_mode.must_flush_denorms32, 3);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fmax: {
-      if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true, false,
-                               ctx->block->fp_mode.must_flush_denorms16_64);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
-                               ctx->block->fp_mode.must_flush_denorms32);
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64_e64, dst,
-                                ctx->block->fp_mode.must_flush_denorms16_64);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f16, dst, false);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f32, dst, false);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fmin: {
-      if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true, false,
-                               ctx->block->fp_mode.must_flush_denorms16_64);
-      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
-      } else if (dst.regClass() == v1) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
-                               ctx->block->fp_mode.must_flush_denorms32);
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64_e64, dst,
-                                ctx->block->fp_mode.must_flush_denorms16_64);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f16, dst, false);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f32, dst, false);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_sdot_4x8_iadd: {
-      if (ctx->options->gfx_level >= GFX11)
-         emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
-      else
-         emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
-      break;
-   }
-   case nir_op_sdot_4x8_iadd_sat: {
-      if (ctx->options->gfx_level >= GFX11)
-         emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
-      else
-         emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
-      break;
-   }
-   case nir_op_sudot_4x8_iadd: {
-      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
-      break;
-   }
-   case nir_op_sudot_4x8_iadd_sat: {
-      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
-      break;
-   }
-   case nir_op_udot_4x8_uadd: {
-      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
-      break;
-   }
-   case nir_op_udot_4x8_uadd_sat: {
-      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
-      break;
-   }
-   case nir_op_sdot_2x16_iadd: {
-      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
-      break;
-   }
-   case nir_op_sdot_2x16_iadd_sat: {
-      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
-      break;
-   }
-   case nir_op_udot_2x16_uadd: {
-      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
-      break;
-   }
-   case nir_op_udot_2x16_uadd_sat: {
-      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
-      break;
-   }
-   case nir_op_bfdot2_bfadd: {
-      Temp src0 = as_vgpr(ctx, get_alu_src(ctx, instr->src[0], 2));
-      Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1], 2));
-      Temp src2 = get_alu_src(ctx, instr->src[2], 1);
-
-      bld.vop3(aco_opcode::v_dot2_bf16_bf16, Definition(dst), src0, src1, src2);
-      break;
-   }
-   case nir_op_cube_amd: {
-      Temp in = get_alu_src(ctx, instr->src[0], 3);
-      Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
-                     emit_extract_vector(ctx, in, 2, v1)};
-      Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
-      Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
-      Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
-      Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
-      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
-      break;
-   }
-   case nir_op_bcsel: {
-      emit_bcsel(ctx, instr, dst);
-      break;
-   }
-   case nir_op_frsq: {
-      if (instr->def.bit_size == 16) {
-         if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
-            bld.vop3(aco_opcode::v_s_rsq_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
-         else
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
-      } else if (instr->def.bit_size == 32) {
-         emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else if (instr->def.bit_size == 64) {
-         /* Lowered at NIR level for precision reasons. */
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fneg: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
-         Instruction* vop3p =
-            bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
-                      instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
-         vop3p->valu().neg_lo[0] = true;
-         vop3p->valu().neg_hi[0] = true;
-         emit_split_vector(ctx, dst, 2);
-         break;
-      }
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == v2b) {
-         bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
-      } else if (dst.regClass() == v1) {
-         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
-                  as_vgpr(ctx, src));
-      } else if (dst.regClass() == v2) {
-         if (ctx->block->fp_mode.must_flush_denorms16_64)
-            src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
-                           as_vgpr(ctx, src));
-         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
-         upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0xbc00u), src);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0xbf800000u), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fabs: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
-         Instruction* vop3p =
-            bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
-                      instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
-               .instr;
-         vop3p->valu().neg_lo[1] = true;
-         vop3p->valu().neg_hi[1] = true;
-         emit_split_vector(ctx, dst, 2);
-         break;
-      }
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == v2b) {
-         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
-                                         Operand::c16(0x3c00), as_vgpr(ctx, src))
-                               .instr;
-         mul->valu().abs[1] = true;
-      } else if (dst.regClass() == v1) {
-         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
-                                         Operand::c32(0x3f800000u), as_vgpr(ctx, src))
-                               .instr;
-         mul->valu().abs[1] = true;
-      } else if (dst.regClass() == v2) {
-         if (ctx->block->fp_mode.must_flush_denorms16_64)
-            src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
-                           as_vgpr(ctx, src));
-         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
-         upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fff));
-         if (ctx->block->fp_mode.denorm16_64 == fp_denorm_keep) {
-            bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
-         } else {
-            Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
-            bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
-         }
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
-         if (ctx->block->fp_mode.denorm32 == fp_denorm_keep) {
-            bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
-         } else {
-            Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
-            bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0x3f800000), tmp);
-         }
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fsat: {
-      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
-         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
-         Instruction* vop3p =
-            bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
-                      instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
-         vop3p->valu().clamp = true;
-         emit_split_vector(ctx, dst, 2);
-         break;
-      }
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
-         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
-                  src);
-      } else if (dst.regClass() == v2b) {
-         bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src)
-            ->valu()
-            .clamp = true;
-      } else if (dst.regClass() == v1) {
-         bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
-                  Operand::c32(0x3f800000u), src);
-         /* apparently, it is not necessary to flush denorms if this instruction is used with these
-          * operands */
-         // TODO: confirm that this holds under any circumstances
-      } else if (dst.regClass() == v2) {
-         Instruction* add =
-            bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src, Operand::zero());
-         add->valu().clamp = true;
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         Temp low = bld.sop2(aco_opcode::s_max_f16, bld.def(s1), src, Operand::c16(0));
-         bld.sop2(aco_opcode::s_min_f16, Definition(dst), low, Operand::c16(0x3C00));
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         Temp low = bld.sop2(aco_opcode::s_max_f32, bld.def(s1), src, Operand::c32(0));
-         bld.sop2(aco_opcode::s_min_f32, Definition(dst), low, Operand::c32(0x3f800000));
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_flog2: {
-      if (instr->def.bit_size == 16) {
-         if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
-            bld.vop3(aco_opcode::v_s_log_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
-         else
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
-      } else if (instr->def.bit_size == 32) {
-         emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_frcp: {
-      if (instr->def.bit_size == 16) {
-         if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
-            bld.vop3(aco_opcode::v_s_rcp_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
-         else
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
-      } else if (instr->def.bit_size == 32) {
-         emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else if (instr->def.bit_size == 64) {
-         /* Lowered at NIR level for precision reasons. */
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fexp2: {
-      if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX12) {
-         aco_opcode opcode =
-            instr->def.bit_size == 16 ? aco_opcode::v_s_exp_f16 : aco_opcode::v_s_exp_f32;
-         bld.vop3(opcode, Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else if (instr->def.bit_size == 16) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
-      } else if (instr->def.bit_size == 32) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fsqrt: {
-      if (instr->def.bit_size == 16) {
-         if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
-            bld.vop3(aco_opcode::v_s_sqrt_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
-         else
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
-      } else if (instr->def.bit_size == 32) {
-         emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else if (instr->def.bit_size == 64) {
-         /* Lowered at NIR level for precision reasons. */
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ffract: {
-      if (dst.regClass() == v2b) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
-      } else if (dst.regClass() == v2) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
-      } else if (dst.regClass() == s1) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         aco_opcode op =
-            instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
-         Temp floor = bld.sop1(op, bld.def(s1), src);
-         op = instr->def.bit_size == 16 ? aco_opcode::s_sub_f16 : aco_opcode::s_sub_f32;
-         bld.sop2(op, Definition(dst), src, floor);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ffloor: {
-      if (dst.regClass() == v2b) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
-      } else if (dst.regClass() == v2) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         emit_floor_f64(ctx, bld, Definition(dst), src);
-      } else if (dst.regClass() == s1) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         aco_opcode op =
-            instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
-         bld.sop1(op, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fceil: {
-      if (dst.regClass() == v2b) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
-      } else if (dst.regClass() == v2) {
-         if (ctx->options->gfx_level >= GFX7) {
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
-         } else {
-            /* GFX6 doesn't support V_CEIL_F64, lower it. */
-            /* trunc = trunc(src0)
-             * if (src0 > 0.0 && src0 != trunc)
-             *    trunc += 1.0
-             */
-            Temp src0 = get_alu_src(ctx, instr->src[0]);
-            Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
-            Temp tmp0 =
-               bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
-            Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
-            Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
-            Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                                bld.copy(bld.def(v1), Operand::zero()),
-                                bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
-            add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
-                             bld.copy(bld.def(v1), Operand::zero()), add);
-            bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), trunc, add);
-         }
-      } else if (dst.regClass() == s1) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         aco_opcode op =
-            instr->def.bit_size == 16 ? aco_opcode::s_ceil_f16 : aco_opcode::s_ceil_f32;
-         bld.sop1(op, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ftrunc: {
-      if (dst.regClass() == v2b) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
-      } else if (dst.regClass() == v2) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         emit_trunc_f64(ctx, bld, Definition(dst), src);
-      } else if (dst.regClass() == s1) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         aco_opcode op =
-            instr->def.bit_size == 16 ? aco_opcode::s_trunc_f16 : aco_opcode::s_trunc_f32;
-         bld.sop1(op, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fround_even: {
-      if (dst.regClass() == v2b) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
-      } else if (dst.regClass() == v2) {
-         if (ctx->options->gfx_level >= GFX7) {
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
-         } else {
-            /* GFX6 doesn't support V_RNDNE_F64, lower it. */
-            Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
-            Temp src0 = get_alu_src(ctx, instr->src[0]);
-            bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
-
-            Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
-                                    bld.copy(bld.def(s1), Operand::c32(-2u)));
-            Temp bfi =
-               bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
-                        bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
-            Temp tmp =
-               bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), src0,
-                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
-            Instruction* sub =
-               bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), tmp,
-                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
-            sub->valu().neg[1] = true;
-            tmp = sub->definitions[0].getTemp();
-
-            Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
-                                Operand::c32(0x432fffffu));
-            Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
-            vop3->valu().abs[0] = true;
-            Temp cond = vop3->definitions[0].getTemp();
-
-            Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
-            bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
-            Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
-                                     as_vgpr(ctx, src0_lo), cond);
-            Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
-                                     as_vgpr(ctx, src0_hi), cond);
-
-            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
-         }
-      } else if (dst.regClass() == s1) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         aco_opcode op =
-            instr->def.bit_size == 16 ? aco_opcode::s_rndne_f16 : aco_opcode::s_rndne_f32;
-         bld.sop1(op, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fsin_amd:
-   case nir_op_fcos_amd: {
-      if (instr->def.bit_size == 16 || instr->def.bit_size == 32) {
-         bool is_sin = instr->op == nir_op_fsin_amd;
-         aco_opcode opcode, fract;
-         RegClass rc;
-         if (instr->def.bit_size == 16) {
-            opcode = is_sin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
-            fract = aco_opcode::v_fract_f16;
-            rc = v2b;
-         } else {
-            opcode = is_sin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
-            fract = aco_opcode::v_fract_f32;
-            rc = v1;
-         }
-
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         /* before GFX9, v_sin and v_cos had a valid input domain of [-256, +256] */
-         if (ctx->options->gfx_level < GFX9)
-            src = bld.vop1(fract, bld.def(rc), src);
-
-         if (dst.regClass() == rc) {
-            bld.vop1(opcode, Definition(dst), src);
-         } else {
-            Temp tmp = bld.vop1(opcode, bld.def(rc), src);
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
-         }
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ldexp: {
-      if (dst.regClass() == v2b) {
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
-      } else if (dst.regClass() == v1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
-      } else if (dst.regClass() == v2) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_frexp_sig: {
-      if (dst.regClass() == v2b) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
-      } else if (dst.regClass() == v1) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
-      } else if (dst.regClass() == v2) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_frexp_exp: {
-      if (instr->src[0].src.ssa->bit_size == 16) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
-         tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
-         convert_int(ctx, bld, tmp, 8, 32, true, dst);
-      } else if (instr->src[0].src.ssa->bit_size == 32) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
-      } else if (instr->src[0].src.ssa->bit_size == 64) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fsign: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == v2b) {
-         /* replace negative zero with positive zero */
-         src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), as_vgpr(ctx, src));
-         if (ctx->program->gfx_level >= GFX9) {
-            src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src,
-                           Operand::c16(1u));
-            bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
-         } else {
-            src = convert_int(ctx, bld, src, 16, 32, true);
-            src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src,
-                           Operand::c32(1u));
-            bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
-         }
-      } else if (dst.regClass() == v1) {
-         /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
-          * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
-          */
-         Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
-         src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, as_vgpr(ctx, src));
-         bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
-                  Operand::c32(0xbf800000));
-      } else if (dst.regClass() == v2) {
-         src = as_vgpr(ctx, src);
-         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
-         Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
-         Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
-                                   emit_extract_vector(ctx, src, 1, v1), cond);
-
-         cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
-         tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
-         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
-
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f16, bld.def(s1, scc), Operand::c16(0), src);
-         src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3c00), src,
-                        bld.scc(cond));
-         cond = bld.sopc(aco_opcode::s_cmp_ge_f16, bld.def(s1, scc), src, Operand::c16(0));
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbc00),
-                  bld.scc(cond));
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
-         Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f32, bld.def(s1, scc), Operand::c32(0), src);
-         src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3f800000), src,
-                        bld.scc(cond));
-         cond = bld.sopc(aco_opcode::s_cmp_ge_f32, bld.def(s1, scc), src, Operand::c32(0));
-         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbf800000),
-                  bld.scc(cond));
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_f2f16:
-   case nir_op_f2f16_rtne: {
-      assert(instr->src[0].src.ssa->bit_size == 32);
-      if (instr->def.num_components == 2) {
-         /* Vectorizing f2f16 is only possible with rtz. */
-         assert(instr->op != nir_op_f2f16_rtne);
-         assert(ctx->block->fp_mode.round16_64 == fp_round_tz ||
-                !ctx->block->fp_mode.care_about_round16_64);
-         emit_vec2_f2f16(ctx, instr, dst);
-         break;
-      }
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) {
-         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
-          * keep value numbering and the scheduler simpler.
-          */
-         if (dst.regClass() == v2b)
-            bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src);
-         else
-            bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, Definition(dst), src);
-      } else {
-         if (dst.regClass() == v2b)
-            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
-         else
-            bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
-      }
-      break;
-   }
-   case nir_op_f2f16_rtz: {
-      assert(instr->src[0].src.ssa->bit_size == 32);
-      if (instr->def.num_components == 2) {
-         emit_vec2_f2f16(ctx, instr, dst);
-         break;
-      }
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (ctx->block->fp_mode.round16_64 == fp_round_tz) {
-         if (dst.regClass() == v2b)
-            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
-         else
-            bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
-      } else if (dst.regClass() == s1) {
-         bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src, Operand::zero());
-      } else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9) {
-         bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
-      } else {
-         bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
-      }
-      break;
-   }
-   case nir_op_f2f32: {
-      if (dst.regClass() == s1) {
-         assert(instr->src[0].src.ssa->bit_size == 16);
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
-      } else if (instr->src[0].src.ssa->bit_size == 16) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
-      } else if (instr->src[0].src.ssa->bit_size == 64) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_f2f64: {
-      assert(instr->src[0].src.ssa->bit_size == 32);
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
-      break;
-   }
-   case nir_op_i2f16: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      const unsigned input_size = instr->src[0].src.ssa->bit_size;
-      if (dst.regClass() == v2b) {
-         if (input_size <= 16) {
-            /* Expand integer to the size expected by the uint→float converter used below */
-            unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
-            if (input_size != target_size) {
-               src = convert_int(ctx, bld, src, input_size, target_size, true);
-            }
-         }
-
-         if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
-            bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
-         } else {
-            /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
-             *
-             * This is also the fallback-path taken on GFX7 and earlier, which
-             * do not support direct f16⟷i16 conversions.
-             */
-            src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
-            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
-         }
-      } else if (dst.regClass() == s1) {
-         if (input_size <= 16) {
-            src = convert_int(ctx, bld, src, input_size, 32, true);
-         }
-         src = bld.sop1(aco_opcode::s_cvt_f32_i32, bld.def(s1), src);
-         bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_i2f32: {
-      assert(dst.size() == 1);
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      const unsigned input_size = instr->src[0].src.ssa->bit_size;
-      if (input_size <= 32) {
-         if (input_size <= 16) {
-            /* Sign-extend to 32-bits */
-            src = convert_int(ctx, bld, src, input_size, 32, true);
-         }
-         if (dst.regClass() == v1)
-            bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
-         else
-            bld.sop1(aco_opcode::s_cvt_f32_i32, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_i2f64: {
-      if (instr->src[0].src.ssa->bit_size <= 32) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         if (instr->src[0].src.ssa->bit_size <= 16)
-            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
-         bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_u2f16: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      const unsigned input_size = instr->src[0].src.ssa->bit_size;
-      if (dst.regClass() == v2b) {
-         if (input_size <= 16) {
-            /* Expand integer to the size expected by the uint→float converter used below */
-            unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
-            if (input_size != target_size) {
-               src = convert_int(ctx, bld, src, input_size, target_size, false);
-            }
-         }
-
-         if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
-            bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
-         } else {
-            /* Large 32bit inputs need to return inf/FLOAT_MAX.
-             *
-             * This is also the fallback-path taken on GFX7 and earlier, which
-             * do not support direct f16⟷u16 conversions.
-             */
-            src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
-            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
-         }
-      } else if (dst.regClass() == s1) {
-         if (input_size <= 16) {
-            src = convert_int(ctx, bld, src, input_size, 32, false);
-         }
-         src = bld.sop1(aco_opcode::s_cvt_f32_u32, bld.def(s1), src);
-         bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_u2f32: {
-      assert(dst.size() == 1);
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      const unsigned input_size = instr->src[0].src.ssa->bit_size;
-      if (input_size == 8 && dst.regClass() == v1) {
-         bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
-      } else if (input_size <= 32) {
-         if (input_size <= 16)
-            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
-         if (dst.regClass() == v1)
-            bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
-         else
-            bld.sop1(aco_opcode::s_cvt_f32_u32, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_u2f64: {
-      if (instr->src[0].src.ssa->bit_size <= 32) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         if (instr->src[0].src.ssa->bit_size <= 16)
-            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
-         bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_f2i8:
-   case nir_op_f2i16: {
-      if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
-          ctx->program->gfx_level >= GFX11_5) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.as_uniform(src);
-         if (instr->src[0].src.ssa->bit_size == 16)
-            tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
-         bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
-      } else if (instr->src[0].src.ssa->bit_size == 16) {
-         if (ctx->program->gfx_level >= GFX8) {
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
-         } else {
-            /* GFX7 and earlier do not support direct f16⟷i16 conversions */
-            Temp tmp = bld.tmp(v1);
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
-            tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
-            tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
-                              (dst.type() == RegType::sgpr) ? Temp() : dst);
-            if (dst.type() == RegType::sgpr) {
-               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
-            }
-         }
-      } else if (instr->src[0].src.ssa->bit_size == 32) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
-      } else {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
-      }
-      break;
-   }
-   case nir_op_f2u8:
-   case nir_op_f2u16: {
-      if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
-          ctx->program->gfx_level >= GFX11_5) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.as_uniform(src);
-         if (instr->src[0].src.ssa->bit_size == 16)
-            tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
-         bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
-      } else if (instr->src[0].src.ssa->bit_size == 16) {
-         if (ctx->program->gfx_level >= GFX8) {
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
-         } else {
-            /* GFX7 and earlier do not support direct f16⟷u16 conversions */
-            Temp tmp = bld.tmp(v1);
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
-            tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
-            tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
-                              (dst.type() == RegType::sgpr) ? Temp() : dst);
-            if (dst.type() == RegType::sgpr) {
-               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
-            }
-         }
-      } else if (instr->src[0].src.ssa->bit_size == 32) {
-         if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX11)
-            bld.vop3(aco_opcode::p_v_cvt_pk_u8_f32, Definition(dst),
-                     get_alu_src(ctx, instr->src[0]));
-         else
-            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
-      } else {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
-      }
-      break;
-   }
-   case nir_op_f2i32: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
-          ctx->program->gfx_level >= GFX11_5) {
-         Temp tmp = bld.as_uniform(src);
-         if (instr->src[0].src.ssa->bit_size == 16)
-            tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
-         bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
-      } else if (instr->src[0].src.ssa->bit_size == 16) {
-         Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
-         if (dst.type() == RegType::vgpr) {
-            bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
-         } else {
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
-         }
-      } else if (instr->src[0].src.ssa->bit_size == 32) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
-      } else if (instr->src[0].src.ssa->bit_size == 64) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_f2u32: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
-          ctx->program->gfx_level >= GFX11_5) {
-         Temp tmp = bld.as_uniform(src);
-         if (instr->src[0].src.ssa->bit_size == 16)
-            tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
-         bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
-      } else if (instr->src[0].src.ssa->bit_size == 16) {
-         Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
-         if (dst.type() == RegType::vgpr) {
-            bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
-         } else {
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
-         }
-      } else if (instr->src[0].src.ssa->bit_size == 32) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
-      } else if (instr->src[0].src.ssa->bit_size == 64) {
-         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_b2f16: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(src.regClass() == bld.lm);
-
-      if (dst.regClass() == s1) {
-         src = bool_to_scalar_condition(ctx, src);
-         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
-      } else if (dst.regClass() == v2b) {
-         Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
-         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
-      } else {
-         unreachable("Wrong destination register class for nir_op_b2f16.");
-      }
-      break;
-   }
-   case nir_op_b2f32: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(src.regClass() == bld.lm);
-
-      if (dst.regClass() == s1) {
-         src = bool_to_scalar_condition(ctx, src);
-         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
-      } else if (dst.regClass() == v1) {
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
-                      Operand::c32(0x3f800000u), src);
-      } else {
-         unreachable("Wrong destination register class for nir_op_b2f32.");
-      }
-      break;
-   }
-   case nir_op_b2f64: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(src.regClass() == bld.lm);
-
-      if (dst.regClass() == s2) {
-         src = bool_to_scalar_condition(ctx, src);
-         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
-                  Operand::zero(), bld.scc(src));
-      } else if (dst.regClass() == v2) {
-         Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
-         Temp upper =
-            bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
-      } else {
-         unreachable("Wrong destination register class for nir_op_b2f64.");
-      }
-      break;
-   }
-   case nir_op_i2i8:
-   case nir_op_i2i16:
-   case nir_op_i2i32: {
-      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
-         /* no need to do the extract in get_alu_src() */
-         sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
-                                     ? sgpr_extract_sext
-                                     : sgpr_extract_undef;
-         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
-      } else {
-         const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
-         const unsigned output_bitsize = instr->def.bit_size;
-         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
-                     output_bitsize > input_bitsize, dst);
-      }
-      break;
-   }
-   case nir_op_u2u8:
-   case nir_op_u2u16:
-   case nir_op_u2u32: {
-      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
-         /* no need to do the extract in get_alu_src() */
-         sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
-                                     ? sgpr_extract_zext
-                                     : sgpr_extract_undef;
-         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
-      } else {
-         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
-                     instr->def.bit_size, false, dst);
-      }
-      break;
-   }
-   case nir_op_b2b32:
-   case nir_op_b2i8:
-   case nir_op_b2i16:
-   case nir_op_b2i32: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(src.regClass() == bld.lm);
-
-      if (dst.regClass() == s1) {
-         bool_to_scalar_condition(ctx, src, dst);
-      } else if (dst.type() == RegType::vgpr) {
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
-                      src);
-      } else {
-         unreachable("Invalid register class for b2i32");
-      }
-      break;
-   }
-   case nir_op_b2b1: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      assert(dst.regClass() == bld.lm);
-
-      if (src.type() == RegType::vgpr) {
-         assert(src.regClass() == v1 || src.regClass() == v2);
-         assert(dst.regClass() == bld.lm);
-         bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
-                  Definition(dst), Operand::zero(), src);
-      } else {
-         assert(src.regClass() == s1 || src.regClass() == s2);
-         Temp tmp;
-         if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
-            tmp =
-               bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
-                  .def(1)
-                  .getTemp();
-         } else {
-            tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
-                           bld.scc(bld.def(s1)), Operand::zero(), src);
-         }
-         bool_to_vector_condition(ctx, tmp, dst);
-      }
-      break;
-   }
-   case nir_op_unpack_64_2x32:
-   case nir_op_unpack_32_2x16:
-   case nir_op_unpack_64_4x16:
-   case nir_op_unpack_32_4x8:
-      bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
-      emit_split_vector(
-         ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
-      break;
-   case nir_op_pack_64_2x32_split: {
-      Operand src[2];
-      RegClass elem_rc = dst.regClass() == s2 ? s1 : v1;
-      for (unsigned i = 0; i < 2; i++) {
-         if (nir_src_is_undef(instr->src[i].src))
-            src[i] = Operand(elem_rc);
-         else
-            src[i] = Operand(get_alu_src(ctx, instr->src[i]));
-      }
-
-      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src[0], src[1]);
-      break;
-   }
-   case nir_op_unpack_64_2x32_split_x:
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
-                 get_alu_src(ctx, instr->src[0]));
-      break;
-   case nir_op_unpack_64_2x32_split_y:
-      bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
-                 get_alu_src(ctx, instr->src[0]));
-      break;
-   case nir_op_unpack_32_2x16_split_x:
-      if (dst.type() == RegType::vgpr) {
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
-                    get_alu_src(ctx, instr->src[0]));
-      } else {
-         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
-      }
-      break;
-   case nir_op_unpack_32_2x16_split_y:
-      if (dst.type() == RegType::vgpr) {
-         bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
-                    get_alu_src(ctx, instr->src[0]));
-      } else {
-         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
-                    get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
-                    Operand::zero());
-      }
-      break;
-   case nir_op_pack_32_2x16_split: {
-      Operand src0 = Operand(get_alu_src(ctx, instr->src[0]));
-      Operand src1 = Operand(get_alu_src(ctx, instr->src[1]));
-      if (dst.regClass() == v1) {
-         if (nir_src_is_undef(instr->src[0].src))
-            src0 = Operand(v2b);
-         else
-            src0 = Operand(emit_extract_vector(ctx, src0.getTemp(), 0, v2b));
-
-         if (nir_src_is_undef(instr->src[1].src))
-            src1 = Operand(v2b);
-         else
-            src1 = Operand(emit_extract_vector(ctx, src1.getTemp(), 0, v2b));
-
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
-      } else if (nir_src_is_undef(instr->src[1].src)) {
-         bld.copy(Definition(dst), src0);
-      } else if (nir_src_is_undef(instr->src[0].src)) {
-         bld.pseudo(aco_opcode::p_insert, Definition(dst), bld.def(s1, scc), src1, Operand::c32(1),
-                    Operand::c32(16));
-      } else if (ctx->program->gfx_level >= GFX9) {
-         bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(dst), src0, src1);
-      } else {
-         src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
-                         Operand::c32(0xFFFFu));
-         src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
-                         Operand::c32(16u));
-         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
-      }
-      break;
-   }
-   case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
-   case nir_op_pack_half_2x16_rtz_split:
-   case nir_op_pack_half_2x16_split: {
-      if (dst.regClass() == v1) {
-         if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
-            emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
-         else
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
-      } else if (dst.regClass() == s1) {
-         emit_sop2_instruction(ctx, instr, aco_opcode::s_cvt_pk_rtz_f16_f32, dst, false);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_pack_unorm_2x16:
-   case nir_op_pack_snorm_2x16: {
-      unsigned bit_size = instr->src[0].src.ssa->bit_size;
-      /* Only support 16 and 32bit. */
-      assert(bit_size == 32 || bit_size == 16);
-
-      RegClass src_rc = bit_size == 32 ? v1 : v2b;
-      Temp src = get_alu_src(ctx, instr->src[0], 2);
-      Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
-      Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
-
-      /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
-      if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
-         src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
-         src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
-         bit_size = 32;
-      }
-
-      aco_opcode opcode;
-      if (bit_size == 32) {
-         opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
-                                                      : aco_opcode::v_cvt_pknorm_i16_f32;
-      } else {
-         opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
-                                                      : aco_opcode::v_cvt_pknorm_i16_f16;
-      }
-      bld.vop3(opcode, Definition(dst), src0, src1);
-      break;
-   }
-   case nir_op_pack_uint_2x16:
-   case nir_op_pack_sint_2x16: {
-      Temp src = get_alu_src(ctx, instr->src[0], 2);
-      Temp src0 = emit_extract_vector(ctx, src, 0, v1);
-      Temp src1 = emit_extract_vector(ctx, src, 1, v1);
-      aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
-                                                             : aco_opcode::v_cvt_pk_i16_i32;
-      bld.vop3(opcode, Definition(dst), src0, src1);
-      break;
-   }
-   case nir_op_unpack_half_2x16_split_x: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == s1) {
-         bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
-         break;
-      }
-      if (src.regClass() == v1)
-         src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
-      if (dst.regClass() == v1) {
-         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_unpack_half_2x16_split_y: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == s1) {
-         bld.sop1(aco_opcode::s_cvt_hi_f32_f16, Definition(dst), src);
-         break;
-      }
-      if (src.regClass() == s1)
-         src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
-                          Operand::c32(1u), Operand::c32(16u), Operand::zero());
-      else
-         src =
-            bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
-      if (dst.regClass() == v1) {
-         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_msad_4x8: {
-      assert(dst.regClass() == v1);
-      emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
-      break;
-   }
-   case nir_op_mqsad_4x8: {
-      assert(dst.regClass() == v4);
-      Temp ref = get_alu_src(ctx, instr->src[0]);
-      Temp src = get_alu_src(ctx, instr->src[1], 2);
-      Temp accum = get_alu_src(ctx, instr->src[2], 4);
-      bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), as_vgpr(ctx, ref),
-               as_vgpr(ctx, accum));
-      emit_split_vector(ctx, dst, 4);
-      break;
-   }
-   case nir_op_shfr: {
-      if (dst.regClass() == s1) {
-         Temp src0 = get_alu_src(ctx, instr->src[0]);
-         Temp src1 = get_alu_src(ctx, instr->src[1]);
-
-         Temp amount;
-         if (nir_src_is_const(instr->src[2].src)) {
-            unsigned camount = nir_src_as_uint(instr->src[2].src) & 0x1f;
-            if (camount == 16 && ctx->program->gfx_level >= GFX11) {
-               bld.sop2(aco_opcode::s_pack_hl_b32_b16, Definition(dst), src1, src0);
-               break;
-            }
-            amount = bld.copy(bld.def(s1), Operand::c32(camount));
-         } else if (get_alu_src_ub(ctx, instr, 2) >= 32) {
-            amount = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
-                              get_alu_src(ctx, instr->src[2]), Operand::c32(0x1f));
-         } else {
-            amount = get_alu_src(ctx, instr->src[2]);
-         }
-
-         Temp src = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), src1, src0);
-
-         Temp res = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), src, amount);
-         bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), res, Operand::zero());
-      } else if (dst.regClass() == v1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbit_b32, dst, false, 3u);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_alignbyte_amd: {
-      if (dst.regClass() == v1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbyte_b32, dst, false, 3u);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_fquantize2f16: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (dst.regClass() == v1) {
-         Temp f16;
-         if (ctx->block->fp_mode.round16_64 != fp_round_ne)
-            f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src);
-         else
-            f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
-
-         if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
-            bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16);
-            break;
-         }
-
-         Temp denorm_zero;
-         Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
-         if (ctx->program->gfx_level >= GFX8) {
-            /* value is negative/positive denormal value/zero */
-            Instruction* tmp0 =
-               bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, Operand::c32(0x30));
-            tmp0->valu().abs[0] = true;
-            tmp0->valu().neg[0] = true;
-            denorm_zero = tmp0->definitions[0].getTemp();
-         } else {
-            /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
-             * so compare the result and flush to 0 if it's smaller.
-             */
-            Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
-            Instruction* tmp0 =
-               bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
-            tmp0->valu().abs[0] = true;
-            denorm_zero = tmp0->definitions[0].getTemp();
-         }
-         if (nir_alu_instr_is_signed_zero_preserve(instr)) {
-            Temp copysign_0 =
-               bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
-            bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), f32, copysign_0, denorm_zero);
-         } else {
-            bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), f32, Operand::zero(),
-                         denorm_zero);
-         }
-      } else if (dst.regClass() == s1) {
-         Temp f16;
-         if (ctx->block->fp_mode.round16_64 != fp_round_ne)
-            f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src);
-         else
-            f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src);
-
-         if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
-            bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);
-         } else {
-            Temp f32 = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), f16);
-            Temp abs_mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
-            Temp abs =
-               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
-            Operand sign;
-            if (nir_alu_instr_is_signed_zero_preserve(instr)) {
-               sign =
-                  bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
-            } else {
-               sign = Operand::c32(0);
-            }
-            Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
-            Temp denorm_zero = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, smallest);
-            bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), sign, f32, bld.scc(denorm_zero));
-         }
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_bfm: {
-      Temp bits = get_alu_src(ctx, instr->src[0]);
-      Temp offset = get_alu_src(ctx, instr->src[1]);
-
-      if (dst.regClass() == s1) {
-         bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
-      } else if (dst.regClass() == v1) {
-         bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_bitfield_select: {
-
-      /* dst = (insert & bitmask) | (base & ~bitmask) */
-      if (dst.regClass() == s1) {
-         Temp bitmask = get_alu_src(ctx, instr->src[0]);
-         Temp insert = get_alu_src(ctx, instr->src[1]);
-         Temp base = get_alu_src(ctx, instr->src[2]);
-         aco_ptr<Instruction> sop2;
-         nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
-         nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
-
-         if (const_bitmask && ctx->program->gfx_level >= GFX9 &&
-             (const_bitmask->u32 == 0xffff || const_bitmask->u32 == 0xffff0000)) {
-            if (const_bitmask->u32 == 0xffff) {
-               bld.sop2(aco_opcode::s_pack_lh_b32_b16, Definition(dst), insert, base);
-            } else {
-               bld.sop2(aco_opcode::s_pack_lh_b32_b16, Definition(dst), base, insert);
-            }
-            break;
-         }
-
-         Operand lhs;
-         if (const_insert && const_bitmask) {
-            lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
-         } else {
-            insert =
-               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
-            lhs = Operand(insert);
-         }
-
-         Operand rhs;
-         nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
-         if (const_base && const_bitmask) {
-            rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
-         } else {
-            base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
-            rhs = Operand(base);
-         }
-
-         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
-
-      } else if (dst.regClass() == v1) {
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_ubfe:
-   case nir_op_ibfe: {
-      if (dst.bytes() != 4)
-         unreachable("Unsupported BFE bit size");
-
-      if (dst.type() == RegType::sgpr) {
-         Temp base = get_alu_src(ctx, instr->src[0]);
-
-         nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
-         nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
-         aco_opcode opcode =
-            instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
-         if (const_offset && const_bits) {
-            uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
-            bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
-            break;
-         }
-
-         Temp offset = get_alu_src(ctx, instr->src[1]);
-         Temp bits = get_alu_src(ctx, instr->src[2]);
-
-         if (ctx->program->gfx_level >= GFX9) {
-            Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
-                                         : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
-                                                    bld.def(s1, scc), bits, Operand::c32(0x1fu));
-            Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
-            bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
-         } else if (instr->op == nir_op_ubfe) {
-            Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
-            Temp masked =
-               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
-            bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
-         } else {
-            Operand bits_op = const_bits
-                                 ? Operand::c32((const_bits->u32 & 0x1f) << 16)
-                                 : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
-                                            bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
-                                                     bld.def(s1, scc), bits, Operand::c32(0x1fu)),
-                                            Operand::c32(16u));
-            Operand offset_op = const_offset
-                                   ? Operand::c32(const_offset->u32 & 0x1fu)
-                                   : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
-                                              offset, Operand::c32(0x1fu));
-
-            Temp extract =
-               bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
-            bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
-         }
-
-      } else {
-         aco_opcode opcode =
-            instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
-         emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
-      }
-      break;
-   }
-   case nir_op_extract_u8:
-   case nir_op_extract_i8:
-   case nir_op_extract_u16:
-   case nir_op_extract_i16: {
-      bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
-      unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
-      uint32_t bits = comp == 4 ? 8 : 16;
-      unsigned index = nir_src_as_uint(instr->src[1].src);
-      if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
-         assert(index == 0);
-         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
-         Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
-         unsigned swizzle = instr->src[0].swizzle[0];
-         if (vec.size() > 1) {
-            vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
-            swizzle = swizzle & 1;
-         }
-         index += swizzle * instr->def.bit_size / bits;
-         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
-                    Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
-      } else if (dst.regClass() == s1) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(src),
-                    Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
-      } else if (dst.regClass() == s2) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         aco_opcode op = is_signed ? aco_opcode::s_bfe_i64 : aco_opcode::s_bfe_u64;
-         Temp extract = bld.copy(bld.def(s1), Operand::c32((bits << 16) | (index * bits)));
-         bld.sop2(op, Definition(dst), bld.def(s1, scc), src, extract);
-      } else {
-         assert(dst.regClass().type() == RegType::vgpr);
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Definition def(dst);
-
-         if (dst.bytes() == 8) {
-            src = emit_extract_vector(ctx, src, index / comp, v1);
-            index %= comp;
-            def = bld.def(v1);
-         }
-
-         assert(def.bytes() <= 4);
-         src = emit_extract_vector(ctx, src, 0, def.regClass());
-         bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
-                    Operand::c32(bits), Operand::c32(is_signed));
-
-         if (dst.size() == 2) {
-            Temp lo = def.getTemp();
-            Operand hi = Operand::zero();
-            if (is_signed)
-               hi = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31), lo);
-            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
-         }
-      }
-      break;
-   }
-   case nir_op_insert_u8:
-   case nir_op_insert_u16: {
-      unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
-      uint32_t bits = comp == 4 ? 8 : 16;
-      unsigned index = nir_src_as_uint(instr->src[1].src);
-      if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
-         assert(index == 0);
-         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
-      } else {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Definition def(dst);
-         bool swap = false;
-         if (dst.bytes() == 8) {
-            src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
-            swap = index >= comp;
-            index %= comp;
-            def = bld.def(src.type(), 1);
-         }
-         if (def.regClass() == s1) {
-            bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
-                       Operand::c32(index), Operand::c32(bits));
-         } else {
-            src = emit_extract_vector(ctx, src, 0, def.regClass());
-            bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
-                       Operand::c32(bits));
-         }
-         if (dst.size() == 2 && swap)
-            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
-                       def.getTemp());
-         else if (dst.size() == 2)
-            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
-                       Operand::zero());
-      }
-      break;
-   }
-   case nir_op_bit_count: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (src.regClass() == s1) {
-         bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
-      } else if (src.regClass() == v1) {
-         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
-      } else if (src.regClass() == v2) {
-         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
-                  bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
-                           emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
-      } else if (src.regClass() == s2) {
-         bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      break;
-   }
-   case nir_op_flt: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
-         aco_opcode::v_cmp_lt_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_fge: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
-         aco_opcode::v_cmp_ge_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_fltu: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_nge_f16, aco_opcode::v_cmp_nge_f32,
-         aco_opcode::v_cmp_nge_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_fgeu: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_nlt_f16, aco_opcode::v_cmp_nlt_f32,
-         aco_opcode::v_cmp_nlt_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_feq: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
-         aco_opcode::v_cmp_eq_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_fneu: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
-         aco_opcode::v_cmp_neq_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_fequ: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_nlg_f16, aco_opcode::v_cmp_nlg_f32,
-         aco_opcode::v_cmp_nlg_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_fneo: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_lg_f16, aco_opcode::v_cmp_lg_f32,
-         aco_opcode::v_cmp_lg_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_funord: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_u_f16, aco_opcode::v_cmp_u_f32, aco_opcode::v_cmp_u_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_ford: {
-      emit_comparison(
-         ctx, instr, dst, aco_opcode::v_cmp_o_f16, aco_opcode::v_cmp_o_f32, aco_opcode::v_cmp_o_f64,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f16 : aco_opcode::num_opcodes,
-         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f32 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_ilt: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
-                      aco_opcode::v_cmp_lt_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_i32);
-      break;
-   }
-   case nir_op_ige: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
-                      aco_opcode::v_cmp_ge_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_i32);
-      break;
-   }
-   case nir_op_ieq: {
-      if (instr->src[0].src.ssa->bit_size == 1)
-         emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
-      else
-         emit_comparison(
-            ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
-            aco_opcode::v_cmp_eq_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_eq_i32,
-            ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_ine: {
-      if (instr->src[0].src.ssa->bit_size == 1)
-         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
-      else
-         emit_comparison(
-            ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
-            aco_opcode::v_cmp_lg_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lg_i32,
-            ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
-      break;
-   }
-   case nir_op_ult: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
-                      aco_opcode::v_cmp_lt_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_u32);
-      break;
-   }
-   case nir_op_uge: {
-      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
-                      aco_opcode::v_cmp_ge_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_u32);
-      break;
-   }
-   case nir_op_bitz:
-   case nir_op_bitnz: {
-      assert(instr->src[0].src.ssa->bit_size != 1);
-      bool test0 = instr->op == nir_op_bitz;
-      Temp src0 = get_alu_src(ctx, instr->src[0]);
-      Temp src1 = get_alu_src(ctx, instr->src[1]);
-      bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
-      if (!use_valu) {
-         aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
-                                                               : aco_opcode::s_bitcmp1_b32;
-         if (test0)
-            op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
-                                                       : aco_opcode::s_bitcmp0_b32;
-         emit_sopc_instruction(ctx, instr, op, dst);
-         break;
-      }
-
-      /* We do not have a VALU version of s_bitcmp.
-       * But if the second source is constant, we can use
-       * v_cmp_class_f32's LUT to check the bit.
-       * The LUT only has 10 entries, so extract a higher byte if we have to.
-       * For sign bits comparision with 0 is better because v_cmp_class
-       * can't be inverted.
-       */
-      if (nir_src_is_const(instr->src[1].src)) {
-         uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
-         bit &= instr->src[0].src.ssa->bit_size - 1;
-         src0 = as_vgpr(ctx, src0);
-
-         if (src0.regClass() == v2) {
-            src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
-            bit &= 31;
-         }
-
-         if (bit == 31) {
-            bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
-                     Operand::c32(0), src0);
-            break;
-         }
-
-         if (bit == 15 && ctx->program->gfx_level >= GFX8) {
-            bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
-                     Operand::c32(0), src0);
-            break;
-         }
-
-         /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
-         const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
-         const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
-         const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
-         if (use_opsel) {
-            src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
-                              Operand::c32(16), Operand::c32(0));
-            bit &= 0xf;
-         }
-
-         /* If we can use sdwa the extract is free, while test0's s_not is not. */
-         if (bit == 7 && test0 && can_sdwa) {
-            src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
-                              Operand::c32(8), Operand::c32(1));
-            bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
-                     Operand::c32(0), src0);
-            break;
-         }
-
-         if (bit > max_bit) {
-            src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
-                              Operand::c32(8), Operand::c32(0));
-            bit &= 0x7;
-         }
-
-         /* denorm and snan/qnan inputs are preserved using all float control modes. */
-         static const struct {
-            uint32_t fp32;
-            uint32_t fp16;
-            bool negate;
-         } float_lut[10] = {
-            {0x7f800001, 0x7c01, false}, /* snan */
-            {~0u, ~0u, false},           /* qnan */
-            {0xff800000, 0xfc00, false}, /* -inf */
-            {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
-            {1, 1, true},                /* -denormal */
-            {0, 0, true},                /* -0.0 */
-            {0, 0, false},               /* +0.0 */
-            {1, 1, false},               /* +denormal */
-            {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
-            {0x7f800000, 0x7c00, false}, /* +inf */
-         };
-
-         Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
-         /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
-         const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
-                               (ctx->program->gfx_level >= GFX11 && use_opsel);
-         const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
-         const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
-
-         VALU_instruction& res =
-            bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
-         if (float_lut[bit].negate) {
-            res.format = asVOP3(res.format);
-            res.neg[0] = true;
-         }
-
-         if (test0)
-            bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
-
-         break;
-      }
-
-      Temp res;
-      aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
-      if (instr->src[0].src.ssa->bit_size == 16) {
-         op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
-         if (ctx->program->gfx_level < GFX10)
-            res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
-         else
-            res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
-
-         res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
-      } else if (instr->src[0].src.ssa->bit_size == 32) {
-         res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
-      } else if (instr->src[0].src.ssa->bit_size == 64) {
-         if (ctx->program->gfx_level < GFX8)
-            res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
-         else
-            res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
-
-         res = emit_extract_vector(ctx, res, 0, v1);
-         res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
-      } else {
-         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
-      }
-      bld.vopc(op, Definition(dst), Operand::c32(0), res);
-      break;
-   }
-   default: isel_err(&instr->instr, "Unknown NIR ALU instr");
-   }
-}
-
 void
 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
 {
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
new file mode 100644
index 00000000000..aaeef4cb619
--- /dev/null
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
@@ -0,0 +1,3612 @@
+/*
+ * Copyright © 2018 Valve Corporation
+ * Copyright © 2018 Google
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "aco_builder.h"
+#include "aco_instruction_selection.h"
+#include "aco_ir.h"
+
+namespace aco {
+namespace {
+
+static Builder
+create_alu_builder(isel_context* ctx, nir_alu_instr* instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
+   bld.is_sz_preserve = nir_alu_instr_is_signed_zero_preserve(instr);
+   bld.is_inf_preserve = nir_alu_instr_is_inf_preserve(instr);
+   bld.is_nan_preserve = nir_alu_instr_is_nan_preserve(instr);
+   return bld;
+}
+
+enum sgpr_extract_mode {
+   sgpr_extract_sext,
+   sgpr_extract_zext,
+   sgpr_extract_undef,
+};
+
+Temp
+extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
+{
+   Temp vec = get_ssa_temp(ctx, src->src.ssa);
+   unsigned src_size = src->src.ssa->bit_size;
+   unsigned swizzle = src->swizzle[0];
+
+   if (vec.size() > 1) {
+      assert(src_size == 16);
+      vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
+      swizzle = swizzle & 1;
+   }
+
+   Builder bld(ctx->program, ctx->block);
+   Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
+
+   if (mode == sgpr_extract_undef && swizzle == 0)
+      bld.copy(Definition(tmp), vec);
+   else
+      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
+                 Operand::c32(swizzle), Operand::c32(src_size),
+                 Operand::c32((mode == sgpr_extract_sext)));
+
+   if (dst.regClass() == s2)
+      convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
+
+   return dst;
+}
+
+Temp
+get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
+{
+   if (src.src.ssa->num_components == 1 && size == 1)
+      return get_ssa_temp(ctx, src.src.ssa);
+
+   Temp vec = get_ssa_temp(ctx, src.src.ssa);
+   unsigned elem_size = src.src.ssa->bit_size / 8u;
+   bool identity_swizzle = true;
+
+   for (unsigned i = 0; identity_swizzle && i < size; i++) {
+      if (src.swizzle[i] != i)
+         identity_swizzle = false;
+   }
+   if (identity_swizzle)
+      return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
+
+   assert(elem_size > 0);
+   assert(vec.bytes() % elem_size == 0);
+
+   if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
+      assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
+      return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
+                                           sgpr_extract_undef);
+   }
+
+   bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
+   if (as_uniform)
+      vec = as_vgpr(ctx, vec);
+
+   RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
+                                    : RegClass(vec.type(), elem_size / 4);
+   if (size == 1) {
+      return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
+   } else {
+      assert(size <= 4);
+      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+      aco_ptr<Instruction> vec_instr{
+         create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
+      for (unsigned i = 0; i < size; ++i) {
+         elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
+         vec_instr->operands[i] = Operand{elems[i]};
+      }
+      Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
+      vec_instr->definitions[0] = Definition(dst);
+      ctx->block->instructions.emplace_back(std::move(vec_instr));
+      ctx->allocated_vec.emplace(dst.id(), elems);
+      return as_uniform ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
+   }
+}
+
+Temp
+get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
+{
+   /* returns v2b or v1 for vop3p usage.
+    * The source expects exactly 2 16bit components
+    * which are within the same dword
+    */
+   assert(src.src.ssa->bit_size == 16);
+   assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
+
+   Temp tmp = get_ssa_temp(ctx, src.src.ssa);
+   if (tmp.size() == 1)
+      return tmp;
+
+   /* the size is larger than 1 dword: check the swizzle */
+   unsigned dword = src.swizzle[0] >> 1;
+
+   /* extract a full dword if possible */
+   if (tmp.bytes() >= (dword + 1) * 4) {
+      /* if the source is split into components, use p_create_vector */
+      auto it = ctx->allocated_vec.find(tmp.id());
+      if (it != ctx->allocated_vec.end()) {
+         unsigned index = dword << 1;
+         Builder bld(ctx->program, ctx->block);
+         if (it->second[index].regClass() == v2b)
+            return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
+                              it->second[index + 1]);
+      }
+      return emit_extract_vector(ctx, tmp, dword, v1);
+   } else {
+      /* This must be a swizzled access to %a.zz where %a is v6b */
+      assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
+      assert(tmp.regClass() == v6b && dword == 1);
+      return emit_extract_vector(ctx, tmp, dword * 2, v2b);
+   }
+}
+
+uint32_t
+get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
+{
+   nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
+   return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
+}
+
+void
+emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
+                      bool writes_scc, uint8_t uses_ub = 0)
+{
+   Builder bld = create_alu_builder(ctx, instr);
+   bld.is_nuw = instr->no_unsigned_wrap;
+
+   Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
+                          Operand(get_alu_src(ctx, instr->src[1]))};
+   u_foreach_bit (i, uses_ub) {
+      uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
+      if (src_ub <= 0xffff)
+         operands[i].set16bit(true);
+      else if (src_ub <= 0xffffff)
+         operands[i].set24bit(true);
+   }
+
+   if (writes_scc)
+      bld.sop2(op, Definition(dst), bld.def(s1, scc), operands[0], operands[1]);
+   else
+      bld.sop2(op, Definition(dst), operands[0], operands[1]);
+}
+
+void
+emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
+                      bool commutative, bool swap_srcs = false, bool flush_denorms = false,
+                      bool nuw = false, uint8_t uses_ub = 0)
+{
+   Builder bld = create_alu_builder(ctx, instr);
+   bld.is_nuw = nuw;
+
+   Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
+                          Operand(get_alu_src(ctx, instr->src[1]))};
+   u_foreach_bit (i, uses_ub) {
+      uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
+      if (src_ub <= 0xffff)
+         operands[i].set16bit(true);
+      else if (src_ub <= 0xffffff)
+         operands[i].set24bit(true);
+   }
+
+   if (swap_srcs)
+      std::swap(operands[0], operands[1]);
+
+   if (operands[1].isOfType(RegType::sgpr)) {
+      if (commutative && operands[0].isOfType(RegType::vgpr)) {
+         std::swap(operands[0], operands[1]);
+      } else {
+         operands[1] = bld.copy(bld.def(RegType::vgpr, operands[1].size()), operands[1]);
+      }
+   }
+
+   if (flush_denorms && ctx->program->gfx_level < GFX9) {
+      assert(dst.size() == 1);
+      Temp tmp = bld.vop2(opc, bld.def(dst.regClass()), operands[0], operands[1]);
+      if (dst.bytes() == 2)
+         bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
+      else
+         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
+   } else {
+      bld.vop2(opc, Definition(dst), operands[0], operands[1]);
+   }
+}
+
+void
+emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
+{
+   Builder bld = create_alu_builder(ctx, instr);
+
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+   if (src1.type() == RegType::sgpr) {
+      assert(src0.type() == RegType::vgpr);
+      std::swap(src0, src1);
+   }
+
+   Temp src00 = bld.tmp(src0.type(), 1);
+   Temp src01 = bld.tmp(src0.type(), 1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+   Temp src10 = bld.tmp(v1);
+   Temp src11 = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+   Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
+   Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
+   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+}
+
+void
+emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
+                       bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
+{
+   assert(num_sources == 2 || num_sources == 3);
+   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
+   bool has_sgpr = false;
+   for (unsigned i = 0; i < num_sources; i++) {
+      src[i] = get_alu_src(ctx, instr->src[(swap_srcs && i < 2) ? 1 - i : i]);
+      if (has_sgpr)
+         src[i] = as_vgpr(ctx, src[i]);
+      else
+         has_sgpr = src[i].type() == RegType::sgpr;
+   }
+
+   Builder bld = create_alu_builder(ctx, instr);
+   if (flush_denorms && ctx->program->gfx_level < GFX9) {
+      Temp tmp;
+      if (num_sources == 3)
+         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
+      else
+         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
+      if (dst.size() == 1)
+         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
+      else
+         bld.vop3(aco_opcode::v_mul_f64_e64, Definition(dst), Operand::c64(0x3FF0000000000000),
+                  tmp);
+   } else if (num_sources == 3) {
+      bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
+   } else {
+      bld.vop3(op, Definition(dst), src[0], src[1]);
+   }
+}
+
+Builder::Result
+emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
+                       bool swap_srcs = false)
+{
+   Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
+   Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
+   if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
+      src1 = as_vgpr(ctx, src1);
+   assert(instr->def.num_components == 2);
+
+   /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
+   unsigned opsel_lo =
+      (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
+   unsigned opsel_hi =
+      (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
+
+   Builder bld = create_alu_builder(ctx, instr);
+   Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
+   emit_split_vector(ctx, dst, 2);
+   return res;
+}
+
+void
+emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
+                      unsigned neg_lo = 0)
+{
+   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
+   bool has_sgpr = false;
+   for (unsigned i = 0; i < 3; i++) {
+      src[i] = get_alu_src(ctx, instr->src[i]);
+      if (has_sgpr)
+         src[i] = as_vgpr(ctx, src[i]);
+      else
+         has_sgpr = src[i].type() == RegType::sgpr;
+   }
+
+   Builder bld = create_alu_builder(ctx, instr);
+   VALU_instruction& vop3p =
+      bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
+   vop3p.clamp = clamp;
+   vop3p.neg_lo = neg_lo;
+}
+
+void
+emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
+{
+   Builder bld = create_alu_builder(ctx, instr);
+   if (dst.type() == RegType::sgpr)
+      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
+   else
+      bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
+}
+
+void
+emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
+{
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+   assert(src0.size() == src1.size());
+
+   aco_ptr<Instruction> vopc;
+   if (src1.type() == RegType::sgpr) {
+      if (src0.type() == RegType::vgpr) {
+         /* to swap the operands, we might also have to change the opcode */
+         op = get_vcmp_swapped(op);
+         Temp t = src0;
+         src0 = src1;
+         src1 = t;
+      } else {
+         src1 = as_vgpr(ctx, src1);
+      }
+   }
+
+   Builder bld = create_alu_builder(ctx, instr);
+   bld.vopc(op, Definition(dst), src0, src1);
+}
+
+void
+emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
+{
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+   Builder bld = create_alu_builder(ctx, instr);
+
+   assert(dst.regClass() == bld.lm);
+   assert(src0.type() == RegType::sgpr);
+   assert(src1.type() == RegType::sgpr);
+
+   /* Emit the SALU comparison instruction */
+   Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
+   /* Turn the result into a per-lane bool */
+   bool_to_vector_condition(ctx, cmp, dst);
+}
+
+void
+emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
+                aco_opcode v32_op, aco_opcode v64_op, aco_opcode s16_op = aco_opcode::num_opcodes,
+                aco_opcode s32_op = aco_opcode::num_opcodes,
+                aco_opcode s64_op = aco_opcode::num_opcodes)
+{
+   aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
+                     : instr->src[0].src.ssa->bit_size == 32 ? s32_op
+                                                             : s16_op;
+   aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
+                     : instr->src[0].src.ssa->bit_size == 32 ? v32_op
+                                                             : v16_op;
+   bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
+                   get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
+                   get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
+   aco_opcode op = use_valu ? v_op : s_op;
+   assert(op != aco_opcode::num_opcodes);
+   assert(dst.regClass() == ctx->program->lane_mask);
+
+   if (use_valu)
+      emit_vopc_instruction(ctx, instr, op, dst);
+   else
+      emit_sopc_instruction(ctx, instr, op, dst);
+}
+
+void
+emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
+                   Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+   assert(dst.regClass() == bld.lm);
+   assert(src0.regClass() == bld.lm);
+   assert(src1.regClass() == bld.lm);
+
+   bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
+}
+
+void
+emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   Temp cond = get_alu_src(ctx, instr->src[0]);
+   Temp then = get_alu_src(ctx, instr->src[1]);
+   Temp els = get_alu_src(ctx, instr->src[2]);
+
+   assert(cond.regClass() == bld.lm);
+
+   if (dst.type() == RegType::vgpr) {
+      aco_ptr<Instruction> bcsel;
+      if (dst.size() == 1) {
+         then = as_vgpr(ctx, then);
+         els = as_vgpr(ctx, els);
+
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
+      } else if (dst.size() == 2) {
+         select_vec2(ctx, dst, cond, then, els);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      return;
+   }
+
+   if (instr->def.bit_size == 1) {
+      assert(dst.regClass() == bld.lm);
+      assert(then.regClass() == bld.lm);
+      assert(els.regClass() == bld.lm);
+   }
+
+   if (!nir_src_is_divergent(&instr->src[0].src)) { /* uniform condition and values in sgpr */
+      cond = bool_to_scalar_condition(ctx, cond);
+
+      bool els_zero =
+         nir_src_is_const(instr->src[2].src) && nir_src_as_uint(instr->src[2].src) == 0;
+
+      if (dst.regClass() == s1 && els_zero) {
+         /* Use s_mul_i32 because it doesn't require scc. */
+         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), then, cond);
+      } else if (dst.regClass() == s1 || dst.regClass() == s2) {
+         assert((then.regClass() == s1 || then.regClass() == s2) &&
+                els.regClass() == then.regClass());
+         assert(dst.size() == then.size());
+         aco_opcode op =
+            dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
+         bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
+      } else {
+         isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
+      }
+      return;
+   }
+
+   /* divergent boolean bcsel
+    * this implements bcsel on bools: dst = s0 ? s1 : s2
+    * are going to be: dst = (s0 & s1) | (~s0 & s2) */
+   assert(instr->def.bit_size == 1);
+
+   if (cond.id() != then.id())
+      then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
+
+   if (cond.id() == els.id())
+      bld.copy(Definition(dst), then);
+   else
+      bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
+               bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
+}
+
+void
+emit_vec2_f2f16(isel_context* ctx, nir_alu_instr* instr, Temp dst)
+{
+   Builder bld = create_alu_builder(ctx, instr);
+   Temp src = get_ssa_temp(ctx, instr->src[0].src.ssa);
+   RegClass rc = RegClass(src.regClass().type(), instr->src[0].src.ssa->bit_size / 32);
+   Temp src0 = emit_extract_vector(ctx, src, instr->src[0].swizzle[0], rc);
+   Temp src1 = emit_extract_vector(ctx, src, instr->src[0].swizzle[1], rc);
+
+   if (dst.regClass() == s1) {
+      bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src0, src1);
+   } else {
+      src1 = as_vgpr(ctx, src1);
+      if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
+         bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src0, src1);
+      else
+         bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
+      emit_split_vector(ctx, dst, 2);
+   }
+}
+
+void
+emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode vop,
+               aco_opcode sop, uint32_t undo)
+{
+   if (ctx->block->fp_mode.denorm32 == 0) {
+      if (dst.regClass() == v1)
+         bld.vop1(vop, dst, val);
+      else if (ctx->options->gfx_level >= GFX12)
+         bld.vop3(sop, dst, val);
+      else
+         bld.pseudo(aco_opcode::p_as_uniform, dst, bld.vop1(vop, bld.def(v1), val));
+      return;
+   }
+
+   /* multiply by 16777216 to handle denormals */
+   Temp scale, unscale;
+   if (val.regClass() == v1) {
+      val = as_vgpr(ctx, val);
+      Temp is_denormal = bld.tmp(bld.lm);
+      VALU_instruction& valu = bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal),
+                                            val, Operand::c32(1u << 4))
+                                  ->valu();
+      valu.neg[0] = true;
+      valu.abs[0] = true;
+      scale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
+                           bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), is_denormal);
+      unscale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
+                             bld.copy(bld.def(s1), Operand::c32(undo)), is_denormal);
+   } else {
+      Temp abs = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), val,
+                          bld.copy(bld.def(s1), Operand::c32(0x7fffffff)));
+      Temp denorm_cmp = bld.copy(bld.def(s1), Operand::c32(0x00800000));
+      Temp is_denormal = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, denorm_cmp);
+      scale = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
+                       bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), Operand::c32(0x3f800000),
+                       bld.scc(is_denormal));
+      unscale =
+         bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(undo)),
+                  Operand::c32(0x3f800000), bld.scc(is_denormal));
+   }
+
+   if (dst.regClass() == v1) {
+      Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), scale, as_vgpr(ctx, val));
+      scaled = bld.vop1(vop, bld.def(v1), scaled);
+      bld.vop2(aco_opcode::v_mul_f32, dst, unscale, scaled);
+   } else {
+      assert(ctx->options->gfx_level >= GFX11_5);
+      Temp scaled = bld.sop2(aco_opcode::s_mul_f32, bld.def(s1), scale, val);
+      if (ctx->options->gfx_level >= GFX12)
+         scaled = bld.vop3(sop, bld.def(s1), scaled);
+      else
+         scaled = bld.as_uniform(bld.vop1(vop, bld.def(v1), scaled));
+      bld.sop2(aco_opcode::s_mul_f32, dst, unscale, scaled);
+   }
+}
+
+void
+emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
+{
+   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, aco_opcode::v_s_rcp_f32, 0x4b800000u);
+}
+
+void
+emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
+{
+   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, aco_opcode::v_s_rsq_f32, 0x45800000u);
+}
+
+void
+emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
+{
+   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, aco_opcode::v_s_sqrt_f32,
+                  0x39800000u);
+}
+
+void
+emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
+{
+   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, aco_opcode::v_s_log_f32, 0xc1c00000u);
+}
+
+Temp
+emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
+{
+   if (ctx->options->gfx_level >= GFX7)
+      return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
+
+   /* GFX6 doesn't support V_TRUNC_F64, lower it. */
+   /* TODO: create more efficient code! */
+   if (val.type() == RegType::sgpr)
+      val = as_vgpr(ctx, val);
+
+   /* Split the input value. */
+   Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
+
+   /* Extract the exponent and compute the unbiased value. */
+   Temp exponent =
+      bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
+   exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
+
+   /* Extract the fractional part. */
+   Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
+                                Operand::c32(0x000fffffu));
+   fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
+
+   Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
+              fract_mask);
+
+   Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
+   Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
+   fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
+   tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
+   fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
+
+   /* Get the sign bit. */
+   Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
+
+   /* Decide the operation to apply depending on the unbiased exponent. */
+   Temp exp_lt0 =
+      bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
+   Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
+                          bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
+   Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
+   Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
+   dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
+   dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
+
+   return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
+}
+
+Temp
+emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
+{
+   if (ctx->options->gfx_level >= GFX7)
+      return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
+
+   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
+    * lowered at NIR level for precision reasons). */
+   Temp src0 = as_vgpr(ctx, val);
+
+   Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
+                             Operand::c32(0x3fefffffu));
+
+   Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
+   Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
+   Temp min = bld.vop3(aco_opcode::v_min_f64_e64, bld.def(v2), fract, min_val);
+
+   Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
+   Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
+
+   Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
+   Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
+
+   Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
+
+   Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src0, v);
+   add->valu().neg[1] = true;
+
+   return add->definitions[0].getTemp();
+}
+
+Temp
+uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
+{
+   if (bld.program->gfx_level < GFX8) {
+      Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
+      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
+                          add.def(1).getTemp());
+   }
+
+   Builder::Result add(NULL);
+   if (bld.program->gfx_level >= GFX9) {
+      add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
+   } else {
+      add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
+   }
+   add->valu().clamp = 1;
+   return dst.getTemp();
+}
+
+Temp
+usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
+{
+   if (bld.program->gfx_level < GFX8) {
+      Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
+      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
+                          sub.def(1).getTemp());
+   }
+
+   Builder::Result sub(NULL);
+   if (bld.program->gfx_level >= GFX9) {
+      sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
+   } else {
+      sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
+   }
+   sub->valu().clamp = 1;
+   return dst.getTemp();
+}
+
+} // namespace
+
+void
+visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
+{
+   Builder bld = create_alu_builder(ctx, instr);
+   Temp dst = get_ssa_temp(ctx, &instr->def);
+   switch (instr->op) {
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+   case nir_op_vec5:
+   case nir_op_vec8:
+   case nir_op_vec16: {
+      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+      unsigned num = instr->def.num_components;
+      for (unsigned i = 0; i < num; ++i)
+         elems[i] = get_alu_src(ctx, instr->src[i]);
+
+      if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
+         aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
+                                                     instr->def.num_components, 1)};
+         RegClass elem_rc = RegClass::get(dst.type(), instr->def.bit_size / 8u);
+         for (unsigned i = 0; i < num; ++i) {
+            if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
+               elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
+
+            if (nir_src_is_undef(instr->src[i].src))
+               vec->operands[i] = Operand{elem_rc};
+            else
+               vec->operands[i] = Operand{elems[i]};
+         }
+         vec->definitions[0] = Definition(dst);
+         ctx->block->instructions.emplace_back(std::move(vec));
+         ctx->allocated_vec.emplace(dst.id(), elems);
+      } else {
+         bool use_s_pack = ctx->program->gfx_level >= GFX9;
+         Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
+
+         std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
+         uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
+         bitarray32 undef_mask = UINT32_MAX;
+         for (unsigned i = 0; i < num; i++) {
+            unsigned packed_size = use_s_pack ? 16 : 32;
+            unsigned idx = i * instr->def.bit_size / packed_size;
+            unsigned offset = i * instr->def.bit_size % packed_size;
+            if (nir_src_is_undef(instr->src[i].src))
+               continue;
+            else
+               undef_mask[idx] = false;
+
+            if (nir_src_is_const(instr->src[i].src)) {
+               const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
+               continue;
+            }
+
+            if (offset != packed_size - instr->def.bit_size)
+               elems[i] =
+                  bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
+
+            if (offset)
+               elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
+                                   Operand::c32(offset));
+
+            if (packed[idx].id())
+               packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
+                                      packed[idx]);
+            else
+               packed[idx] = elems[i];
+         }
+
+         if (use_s_pack) {
+            for (unsigned i = 0; i < dst.size(); i++) {
+               bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
+
+               if (packed[i * 2].id() && packed[i * 2 + 1].id())
+                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
+                                       packed[i * 2 + 1]);
+               else if (packed[i * 2 + 1].id())
+                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
+                                       Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
+               else if (packed[i * 2].id())
+                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
+                                       Operand::c32(const_vals[i * 2 + 1]));
+               else
+                  packed[i] = Temp(0, s1); /* Both constants, so reset the entry */
+
+               undef_mask[i] = undef_mask[i * 2] && undef_mask[i * 2 + 1];
+
+               if (same)
+                  const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
+               else
+                  const_vals[i] = 0;
+            }
+         }
+
+         for (unsigned i = 0; i < dst.size(); i++) {
+            if (const_vals[i] && packed[i].id())
+               packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
+                                    Operand::c32(const_vals[i]), packed[i]);
+            else if (!packed[i].id() && !undef_mask[i])
+               packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
+         }
+
+         if (dst.size() == 1 && packed[0].id())
+            bld.copy(Definition(dst), packed[0]);
+         else {
+            aco_ptr<Instruction> vec{
+               create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+            vec->definitions[0] = Definition(dst);
+            for (unsigned i = 0; i < dst.size(); ++i)
+               vec->operands[i] = Operand(packed[i]);
+            bld.insert(std::move(vec));
+         }
+      }
+      break;
+   }
+   case nir_op_mov: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
+         /* use size() instead of bytes() for 8/16-bit */
+         assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
+         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+      } else {
+         assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
+         bld.copy(Definition(dst), src);
+      }
+      break;
+   }
+   case nir_op_inot: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
+      } else if (dst.regClass() == v2) {
+         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+         lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
+         hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+      } else if (dst.type() == RegType::sgpr) {
+         aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
+         bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_iabs: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
+
+         unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
+         unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
+
+         Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
+                              src, opsel_lo, opsel_hi);
+         bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
+         emit_split_vector(ctx, dst, 2);
+         break;
+      }
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s1) {
+         bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
+      } else if (dst.regClass() == v1) {
+         bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
+                  bld.vsub32(bld.def(v1), Operand::zero(), src));
+      } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
+         bld.vop3(
+            aco_opcode::v_max_i16_e64, Definition(dst), src,
+            bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
+      } else if (dst.regClass() == v2b) {
+         src = as_vgpr(ctx, src);
+         bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
+                  bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_isign: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s1) {
+         Temp tmp =
+            bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
+         bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
+      } else if (dst.regClass() == s2) {
+         Temp neg =
+            bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
+         Temp neqz;
+         if (ctx->program->gfx_level >= GFX8)
+            neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
+         else
+            neqz =
+               bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
+                  .def(1)
+                  .getTemp();
+         /* SCC gets zero-extended to 64 bit */
+         bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
+      } else if (dst.regClass() == v1) {
+         bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
+      } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
+         bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
+      } else if (dst.regClass() == v2b) {
+         src = as_vgpr(ctx, src);
+         bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
+                  bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
+      } else if (dst.regClass() == v2) {
+         Temp upper = emit_extract_vector(ctx, src, 1, v1);
+         Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
+         Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
+         Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
+         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_imax: {
+      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
+      } else if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_umax: {
+      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
+      } else if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_imin: {
+      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
+      } else if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_umin: {
+      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
+      } else if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ior: {
+      if (instr->def.bit_size == 1) {
+         emit_boolean_logic(ctx, instr, Builder::s_or, dst);
+      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_iand: {
+      if (instr->def.bit_size == 1) {
+         emit_boolean_logic(ctx, instr, Builder::s_and, dst);
+      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ixor: {
+      if (instr->def.bit_size == 1) {
+         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
+      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ushr: {
+      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
+      } else if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
+      } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
+         bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
+                  get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ishl: {
+      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
+      } else if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
+                               false, 1);
+      } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
+         bld.vop3(aco_opcode::v_lshlrev_b64_e64, Definition(dst), get_alu_src(ctx, instr->src[1]),
+                  get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ishr: {
+      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
+      } else if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
+      } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
+         bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
+                  get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
+      } else if (dst.regClass() == s2) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_find_lsb: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (src.regClass() == s1) {
+         bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
+      } else if (src.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
+      } else if (src.regClass() == s2) {
+         bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
+      } else if (src.regClass() == v2) {
+         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+         lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
+         hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
+         hi = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32u), hi);
+         bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ufind_msb:
+   case nir_op_ifind_msb: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (src.regClass() == s1 || src.regClass() == s2) {
+         aco_opcode op = src.regClass() == s2
+                            ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
+                                                             : aco_opcode::s_flbit_i32_i64)
+                            : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
+                                                             : aco_opcode::s_flbit_i32);
+         Temp msb_rev = bld.sop1(op, bld.def(s1), src);
+
+         Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
+                                        Operand::c32(src.size() * 32u - 1u), msb_rev);
+         Temp msb = sub.def(0).getTemp();
+         Temp carry = sub.def(1).getTemp();
+
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
+                  bld.scc(carry));
+      } else if (src.regClass() == v1) {
+         aco_opcode op =
+            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
+         Temp msb_rev = bld.tmp(v1);
+         emit_vop1_instruction(ctx, instr, op, msb_rev);
+         Temp msb = bld.tmp(v1);
+         Temp carry =
+            bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
+      } else if (src.regClass() == v2) {
+         aco_opcode op =
+            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
+
+         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+
+         lo = bld.vop1(op, bld.def(v1), lo);
+         lo = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32), lo);
+         hi = bld.vop1(op, bld.def(v1), hi);
+         Temp msb_rev = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), lo, hi);
+
+         Temp msb = bld.tmp(v1);
+         Temp carry =
+            bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ufind_msb_rev:
+   case nir_op_ifind_msb_rev: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (src.regClass() == s1) {
+         aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
+                                                           : aco_opcode::s_flbit_i32;
+         bld.sop1(op, Definition(dst), src);
+      } else if (src.regClass() == v1) {
+         aco_opcode op =
+            instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
+         emit_vop1_instruction(ctx, instr, op, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_bitfield_reverse: {
+      if (dst.regClass() == s1) {
+         bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == v1) {
+         bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_iadd: {
+      if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
+         break;
+      } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
+         break;
+      } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
+         break;
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
+         break;
+      } else if (dst.regClass() == s2 && ctx->program->gfx_level >= GFX12) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u64, dst, false);
+         break;
+      }
+
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
+         if (instr->no_unsigned_wrap)
+            bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
+         else
+            bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
+         break;
+      }
+
+      assert(src0.size() == 2 && src1.size() == 2);
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+
+      if (dst.regClass() == s2) {
+         Temp carry = bld.tmp(s1);
+         Temp dst0 =
+            bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
+         Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
+                              bld.scc(carry));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else if (dst.regClass() == v2) {
+         Temp dst0 = bld.tmp(v1);
+         Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
+         Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_uadd_sat: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
+         add_instr->valu().clamp = 1;
+         break;
+      }
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
+         bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
+                  bld.scc(carry));
+         break;
+      } else if (dst.regClass() == v2b) {
+         Instruction* add_instr;
+         if (ctx->program->gfx_level >= GFX10) {
+            add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
+         } else {
+            if (src1.type() == RegType::sgpr)
+               std::swap(src0, src1);
+            add_instr =
+               bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
+         }
+         add_instr->valu().clamp = 1;
+         break;
+      } else if (dst.regClass() == v1) {
+         uadd32_sat(bld, Definition(dst), src0, src1);
+         break;
+      }
+
+      assert(src0.size() == 2 && src1.size() == 2);
+
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(src0.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(src1.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+
+      if (dst.regClass() == s2) {
+         Temp carry0 = bld.tmp(s1);
+         Temp carry1 = bld.tmp(s1);
+
+         Temp no_sat0 =
+            bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
+         Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
+                                 src01, src11, bld.scc(carry0));
+
+         Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
+
+         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
+                  bld.scc(carry1));
+      } else if (dst.regClass() == v2) {
+         Temp no_sat0 = bld.tmp(v1);
+         Temp dst0 = bld.tmp(v1);
+         Temp dst1 = bld.tmp(v1);
+
+         Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
+         Temp carry1;
+
+         if (ctx->program->gfx_level >= GFX8) {
+            carry1 = bld.tmp(bld.lm);
+            bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
+                         as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
+               ->valu()
+               .clamp = 1;
+         } else {
+            Temp no_sat1 = bld.tmp(v1);
+            carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
+            bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
+                         carry1);
+         }
+
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
+                      carry1);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_iadd_sat: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
+         add_instr->valu().clamp = 1;
+         break;
+      }
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
+         Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
+                               Operand::c32(INT32_MAX), cond);
+         Temp overflow = bld.tmp(s1);
+         Temp add =
+            bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
+         break;
+      }
+
+      src1 = as_vgpr(ctx, src1);
+
+      if (dst.regClass() == v2b) {
+         Instruction* add_instr =
+            bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
+         add_instr->valu().clamp = 1;
+      } else if (dst.regClass() == v1) {
+         Instruction* add_instr =
+            bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
+         add_instr->valu().clamp = 1;
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_uadd_carry: {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
+         break;
+      }
+      if (dst.regClass() == v1) {
+         Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
+                      carry);
+         break;
+      }
+
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+      if (dst.regClass() == s2) {
+         Temp carry = bld.tmp(s1);
+         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
+         carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
+                          bld.scc(carry))
+                    .def(1)
+                    .getTemp();
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
+      } else if (dst.regClass() == v2) {
+         Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
+         carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
+         carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
+                              Operand::c32(1u), carry);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_isub: {
+      if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
+         break;
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
+         break;
+      } else if (dst.regClass() == s2 && ctx->program->gfx_level >= GFX12) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_u64, dst, false);
+         break;
+      }
+
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == v1) {
+         bld.vsub32(Definition(dst), src0, src1);
+         break;
+      } else if (dst.bytes() <= 2) {
+         if (ctx->program->gfx_level >= GFX10)
+            bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
+         else if (src1.type() == RegType::sgpr)
+            bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
+         else if (ctx->program->gfx_level >= GFX8)
+            bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
+         else
+            bld.vsub32(Definition(dst), src0, src1);
+         break;
+      }
+
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+      if (dst.regClass() == s2) {
+         Temp borrow = bld.tmp(s1);
+         Temp dst0 =
+            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
+         Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
+                              bld.scc(borrow));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else if (dst.regClass() == v2) {
+         Temp lower = bld.tmp(v1);
+         Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
+         Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_usub_borrow: {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
+         break;
+      } else if (dst.regClass() == v1) {
+         Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
+                      borrow);
+         break;
+      }
+
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(dst.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+      if (dst.regClass() == s2) {
+         Temp borrow = bld.tmp(s1);
+         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
+         borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
+                           bld.scc(borrow))
+                     .def(1)
+                     .getTemp();
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
+      } else if (dst.regClass() == v2) {
+         Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
+         borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
+         borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
+                               Operand::c32(1u), borrow);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_usub_sat: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
+         sub_instr->valu().clamp = 1;
+         break;
+      }
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
+         bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
+         break;
+      } else if (dst.regClass() == v2b) {
+         Instruction* sub_instr;
+         if (ctx->program->gfx_level >= GFX10) {
+            sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
+         } else {
+            aco_opcode op = aco_opcode::v_sub_u16;
+            if (src1.type() == RegType::sgpr) {
+               std::swap(src0, src1);
+               op = aco_opcode::v_subrev_u16;
+            }
+            sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
+         }
+         sub_instr->valu().clamp = 1;
+         break;
+      } else if (dst.regClass() == v1) {
+         usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
+         break;
+      }
+
+      assert(src0.size() == 2 && src1.size() == 2);
+      Temp src00 = bld.tmp(src0.type(), 1);
+      Temp src01 = bld.tmp(src0.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+      Temp src10 = bld.tmp(src1.type(), 1);
+      Temp src11 = bld.tmp(src1.type(), 1);
+      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+
+      if (dst.regClass() == s2) {
+         Temp carry0 = bld.tmp(s1);
+         Temp carry1 = bld.tmp(s1);
+
+         Temp no_sat0 =
+            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
+         Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
+                                 src01, src11, bld.scc(carry0));
+
+         Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
+
+         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
+                  bld.scc(carry1));
+      } else if (dst.regClass() == v2) {
+         Temp no_sat0 = bld.tmp(v1);
+         Temp dst0 = bld.tmp(v1);
+         Temp dst1 = bld.tmp(v1);
+
+         Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
+         Temp carry1;
+
+         if (ctx->program->gfx_level >= GFX8) {
+            carry1 = bld.tmp(bld.lm);
+            bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
+                         as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
+               ->valu()
+               .clamp = 1;
+         } else {
+            Temp no_sat1 = bld.tmp(v1);
+            carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
+            bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
+                         carry1);
+         }
+
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
+                      carry1);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_isub_sat: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
+         sub_instr->valu().clamp = 1;
+         break;
+      }
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == s1) {
+         Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
+         Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
+                               Operand::c32(INT32_MAX), cond);
+         Temp overflow = bld.tmp(s1);
+         Temp sub =
+            bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
+         break;
+      }
+
+      src1 = as_vgpr(ctx, src1);
+
+      if (dst.regClass() == v2b) {
+         Instruction* sub_instr =
+            bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
+         sub_instr->valu().clamp = 1;
+      } else if (dst.regClass() == v1) {
+         Instruction* sub_instr =
+            bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
+         sub_instr->valu().clamp = 1;
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_imul: {
+      if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
+      } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
+      } else if (dst.type() == RegType::vgpr) {
+         uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
+         uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
+
+         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
+            bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
+                                  true /* commutative */, false, false, nuw_16bit, 0x3);
+         } else if (nir_src_is_const(instr->src[0].src)) {
+            bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
+                          nir_src_as_uint(instr->src[0].src), false);
+         } else if (nir_src_is_const(instr->src[1].src)) {
+            bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
+                          nir_src_as_uint(instr->src[1].src), false);
+         } else {
+            emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
+         }
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_imul24_relaxed: {
+      if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_i32_i24, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_umul24_relaxed: {
+      if (dst.regClass() == s1) {
+         Operand op1(get_alu_src(ctx, instr->src[0]));
+         Operand op2(get_alu_src(ctx, instr->src[1]));
+         op1.set24bit(true);
+         op2.set24bit(true);
+         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), op1, op2);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true /* commutative */,
+                               false, false, false, 0x3);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_umul_high: {
+      if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
+      } else if (dst.bytes() == 4) {
+         uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
+         uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
+
+         Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
+         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
+         } else {
+            emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
+         }
+
+         if (dst.regClass() == s1)
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_imul_high: {
+      if (dst.regClass() == v1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
+      } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
+      } else if (dst.regClass() == s1) {
+         Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
+                             as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
+         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fmul: {
+      if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64_e64, dst);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f16, dst, false);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f32, dst, false);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fmulz: {
+      if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fadd: {
+      if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64_e64, dst);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f16, dst, false);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f32, dst, false);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fsub: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
+         VALU_instruction& sub = add->valu();
+         sub.neg_lo[1] = true;
+         sub.neg_hi[1] = true;
+         break;
+      }
+
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      if (dst.regClass() == v2b) {
+         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
+         else
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
+      } else if (dst.regClass() == v1) {
+         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
+         else
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
+      } else if (dst.regClass() == v2) {
+         Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), as_vgpr(ctx, src0),
+                                     as_vgpr(ctx, src1));
+         add->valu().neg[1] = true;
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f16, dst, false);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f32, dst, false);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ffma: {
+      if (dst.regClass() == v2b) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         assert(instr->def.num_components == 2);
+
+         Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
+         Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
+         Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
+
+         /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
+         unsigned opsel_lo = 0, opsel_hi = 0;
+         for (unsigned i = 0; i < 3; i++) {
+            opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
+            opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
+         }
+
+         bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
+         emit_split_vector(ctx, dst, 2);
+      } else if (dst.regClass() == v1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
+                                ctx->block->fp_mode.must_flush_denorms32, 3);
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
+      } else if (dst.regClass() == s1) {
+         Temp src0 = get_alu_src(ctx, instr->src[0]);
+         Temp src1 = get_alu_src(ctx, instr->src[1]);
+         Temp src2 = get_alu_src(ctx, instr->src[2]);
+         aco_opcode op =
+            instr->def.bit_size == 16 ? aco_opcode::s_fmac_f16 : aco_opcode::s_fmac_f32;
+         bld.sop2(op, Definition(dst), src0, src1, src2);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ffmaz: {
+      if (dst.regClass() == v1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
+                                ctx->block->fp_mode.must_flush_denorms32, 3);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fmax: {
+      if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true, false,
+                               ctx->block->fp_mode.must_flush_denorms16_64);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
+                               ctx->block->fp_mode.must_flush_denorms32);
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64_e64, dst,
+                                ctx->block->fp_mode.must_flush_denorms16_64);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f16, dst, false);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f32, dst, false);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fmin: {
+      if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true, false,
+                               ctx->block->fp_mode.must_flush_denorms16_64);
+      } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
+      } else if (dst.regClass() == v1) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
+                               ctx->block->fp_mode.must_flush_denorms32);
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64_e64, dst,
+                                ctx->block->fp_mode.must_flush_denorms16_64);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f16, dst, false);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f32, dst, false);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_sdot_4x8_iadd: {
+      if (ctx->options->gfx_level >= GFX11)
+         emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
+      else
+         emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
+      break;
+   }
+   case nir_op_sdot_4x8_iadd_sat: {
+      if (ctx->options->gfx_level >= GFX11)
+         emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
+      else
+         emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
+      break;
+   }
+   case nir_op_sudot_4x8_iadd: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
+      break;
+   }
+   case nir_op_sudot_4x8_iadd_sat: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
+      break;
+   }
+   case nir_op_udot_4x8_uadd: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
+      break;
+   }
+   case nir_op_udot_4x8_uadd_sat: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
+      break;
+   }
+   case nir_op_sdot_2x16_iadd: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
+      break;
+   }
+   case nir_op_sdot_2x16_iadd_sat: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
+      break;
+   }
+   case nir_op_udot_2x16_uadd: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
+      break;
+   }
+   case nir_op_udot_2x16_uadd_sat: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
+      break;
+   }
+   case nir_op_bfdot2_bfadd: {
+      Temp src0 = as_vgpr(ctx, get_alu_src(ctx, instr->src[0], 2));
+      Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1], 2));
+      Temp src2 = get_alu_src(ctx, instr->src[2], 1);
+
+      bld.vop3(aco_opcode::v_dot2_bf16_bf16, Definition(dst), src0, src1, src2);
+      break;
+   }
+   case nir_op_cube_amd: {
+      Temp in = get_alu_src(ctx, instr->src[0], 3);
+      Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
+                     emit_extract_vector(ctx, in, 2, v1)};
+      Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
+      Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
+      Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
+      Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
+      break;
+   }
+   case nir_op_bcsel: {
+      emit_bcsel(ctx, instr, dst);
+      break;
+   }
+   case nir_op_frsq: {
+      if (instr->def.bit_size == 16) {
+         if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
+            bld.vop3(aco_opcode::v_s_rsq_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
+         else
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
+      } else if (instr->def.bit_size == 32) {
+         emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else if (instr->def.bit_size == 64) {
+         /* Lowered at NIR level for precision reasons. */
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fneg: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
+         Instruction* vop3p =
+            bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
+                      instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
+         vop3p->valu().neg_lo[0] = true;
+         vop3p->valu().neg_hi[0] = true;
+         emit_split_vector(ctx, dst, 2);
+         break;
+      }
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == v2b) {
+         bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
+      } else if (dst.regClass() == v1) {
+         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
+                  as_vgpr(ctx, src));
+      } else if (dst.regClass() == v2) {
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
+                           as_vgpr(ctx, src));
+         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
+         upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0xbc00u), src);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0xbf800000u), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fabs: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
+         Instruction* vop3p =
+            bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
+                      instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
+               .instr;
+         vop3p->valu().neg_lo[1] = true;
+         vop3p->valu().neg_hi[1] = true;
+         emit_split_vector(ctx, dst, 2);
+         break;
+      }
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == v2b) {
+         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
+                                         Operand::c16(0x3c00), as_vgpr(ctx, src))
+                               .instr;
+         mul->valu().abs[1] = true;
+      } else if (dst.regClass() == v1) {
+         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
+                                         Operand::c32(0x3f800000u), as_vgpr(ctx, src))
+                               .instr;
+         mul->valu().abs[1] = true;
+      } else if (dst.regClass() == v2) {
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
+                           as_vgpr(ctx, src));
+         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
+         upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fff));
+         if (ctx->block->fp_mode.denorm16_64 == fp_denorm_keep) {
+            bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
+         } else {
+            Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
+            bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
+         }
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
+         if (ctx->block->fp_mode.denorm32 == fp_denorm_keep) {
+            bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
+         } else {
+            Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
+            bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0x3f800000), tmp);
+         }
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fsat: {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
+         Instruction* vop3p =
+            bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
+                      instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
+         vop3p->valu().clamp = true;
+         emit_split_vector(ctx, dst, 2);
+         break;
+      }
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
+         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
+                  src);
+      } else if (dst.regClass() == v2b) {
+         bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src)
+            ->valu()
+            .clamp = true;
+      } else if (dst.regClass() == v1) {
+         bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
+                  Operand::c32(0x3f800000u), src);
+         /* apparently, it is not necessary to flush denorms if this instruction is used with these
+          * operands */
+         // TODO: confirm that this holds under any circumstances
+      } else if (dst.regClass() == v2) {
+         Instruction* add =
+            bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src, Operand::zero());
+         add->valu().clamp = true;
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         Temp low = bld.sop2(aco_opcode::s_max_f16, bld.def(s1), src, Operand::c16(0));
+         bld.sop2(aco_opcode::s_min_f16, Definition(dst), low, Operand::c16(0x3C00));
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         Temp low = bld.sop2(aco_opcode::s_max_f32, bld.def(s1), src, Operand::c32(0));
+         bld.sop2(aco_opcode::s_min_f32, Definition(dst), low, Operand::c32(0x3f800000));
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_flog2: {
+      if (instr->def.bit_size == 16) {
+         if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
+            bld.vop3(aco_opcode::v_s_log_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
+         else
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
+      } else if (instr->def.bit_size == 32) {
+         emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_frcp: {
+      if (instr->def.bit_size == 16) {
+         if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
+            bld.vop3(aco_opcode::v_s_rcp_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
+         else
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
+      } else if (instr->def.bit_size == 32) {
+         emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else if (instr->def.bit_size == 64) {
+         /* Lowered at NIR level for precision reasons. */
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fexp2: {
+      if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX12) {
+         aco_opcode opcode =
+            instr->def.bit_size == 16 ? aco_opcode::v_s_exp_f16 : aco_opcode::v_s_exp_f32;
+         bld.vop3(opcode, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else if (instr->def.bit_size == 16) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
+      } else if (instr->def.bit_size == 32) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fsqrt: {
+      if (instr->def.bit_size == 16) {
+         if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
+            bld.vop3(aco_opcode::v_s_sqrt_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
+         else
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
+      } else if (instr->def.bit_size == 32) {
+         emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else if (instr->def.bit_size == 64) {
+         /* Lowered at NIR level for precision reasons. */
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ffract: {
+      if (dst.regClass() == v2b) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
+      } else if (dst.regClass() == v2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
+      } else if (dst.regClass() == s1) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         aco_opcode op =
+            instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
+         Temp floor = bld.sop1(op, bld.def(s1), src);
+         op = instr->def.bit_size == 16 ? aco_opcode::s_sub_f16 : aco_opcode::s_sub_f32;
+         bld.sop2(op, Definition(dst), src, floor);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ffloor: {
+      if (dst.regClass() == v2b) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
+      } else if (dst.regClass() == v2) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         emit_floor_f64(ctx, bld, Definition(dst), src);
+      } else if (dst.regClass() == s1) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         aco_opcode op =
+            instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
+         bld.sop1(op, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fceil: {
+      if (dst.regClass() == v2b) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
+      } else if (dst.regClass() == v2) {
+         if (ctx->options->gfx_level >= GFX7) {
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
+         } else {
+            /* GFX6 doesn't support V_CEIL_F64, lower it. */
+            /* trunc = trunc(src0)
+             * if (src0 > 0.0 && src0 != trunc)
+             *    trunc += 1.0
+             */
+            Temp src0 = get_alu_src(ctx, instr->src[0]);
+            Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
+            Temp tmp0 =
+               bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
+            Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
+            Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
+            Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
+                                bld.copy(bld.def(v1), Operand::zero()),
+                                bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
+            add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
+                             bld.copy(bld.def(v1), Operand::zero()), add);
+            bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), trunc, add);
+         }
+      } else if (dst.regClass() == s1) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         aco_opcode op =
+            instr->def.bit_size == 16 ? aco_opcode::s_ceil_f16 : aco_opcode::s_ceil_f32;
+         bld.sop1(op, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ftrunc: {
+      if (dst.regClass() == v2b) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
+      } else if (dst.regClass() == v2) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         emit_trunc_f64(ctx, bld, Definition(dst), src);
+      } else if (dst.regClass() == s1) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         aco_opcode op =
+            instr->def.bit_size == 16 ? aco_opcode::s_trunc_f16 : aco_opcode::s_trunc_f32;
+         bld.sop1(op, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fround_even: {
+      if (dst.regClass() == v2b) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
+      } else if (dst.regClass() == v2) {
+         if (ctx->options->gfx_level >= GFX7) {
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
+         } else {
+            /* GFX6 doesn't support V_RNDNE_F64, lower it. */
+            Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
+            Temp src0 = get_alu_src(ctx, instr->src[0]);
+            bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
+
+            Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
+                                    bld.copy(bld.def(s1), Operand::c32(-2u)));
+            Temp bfi =
+               bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
+                        bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
+            Temp tmp =
+               bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), src0,
+                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
+            Instruction* sub =
+               bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), tmp,
+                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
+            sub->valu().neg[1] = true;
+            tmp = sub->definitions[0].getTemp();
+
+            Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
+                                Operand::c32(0x432fffffu));
+            Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
+            vop3->valu().abs[0] = true;
+            Temp cond = vop3->definitions[0].getTemp();
+
+            Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
+            bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
+            Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
+                                     as_vgpr(ctx, src0_lo), cond);
+            Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
+                                     as_vgpr(ctx, src0_hi), cond);
+
+            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+         }
+      } else if (dst.regClass() == s1) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         aco_opcode op =
+            instr->def.bit_size == 16 ? aco_opcode::s_rndne_f16 : aco_opcode::s_rndne_f32;
+         bld.sop1(op, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fsin_amd:
+   case nir_op_fcos_amd: {
+      if (instr->def.bit_size == 16 || instr->def.bit_size == 32) {
+         bool is_sin = instr->op == nir_op_fsin_amd;
+         aco_opcode opcode, fract;
+         RegClass rc;
+         if (instr->def.bit_size == 16) {
+            opcode = is_sin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
+            fract = aco_opcode::v_fract_f16;
+            rc = v2b;
+         } else {
+            opcode = is_sin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
+            fract = aco_opcode::v_fract_f32;
+            rc = v1;
+         }
+
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         /* before GFX9, v_sin and v_cos had a valid input domain of [-256, +256] */
+         if (ctx->options->gfx_level < GFX9)
+            src = bld.vop1(fract, bld.def(rc), src);
+
+         if (dst.regClass() == rc) {
+            bld.vop1(opcode, Definition(dst), src);
+         } else {
+            Temp tmp = bld.vop1(opcode, bld.def(rc), src);
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+         }
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ldexp: {
+      if (dst.regClass() == v2b) {
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
+      } else if (dst.regClass() == v1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
+      } else if (dst.regClass() == v2) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_frexp_sig: {
+      if (dst.regClass() == v2b) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
+      } else if (dst.regClass() == v1) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
+      } else if (dst.regClass() == v2) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_frexp_exp: {
+      if (instr->src[0].src.ssa->bit_size == 16) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
+         tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
+         convert_int(ctx, bld, tmp, 8, 32, true, dst);
+      } else if (instr->src[0].src.ssa->bit_size == 32) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fsign: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == v2b) {
+         /* replace negative zero with positive zero */
+         src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), as_vgpr(ctx, src));
+         if (ctx->program->gfx_level >= GFX9) {
+            src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src,
+                           Operand::c16(1u));
+            bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
+         } else {
+            src = convert_int(ctx, bld, src, 16, 32, true);
+            src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src,
+                           Operand::c32(1u));
+            bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
+         }
+      } else if (dst.regClass() == v1) {
+         /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
+          * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
+          */
+         Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
+         src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, as_vgpr(ctx, src));
+         bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
+                  Operand::c32(0xbf800000));
+      } else if (dst.regClass() == v2) {
+         src = as_vgpr(ctx, src);
+         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
+         Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
+         Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
+                                   emit_extract_vector(ctx, src, 1, v1), cond);
+
+         cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
+         tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
+         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
+
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f16, bld.def(s1, scc), Operand::c16(0), src);
+         src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3c00), src,
+                        bld.scc(cond));
+         cond = bld.sopc(aco_opcode::s_cmp_ge_f16, bld.def(s1, scc), src, Operand::c16(0));
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbc00),
+                  bld.scc(cond));
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
+         Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f32, bld.def(s1, scc), Operand::c32(0), src);
+         src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3f800000), src,
+                        bld.scc(cond));
+         cond = bld.sopc(aco_opcode::s_cmp_ge_f32, bld.def(s1, scc), src, Operand::c32(0));
+         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbf800000),
+                  bld.scc(cond));
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_f2f16:
+   case nir_op_f2f16_rtne: {
+      assert(instr->src[0].src.ssa->bit_size == 32);
+      if (instr->def.num_components == 2) {
+         /* Vectorizing f2f16 is only possible with rtz. */
+         assert(instr->op != nir_op_f2f16_rtne);
+         assert(ctx->block->fp_mode.round16_64 == fp_round_tz ||
+                !ctx->block->fp_mode.care_about_round16_64);
+         emit_vec2_f2f16(ctx, instr, dst);
+         break;
+      }
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) {
+         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
+          * keep value numbering and the scheduler simpler.
+          */
+         if (dst.regClass() == v2b)
+            bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src);
+         else
+            bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, Definition(dst), src);
+      } else {
+         if (dst.regClass() == v2b)
+            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
+         else
+            bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
+      }
+      break;
+   }
+   case nir_op_f2f16_rtz: {
+      assert(instr->src[0].src.ssa->bit_size == 32);
+      if (instr->def.num_components == 2) {
+         emit_vec2_f2f16(ctx, instr, dst);
+         break;
+      }
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (ctx->block->fp_mode.round16_64 == fp_round_tz) {
+         if (dst.regClass() == v2b)
+            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
+         else
+            bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
+      } else if (dst.regClass() == s1) {
+         bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src, Operand::zero());
+      } else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9) {
+         bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
+      } else {
+         bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
+      }
+      break;
+   }
+   case nir_op_f2f32: {
+      if (dst.regClass() == s1) {
+         assert(instr->src[0].src.ssa->bit_size == 16);
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
+      } else if (instr->src[0].src.ssa->bit_size == 16) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_f2f64: {
+      assert(instr->src[0].src.ssa->bit_size == 32);
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
+      break;
+   }
+   case nir_op_i2f16: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      const unsigned input_size = instr->src[0].src.ssa->bit_size;
+      if (dst.regClass() == v2b) {
+         if (input_size <= 16) {
+            /* Expand integer to the size expected by the uint→float converter used below */
+            unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
+            if (input_size != target_size) {
+               src = convert_int(ctx, bld, src, input_size, target_size, true);
+            }
+         }
+
+         if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
+            bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
+         } else {
+            /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
+             *
+             * This is also the fallback-path taken on GFX7 and earlier, which
+             * do not support direct f16⟷i16 conversions.
+             */
+            src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
+            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
+         }
+      } else if (dst.regClass() == s1) {
+         if (input_size <= 16) {
+            src = convert_int(ctx, bld, src, input_size, 32, true);
+         }
+         src = bld.sop1(aco_opcode::s_cvt_f32_i32, bld.def(s1), src);
+         bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_i2f32: {
+      assert(dst.size() == 1);
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      const unsigned input_size = instr->src[0].src.ssa->bit_size;
+      if (input_size <= 32) {
+         if (input_size <= 16) {
+            /* Sign-extend to 32-bits */
+            src = convert_int(ctx, bld, src, input_size, 32, true);
+         }
+         if (dst.regClass() == v1)
+            bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
+         else
+            bld.sop1(aco_opcode::s_cvt_f32_i32, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_i2f64: {
+      if (instr->src[0].src.ssa->bit_size <= 32) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         if (instr->src[0].src.ssa->bit_size <= 16)
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+         bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_u2f16: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      const unsigned input_size = instr->src[0].src.ssa->bit_size;
+      if (dst.regClass() == v2b) {
+         if (input_size <= 16) {
+            /* Expand integer to the size expected by the uint→float converter used below */
+            unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
+            if (input_size != target_size) {
+               src = convert_int(ctx, bld, src, input_size, target_size, false);
+            }
+         }
+
+         if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
+            bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
+         } else {
+            /* Large 32bit inputs need to return inf/FLOAT_MAX.
+             *
+             * This is also the fallback-path taken on GFX7 and earlier, which
+             * do not support direct f16⟷u16 conversions.
+             */
+            src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
+            bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
+         }
+      } else if (dst.regClass() == s1) {
+         if (input_size <= 16) {
+            src = convert_int(ctx, bld, src, input_size, 32, false);
+         }
+         src = bld.sop1(aco_opcode::s_cvt_f32_u32, bld.def(s1), src);
+         bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_u2f32: {
+      assert(dst.size() == 1);
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      const unsigned input_size = instr->src[0].src.ssa->bit_size;
+      if (input_size == 8 && dst.regClass() == v1) {
+         bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
+      } else if (input_size <= 32) {
+         if (input_size <= 16)
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
+         if (dst.regClass() == v1)
+            bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
+         else
+            bld.sop1(aco_opcode::s_cvt_f32_u32, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_u2f64: {
+      if (instr->src[0].src.ssa->bit_size <= 32) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         if (instr->src[0].src.ssa->bit_size <= 16)
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
+         bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_f2i8:
+   case nir_op_f2i16: {
+      if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
+          ctx->program->gfx_level >= GFX11_5) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         Temp tmp = bld.as_uniform(src);
+         if (instr->src[0].src.ssa->bit_size == 16)
+            tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
+         bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
+      } else if (instr->src[0].src.ssa->bit_size == 16) {
+         if (ctx->program->gfx_level >= GFX8) {
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
+         } else {
+            /* GFX7 and earlier do not support direct f16⟷i16 conversions */
+            Temp tmp = bld.tmp(v1);
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
+            tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
+            tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
+                              (dst.type() == RegType::sgpr) ? Temp() : dst);
+            if (dst.type() == RegType::sgpr) {
+               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+            }
+         }
+      } else if (instr->src[0].src.ssa->bit_size == 32) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
+      } else {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
+      }
+      break;
+   }
+   case nir_op_f2u8:
+   case nir_op_f2u16: {
+      if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
+          ctx->program->gfx_level >= GFX11_5) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         Temp tmp = bld.as_uniform(src);
+         if (instr->src[0].src.ssa->bit_size == 16)
+            tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
+         bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
+      } else if (instr->src[0].src.ssa->bit_size == 16) {
+         if (ctx->program->gfx_level >= GFX8) {
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
+         } else {
+            /* GFX7 and earlier do not support direct f16⟷u16 conversions */
+            Temp tmp = bld.tmp(v1);
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
+            tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
+            tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
+                              (dst.type() == RegType::sgpr) ? Temp() : dst);
+            if (dst.type() == RegType::sgpr) {
+               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+            }
+         }
+      } else if (instr->src[0].src.ssa->bit_size == 32) {
+         if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX11)
+            bld.vop3(aco_opcode::p_v_cvt_pk_u8_f32, Definition(dst),
+                     get_alu_src(ctx, instr->src[0]));
+         else
+            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
+      } else {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
+      }
+      break;
+   }
+   case nir_op_f2i32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
+          ctx->program->gfx_level >= GFX11_5) {
+         Temp tmp = bld.as_uniform(src);
+         if (instr->src[0].src.ssa->bit_size == 16)
+            tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
+         bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
+      } else if (instr->src[0].src.ssa->bit_size == 16) {
+         Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+         if (dst.type() == RegType::vgpr) {
+            bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
+         } else {
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                       bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
+         }
+      } else if (instr->src[0].src.ssa->bit_size == 32) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_f2u32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
+          ctx->program->gfx_level >= GFX11_5) {
+         Temp tmp = bld.as_uniform(src);
+         if (instr->src[0].src.ssa->bit_size == 16)
+            tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
+         bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
+      } else if (instr->src[0].src.ssa->bit_size == 16) {
+         Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
+         if (dst.type() == RegType::vgpr) {
+            bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
+         } else {
+            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                       bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
+         }
+      } else if (instr->src[0].src.ssa->bit_size == 32) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_b2f16: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      assert(src.regClass() == bld.lm);
+
+      if (dst.regClass() == s1) {
+         src = bool_to_scalar_condition(ctx, src);
+         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
+      } else if (dst.regClass() == v2b) {
+         Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
+      } else {
+         unreachable("Wrong destination register class for nir_op_b2f16.");
+      }
+      break;
+   }
+   case nir_op_b2f32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      assert(src.regClass() == bld.lm);
+
+      if (dst.regClass() == s1) {
+         src = bool_to_scalar_condition(ctx, src);
+         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
+      } else if (dst.regClass() == v1) {
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
+                      Operand::c32(0x3f800000u), src);
+      } else {
+         unreachable("Wrong destination register class for nir_op_b2f32.");
+      }
+      break;
+   }
+   case nir_op_b2f64: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      assert(src.regClass() == bld.lm);
+
+      if (dst.regClass() == s2) {
+         src = bool_to_scalar_condition(ctx, src);
+         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
+                  Operand::zero(), bld.scc(src));
+      } else if (dst.regClass() == v2) {
+         Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
+         Temp upper =
+            bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
+      } else {
+         unreachable("Wrong destination register class for nir_op_b2f64.");
+      }
+      break;
+   }
+   case nir_op_i2i8:
+   case nir_op_i2i16:
+   case nir_op_i2i32: {
+      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
+         /* no need to do the extract in get_alu_src() */
+         sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
+                                     ? sgpr_extract_sext
+                                     : sgpr_extract_undef;
+         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
+      } else {
+         const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
+         const unsigned output_bitsize = instr->def.bit_size;
+         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
+                     output_bitsize > input_bitsize, dst);
+      }
+      break;
+   }
+   case nir_op_u2u8:
+   case nir_op_u2u16:
+   case nir_op_u2u32: {
+      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
+         /* no need to do the extract in get_alu_src() */
+         sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
+                                     ? sgpr_extract_zext
+                                     : sgpr_extract_undef;
+         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
+      } else {
+         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
+                     instr->def.bit_size, false, dst);
+      }
+      break;
+   }
+   case nir_op_b2b32:
+   case nir_op_b2i8:
+   case nir_op_b2i16:
+   case nir_op_b2i32: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      assert(src.regClass() == bld.lm);
+
+      if (dst.regClass() == s1) {
+         bool_to_scalar_condition(ctx, src, dst);
+      } else if (dst.type() == RegType::vgpr) {
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
+                      src);
+      } else {
+         unreachable("Invalid register class for b2i32");
+      }
+      break;
+   }
+   case nir_op_b2b1: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      assert(dst.regClass() == bld.lm);
+
+      if (src.type() == RegType::vgpr) {
+         assert(src.regClass() == v1 || src.regClass() == v2);
+         assert(dst.regClass() == bld.lm);
+         bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
+                  Definition(dst), Operand::zero(), src);
+      } else {
+         assert(src.regClass() == s1 || src.regClass() == s2);
+         Temp tmp;
+         if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
+            tmp =
+               bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
+                  .def(1)
+                  .getTemp();
+         } else {
+            tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
+                           bld.scc(bld.def(s1)), Operand::zero(), src);
+         }
+         bool_to_vector_condition(ctx, tmp, dst);
+      }
+      break;
+   }
+   case nir_op_unpack_64_2x32:
+   case nir_op_unpack_32_2x16:
+   case nir_op_unpack_64_4x16:
+   case nir_op_unpack_32_4x8:
+      bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
+      emit_split_vector(
+         ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
+      break;
+   case nir_op_pack_64_2x32_split: {
+      Operand src[2];
+      RegClass elem_rc = dst.regClass() == s2 ? s1 : v1;
+      for (unsigned i = 0; i < 2; i++) {
+         if (nir_src_is_undef(instr->src[i].src))
+            src[i] = Operand(elem_rc);
+         else
+            src[i] = Operand(get_alu_src(ctx, instr->src[i]));
+      }
+
+      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src[0], src[1]);
+      break;
+   }
+   case nir_op_unpack_64_2x32_split_x:
+      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
+                 get_alu_src(ctx, instr->src[0]));
+      break;
+   case nir_op_unpack_64_2x32_split_y:
+      bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
+                 get_alu_src(ctx, instr->src[0]));
+      break;
+   case nir_op_unpack_32_2x16_split_x:
+      if (dst.type() == RegType::vgpr) {
+         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
+                    get_alu_src(ctx, instr->src[0]));
+      } else {
+         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
+      }
+      break;
+   case nir_op_unpack_32_2x16_split_y:
+      if (dst.type() == RegType::vgpr) {
+         bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
+                    get_alu_src(ctx, instr->src[0]));
+      } else {
+         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
+                    get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
+                    Operand::zero());
+      }
+      break;
+   case nir_op_pack_32_2x16_split: {
+      Operand src0 = Operand(get_alu_src(ctx, instr->src[0]));
+      Operand src1 = Operand(get_alu_src(ctx, instr->src[1]));
+      if (dst.regClass() == v1) {
+         if (nir_src_is_undef(instr->src[0].src))
+            src0 = Operand(v2b);
+         else
+            src0 = Operand(emit_extract_vector(ctx, src0.getTemp(), 0, v2b));
+
+         if (nir_src_is_undef(instr->src[1].src))
+            src1 = Operand(v2b);
+         else
+            src1 = Operand(emit_extract_vector(ctx, src1.getTemp(), 0, v2b));
+
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
+      } else if (nir_src_is_undef(instr->src[1].src)) {
+         bld.copy(Definition(dst), src0);
+      } else if (nir_src_is_undef(instr->src[0].src)) {
+         bld.pseudo(aco_opcode::p_insert, Definition(dst), bld.def(s1, scc), src1, Operand::c32(1),
+                    Operand::c32(16));
+      } else if (ctx->program->gfx_level >= GFX9) {
+         bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(dst), src0, src1);
+      } else {
+         src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
+                         Operand::c32(0xFFFFu));
+         src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
+                         Operand::c32(16u));
+         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
+      }
+      break;
+   }
+   case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
+   case nir_op_pack_half_2x16_rtz_split:
+   case nir_op_pack_half_2x16_split: {
+      if (dst.regClass() == v1) {
+         if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
+            emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
+         else
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
+      } else if (dst.regClass() == s1) {
+         emit_sop2_instruction(ctx, instr, aco_opcode::s_cvt_pk_rtz_f16_f32, dst, false);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_pack_unorm_2x16:
+   case nir_op_pack_snorm_2x16: {
+      unsigned bit_size = instr->src[0].src.ssa->bit_size;
+      /* Only support 16 and 32bit. */
+      assert(bit_size == 32 || bit_size == 16);
+
+      RegClass src_rc = bit_size == 32 ? v1 : v2b;
+      Temp src = get_alu_src(ctx, instr->src[0], 2);
+      Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
+      Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
+
+      /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
+      if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
+         src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
+         src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
+         bit_size = 32;
+      }
+
+      aco_opcode opcode;
+      if (bit_size == 32) {
+         opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
+                                                      : aco_opcode::v_cvt_pknorm_i16_f32;
+      } else {
+         opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
+                                                      : aco_opcode::v_cvt_pknorm_i16_f16;
+      }
+      bld.vop3(opcode, Definition(dst), src0, src1);
+      break;
+   }
+   case nir_op_pack_uint_2x16:
+   case nir_op_pack_sint_2x16: {
+      Temp src = get_alu_src(ctx, instr->src[0], 2);
+      Temp src0 = emit_extract_vector(ctx, src, 0, v1);
+      Temp src1 = emit_extract_vector(ctx, src, 1, v1);
+      aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
+                                                             : aco_opcode::v_cvt_pk_i16_i32;
+      bld.vop3(opcode, Definition(dst), src0, src1);
+      break;
+   }
+   case nir_op_unpack_half_2x16_split_x: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s1) {
+         bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
+         break;
+      }
+      if (src.regClass() == v1)
+         src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
+      if (dst.regClass() == v1) {
+         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_unpack_half_2x16_split_y: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == s1) {
+         bld.sop1(aco_opcode::s_cvt_hi_f32_f16, Definition(dst), src);
+         break;
+      }
+      if (src.regClass() == s1)
+         src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
+                          Operand::c32(1u), Operand::c32(16u), Operand::zero());
+      else
+         src =
+            bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
+      if (dst.regClass() == v1) {
+         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_msad_4x8: {
+      assert(dst.regClass() == v1);
+      emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
+      break;
+   }
+   case nir_op_mqsad_4x8: {
+      assert(dst.regClass() == v4);
+      Temp ref = get_alu_src(ctx, instr->src[0]);
+      Temp src = get_alu_src(ctx, instr->src[1], 2);
+      Temp accum = get_alu_src(ctx, instr->src[2], 4);
+      bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), as_vgpr(ctx, ref),
+               as_vgpr(ctx, accum));
+      emit_split_vector(ctx, dst, 4);
+      break;
+   }
+   case nir_op_shfr: {
+      if (dst.regClass() == s1) {
+         Temp src0 = get_alu_src(ctx, instr->src[0]);
+         Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+         Temp amount;
+         if (nir_src_is_const(instr->src[2].src)) {
+            unsigned camount = nir_src_as_uint(instr->src[2].src) & 0x1f;
+            if (camount == 16 && ctx->program->gfx_level >= GFX11) {
+               bld.sop2(aco_opcode::s_pack_hl_b32_b16, Definition(dst), src1, src0);
+               break;
+            }
+            amount = bld.copy(bld.def(s1), Operand::c32(camount));
+         } else if (get_alu_src_ub(ctx, instr, 2) >= 32) {
+            amount = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
+                              get_alu_src(ctx, instr->src[2]), Operand::c32(0x1f));
+         } else {
+            amount = get_alu_src(ctx, instr->src[2]);
+         }
+
+         Temp src = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), src1, src0);
+
+         Temp res = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), src, amount);
+         bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), res, Operand::zero());
+      } else if (dst.regClass() == v1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbit_b32, dst, false, 3u);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_alignbyte_amd: {
+      if (dst.regClass() == v1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbyte_b32, dst, false, 3u);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_fquantize2f16: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (dst.regClass() == v1) {
+         Temp f16;
+         if (ctx->block->fp_mode.round16_64 != fp_round_ne)
+            f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src);
+         else
+            f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
+
+         if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
+            bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16);
+            break;
+         }
+
+         Temp denorm_zero;
+         Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
+         if (ctx->program->gfx_level >= GFX8) {
+            /* value is negative/positive denormal value/zero */
+            Instruction* tmp0 =
+               bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, Operand::c32(0x30));
+            tmp0->valu().abs[0] = true;
+            tmp0->valu().neg[0] = true;
+            denorm_zero = tmp0->definitions[0].getTemp();
+         } else {
+            /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
+             * so compare the result and flush to 0 if it's smaller.
+             */
+            Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
+            Instruction* tmp0 =
+               bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
+            tmp0->valu().abs[0] = true;
+            denorm_zero = tmp0->definitions[0].getTemp();
+         }
+         if (nir_alu_instr_is_signed_zero_preserve(instr)) {
+            Temp copysign_0 =
+               bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
+            bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), f32, copysign_0, denorm_zero);
+         } else {
+            bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), f32, Operand::zero(),
+                         denorm_zero);
+         }
+      } else if (dst.regClass() == s1) {
+         Temp f16;
+         if (ctx->block->fp_mode.round16_64 != fp_round_ne)
+            f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src);
+         else
+            f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src);
+
+         if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
+            bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);
+         } else {
+            Temp f32 = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), f16);
+            Temp abs_mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
+            Temp abs =
+               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
+            Operand sign;
+            if (nir_alu_instr_is_signed_zero_preserve(instr)) {
+               sign =
+                  bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
+            } else {
+               sign = Operand::c32(0);
+            }
+            Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
+            Temp denorm_zero = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, smallest);
+            bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), sign, f32, bld.scc(denorm_zero));
+         }
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_bfm: {
+      Temp bits = get_alu_src(ctx, instr->src[0]);
+      Temp offset = get_alu_src(ctx, instr->src[1]);
+
+      if (dst.regClass() == s1) {
+         bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
+      } else if (dst.regClass() == v1) {
+         bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_bitfield_select: {
+
+      /* dst = (insert & bitmask) | (base & ~bitmask) */
+      if (dst.regClass() == s1) {
+         Temp bitmask = get_alu_src(ctx, instr->src[0]);
+         Temp insert = get_alu_src(ctx, instr->src[1]);
+         Temp base = get_alu_src(ctx, instr->src[2]);
+         aco_ptr<Instruction> sop2;
+         nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
+         nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
+
+         if (const_bitmask && ctx->program->gfx_level >= GFX9 &&
+             (const_bitmask->u32 == 0xffff || const_bitmask->u32 == 0xffff0000)) {
+            if (const_bitmask->u32 == 0xffff) {
+               bld.sop2(aco_opcode::s_pack_lh_b32_b16, Definition(dst), insert, base);
+            } else {
+               bld.sop2(aco_opcode::s_pack_lh_b32_b16, Definition(dst), base, insert);
+            }
+            break;
+         }
+
+         Operand lhs;
+         if (const_insert && const_bitmask) {
+            lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
+         } else {
+            insert =
+               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
+            lhs = Operand(insert);
+         }
+
+         Operand rhs;
+         nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
+         if (const_base && const_bitmask) {
+            rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
+         } else {
+            base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
+            rhs = Operand(base);
+         }
+
+         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
+
+      } else if (dst.regClass() == v1) {
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_ubfe:
+   case nir_op_ibfe: {
+      if (dst.bytes() != 4)
+         unreachable("Unsupported BFE bit size");
+
+      if (dst.type() == RegType::sgpr) {
+         Temp base = get_alu_src(ctx, instr->src[0]);
+
+         nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
+         nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
+         aco_opcode opcode =
+            instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
+         if (const_offset && const_bits) {
+            uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
+            bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
+            break;
+         }
+
+         Temp offset = get_alu_src(ctx, instr->src[1]);
+         Temp bits = get_alu_src(ctx, instr->src[2]);
+
+         if (ctx->program->gfx_level >= GFX9) {
+            Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
+                                         : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
+                                                    bld.def(s1, scc), bits, Operand::c32(0x1fu));
+            Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
+            bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
+         } else if (instr->op == nir_op_ubfe) {
+            Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
+            Temp masked =
+               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
+            bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
+         } else {
+            Operand bits_op = const_bits
+                                 ? Operand::c32((const_bits->u32 & 0x1f) << 16)
+                                 : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
+                                            bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
+                                                     bld.def(s1, scc), bits, Operand::c32(0x1fu)),
+                                            Operand::c32(16u));
+            Operand offset_op = const_offset
+                                   ? Operand::c32(const_offset->u32 & 0x1fu)
+                                   : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
+                                              offset, Operand::c32(0x1fu));
+
+            Temp extract =
+               bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
+            bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
+         }
+
+      } else {
+         aco_opcode opcode =
+            instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
+         emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
+      }
+      break;
+   }
+   case nir_op_extract_u8:
+   case nir_op_extract_i8:
+   case nir_op_extract_u16:
+   case nir_op_extract_i16: {
+      bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
+      unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
+      uint32_t bits = comp == 4 ? 8 : 16;
+      unsigned index = nir_src_as_uint(instr->src[1].src);
+      if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
+         assert(index == 0);
+         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
+         Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
+         unsigned swizzle = instr->src[0].swizzle[0];
+         if (vec.size() > 1) {
+            vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
+            swizzle = swizzle & 1;
+         }
+         index += swizzle * instr->def.bit_size / bits;
+         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
+                    Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
+      } else if (dst.regClass() == s1) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(src),
+                    Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
+      } else if (dst.regClass() == s2) {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         aco_opcode op = is_signed ? aco_opcode::s_bfe_i64 : aco_opcode::s_bfe_u64;
+         Temp extract = bld.copy(bld.def(s1), Operand::c32((bits << 16) | (index * bits)));
+         bld.sop2(op, Definition(dst), bld.def(s1, scc), src, extract);
+      } else {
+         assert(dst.regClass().type() == RegType::vgpr);
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         Definition def(dst);
+
+         if (dst.bytes() == 8) {
+            src = emit_extract_vector(ctx, src, index / comp, v1);
+            index %= comp;
+            def = bld.def(v1);
+         }
+
+         assert(def.bytes() <= 4);
+         src = emit_extract_vector(ctx, src, 0, def.regClass());
+         bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
+                    Operand::c32(bits), Operand::c32(is_signed));
+
+         if (dst.size() == 2) {
+            Temp lo = def.getTemp();
+            Operand hi = Operand::zero();
+            if (is_signed)
+               hi = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31), lo);
+            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+         }
+      }
+      break;
+   }
+   case nir_op_insert_u8:
+   case nir_op_insert_u16: {
+      unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
+      uint32_t bits = comp == 4 ? 8 : 16;
+      unsigned index = nir_src_as_uint(instr->src[1].src);
+      if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
+         assert(index == 0);
+         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
+      } else {
+         Temp src = get_alu_src(ctx, instr->src[0]);
+         Definition def(dst);
+         bool swap = false;
+         if (dst.bytes() == 8) {
+            src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
+            swap = index >= comp;
+            index %= comp;
+            def = bld.def(src.type(), 1);
+         }
+         if (def.regClass() == s1) {
+            bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
+                       Operand::c32(index), Operand::c32(bits));
+         } else {
+            src = emit_extract_vector(ctx, src, 0, def.regClass());
+            bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
+                       Operand::c32(bits));
+         }
+         if (dst.size() == 2 && swap)
+            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
+                       def.getTemp());
+         else if (dst.size() == 2)
+            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
+                       Operand::zero());
+      }
+      break;
+   }
+   case nir_op_bit_count: {
+      Temp src = get_alu_src(ctx, instr->src[0]);
+      if (src.regClass() == s1) {
+         bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
+      } else if (src.regClass() == v1) {
+         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
+      } else if (src.regClass() == v2) {
+         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
+                  bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
+                           emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
+      } else if (src.regClass() == s2) {
+         bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
+   case nir_op_flt: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
+         aco_opcode::v_cmp_lt_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_fge: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
+         aco_opcode::v_cmp_ge_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_fltu: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_nge_f16, aco_opcode::v_cmp_nge_f32,
+         aco_opcode::v_cmp_nge_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_fgeu: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_nlt_f16, aco_opcode::v_cmp_nlt_f32,
+         aco_opcode::v_cmp_nlt_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_feq: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
+         aco_opcode::v_cmp_eq_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_fneu: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
+         aco_opcode::v_cmp_neq_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_fequ: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_nlg_f16, aco_opcode::v_cmp_nlg_f32,
+         aco_opcode::v_cmp_nlg_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_fneo: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_lg_f16, aco_opcode::v_cmp_lg_f32,
+         aco_opcode::v_cmp_lg_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_funord: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_u_f16, aco_opcode::v_cmp_u_f32, aco_opcode::v_cmp_u_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_ford: {
+      emit_comparison(
+         ctx, instr, dst, aco_opcode::v_cmp_o_f16, aco_opcode::v_cmp_o_f32, aco_opcode::v_cmp_o_f64,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f16 : aco_opcode::num_opcodes,
+         ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f32 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_ilt: {
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
+                      aco_opcode::v_cmp_lt_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_i32);
+      break;
+   }
+   case nir_op_ige: {
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
+                      aco_opcode::v_cmp_ge_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_i32);
+      break;
+   }
+   case nir_op_ieq: {
+      if (instr->src[0].src.ssa->bit_size == 1)
+         emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
+      else
+         emit_comparison(
+            ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
+            aco_opcode::v_cmp_eq_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_eq_i32,
+            ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_ine: {
+      if (instr->src[0].src.ssa->bit_size == 1)
+         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
+      else
+         emit_comparison(
+            ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
+            aco_opcode::v_cmp_lg_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lg_i32,
+            ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
+      break;
+   }
+   case nir_op_ult: {
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
+                      aco_opcode::v_cmp_lt_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_u32);
+      break;
+   }
+   case nir_op_uge: {
+      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
+                      aco_opcode::v_cmp_ge_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_u32);
+      break;
+   }
+   case nir_op_bitz:
+   case nir_op_bitnz: {
+      assert(instr->src[0].src.ssa->bit_size != 1);
+      bool test0 = instr->op == nir_op_bitz;
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = get_alu_src(ctx, instr->src[1]);
+      bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
+      if (!use_valu) {
+         aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
+                                                               : aco_opcode::s_bitcmp1_b32;
+         if (test0)
+            op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
+                                                       : aco_opcode::s_bitcmp0_b32;
+         emit_sopc_instruction(ctx, instr, op, dst);
+         break;
+      }
+
+      /* We do not have a VALU version of s_bitcmp.
+       * But if the second source is constant, we can use
+       * v_cmp_class_f32's LUT to check the bit.
+       * The LUT only has 10 entries, so extract a higher byte if we have to.
+       * For sign bits comparision with 0 is better because v_cmp_class
+       * can't be inverted.
+       */
+      if (nir_src_is_const(instr->src[1].src)) {
+         uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
+         bit &= instr->src[0].src.ssa->bit_size - 1;
+         src0 = as_vgpr(ctx, src0);
+
+         if (src0.regClass() == v2) {
+            src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
+            bit &= 31;
+         }
+
+         if (bit == 31) {
+            bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
+                     Operand::c32(0), src0);
+            break;
+         }
+
+         if (bit == 15 && ctx->program->gfx_level >= GFX8) {
+            bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
+                     Operand::c32(0), src0);
+            break;
+         }
+
+         /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
+         const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
+         const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
+         const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
+         if (use_opsel) {
+            src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
+                              Operand::c32(16), Operand::c32(0));
+            bit &= 0xf;
+         }
+
+         /* If we can use sdwa the extract is free, while test0's s_not is not. */
+         if (bit == 7 && test0 && can_sdwa) {
+            src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
+                              Operand::c32(8), Operand::c32(1));
+            bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
+                     Operand::c32(0), src0);
+            break;
+         }
+
+         if (bit > max_bit) {
+            src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
+                              Operand::c32(8), Operand::c32(0));
+            bit &= 0x7;
+         }
+
+         /* denorm and snan/qnan inputs are preserved using all float control modes. */
+         static const struct {
+            uint32_t fp32;
+            uint32_t fp16;
+            bool negate;
+         } float_lut[10] = {
+            {0x7f800001, 0x7c01, false}, /* snan */
+            {~0u, ~0u, false},           /* qnan */
+            {0xff800000, 0xfc00, false}, /* -inf */
+            {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
+            {1, 1, true},                /* -denormal */
+            {0, 0, true},                /* -0.0 */
+            {0, 0, false},               /* +0.0 */
+            {1, 1, false},               /* +denormal */
+            {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
+            {0x7f800000, 0x7c00, false}, /* +inf */
+         };
+
+         Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
+         /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
+         const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
+                               (ctx->program->gfx_level >= GFX11 && use_opsel);
+         const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
+         const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
+
+         VALU_instruction& res =
+            bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
+         if (float_lut[bit].negate) {
+            res.format = asVOP3(res.format);
+            res.neg[0] = true;
+         }
+
+         if (test0)
+            bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
+
+         break;
+      }
+
+      Temp res;
+      aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
+      if (instr->src[0].src.ssa->bit_size == 16) {
+         op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
+         if (ctx->program->gfx_level < GFX10)
+            res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
+         else
+            res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
+
+         res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
+      } else if (instr->src[0].src.ssa->bit_size == 32) {
+         res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
+      } else if (instr->src[0].src.ssa->bit_size == 64) {
+         if (ctx->program->gfx_level < GFX8)
+            res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
+         else
+            res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
+
+         res = emit_extract_vector(ctx, res, 0, v1);
+         res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      bld.vopc(op, Definition(dst), Operand::c32(0), res);
+      break;
+   }
+   default: isel_err(&instr->instr, "Unknown NIR ALU instr");
+   }
+}
+
+} // namespace aco
diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build
index 6a974b7825b..307fedd9530 100644
--- a/src/amd/compiler/meson.build
+++ b/src/amd/compiler/meson.build
@@ -35,6 +35,7 @@ libaco_files = files(
   'instruction_selection/aco_isel_cfg.cpp',
   'instruction_selection/aco_isel_helpers.cpp',
   'instruction_selection/aco_isel_setup.cpp',
+  'instruction_selection/aco_select_nir_alu.cpp',
   'instruction_selection/aco_select_nir.cpp',
   'instruction_selection/aco_select_ps_epilog.cpp',
   'instruction_selection/aco_select_ps_prolog.cpp',