ir3: lower vectorized NIR instructions

Use the new repeat group builders to lower vectorized NIR instructions. Add NIR pass to vectorize NIR before lowering. Support for repeated instruction is added over a number of different commits. Here's how they all tie together: ir3 is a scalar architecture and as such most instructions cannot be vectorized. However, many instructions support the (rptN) modifier that allows us to mimic vector instructions. Whenever an instruction has the (rptN) modifier set it will execute N more times, incrementing its destination register for each repetition. Additionally, source registers with the (r) flag set will also be incremented. For example: (rpt1)add.f r0.x, (r)r1.x, r2.x is the same as: add.f r0.x, r1.x, r2.x add.f r0.y, r1.y, r2.x The main benefit of using repeated instructions is a reduction in code size. Since every iteration is still executed as a scalar instruction, there's no direct benefit in terms of runtime. The only exception seems to be for 3-source instructions pre-a7xx: if one of the sources is constant (i.e., without the (r) flag), a repeated instruction executes faster than the equivalent expanded sequence. Presumably, this is because the ALU only has 2 register read ports. I have not been able to measure this difference on a7xx though. Support for repeated instructions consists of two parts. First, we need to make sure NIR is (mostly) vectorized when translating to ir3. I have not been able to find a way to keep NIR vectorized all the way and still generate decent code. Therefore, I have taken the approach of vectorizing the (scalarized) NIR right before translating it to ir3. Secondly, ir3 needs to be adapted to ingest vectorized NIR and translate it to repeated instructions. To this end, I have introduced the concept of "repeat groups" to ir3. A repeat group is a group of instructions that were produced from a vectorized NIR operation and linked together. They are, however, still separate scalar instructions until quite late. More concretely: 1. Instruction emission: for every vectorized NIR operation, emit separate scalar instructions for its components and link them together in a repeat group. For every instruction builder ir3_X, a new repeat builder ir3_X_rpt has been added to facilitate this. 2. Optimization passes: for now, repeat groups are completely ignored by optimizations. 3. Pre-RA: clean up repeat groups that can never be merged into an actual rptN instruction (e.g., because their instructions are not consecutive anymore). This ensures no useless merge sets will be created in the next step. 4. RA: create merge sets for the sources and defs of the instructions in repeat groups. This way, RA will try to allocate consecutive registers for them. This will not be forced though because we prefer to split-up repeat groups over creating movs to reorder registers. 5. Post-RA: create actual rptN instructions for repeat groups where the allocated registers allow it. The idea for step 2 is that we prefer that any potential optimizations take precedence over creating rptN instructions as the latter will only yield a code size benefit. However, it might be interesting to investigate if we could make some optimizations repeat aware. For example, the scheduler could try to schedule instructions of a repeat group together. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28341>
2025-12-20 22:30:12 +01:00 · 2024-08-15 08:46:36 +02:00 · 2024-08-15 08:46:36 +02:00 · 58d18bc7a8
commit 58d18bc7a8
parent 4c4366179b
4 changed files with 435 additions and 230 deletions
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -39,6 +39,83 @@
 #include "ir3.h"
 #include "ir3_context.h"

+static struct ir3_instruction_rpt
+rpt_instr(struct ir3_instruction *instr, unsigned nrpt)
+{
+   struct ir3_instruction_rpt dst = {{0}};
+
+   for (unsigned i = 0; i < nrpt; ++i)
+      dst.rpts[i] = instr;
+
+   return dst;
+}
+
+static void
+cp_instrs(struct ir3_instruction *dst[], struct ir3_instruction *instrs[],
+          unsigned n)
+{
+   for (unsigned i = 0; i < n; ++i)
+      dst[i] = instrs[i];
+}
+
+static struct ir3_instruction_rpt
+create_immed_rpt(struct ir3_block *block, unsigned nrpt, unsigned val)
+{
+   return rpt_instr(create_immed(block, val), nrpt);
+}
+
+static struct ir3_instruction_rpt
+create_immed_shared_rpt(struct ir3_block *block, unsigned nrpt, uint32_t val,
+                        bool shared)
+{
+   return rpt_instr(create_immed_shared(block, val, shared), nrpt);
+}
+
+static struct ir3_instruction_rpt
+create_immed_typed_rpt(struct ir3_block *block, unsigned nrpt, unsigned val,
+                       type_t type)
+{
+   return rpt_instr(create_immed_typed(block, val, type), nrpt);
+}
+
+static inline struct ir3_instruction_rpt
+create_immed_typed_shared_rpt(struct ir3_block *block, unsigned nrpt,
+                              uint32_t val, type_t type, bool shared)
+{
+   return rpt_instr(create_immed_typed_shared(block, val, type, shared), nrpt);
+}
+
+static void
+set_instr_flags(struct ir3_instruction *instrs[], unsigned n,
+                ir3_instruction_flags flags)
+{
+   for (unsigned i = 0; i < n; ++i)
+      instrs[i]->flags |= flags;
+}
+
+static void
+set_cat1_round(struct ir3_instruction *instrs[], unsigned n, round_t round)
+{
+   for (unsigned i = 0; i < n; ++i)
+      instrs[i]->cat1.round = round;
+}
+
+static void
+set_cat2_condition(struct ir3_instruction *instrs[], unsigned n,
+                   unsigned condition)
+{
+   for (unsigned i = 0; i < n; ++i)
+      instrs[i]->cat2.condition = condition;
+}
+
+static void
+set_dst_flags(struct ir3_instruction *instrs[], unsigned n,
+              ir3_register_flags flags)
+{
+   for (unsigned i = 0; i < n; ++i)
+      instrs[i]->dsts[0]->flags |= flags;
+}
+
 void
 ir3_handle_nonuniform(struct ir3_instruction *instr,
                      nir_intrinsic_instr *intrin)
@ -135,9 +212,9 @@ create_driver_param_indirect(struct ir3_context *ctx, enum ir3_driver_param dp,
 * alu/sfu instructions:
 */

-static struct ir3_instruction *
-create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
-           unsigned src_bitsize, nir_op op)
+static struct ir3_instruction_rpt
+create_cov(struct ir3_context *ctx, unsigned nrpt,
+           struct ir3_instruction_rpt src, unsigned src_bitsize, nir_op op)
 {
   type_t src_type, dst_type;

@ -277,9 +354,11 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
    * is used to achieve the result.
    */
   if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) {
-      struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
-      struct ir3_instruction *cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
-      cov->dsts[0]->flags |= type_flags(dst_type);
+      struct ir3_instruction_rpt mask =
+         create_immed_typed_rpt(ctx->block, nrpt, 0xff, TYPE_U8);
+      struct ir3_instruction_rpt cov =
+         ir3_AND_B_rpt(ctx->block, nrpt, src, 0, mask, 0);
+      set_dst_flags(cov.rpts, nrpt, type_flags(dst_type));
      return cov;
   }

@ -291,15 +370,16 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
      assert(op == nir_op_u2f16 || op == nir_op_i2f16 ||
             op == nir_op_u2f32 || op == nir_op_i2f32);

-      struct ir3_instruction *cov;
+      struct ir3_instruction_rpt cov;
      if (op == nir_op_u2f16 || op == nir_op_u2f32) {
-         struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
-         cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
-         cov->dsts[0]->flags |= IR3_REG_HALF;
-         cov = ir3_COV(ctx->block, cov, TYPE_U16, dst_type);
+         struct ir3_instruction_rpt mask =
+            create_immed_typed_rpt(ctx->block, nrpt, 0xff, TYPE_U8);
+         cov = ir3_AND_B_rpt(ctx->block, nrpt, src, 0, mask, 0);
+         set_dst_flags(cov.rpts, nrpt, IR3_REG_HALF);
+         cov = ir3_COV_rpt(ctx->block, nrpt, cov, TYPE_U16, dst_type);
      } else {
-         cov = ir3_COV(ctx->block, src, TYPE_U8, TYPE_S16);
-         cov = ir3_COV(ctx->block, cov, TYPE_S16, dst_type);
+         cov = ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U8, TYPE_S16);
+         cov = ir3_COV_rpt(ctx->block, nrpt, cov, TYPE_S16, dst_type);
      }
      return cov;
   }
@ -312,17 +392,19 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
      assert(op == nir_op_f2u8 || op == nir_op_f2i8);

      type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16;
-      struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, intermediate_type);
-      cov = ir3_COV(ctx->block, cov, intermediate_type, TYPE_U8);
+      struct ir3_instruction_rpt cov =
+         ir3_COV_rpt(ctx->block, nrpt, src, src_type, intermediate_type);
+      cov = ir3_COV_rpt(ctx->block, nrpt, cov, intermediate_type, TYPE_U8);
      return cov;
   }

-   struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
+   struct ir3_instruction_rpt cov =
+      ir3_COV_rpt(ctx->block, nrpt, src, src_type, dst_type);

   if (op == nir_op_f2f16_rtne) {
-      cov->cat1.round = ROUND_EVEN;
+      set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
   } else if (op == nir_op_f2f16_rtz) {
-      cov->cat1.round = ROUND_ZERO;
+      set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
   } else if (dst_type == TYPE_F16 || dst_type == TYPE_F32) {
      unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
      nir_alu_type type =
@ -330,23 +412,23 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
      nir_rounding_mode rounding_mode =
         nir_get_rounding_mode_from_float_controls(execution_mode, type);
      if (rounding_mode == nir_rounding_mode_rtne)
-         cov->cat1.round = ROUND_EVEN;
+         set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
      else if (rounding_mode == nir_rounding_mode_rtz)
-         cov->cat1.round = ROUND_ZERO;
+         set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
   }

   return cov;
 }

 /* For shift instructions NIR always has shift amount as 32 bit integer */
-static struct ir3_instruction *
-resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
-                    unsigned bs)
+static struct ir3_instruction_rpt
+resize_shift_amount(struct ir3_context *ctx, unsigned nrpt,
+                    struct ir3_instruction_rpt src, unsigned bs)
 {
   if (bs == 16)
-      return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
+      return ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U32, TYPE_U16);
   else if (bs == 8)
-      return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U8);
+      return ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U32, TYPE_U8);
   else
      return src;
 }
@ -452,18 +534,52 @@ emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
   }
 }

+static bool
+all_sat_compatible(struct ir3_instruction *instrs[], unsigned n)
+{
+   for (unsigned i = 0; i < n; i++) {
+      if (!is_sat_compatible(instrs[i]->opc))
+         return false;
+   }
+
+   return true;
+}
+
+/* Is src the only use of its def, taking components into account. */
+static bool
+is_unique_use(nir_src *src)
+{
+   nir_def *def = src->ssa;
+
+   if (list_is_singular(&def->uses))
+      return true;
+
+   nir_component_mask_t src_read_mask = nir_src_components_read(src);
+
+   nir_foreach_use (use, def) {
+      if (use == src)
+         continue;
+
+      if (nir_src_components_read(use) & src_read_mask)
+         return false;
+   }
+
+   return true;
+}
+
 static void
 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
 {
   const nir_op_info *info = &nir_op_infos[alu->op];
-   struct ir3_instruction **dst, *src[info->num_inputs];
+   struct ir3_instruction_rpt dst, src[info->num_inputs];
   unsigned bs[info->num_inputs]; /* bit size */
   struct ir3_block *b = ctx->block;
-   unsigned dst_sz, wrmask;
+   unsigned dst_sz;
   type_t dst_type = type_uint_size(alu->def.bit_size);

   dst_sz = alu->def.num_components;
-   wrmask = (1 << dst_sz) - 1;
+   assert(dst_sz == 1 || ir3_supports_vectorized_nir_op(alu->op));
+   assert(dst_sz <= ARRAY_SIZE(src[0].rpts));

   bool use_shared = !alu->def.divergent &&
      ctx->compiler->has_scalar_alu &&
@ -477,7 +593,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
      /* not supported in HW, we have to fall back to normal registers */
      alu->op != nir_op_ffma;

-   dst = ir3_get_def(ctx, &alu->def, dst_sz);
+   struct ir3_instruction **def = ir3_get_def(ctx, &alu->def, dst_sz);

   /* Vectors are special in that they have non-scalarized writemasks,
    * and just take the first swizzle channel for each argument in
@ -486,53 +602,39 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
   if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
       (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) ||
       (alu->op == nir_op_vec16)) {
+      struct ir3_instruction_rpt src;
+      assert(info->num_inputs <= ARRAY_SIZE(src.rpts));

      for (int i = 0; i < info->num_inputs; i++) {
         nir_alu_src *asrc = &alu->src[i];
-
-         src[i] = ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
-         if (!src[i])
-            src[i] = create_immed_typed_shared(ctx->block, 0, dst_type, use_shared);
-         dst[i] = ir3_MOV(b, src[i], dst_type);
+         src.rpts[i] =
+            ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
+         compile_assert(ctx, src.rpts[i]);
      }

+      dst = ir3_MOV_rpt(b, dst_sz, src, dst_type);
+      cp_instrs(def, dst.rpts, dst_sz);
      ir3_put_def(ctx, &alu->def);
      return;
   }

-   /* We also get mov's with more than one component for mov's so
-    * handle those specially:
-    */
-   if (alu->op == nir_op_mov) {
-      nir_alu_src *asrc = &alu->src[0];
-      struct ir3_instruction *const *src0 =
+   for (int i = 0; i < info->num_inputs; i++) {
+      nir_alu_src *asrc = &alu->src[i];
+      struct ir3_instruction *const *input_src =
         ir3_get_src_shared(ctx, &asrc->src, use_shared);
-
-      for (unsigned i = 0; i < dst_sz; i++) {
-         if (wrmask & (1 << i)) {
-            dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
-         } else {
-            dst[i] = NULL;
-         }
-      }
-
-      ir3_put_def(ctx, &alu->def);
-      return;
-   }
-
-   /* General case: We can just grab the one used channel per src. */
-   assert(alu->def.num_components == 1);
-
-   for (int i = 0; i < info->num_inputs; i++) {
-      nir_alu_src *asrc = &alu->src[i];
-
-      src[i] = ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
      bs[i] = nir_src_bit_size(asrc->src);

-      compile_assert(ctx, src[i]);
+      for (unsigned rpt = 0; rpt < dst_sz; rpt++) {
+         src[i].rpts[rpt] = input_src[asrc->swizzle[rpt]];
+         compile_assert(ctx, src[i].rpts[rpt]);
+      }
   }

   switch (alu->op) {
+   case nir_op_mov:
+      dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
+      break;
+
   case nir_op_f2f32:
   case nir_op_f2f16_rtne:
   case nir_op_f2f16_rtz:
@ -558,11 +660,12 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
   case nir_op_b2i8:
   case nir_op_b2i16:
   case nir_op_b2i32:
-      dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
+      dst = create_cov(ctx, dst_sz, src[0], bs[0], alu->op);
      break;

   case nir_op_fquantize2f16:
-      dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
+      dst = create_cov(ctx, dst_sz,
+                       create_cov(ctx, dst_sz, src[0], 32, nir_op_f2f16_rtne),
                       16, nir_op_f2f32);
      break;

@ -574,7 +677,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
       *
       * A negate can turn those into a 1 or 0 for us.
       */
-      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+      dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
      break;

   case nir_op_b2b32:
@ -583,20 +686,20 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
       *
       * A negate can turn those into a ~0 for us.
       */
-      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+      dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
      break;

   case nir_op_fneg:
-      dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
+      dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FNEG);
      break;
   case nir_op_fabs:
-      dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
+      dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FABS);
      break;
   case nir_op_fmax:
-      dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
+      dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_fmin:
-      dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
+      dst = ir3_MIN_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_fsat:
      /* if there is just a single use of the src, and it supports
@ -604,222 +707,235 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
       * src instruction and create a mov.  This is easier for cp
       * to eliminate.
       */
-      if (is_sat_compatible(src[0]->opc) &&
-          (list_length(&alu->src[0].src.ssa->uses) == 1)) {
-         src[0]->flags |= IR3_INSTR_SAT;
-         dst[0] = ir3_MOV(b, src[0], dst_type);
+      if (all_sat_compatible(src[0].rpts, dst_sz) &&
+          is_unique_use(&alu->src[0].src)) {
+         set_instr_flags(src[0].rpts, dst_sz, IR3_INSTR_SAT);
+         dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
      } else {
         /* otherwise generate a max.f that saturates.. blob does
          * similar (generating a cat2 mov using max.f)
          */
-         dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
-         dst[0]->flags |= IR3_INSTR_SAT;
+         dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[0], 0);
+         set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
      }
      break;
   case nir_op_fmul:
-      dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
+      dst = ir3_MUL_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_fadd:
-      dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
+      dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_fsub:
-      dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
+      dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], IR3_REG_FNEG);
      break;
   case nir_op_ffma:
-      dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
+      dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
      break;
   case nir_op_flt:
-      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_LT;
+      dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
      break;
   case nir_op_fge:
-      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_GE;
+      dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
      break;
   case nir_op_feq:
-      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_EQ;
+      dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
      break;
   case nir_op_fneu:
-      dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_NE;
+      dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
      break;
   case nir_op_fceil:
-      dst[0] = ir3_CEIL_F(b, src[0], 0);
+      dst = ir3_CEIL_F_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_ffloor:
-      dst[0] = ir3_FLOOR_F(b, src[0], 0);
+      dst = ir3_FLOOR_F_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_ftrunc:
-      dst[0] = ir3_TRUNC_F(b, src[0], 0);
+      dst = ir3_TRUNC_F_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_fround_even:
-      dst[0] = ir3_RNDNE_F(b, src[0], 0);
+      dst = ir3_RNDNE_F_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_fsign:
-      dst[0] = ir3_SIGN_F(b, src[0], 0);
+      dst = ir3_SIGN_F_rpt(b, dst_sz, src[0], 0);
      break;

   case nir_op_fsin:
-      dst[0] = ir3_SIN(b, src[0], 0);
+      dst = ir3_SIN_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_fcos:
-      dst[0] = ir3_COS(b, src[0], 0);
+      dst = ir3_COS_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_frsq:
-      dst[0] = ir3_RSQ(b, src[0], 0);
+      dst = ir3_RSQ_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_frcp:
-      dst[0] = ir3_RCP(b, src[0], 0);
+      assert(dst_sz == 1);
+      dst.rpts[0] = ir3_RCP(b, src[0].rpts[0], 0);
      break;
   case nir_op_flog2:
-      dst[0] = ir3_LOG2(b, src[0], 0);
+      dst = ir3_LOG2_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_fexp2:
-      dst[0] = ir3_EXP2(b, src[0], 0);
+      dst = ir3_EXP2_rpt(b, dst_sz, src[0], 0);
      break;
   case nir_op_fsqrt:
-      dst[0] = ir3_SQRT(b, src[0], 0);
+      dst = ir3_SQRT_rpt(b, dst_sz, src[0], 0);
      break;

   case nir_op_iabs:
-      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
+      dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SABS);
      break;
   case nir_op_iadd:
-      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
+      dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_ihadd:
-      dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
-      dst[0]->dsts[0]->flags |= IR3_REG_EI;
+      dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
      break;
   case nir_op_uhadd:
-      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
-      dst[0]->dsts[0]->flags |= IR3_REG_EI;
+      dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
      break;
   case nir_op_iand:
-      dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
+      dst = ir3_AND_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_imax:
-      dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
+      dst = ir3_MAX_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_umax:
-      dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
+      dst = ir3_MAX_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_imin:
-      dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
+      dst = ir3_MIN_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_umin:
-      dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
+      dst = ir3_MIN_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_umul_low:
-      dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
+      dst = ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_imadsh_mix16:
      if (use_shared) {
-         struct ir3_instruction *sixteen = create_immed_shared(b, 16, true);
-         struct ir3_instruction *src1 = ir3_SHR_B(b, src[1], 0, sixteen, 0);
-         struct ir3_instruction *mul = ir3_MULL_U(b, src[0], 0, src1, 0);
-         dst[0] = ir3_ADD_U(b, ir3_SHL_B(b, mul, 0, sixteen, 0), 0, src[2], 0);
+         struct ir3_instruction_rpt sixteen =
+            create_immed_shared_rpt(b, dst_sz, 16, true);
+         struct ir3_instruction_rpt src1 =
+            ir3_SHR_B_rpt(b, dst_sz, src[1], 0, sixteen, 0);
+         struct ir3_instruction_rpt mul =
+            ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src1, 0);
+         dst = ir3_ADD_U_rpt(b, dst_sz,
+                             ir3_SHL_B_rpt(b, dst_sz, mul, 0, sixteen, 0), 0,
+                             src[2], 0);
      } else {
-         dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
+         dst = ir3_MADSH_M16_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
      }
      break;
   case nir_op_imad24_ir3:
      if (use_shared) {
-         dst[0] = ir3_ADD_U(b, ir3_MUL_U24(b, src[0], 0, src[1], 0), 0, src[2], 0);
+         dst = ir3_ADD_U_rpt(b, dst_sz,
+                             ir3_MUL_U24_rpt(b, dst_sz, src[0], 0, src[1], 0),
+                             0, src[2], 0);
      } else {
-         dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
+         dst = ir3_MAD_S24_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
      }
      break;
   case nir_op_imul:
      compile_assert(ctx, alu->def.bit_size == 8 || alu->def.bit_size == 16);
-      dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
+      dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_imul24:
-      dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
+      dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_ineg:
-      dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+      dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
      break;
   case nir_op_inot:
      if (bs[0] == 1) {
-         struct ir3_instruction *one =
-               create_immed_typed_shared(ctx->block, 1, ctx->compiler->bool_type,
-                                         use_shared);
-         dst[0] = ir3_SUB_U(b, one, 0, src[0], 0);
+         struct ir3_instruction_rpt one = create_immed_typed_shared_rpt(
+            ctx->block, dst_sz, 1, ctx->compiler->bool_type, use_shared);
+         dst = ir3_SUB_U_rpt(b, dst_sz, one, 0, src[0], 0);
      } else {
-         dst[0] = ir3_NOT_B(b, src[0], 0);
+         dst = ir3_NOT_B_rpt(ctx->block, dst_sz, src[0], 0);
      }
      break;
   case nir_op_ior:
-      dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
+      dst = ir3_OR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_ishl:
-      dst[0] =
-         ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
+      dst = ir3_SHL_B_rpt(ctx->block, dst_sz, src[0], 0,
+                          resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
      break;
   case nir_op_ishr:
-      dst[0] =
-         ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
+      dst = ir3_ASHR_B_rpt(ctx->block, dst_sz, src[0], 0,
+                           resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
      break;
   case nir_op_isub:
-      dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
+      dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_ixor:
-      dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
+      dst = ir3_XOR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
      break;
   case nir_op_ushr:
-      dst[0] =
-         ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
+      dst = ir3_SHR_B_rpt(ctx->block, dst_sz, src[0], 0,
+                          resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
      break;
   case nir_op_ilt:
-      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_LT;
+      dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
      break;
   case nir_op_ige:
-      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_GE;
+      dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
      break;
   case nir_op_ieq:
-      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_EQ;
+      dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
      break;
   case nir_op_ine:
-      dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_NE;
+      dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
      break;
   case nir_op_ult:
-      dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_LT;
+      dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
      break;
   case nir_op_uge:
-      dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
-      dst[0]->cat2.condition = IR3_COND_GE;
+      dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
      break;

   case nir_op_bcsel: {
-      struct ir3_instruction *cond = ir3_get_cond_for_nonzero_compare(src[0]);
+      struct ir3_instruction_rpt conds;

      compile_assert(ctx, bs[1] == bs[2]);

+      /* TODO: repeat the covs when possible. */
+      for (unsigned rpt = 0; rpt < dst_sz; ++rpt) {
+         struct ir3_instruction *cond =
+            ir3_get_cond_for_nonzero_compare(src[0].rpts[rpt]);
+
         /* The condition's size has to match the other two arguments' size, so
          * convert down if necessary.
          *
          * Single hashtable is fine, because the conversion will either be
          * 16->32 or 32->16, but never both
          */
-      if (is_half(src[1]) != is_half(cond)) {
-         struct hash_entry *prev_entry =
-            _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
+         if (is_half(src[1].rpts[rpt]) != is_half(cond)) {
+            struct hash_entry *prev_entry = _mesa_hash_table_search(
+               ctx->sel_cond_conversions, src[0].rpts[rpt]);
            if (prev_entry) {
               cond = prev_entry->data;
            } else {
               if (is_half(cond)) {
                  if (bs[0] == 8) {
-                  /* Zero-extension of an 8-bit value has to be done through masking,
-                   * as in create_cov.
+                     /* Zero-extension of an 8-bit value has to be done through
+                      * masking, as in create_cov.
                      */
-                  struct ir3_instruction *mask = create_immed_typed(b, 0xff, TYPE_U8);
+                     struct ir3_instruction *mask =
+                        create_immed_typed(b, 0xff, TYPE_U8);
                     cond = ir3_AND_B(b, cond, 0, mask, 0);
                  } else {
                     cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32);
@ -827,86 +943,98 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
               } else {
                  cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
               }
-            _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
+               _mesa_hash_table_insert(ctx->sel_cond_conversions,
+                                       src[0].rpts[rpt], cond);
            }
         }
+         conds.rpts[rpt] = cond;
+      }

-      if (is_half(src[1])) {
-         dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
-      } else {
-         dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
-      }
-
+      if (is_half(src[1].rpts[0]))
+         dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
+      else
+         dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
      break;
   }
   case nir_op_bit_count: {
-      if (ctx->compiler->gen < 5 || (src[0]->dsts[0]->flags & IR3_REG_HALF)) {
-         dst[0] = ir3_CBITS_B(b, src[0], 0);
+      if (ctx->compiler->gen < 5 ||
+          (src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) {
+         dst = ir3_CBITS_B_rpt(b, dst_sz, src[0], 0);
         break;
      }

      // We need to do this 16b at a time on a5xx+a6xx.  Once half-precision
      // support is in place, this should probably move to a NIR lowering pass:
-      struct ir3_instruction *hi, *lo;
+      struct ir3_instruction_rpt hi, lo;

-      hi = ir3_COV(b,
-                   ir3_SHR_B(b, src[0], 0, create_immed_shared(b, 16, use_shared), 0),
+      hi = ir3_COV_rpt(
+         b, dst_sz,
+         ir3_SHR_B_rpt(b, dst_sz, src[0], 0,
+                       create_immed_shared_rpt(b, dst_sz, 16, use_shared), 0),
         TYPE_U32, TYPE_U16);
-      lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
+      lo = ir3_COV_rpt(b, dst_sz, src[0], TYPE_U32, TYPE_U16);

-      hi = ir3_CBITS_B(b, hi, 0);
-      lo = ir3_CBITS_B(b, lo, 0);
+      hi = ir3_CBITS_B_rpt(b, dst_sz, hi, 0);
+      lo = ir3_CBITS_B_rpt(b, dst_sz, lo, 0);

      // TODO maybe the builders should default to making dst half-precision
      // if the src's were half precision, to make this less awkward.. otoh
      // we should probably just do this lowering in NIR.
-      hi->dsts[0]->flags |= IR3_REG_HALF;
-      lo->dsts[0]->flags |= IR3_REG_HALF;
+      set_dst_flags(hi.rpts, dst_sz, IR3_REG_HALF);
+      set_dst_flags(lo.rpts, dst_sz, IR3_REG_HALF);

-      dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
-      dst[0]->dsts[0]->flags |= IR3_REG_HALF;
-      dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
+      dst = ir3_ADD_S_rpt(b, dst_sz, hi, 0, lo, 0);
+      set_dst_flags(dst.rpts, dst_sz, IR3_REG_HALF);
+      dst = ir3_COV_rpt(b, dst_sz, dst, TYPE_U16, TYPE_U32);
      break;
   }
   case nir_op_ifind_msb: {
-      struct ir3_instruction *cmp;
-      dst[0] = ir3_CLZ_S(b, src[0], 0);
-      cmp = ir3_CMPS_S(b, dst[0], 0, create_immed_shared(b, 0, use_shared), 0);
-      cmp->cat2.condition = IR3_COND_GE;
-      dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed_shared(b, 31, use_shared), 0,
-                                        dst[0], 0),
-                           0, cmp, 0, dst[0], 0);
+      struct ir3_instruction_rpt cmp;
+      dst = ir3_CLZ_S_rpt(b, dst_sz, src[0], 0);
+      cmp =
+         ir3_CMPS_S_rpt(b, dst_sz, dst, 0,
+                        create_immed_shared_rpt(b, dst_sz, 0, use_shared), 0);
+      set_cat2_condition(cmp.rpts, dst_sz, IR3_COND_GE);
+      dst = ir3_SEL_B32_rpt(
+         b, dst_sz,
+         ir3_SUB_U_rpt(b, dst_sz,
+                       create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
+                       dst, 0),
+         0, cmp, 0, dst, 0);
      break;
   }
   case nir_op_ufind_msb:
-      dst[0] = ir3_CLZ_B(b, src[0], 0);
-      dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed_shared(b, 31, use_shared), 0,
-                                        dst[0], 0),
-                           0, src[0], 0, dst[0], 0);
+      dst = ir3_CLZ_B_rpt(b, dst_sz, src[0], 0);
+      dst = ir3_SEL_B32_rpt(
+         b, dst_sz,
+         ir3_SUB_U_rpt(b, dst_sz,
+                       create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
+                       dst, 0),
+         0, src[0], 0, dst, 0);
      break;
   case nir_op_find_lsb:
-      dst[0] = ir3_BFREV_B(b, src[0], 0);
-      dst[0] = ir3_CLZ_B(b, dst[0], 0);
+      dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
+      dst = ir3_CLZ_B_rpt(b, dst_sz, dst, 0);
      break;
   case nir_op_bitfield_reverse:
-      dst[0] = ir3_BFREV_B(b, src[0], 0);
+      dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
      break;

   case nir_op_uadd_sat:
-      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
-      dst[0]->flags |= IR3_INSTR_SAT;
+      dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
      break;
   case nir_op_iadd_sat:
-      dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
-      dst[0]->flags |= IR3_INSTR_SAT;
+      dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
      break;
   case nir_op_usub_sat:
-      dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
-      dst[0]->flags |= IR3_INSTR_SAT;
+      dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
      break;
   case nir_op_isub_sat:
-      dst[0] = ir3_SUB_S(b, src[0], 0, src[1], 0);
-      dst[0]->flags |= IR3_INSTR_SAT;
+      dst = ir3_SUB_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
+      set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
      break;

   case nir_op_udot_4x8_uadd:
@ -915,10 +1043,15 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
   case nir_op_sdot_4x8_iadd_sat:
   case nir_op_sudot_4x8_iadd:
   case nir_op_sudot_4x8_iadd_sat: {
+      assert(dst_sz == 1);
+
+      struct ir3_instruction *src_rpt0[] = {src[0].rpts[0], src[1].rpts[0],
+                                            src[2].rpts[0]};
+
      if (ctx->compiler->has_dp4acc) {
-         emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src);
+         emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst.rpts, src_rpt0);
      } else if (ctx->compiler->has_dp2acc) {
-         emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src);
+         emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst.rpts, src_rpt0);
      } else {
         ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
                           nir_op_infos[alu->op].name);
@ -935,12 +1068,12 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)

   if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
      assert(alu->def.bit_size == 1 || alu->op == nir_op_b2b32);
-      assert(dst_sz == 1);
   } else {
      /* 1-bit values stored in 32-bit registers are only valid for certain
       * ALU ops.
       */
      switch (alu->op) {
+      case nir_op_mov:
      case nir_op_iand:
      case nir_op_ior:
      case nir_op_ixor:
@ -952,6 +1085,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
      }
   }

+   cp_instrs(def, dst.rpts, dst_sz);
   ir3_put_def(ctx, &alu->def);
 }

@ -2365,6 +2499,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
   struct ir3_block *b = ctx->block;
   unsigned dest_components = nir_intrinsic_dest_components(intr);
   int idx;
+   bool create_rpt = false;

   if (info->has_dest) {
      dst = ir3_get_def(ctx, &intr->def, dest_components);
@ -2443,6 +2578,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
               b, idx + i,
               intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32);
         }
+         create_rpt = true;
      } else {
         src = ctx->compiler->has_scalar_alu ?
            ir3_get_src_maybe_shared(ctx, &intr->src[0]) : 
@ -2708,6 +2844,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
         unsigned n = idx * 4 + i;
         dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
      }
+      create_rpt = true;
      break;
   case nir_intrinsic_load_front_face:
      if (!ctx->frag_face) {
@ -2749,16 +2886,19 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      for (int i = 0; i < dest_components; i++) {
         dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
      }
+      create_rpt = true;
      break;
   case nir_intrinsic_load_num_workgroups:
      for (int i = 0; i < dest_components; i++) {
         dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
      }
+      create_rpt = true;
      break;
   case nir_intrinsic_load_workgroup_size:
      for (int i = 0; i < dest_components; i++) {
         dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
      }
+      create_rpt = true;
      break;
   case nir_intrinsic_load_subgroup_size: {
      assert(ctx->so->type == MESA_SHADER_COMPUTE ||
@ -2784,11 +2924,13 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      for (int i = 0; i < dest_components; i++) {
         dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_OUTER_LEVEL_X + i);
      }
+      create_rpt = true;
      break;
   case nir_intrinsic_load_tess_level_inner_default:
      for (int i = 0; i < dest_components; i++) {
         dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_INNER_LEVEL_X + i);
      }
+      create_rpt = true;
      break;
   case nir_intrinsic_load_frag_invocation_count:
      dst[0] = create_driver_param(ctx, IR3_DP_FS_FRAG_INVOCATION_COUNT);
@ -2803,6 +2945,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
         for (int i = 0; i < dest_components; i++) {
            dst[i] = create_driver_param(ctx, param + 4 * view + i);
         }
+         create_rpt = true;
      } else {
         struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
         for (int i = 0; i < dest_components; i++) {
@ -3131,8 +3274,11 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      break;
   }

-   if (info->has_dest)
+   if (info->has_dest) {
+      if (create_rpt)
+         ir3_instr_create_rpt(dst, dest_components);
      ir3_put_def(ctx, &intr->def);
+   }
 }

 static void
@ -4042,16 +4188,17 @@ static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
 * instead of adding an explicit not.b/sub.u instruction.
 */
 static struct ir3_instruction *
-get_branch_condition(struct ir3_context *ctx, nir_src *src, bool *inv)
+get_branch_condition(struct ir3_context *ctx, nir_src *src, unsigned comp,
+                     bool *inv)
 {
-   struct ir3_instruction *condition = ir3_get_src(ctx, src)[0];
+   struct ir3_instruction *condition = ir3_get_src(ctx, src)[comp];

   if (src->ssa->parent_instr->type == nir_instr_type_alu) {
      nir_alu_instr *nir_cond = nir_instr_as_alu(src->ssa->parent_instr);

      if (nir_cond->op == nir_op_inot) {
-         struct ir3_instruction *inv_cond =
-            get_branch_condition(ctx, &nir_cond->src[0].src, inv);
+         struct ir3_instruction *inv_cond = get_branch_condition(
+            ctx, &nir_cond->src[0].src, nir_cond->src[0].swizzle[comp], inv);
         *inv = !*inv;
         return inv_cond;
      }
@ -4087,10 +4234,10 @@ fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
      return NULL;

   bool inv1, inv2;
-   struct ir3_instruction *cond1 =
-      get_branch_condition(ctx, &alu_cond->src[0].src, &inv1);
-   struct ir3_instruction *cond2 =
-      get_branch_condition(ctx, &alu_cond->src[1].src, &inv2);
+   struct ir3_instruction *cond1 = get_branch_condition(
+      ctx, &alu_cond->src[0].src, alu_cond->src[0].swizzle[0], &inv1);
+   struct ir3_instruction *cond2 = get_branch_condition(
+      ctx, &alu_cond->src[1].src, alu_cond->src[1].swizzle[0], &inv2);

   struct ir3_instruction *branch;
   if (alu_cond->op == nir_op_iand) {
@ -4224,7 +4371,7 @@ emit_predicated_branch(struct ir3_context *ctx, nir_if *nif)

   bool inv;
   struct ir3_instruction *condition =
-      get_branch_condition(ctx, &nif->condition, &inv);
+      get_branch_condition(ctx, &nif->condition, 0, &inv);
   struct ir3_instruction *pred, *pred_inv;

   if (!inv) {
@ -4253,7 +4400,8 @@ emit_conditional_branch(struct ir3_context *ctx, nir_if *nif)
      return predicated;

   bool inv1;
-   struct ir3_instruction *cond1 = get_branch_condition(ctx, nir_cond, &inv1);
+   struct ir3_instruction *cond1 =
+      get_branch_condition(ctx, nir_cond, 0, &inv1);
   struct ir3_instruction *branch =
      ir3_BR(ctx->block, cond1, IR3_REG_PREDICATE);
   branch->cat0.inv1 = inv1;
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@ -121,9 +121,17 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
   if ((so->type == MESA_SHADER_FRAGMENT) && compiler->has_fs_tex_prefetch)
      NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);

-   NIR_PASS(progress, ctx->s, nir_convert_to_lcssa, true, true);
+   bool vectorized = false;
+   NIR_PASS(vectorized, ctx->s, nir_opt_vectorize, ir3_nir_vectorize_filter,
+            NULL);

-   NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);
+   if (vectorized) {
+      NIR_PASS_V(ctx->s, nir_opt_undef);
+      NIR_PASS_V(ctx->s, nir_copy_prop);
+      NIR_PASS_V(ctx->s, nir_opt_dce);
+   }
+
+   NIR_PASS(progress, ctx->s, nir_convert_to_lcssa, true, true);

   /* This has to go at the absolute end to make sure that all SSA defs are
    * correctly marked.
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@ -61,6 +61,9 @@ void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
                             unsigned topology);
 void ir3_nir_lower_gs(nir_shader *shader);

+bool ir3_supports_vectorized_nir_op(nir_op op);
+uint8_t ir3_nir_vectorize_filter(const nir_instr *instr, const void *data);
+
 /*
 * 64b related lowering:
 */
--- a/src/freedreno/ir3/ir3_rpt.c
+++ b/src/freedreno/ir3/ir3_rpt.c
@ -5,6 +5,52 @@

 #include "ir3_nir.h"

+bool
+ir3_supports_vectorized_nir_op(nir_op op)
+{
+   switch (op) {
+      /* TODO: emitted as absneg which can often be folded away (e.g., into
+       * (neg)). This seems to often fail when repeated.
+       */
+   case nir_op_b2b1:
+
+      /* dsx/dsy don't seem to support repeat. */
+   case nir_op_fddx:
+   case nir_op_fddx_coarse:
+   case nir_op_fddx_fine:
+   case nir_op_fddy:
+   case nir_op_fddy_coarse:
+   case nir_op_fddy_fine:
+
+      /* dp2acc/dp4acc don't seem to support repeat. */
+   case nir_op_udot_4x8_uadd:
+   case nir_op_udot_4x8_uadd_sat:
+   case nir_op_sudot_4x8_iadd:
+   case nir_op_sudot_4x8_iadd_sat:
+
+      /* Among SFU instructions, only rcp doesn't seem to support repeat. */
+   case nir_op_frcp:
+      return false;
+
+   default:
+      return true;
+   }
+}
+
+uint8_t
+ir3_nir_vectorize_filter(const nir_instr *instr, const void *data)
+{
+   if (instr->type != nir_instr_type_alu)
+      return 0;
+
+   struct nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+   if (!ir3_supports_vectorized_nir_op(alu->op))
+      return 0;
+
+   return 4;
+}
+
 static void
 rpt_list_split(struct list_head *list, struct list_head *at)
 {