diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 7eb8ca36209..0a1d937ad7d 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -335,7 +335,14 @@ typedef enum ir3_instruction_flags {
     * before register assignment is done:
     */
    IR3_INSTR_MARK = BIT(15),
-   IR3_INSTR_UNUSED = BIT(16),
+
+   /* Used by shared register allocation when creating spill/reload instructions
+    * to inform validation that this is created by RA. This also may be set on
+    * an instruction where a spill has been folded into it.
+    */
+   IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
+
+   IR3_INSTR_UNUSED = BIT(17),
 } ir3_instruction_flags;
 
 struct ir3_instruction {
diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c
index 1cdaad67b0e..7de6aaa4b18 100644
--- a/src/freedreno/ir3/ir3_merge_regs.c
+++ b/src/freedreno/ir3/ir3_merge_regs.c
@@ -377,6 +377,8 @@ static void
 aggressive_coalesce_split(struct ir3_liveness *live,
                           struct ir3_instruction *split)
 {
+   if (!(split->dsts[0]->flags & IR3_REG_SSA))
+      return;
    try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
                   split->split.off * reg_elem_size(split->dsts[0]));
 }
@@ -409,6 +411,10 @@ create_parallel_copy(struct ir3_block *block)
          if (phi->opc != OPC_META_PHI)
             break;
 
+         /* Avoid phis we've already colored */
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
+
          /* Avoid undef */
          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
              !phi->srcs[pred_idx]->def)
@@ -430,6 +436,8 @@ create_parallel_copy(struct ir3_block *block)
       foreach_instr (phi, &succ->instr_list) {
          if (phi->opc != OPC_META_PHI)
             break;
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
              !phi->srcs[pred_idx]->def)
             continue;
@@ -456,6 +464,8 @@ create_parallel_copy(struct ir3_block *block)
       foreach_instr (phi, &succ->instr_list) {
          if (phi->opc != OPC_META_PHI)
             break;
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
              !phi->srcs[pred_idx]->def)
             continue;
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 070f67ba7ba..3b1b424ae22 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -193,6 +193,8 @@ void
 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
                         struct ir3_reg_interval *interval)
 {
+   assert(interval->inserted);
+
    if (interval->parent) {
       rb_tree_remove(&interval->parent->children, &interval->node);
    } else {
@@ -684,6 +686,8 @@ ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
                 struct ra_interval *interval)
 {
    assert(!interval->interval.parent);
+   /* shared live splitting is not allowed! */
+   assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
 
    /* Check if we've already moved this reg before */
    unsigned pcopy_index;
@@ -1665,6 +1669,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
    struct ir3_register *dst = instr->dsts[0];
    struct ir3_register *src = instr->srcs[0];
 
+   if (!(dst->flags & IR3_REG_SSA))
+      return;
+
    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
       handle_normal_instr(ctx, instr);
       return;
@@ -1683,6 +1690,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
    unsigned dst_offset = instr->dsts[0]->merge_set_offset;
 
@@ -1798,7 +1808,8 @@ handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-   if (instr->dsts[0]->num == INVALID_REG)
+   if (instr->dsts[0]->num == INVALID_REG ||
+       !(instr->dsts[0]->flags & IR3_REG_SSA))
       return;
 
    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
@@ -1829,6 +1840,9 @@ handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
 
@@ -1973,6 +1987,9 @@ handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
 static void
 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
 {
+   if (!(def->flags & IR3_REG_SSA))
+      return;
+
    struct ra_file *file = ra_get_file(ctx, def);
    struct ra_interval *interval = &ctx->intervals[def->name];
 
@@ -1999,6 +2016,9 @@ handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
 static void
 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
 {
+   if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
    struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
    assert(!interval->interval.parent);
@@ -2085,15 +2105,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
 {
    physreg_t physreg = ra_interval_get_physreg(interval);
 
-   bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
-   struct ir3_block **predecessors =
-      shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
-   unsigned predecessors_count = shared
-                                    ? ctx->block->physical_predecessors_count
-                                    : ctx->block->predecessors_count;
-
-   for (unsigned i = 0; i < predecessors_count; i++) {
-      struct ir3_block *pred = predecessors[i];
+   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+      struct ir3_block *pred = ctx->block->predecessors[i];
       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
 
       if (!pred_state->visited)
@@ -2101,28 +2114,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
 
       physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
       if (pred_reg != physreg) {
+         assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
          insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
-
-         /* This is a bit tricky, but when visiting the destination of a
-          * physical-only edge, we have two predecessors (the if and the
-          * header block) and both have multiple successors. We pick the
-          * register for all live-ins from the normal edge, which should
-          * guarantee that there's no need for shuffling things around in
-          * the normal predecessor as long as there are no phi nodes, but
-          * we still may need to insert fixup code in the physical
-          * predecessor (i.e. the last block of the if) and that has
-          * another successor (the block after the if) so we need to update
-          * the renames state for when we process the other successor. This
-          * crucially depends on the other successor getting processed
-          * after this.
-          *
-          * For normal (non-physical) edges we disallow critical edges so
-          * that hacks like this aren't necessary.
-          */
-         if (!pred_state->renames)
-            pred_state->renames = _mesa_pointer_hash_table_create(ctx);
-         _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
-                                 (void *)(uintptr_t)physreg);
       }
    }
 }
@@ -2561,6 +2554,18 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ir3_merge_regs(live, v->ir);
 
+   bool has_shared_vectors = false;
+   foreach_block (block, &v->ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) {
+               has_shared_vectors = true;
+               break;
+            }
+         }
+      }
+   }
+
    struct ir3_pressure max_pressure;
    ir3_calc_pressure(v, live, &max_pressure);
    d("max pressure:");
@@ -2590,10 +2595,17 @@ ir3_ra(struct ir3_shader_variant *v)
    if (ir3_shader_debug & IR3_DBG_SPILLALL)
       calc_min_limit_pressure(v, live, &limit_pressure);
 
-   if (max_pressure.shared > limit_pressure.shared) {
-      /* TODO shared reg -> normal reg spilling */
-      d("shared max pressure exceeded!");
-      goto fail;
+   if (max_pressure.shared > limit_pressure.shared || has_shared_vectors) {
+      ir3_ra_shared(v, live);
+
+      /* Recalculate liveness and register pressure now that additional values
+       * have been added.
+       */
+      ralloc_free(live);
+      live = ir3_calc_liveness(ctx, v->ir);
+      ir3_calc_pressure(v, live, &max_pressure);
+
+      ir3_debug_print(v->ir, "AFTER: shared register allocation");
    }
 
    bool spilled = false;
@@ -2629,7 +2641,7 @@ ir3_ra(struct ir3_shader_variant *v)
    foreach_block (block, &v->ir->block_list)
       handle_block(ctx, block);
 
-   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
+   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false);
 
    /* Strip array-ness and SSA-ness at the end, because various helpers still
     * need to work even on definitions that have already been assigned. For
diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h
index c6837aaae21..1c561a57d76 100644
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@@ -168,8 +168,10 @@ bool ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
 
 bool ir3_lower_spill(struct ir3 *ir);
 
+void ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness *live);
+
 void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
-                     unsigned half_size, unsigned block_count);
+                     unsigned half_size, unsigned block_count, bool shared_ra);
 
 void ir3_lower_copies(struct ir3_shader_variant *v);
 
diff --git a/src/freedreno/ir3/ir3_ra_validate.c b/src/freedreno/ir3/ir3_ra_validate.c
index aab26760ab9..3d19e2b7431 100644
--- a/src/freedreno/ir3/ir3_ra_validate.c
+++ b/src/freedreno/ir3/ir3_ra_validate.c
@@ -92,13 +92,25 @@ struct reaching_state {
 struct ra_val_ctx {
    struct ir3_instruction *current_instr;
 
+   /* The current state of the dataflow analysis for the instruction we're
+    * processing.
+    */
    struct reaching_state reaching;
+
+   /* The state at the end of each basic block. */
    struct reaching_state *block_reaching;
    unsigned block_count;
 
+   /* When validating shared RA, we have to take spill/reload instructions into
+    * account. This saves an array of reg_state for the source of each spill
+    * instruction, to be restored at the corresponding reload(s).
+    */
+   struct hash_table *spill_reaching;
+
    unsigned full_size, half_size;
 
    bool merged_regs;
+   bool shared_ra;
 
    bool failed;
 };
@@ -130,6 +142,28 @@ get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
       return ctx->half_size;
 }
 
+static struct reg_state *
+get_spill_state(struct ra_val_ctx *ctx, struct ir3_register *dst)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(ctx->spill_reaching, dst);
+   if (entry)
+      return entry->data;
+   else
+      return NULL;
+}
+
+static struct reg_state *
+get_or_create_spill_state(struct ra_val_ctx *ctx, struct ir3_register *dst)
+{
+   struct reg_state *state = get_spill_state(ctx, dst);
+   if (state)
+      return state;
+
+   state = rzalloc_array(ctx, struct reg_state, reg_size(dst));
+   _mesa_hash_table_insert(ctx->spill_reaching, dst, state);
+   return state;
+}
+
 /* Validate simple things, like the registers being in-bounds. This way we
  * don't have to worry about out-of-bounds accesses later.
  */
@@ -139,6 +173,8 @@ validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
    ctx->current_instr = instr;
    ra_foreach_dst (dst, instr) {
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED))
+         continue;
       unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
       validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
       if (dst->tied)
@@ -146,6 +182,8 @@ validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
    }
 
    ra_foreach_src (src, instr) {
+      if (ctx->shared_ra && !(src->flags & IR3_REG_SHARED))
+         continue;
       unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
       validate_assert(ctx, src_max <= get_file_size(ctx, src));
    }
@@ -219,6 +257,24 @@ static void
 propagate_normal_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
    ra_foreach_dst (dst, instr) {
+      /* Process destinations from scalar ALU instructions that were demoted to
+       * normal ALU instructions. For these we must treat the instruction as a
+       * spill of itself and set the propagate state to itself. See
+       * try_demote_instructions().
+       */
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         if (instr->flags & IR3_INSTR_SHARED_SPILL) {
+            struct reg_state *state = get_or_create_spill_state(ctx, dst);
+            for (unsigned i = 0; i < reg_size(dst); i++) {
+               state[i] = (struct reg_state){
+                  .def = dst,
+                  .offset = i,
+               };
+            }
+         }
+         continue;
+      }
+
       struct file_state *file = ra_val_get_file(ctx, dst);
       physreg_t physreg = ra_reg_get_physreg(dst);
       for (unsigned i = 0; i < reg_size(dst); i++) {
@@ -239,6 +295,16 @@ propagate_split(struct ra_val_ctx *ctx, struct ir3_instruction *split)
    physreg_t src_physreg = ra_reg_get_physreg(src);
    struct file_state *file = ra_val_get_file(ctx, dst);
 
+   if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+      struct reg_state *src_state = get_spill_state(ctx, src->def);
+      if (src_state) {
+         struct reg_state *dst_state = get_or_create_spill_state(ctx, dst);
+         memcpy(dst_state, &src_state[split->split.off * reg_elem_size(src)],
+                reg_size(dst) * sizeof(struct reg_state));
+      }
+      return;
+   }
+
    unsigned offset = split->split.off * reg_elem_size(src);
    for (unsigned i = 0; i < reg_elem_size(src); i++) {
       file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
@@ -249,30 +315,50 @@ static void
 propagate_collect(struct ra_val_ctx *ctx, struct ir3_instruction *collect)
 {
    struct ir3_register *dst = collect->dsts[0];
-   physreg_t dst_physreg = ra_reg_get_physreg(dst);
-   struct file_state *file = ra_val_get_file(ctx, dst);
-
    unsigned size = reg_size(dst);
-   struct reg_state srcs[size];
 
-   for (unsigned i = 0; i < collect->srcs_count; i++) {
-      struct ir3_register *src = collect->srcs[i];
-      unsigned dst_offset = i * reg_elem_size(dst);
-      for (unsigned j = 0; j < reg_elem_size(dst); j++) {
-         if (!ra_reg_is_src(src)) {
-            srcs[dst_offset + j] = (struct reg_state){
-               .def = dst,
-               .offset = dst_offset + j,
-            };
-         } else {
-            physreg_t src_physreg = ra_reg_get_physreg(src);
-            srcs[dst_offset + j] = file->regs[src_physreg + j];
+   if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+      struct reg_state *dst_state = NULL;
+
+      for (unsigned i = 0; i < collect->srcs_count; i++) {
+         struct ir3_register *src = collect->srcs[i];
+         unsigned dst_offset = i * reg_elem_size(dst);
+
+         if (ra_reg_is_src(src)) {
+            struct reg_state *src_state = get_spill_state(ctx, src->def);
+            if (src_state) {
+               if (!dst_state)
+                  dst_state = get_or_create_spill_state(ctx, dst);
+               memcpy(&dst_state[dst_offset], src_state,
+                      reg_size(src) * sizeof(struct reg_state));
+            }
          }
       }
-   }
+   } else {
+      struct file_state *file = ra_val_get_file(ctx, dst);
+      physreg_t dst_physreg = ra_reg_get_physreg(dst);
+      struct reg_state srcs[size];
 
-   for (unsigned i = 0; i < size; i++)
-      file->regs[dst_physreg + i] = srcs[i];
+      for (unsigned i = 0; i < collect->srcs_count; i++) {
+         struct ir3_register *src = collect->srcs[i];
+         unsigned dst_offset = i * reg_elem_size(dst);
+
+         for (unsigned j = 0; j < reg_elem_size(dst); j++) {
+            if (!ra_reg_is_src(src)) {
+               srcs[dst_offset + j] = (struct reg_state){
+                  .def = dst,
+                  .offset = dst_offset + j,
+               };
+            } else {
+               physreg_t src_physreg = ra_reg_get_physreg(src);
+               srcs[dst_offset + j] = file->regs[src_physreg + j];
+            }
+         }
+      }
+
+      for (unsigned i = 0; i < size; i++)
+         file->regs[dst_physreg + i] = srcs[i];
+   }
 }
 
 static void
@@ -291,15 +377,25 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
       struct ir3_register *src = pcopy->srcs[i];
       struct file_state *file = ra_val_get_file(ctx, dst);
 
-      for (unsigned j = 0; j < reg_size(dst); j++) {
-         if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
-            srcs[offset + j] = (struct reg_state){
-               .def = dst,
-               .offset = j,
-            };
-         } else {
-            physreg_t src_physreg = ra_reg_get_physreg(src);
-            srcs[offset + j] = file->regs[src_physreg + j];
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         if (ra_reg_is_src(src)) {
+            struct reg_state *src_state = get_spill_state(ctx, src->def);
+            if (src_state) {
+               struct reg_state *dst_state = get_or_create_spill_state(ctx, dst);
+               memcpy(dst_state, src_state, reg_size(dst) * sizeof(struct reg_state));
+            }
+         }
+      } else {
+         for (unsigned j = 0; j < reg_size(dst); j++) {
+            if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
+               srcs[offset + j] = (struct reg_state){
+                  .def = dst,
+                  .offset = j,
+               };
+            } else {
+               physreg_t src_physreg = ra_reg_get_physreg(src);
+               srcs[offset + j] = file->regs[src_physreg + j];
+            }
          }
       }
 
@@ -310,6 +406,12 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
    offset = 0;
    for (unsigned i = 0; i < pcopy->dsts_count; i++) {
       struct ir3_register *dst = pcopy->dsts[i];
+
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         offset += reg_size(dst);
+         continue;
+      }
+
       physreg_t dst_physreg = ra_reg_get_physreg(dst);
       struct file_state *file = ra_val_get_file(ctx, dst);
 
@@ -321,6 +423,23 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
    assert(offset == size);
 }
 
+static void
+propagate_spill(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
+{
+   if (instr->srcs[0]->flags & IR3_REG_SHARED) { /* spill */
+      struct reg_state *state = get_or_create_spill_state(ctx, instr->dsts[0]);
+      physreg_t src_physreg = ra_reg_get_physreg(instr->srcs[0]);
+      memcpy(state, &ctx->reaching.shared.regs[src_physreg],
+             reg_size(instr->srcs[0]) * sizeof(struct reg_state));
+   } else { /* reload */
+      struct reg_state *state = get_spill_state(ctx, instr->srcs[0]->def);
+      assert(state);
+      physreg_t dst_physreg = ra_reg_get_physreg(instr->dsts[0]);
+      memcpy(&ctx->reaching.shared.regs[dst_physreg], state,
+             reg_size(instr->dsts[0]) * sizeof(struct reg_state));
+   }
+}
+
 static void
 propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
@@ -330,6 +449,13 @@ propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
       propagate_collect(ctx, instr);
    else if (instr->opc == OPC_META_PARALLEL_COPY)
       propagate_parallelcopy(ctx, instr);
+   else if (ctx->shared_ra && instr->opc == OPC_MOV &&
+            /* Moves from immed/const with IR3_INSTR_SHARED_SPILL were demoted
+             * from scalar ALU, see try_demote_instruction().
+             */
+            !(instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)) &&
+            (instr->flags & IR3_INSTR_SHARED_SPILL))
+      propagate_spill(ctx, instr);
    else
       propagate_normal_instr(ctx, instr);
 }
@@ -439,6 +565,8 @@ static void
 check_reaching_src(struct ra_val_ctx *ctx, struct ir3_instruction *instr,
                    struct ir3_register *src)
 {
+   if (ctx->shared_ra && !(src->flags & IR3_REG_SHARED))
+      return;
    struct file_state *file = ra_val_get_file(ctx, src);
    physreg_t physreg = ra_reg_get_physreg(src);
    for (unsigned i = 0; i < reg_size(src); i++) {
@@ -541,7 +669,7 @@ check_reaching_defs(struct ra_val_ctx *ctx, struct ir3 *ir)
 
 void
 ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
-                unsigned half_size, unsigned block_count)
+                unsigned half_size, unsigned block_count, bool shared_ra)
 {
 #ifdef NDEBUG
 #define VALIDATE 0
@@ -557,6 +685,9 @@ ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
    ctx->full_size = full_size;
    ctx->half_size = half_size;
    ctx->block_count = block_count;
+   ctx->shared_ra = shared_ra;
+   if (ctx->shared_ra)
+      ctx->spill_reaching = _mesa_pointer_hash_table_create(ctx);
 
    foreach_block (block, &v->ir->block_list) {
       foreach_instr (instr, &block->instr_list) {
diff --git a/src/freedreno/ir3/ir3_shared_ra.c b/src/freedreno/ir3/ir3_shared_ra.c
new file mode 100644
index 00000000000..d00198b2a3c
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shared_ra.c
@@ -0,0 +1,1415 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+#include "ir3_shader.h"
+
+#include "util/u_math.h"
+
+/* Allocating shared registers can pose a challenge, because their live
+ * intervals use the physical CFG which has extra edges inserted that are
+ * pretty much always critical edges. This causes problems with phi nodes,
+ * because copies for phi nodes have to happen "along the edge," and similarly
+ * causes problems when reunifying values that have had their live range split.
+ * Problematic phi nodes should be relatively rare, so we ban them for now.
+ * The solution we choose for live-range splitting is to integrate spilling and
+ * register allcoation and spill to vector registers rather than split a live
+ * range, which negates some of the advantages of SSA-based RA, but it isn't as
+ * bad as it seems because the conditions needed (vector shared registers, which
+ * only movmsk currently produces, or fixed registers which we don't do) are
+ * relatively rare. Spilling is also much cheaper than spilling vector registers
+ * to private memory.
+ */
+
+struct ra_interval {
+   struct ir3_reg_interval interval;
+
+   struct rb_node physreg_node;
+   physreg_t physreg_start, physreg_end;
+
+   /* Where the shared register is spilled to. If there were no uses when it's
+    * spilled it could be the original defining instruction.
+    */
+   struct ir3_register *spill_def;
+
+   /* Whether this contains a source of the current instruction that can't be
+    * spilled.
+    */
+   bool src;
+
+   bool needs_reload;
+};
+
+struct ra_block_state {
+   bool visited;
+
+   /* For blocks whose successors are visited first (i.e. loop backedges), which
+    * values should be live at the end.
+    */
+   BITSET_WORD *live_out;
+};
+
+struct ra_ctx {
+   struct ir3_reg_ctx reg_ctx;
+
+   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
+
+   struct rb_tree physreg_intervals;
+
+   struct ra_interval *intervals;
+
+   struct ir3_liveness *live;
+
+   struct hash_table *pcopy_src_map;
+
+   struct ra_block_state *blocks;
+
+   unsigned start;
+};
+
+static struct ra_interval *
+ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
+{
+   return rb_node_data(struct ra_interval, interval, interval);
+}
+
+static struct ra_interval *
+rb_node_to_interval(struct rb_node *node)
+{
+   return rb_node_data(struct ra_interval, node, physreg_node);
+}
+
+static const struct ra_interval *
+rb_node_to_interval_const(const struct rb_node *node)
+{
+   return rb_node_data(struct ra_interval, node, physreg_node);
+}
+
+static struct ra_interval *
+ra_interval_next(struct ra_interval *interval)
+{
+   struct rb_node *next = rb_node_next(&interval->physreg_node);
+   return next ? rb_node_to_interval(next) : NULL;
+}
+
+static struct ra_interval *
+ra_interval_next_or_null(struct ra_interval *interval)
+{
+   return interval ? ra_interval_next(interval) : NULL;
+}
+
+static int
+ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+   const struct ra_interval *a = rb_node_to_interval_const(_a);
+   const struct ra_interval *b = rb_node_to_interval_const(_b);
+   return b->physreg_start - a->physreg_start;
+}
+
+static int
+ra_interval_cmp(const struct rb_node *node, const void *data)
+{
+   physreg_t reg = *(const physreg_t *)data;
+   const struct ra_interval *interval = rb_node_to_interval_const(node);
+   if (interval->physreg_start > reg)
+      return -1;
+   else if (interval->physreg_end <= reg)
+      return 1;
+   else
+      return 0;
+}
+
+static struct ra_ctx *
+ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
+{
+   return rb_node_data(struct ra_ctx, ctx, reg_ctx);
+}
+
+static struct ra_interval *
+ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
+{
+   struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
+   return node ? rb_node_to_interval(node) : NULL;
+}
+
+/* Get the interval covering the reg, or the closest to the right if it
+ * doesn't exist.
+ */
+static struct ra_interval *
+ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
+{
+   struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
+   if (!interval) {
+      return NULL;
+   } else if (interval->physreg_end > reg) {
+      return interval;
+   } else {
+      /* There is no interval covering reg, and ra_file_search_sloppy()
+       * returned the closest range to the left, so the next interval to the
+       * right should be the closest to the right.
+       */
+      return ra_interval_next_or_null(interval);
+   }
+}
+
+static struct ra_interval *
+ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
+{
+   return ra_interval_search_right(&ctx->physreg_intervals, reg);
+}
+
+static void
+interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
+{
+   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+   struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
+
+   /* We can assume in this case that physreg_start/physreg_end is already
+    * initialized.
+    */
+   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+      BITSET_CLEAR(ctx->available, i);
+   }
+
+   rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
+                  ra_interval_insert_cmp);
+}
+
+static void
+interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
+{
+   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+   struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
+
+   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+      BITSET_SET(ctx->available, i);
+   }
+
+   rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
+}
+
+static void
+interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
+               struct ir3_reg_interval *_child)
+{
+   struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
+   struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
+
+   child->physreg_start =
+      parent->physreg_start + (child->interval.reg->interval_start -
+                               parent->interval.reg->interval_start);
+   child->physreg_end =
+      child->physreg_start +
+      (child->interval.reg->interval_end - child->interval.reg->interval_start);
+
+   interval_add(ctx, _child);
+}
+
+static void
+ra_ctx_init(struct ra_ctx *ctx)
+{
+   ctx->reg_ctx.interval_add = interval_add;
+   ctx->reg_ctx.interval_delete = interval_delete;
+   ctx->reg_ctx.interval_readd = interval_readd;
+}
+
+static void
+ra_ctx_reset_block(struct ra_ctx *ctx)
+{
+   for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
+      BITSET_SET(ctx->available, i);
+   }
+
+   rb_tree_init(&ctx->reg_ctx.intervals);
+   rb_tree_init(&ctx->physreg_intervals);
+}
+
+static void
+ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
+{
+   ir3_reg_interval_init(&interval->interval, reg);
+}
+
+static physreg_t
+ra_interval_get_physreg(const struct ra_interval *interval)
+{
+   unsigned child_start = interval->interval.reg->interval_start;
+
+   while (interval->interval.parent) {
+      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+   }
+
+   return interval->physreg_start +
+          (child_start - interval->interval.reg->interval_start);
+}
+
+static unsigned
+ra_interval_get_num(const struct ra_interval *interval)
+{
+   return ra_physreg_to_num(ra_interval_get_physreg(interval),
+                            interval->interval.reg->flags);
+}
+
+static void
+ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
+{
+   mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
+
+   ir3_reg_interval_dump(stream, &interval->interval);
+}
+
+static void
+ra_ctx_dump(struct ra_ctx *ctx)
+{
+   struct log_stream *stream = mesa_log_streami();
+
+   mesa_log_stream_printf(stream, "shared:\n");
+   rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
+                    physreg_node) {
+      ra_interval_dump(stream, interval);
+   }
+
+   unsigned start, end;
+   mesa_log_stream_printf(stream, "available:\n");
+   BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
+      mesa_log_stream_printf(stream, "%u-%u ", start, end);
+   }
+   mesa_log_stream_printf(stream, "\n");
+   mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
+}
+
+static bool
+get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
+{
+   for (unsigned i = 0; i < reg_size(reg); i++) {
+      if (!BITSET_TEST(ctx->available, physreg + i))
+         return false;
+   }
+
+   return true;
+}
+
+static unsigned
+reg_file_size(struct ir3_register *reg)
+{
+   return RA_SHARED_SIZE;
+}
+
+static physreg_t
+find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
+              unsigned align)
+{
+   unsigned file_size = reg_file_size(dst);
+
+   /* This can happen if we create a very large merge set. Just bail out in that
+    * case.
+    */
+   if (size > file_size)
+      return (physreg_t) ~0;
+
+   unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
+   unsigned candidate = start;
+   do {
+      bool is_available = true;
+      for (unsigned i = 0; i < size; i++) {
+         if (!BITSET_TEST(ctx->available, candidate + i)) {
+            is_available = false;
+            break;
+         }
+      }
+
+      if (is_available) {
+         ctx->start = (candidate + size) % file_size;
+         return candidate;
+      }
+
+      candidate += align;
+      if (candidate + size > file_size)
+         candidate = 0;
+   } while (candidate != start);
+
+   return (physreg_t)~0;
+}
+
+static physreg_t
+find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
+                    unsigned size, unsigned align)
+{
+   unsigned file_size = reg_file_size(reg);
+   unsigned min_cost = UINT_MAX;
+
+   unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
+   physreg_t candidate = start;
+   physreg_t best_reg = (physreg_t)~0;
+   do {
+      unsigned cost = 0;
+
+      /* Iterate through intervals we'd need to spill to use this reg. */
+      for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
+           interval && interval->physreg_start < candidate + size;
+           interval = ra_interval_next_or_null(interval)) {
+         /* We can't spill sources of the current instruction when reloading
+          * sources.
+          */
+         if (interval->src) {
+            cost = UINT_MAX;
+            break;
+         }
+
+         /* We prefer spilling intervals that already have been spilled, so we
+          * don't have to emit another mov.
+          */
+         if (!interval->spill_def)
+            cost += (interval->physreg_end - interval->physreg_start);
+      }
+
+      if (cost < min_cost) {
+         min_cost = cost;
+         best_reg = candidate;
+      }
+
+      candidate += align;
+      if (candidate + size > file_size)
+         candidate = 0;
+   } while (candidate != start);
+
+   return best_reg;
+}
+
+static struct ir3_register *
+split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
+{
+   if (reg_elems(def) == 1) {
+      assert(offset == 0);
+      return def;
+   }
+
+   struct ir3_instruction *split =
+      ir3_instr_create(before->block, OPC_META_SPLIT, 1, 1);
+   split->split.off = offset;
+   struct ir3_register *dst = __ssa_dst(split);
+   struct ir3_register *src =
+      ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
+   src->wrmask = def->wrmask;
+   src->def = def;
+   ir3_instr_move_after(split, before);
+   return dst;
+}
+
+static struct ir3_register *
+extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
+        struct ir3_instruction *before)
+{
+   if (offset == 0 && elems == reg_elems(parent_def))
+      return parent_def;
+
+   if (elems == 1)
+      return split(parent_def, offset, before);
+
+   struct ir3_instruction *collect =
+      ir3_instr_create(before->block, OPC_META_COLLECT, 1, elems);
+   struct ir3_register *dst = __ssa_dst(collect);
+   dst->flags |= parent_def->flags & IR3_REG_HALF;
+   dst->wrmask = MASK(elems);
+
+   ir3_instr_move_after(collect, before);
+
+   for (unsigned i = 0; i < elems; i++) {
+      ir3_src_create(collect, INVALID_REG,
+                     parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
+         split(parent_def, offset + i, before);
+   }
+
+   return dst;
+}
+
+static void
+spill_interval_children(struct ra_interval *interval,
+                        struct ir3_instruction *before)
+{
+   rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
+                    interval.node) {
+      if (!child->spill_def) {
+         child->spill_def = extract(interval->spill_def,
+                                    (child->interval.reg->interval_start -
+                                     interval->interval.reg->interval_start) /
+                                    reg_elem_size(interval->interval.reg),
+                                    reg_elems(child->interval.reg), before);
+      }
+      spill_interval_children(child, before);
+   }
+}
+
+static void
+spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
+{
+   struct ir3_instruction *before = interval->interval.reg->instr;
+
+   d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);
+
+   if (!interval->spill_def) {
+      /* If this is a phi node or input, we need to insert the demotion to a
+       * regular register after the last phi or input in the block.
+       */
+      if (before->opc == OPC_META_PHI ||
+          before->opc == OPC_META_INPUT) {
+         struct ir3_block *block = before->block;
+         struct ir3_instruction *last_phi_input = NULL;
+         foreach_instr_from (instr, before, &block->instr_list) {
+            if (instr->opc != before->opc)
+               break;
+            last_phi_input = instr;
+         }
+         before = last_phi_input;
+      }
+
+      struct ir3_instruction *mov = ir3_instr_create(before->block, OPC_MOV, 1, 1);
+      mov->flags |= IR3_INSTR_SHARED_SPILL;
+      struct ir3_register *dst = __ssa_dst(mov);
+      dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
+      dst->wrmask = interval->interval.reg->wrmask;
+      mov->repeat = reg_elems(dst) - 1;
+      ir3_src_create(mov, interval->interval.reg->num,
+                     IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
+                     (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
+                     interval->interval.reg->wrmask;
+      mov->cat1.src_type = mov->cat1.dst_type =
+         (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+      ir3_instr_move_after(mov, before);
+      interval->spill_def = dst;
+   }
+
+   spill_interval_children(interval, interval->spill_def->instr);
+
+   ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+}
+
+/* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
+ * spilled sources. We have to take into account restrictions on the number of
+ * shared sources that only exist for normal ALU instructions.
+ */
+static bool
+try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+   /* First, check restrictions. */
+   switch (opc_cat(instr->opc)) {
+   case 1:
+      if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
+         return false;
+      break;
+   case 2: {
+      /* We need one source to either be demotable or an immediate. */
+      if (instr->srcs_count > 1) {
+         struct ra_interval *src0_interval =
+            (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
+         struct ra_interval *src1_interval =
+            (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
+         if (!(src0_interval && src0_interval->spill_def) &&
+             !(src1_interval && src1_interval->spill_def) &&
+             !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
+             !(instr->srcs[1]->flags & IR3_REG_IMMED))
+            return false;
+      }
+      break;
+   }
+   case 3: {
+      struct ra_interval *src0_interval =
+         (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
+      struct ra_interval *src1_interval =
+         (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;
+
+      /* src1 cannot be shared */
+      if (src1_interval && !src1_interval->spill_def) {
+         /* Try to swap src0 and src1, similar to what copy prop does. */
+         if (!is_mad(instr->opc))
+            return false;
+
+         if ((src0_interval && src0_interval->spill_def) ||
+             (instr->srcs[0]->flags & IR3_REG_IMMED)) {
+            struct ir3_register *src0 = instr->srcs[0];
+            instr->srcs[0] = instr->srcs[1];
+            instr->srcs[1] = src0;
+         } else {
+            return false;
+         }
+      }
+      break;
+   }
+   case 4: {
+      assert(instr->srcs[0]->flags & IR3_REG_SSA);
+      struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
+      if (!src_interval->spill_def)
+         return false;
+      break;
+   }
+
+   default:
+      return false;
+   }
+
+   d("demoting instruction");
+
+   /* If the instruction is already not a scalar ALU instruction, we should've
+    * skipped reloading and just demoted sources directly, so we should never
+    * get here.
+    */
+   assert(instr->dsts[0]->flags & IR3_REG_SHARED);
+
+   /* Now we actually demote the instruction */
+   ra_foreach_src (src, instr) {
+      assert(src->flags & IR3_REG_SHARED);
+      struct ra_interval *interval = &ctx->intervals[src->def->name];
+      if (interval->spill_def) {
+         src->def = interval->spill_def;
+         src->flags &= ~IR3_REG_SHARED;
+         interval->needs_reload = false;
+         if (interval->interval.inserted)
+            ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
+         while (interval->interval.parent)
+            interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+         interval->src = false;
+      }
+   }
+
+   struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
+   instr->dsts[0]->flags &= ~IR3_REG_SHARED;
+   ra_interval_init(dst_interval, instr->dsts[0]);
+   dst_interval->spill_def = instr->dsts[0];
+
+   instr->flags |= IR3_INSTR_SHARED_SPILL;
+
+   return true;
+}
+
+/* Free up [start, start + size) by spilling live intervals.
+ */
+static void
+free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
+{
+   struct ra_interval *interval = ra_ctx_search_right(ctx, start);
+   while (interval && interval->physreg_start < start + size) {
+      struct ra_interval *next = ra_interval_next_or_null(interval);
+      spill_interval(ctx, interval);
+      interval = next;
+   }
+}
+
+static physreg_t
+get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
+{
+   if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
+      physreg_t preferred_reg =
+         reg->merge_set->preferred_reg + reg->merge_set_offset;
+      if (preferred_reg < reg_file_size(reg) &&
+          preferred_reg % reg_elem_size(reg) == 0 &&
+          get_reg_specified(ctx, reg, preferred_reg))
+         return preferred_reg;
+   }
+
+   /* If this register is a subset of a merge set which we have not picked a
+    * register for, first try to allocate enough space for the entire merge
+    * set.
+    */
+   unsigned size = reg_size(reg);
+   if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
+       size < reg->merge_set->size) {
+      physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
+                                         reg->merge_set->alignment);
+      if (best_reg != (physreg_t)~0u) {
+         best_reg += reg->merge_set_offset;
+         return best_reg;
+      }
+   }
+
+   /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
+    * Because this doesn't introduce unnecessary dependencies, and it
+    * potentially avoids needing (ss) syncs for write after read hazards for
+    * SFU instructions:
+    */
+   if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
+      for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
+         struct ir3_register *src = reg->instr->srcs[i];
+         if (!ra_reg_is_src(src))
+            continue;
+         if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
+            struct ra_interval *src_interval = &ctx->intervals[src->def->name];
+            physreg_t src_physreg = ra_interval_get_physreg(src_interval);
+            if (src_physreg % reg_elem_size(reg) == 0 &&
+                src_physreg + size <= reg_file_size(reg) &&
+                get_reg_specified(ctx, reg, src_physreg))
+               return src_physreg;
+         }
+      }
+   }
+
+   return find_best_gap(ctx, reg, size, reg_elem_size(reg));
+}
+
+/* The reload process is split in two, first we allocate a register to reload to
+ * for all sources that need a reload and then we actually execute the reload.
+ * This is to allow us to demote shared ALU instructions to non-shared whenever
+ * we would otherwise need to spill to reload, without leaving dangling unused
+ * reload mov's from previously processed sources. So, for example, we could
+ * need to reload both sources of an add, but after reloading the first source
+ * we realize that we would need to spill to reload the second source and we
+ * should demote the add instead, which means cancelling the first reload.
+ */
+static void
+reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
+           struct ir3_register *src)
+{
+   struct ir3_register *reg = src->def;
+   struct ra_interval *interval = &ctx->intervals[reg->name];
+   unsigned size = reg_size(reg);
+
+   physreg_t best_reg = get_reg(ctx, reg, true);
+
+   if (best_reg == (physreg_t)~0u) {
+      if (try_demote_instruction(ctx, instr))
+         return;
+
+      best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
+      assert(best_reg != (physreg_t)~0u);
+
+      free_space(ctx, best_reg, size);
+   }
+
+   d("reload src %u physreg %u", reg->name, best_reg);
+   interval->physreg_start = best_reg;
+   interval->physreg_end = best_reg + size;
+   interval->needs_reload = true;
+   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+   interval->src = true;
+}
+
+static void
+reload_interval(struct ra_ctx *ctx, struct ir3_instruction *instr,
+                struct ir3_block *block, struct ra_interval *interval)
+{
+   struct ir3_register *def = interval->interval.reg;
+   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   mov->flags |= IR3_INSTR_SHARED_SPILL;
+   unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
+   ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
+                  flags)->wrmask = def->wrmask;
+   mov->repeat = reg_elems(def) - 1;
+   struct ir3_register *mov_src =
+      ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
+                     (mov->repeat ? IR3_REG_R : 0));
+   assert(interval->spill_def);
+   mov_src->def = interval->spill_def;
+   mov_src->wrmask = def->wrmask;
+   mov->cat1.src_type = mov->cat1.dst_type =
+      (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+   if (instr)
+      ir3_instr_move_before(mov, instr);
+}
+
+static void
+reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
+                    struct ir3_register *src)
+{
+   struct ir3_register *reg = src->def;
+   struct ra_interval *interval = &ctx->intervals[reg->name];
+
+   if (!interval->needs_reload)
+      return;
+
+   reload_interval(ctx, instr, instr->block, interval);
+
+   interval->needs_reload = false;
+}
+
+static bool
+can_demote_src(struct ir3_instruction *instr)
+{
+   switch (instr->opc) {
+   case OPC_SCAN_MACRO:
+   case OPC_META_COLLECT:
+      return false;
+   case OPC_MOV:
+      /* non-shared -> shared floating-point conversions don't work */
+      return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
+          (full_type(instr->cat1.src_type) != TYPE_F32 &&
+           full_type(instr->cat1.dst_type) != TYPE_F32));
+   default:
+      return (!is_alu(instr) && !is_sfu(instr)) ||
+         !(instr->dsts[0]->flags & IR3_REG_SHARED);
+   }
+}
+
+/* Ensure that this source is never spilled while reloading other sources.
+ */
+static void
+mark_src(struct ra_ctx *ctx, struct ir3_register *src)
+{
+   if (!(src->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+
+   if (interval->interval.inserted) {
+      while (interval->interval.parent)
+         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+
+      interval->src = true;
+   }
+}
+
+static void
+ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
+                struct ir3_register *src)
+{
+   if (!(src->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+
+   if (!interval->interval.inserted) {
+      /* In some cases we cannot demote shared reg sources to non-shared regs,
+       * then we have to reload it.
+       */
+      assert(interval->spill_def);
+      if (!can_demote_src(instr)) {
+         reload_src(ctx, instr, src);
+      } else {
+         if (instr->opc == OPC_META_PARALLEL_COPY) {
+            /* Stash away the original def to use later in case we actually have
+             * to insert a reload.
+             */
+            _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
+         }
+         src->def = interval->spill_def;
+         src->flags &= ~IR3_REG_SHARED;
+      }
+   }
+}
+
+static void
+assign_src(struct ra_ctx *ctx, struct ir3_register *src)
+{
+   if (!(src->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+   assert(interval->interval.inserted);
+   src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);
+
+   if ((src->flags & IR3_REG_FIRST_KILL) &&
+       !interval->interval.parent &&
+       rb_tree_is_empty(&interval->interval.children))
+      ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
+
+   while (interval->interval.parent)
+      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+
+   interval->src = false;
+}
+
+static void
+handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
+           struct ir3_register *dst)
+{
+   if (!(dst->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[dst->name];
+   ra_interval_init(interval, dst);
+   interval->spill_def = NULL;
+
+   if (dst->tied) {
+      struct ir3_register *tied_def = dst->tied->def;
+      struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
+      if ((dst->tied->flags & IR3_REG_KILL) &&
+          !tied_interval->interval.parent &&
+          rb_tree_is_empty(&tied_interval->interval.children)) {
+         dst->num = dst->tied->num;
+         interval->physreg_start = tied_interval->physreg_start;
+         interval->physreg_end = tied_interval->physreg_end;
+         ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+         return;
+      }
+   }
+
+   physreg_t physreg = get_reg(ctx, dst, false);
+   if (physreg == (physreg_t) ~0u) {
+      if (try_demote_instruction(ctx, instr))
+         return;
+
+      unsigned size = reg_size(dst);
+      physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
+      assert(physreg != (physreg_t)~0u);
+      free_space(ctx, physreg, size);
+   }
+
+   interval->physreg_start = physreg;
+   interval->physreg_end = physreg + reg_size(dst);
+   dst->num = ra_physreg_to_num(physreg, dst->flags);
+   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+   d("insert dst %u physreg %u", dst->name, physreg);
+
+   if (dst->tied) {
+      struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 1, 1);
+      unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
+      ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
+      ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
+      mov->cat1.src_type = mov->cat1.dst_type =
+         (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
+      ir3_instr_move_before(mov, instr);
+      dst->tied->num = dst->num;
+   }
+}
+
+static void
+handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
+                struct ir3_register *src)
+{
+   if (!(src->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+   reload_src_finalize(ctx, instr, src);
+
+   /* Remove killed sources that have to be killed late due to being merged with
+    * other defs.
+    */
+   if (!(src->flags & IR3_REG_KILL))
+      return;
+
+   if (interval->interval.inserted)
+      ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
+}
+
+static void
+handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+   ra_foreach_src (src, instr)
+      mark_src(ctx, src);
+
+   ra_foreach_src (src, instr)
+      ensure_src_live(ctx, instr, src);
+
+   ra_foreach_src_rev (src, instr)
+      assign_src(ctx, src);
+
+   ra_foreach_dst (dst, instr)
+      handle_dst(ctx, instr, dst);
+
+   ra_foreach_src (src, instr)
+      handle_src_late(ctx, instr, src);
+}
+
+static void
+handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
+{
+   struct ir3_register *src = split->srcs[0];
+   struct ir3_register *dst = split->dsts[0];
+
+   if (!(dst->flags & IR3_REG_SHARED))
+      return;
+
+   if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
+      handle_normal_instr(ctx, split);
+      return;
+   }
+
+   struct ra_interval *src_interval = &ctx->intervals[src->def->name];
+   struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+
+   ra_interval_init(dst_interval, dst);
+   dst_interval->spill_def = NULL;
+
+   if (src_interval->spill_def) {
+      struct ir3_instruction *spill_split =
+         ir3_instr_create(split->block, OPC_META_SPLIT, 1, 1);
+      struct ir3_register *dst = __ssa_dst(spill_split);
+      ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA)->def =
+         src_interval->spill_def;
+      spill_split->split.off = split->split.off;
+      ir3_instr_move_after(spill_split, split);
+      dst_interval->spill_def = dst;
+      return;
+   }
+
+   dst_interval->physreg_start =
+      src_interval->physreg_start + dst->merge_set_offset -
+      src->def->merge_set_offset;
+   dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
+   ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
+   src->num = ra_interval_get_num(src_interval);
+   dst->num = ra_interval_get_num(dst_interval);
+   d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);
+
+   if (src->flags & IR3_REG_KILL)
+      ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
+}
+
+static void
+handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
+{
+   struct ir3_register *dst = phi->dsts[0];
+   
+   if (!(dst->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+   ra_interval_init(dst_interval, dst);
+
+   /* In some rare cases, it's possible to have a phi node with a physical-only
+    * source. Here's a contrived example:
+    *
+    * loop {
+    *    if non-uniform {
+    *       if uniform {
+    *          x_1 = ...;
+    *          continue;
+    *       }
+    *       x_2 = ...;
+    *    } else {
+    *       break;
+    *    }
+    *    // continue block
+    *    x_3 = phi(x_1, x_2)
+    * }
+    *
+    * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
+    * threads that stay in the loop take the same branch to the continue block,
+    * however execution may fall through from the assignment to x_2 to the
+    * break statement because the outer if is non-uniform, and then it will fall
+    * through again to the continue block, so if x_3 is to be in a shared reg
+    * then the phi needs an extra source pointing to the break statement, which
+    * itself needs a phi node:
+    *
+    * loop {
+    *    if non-uniform {
+    *       if uniform {
+    *          x_1 = ...;
+    *          continue;
+    *       }
+    *       x_2 = ...;
+    *    } else {
+    *       x_4 = phi(undef, x_2)
+    *       break;
+    *    }
+    *    // continue block
+    *    x_3 = phi(x_1, x_2, x_4)
+    * }
+    */
+
+   /* phi nodes are special because we cannot spill them normally, instead we
+    * have to spill the parallel copies that their sources point to and make the
+    * entire phi not shared anymore.
+    */
+
+   physreg_t physreg = get_reg(ctx, dst, false);
+   if (physreg == (physreg_t) ~0u) {
+      d("spilling phi destination");
+      dst->flags &= ~IR3_REG_SHARED;
+      dst_interval->spill_def = dst;
+      phi->flags |= IR3_INSTR_SHARED_SPILL;
+
+      foreach_src (src, phi) {
+         src->flags &= ~IR3_REG_SHARED;
+         if (src->def)
+            src->def->flags &= ~IR3_REG_SHARED;
+      }
+
+      return;
+   }
+
+   dst->num = ra_physreg_to_num(physreg, dst->flags);
+   dst_interval->spill_def = NULL;
+   dst_interval->physreg_start = physreg;
+   dst_interval->physreg_end = physreg + reg_size(dst);
+   ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
+
+   ra_foreach_src_n (src, i, phi) {
+      /* We assume that any phis with non-logical sources aren't promoted. */
+      assert(i < phi->block->predecessors_count);
+      src->num = dst->num;
+      src->def->num = dst->num;
+   }
+}
+
+static void
+handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
+{
+   /* For parallel copies, we only handle the source. The destination is handled
+    * later when processing phi nodes.
+    */
+
+   ra_foreach_src (src, pcopy)
+      mark_src(ctx, src);
+
+   ra_foreach_src (src, pcopy)
+      ensure_src_live(ctx, pcopy, src);
+
+   ra_foreach_src_rev (src, pcopy)
+      assign_src(ctx, src);
+
+   ra_foreach_src (src, pcopy)
+      handle_src_late(ctx, pcopy, src);
+}
+
+static void
+handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+   instr->flags &= ~IR3_INSTR_SHARED_SPILL;
+
+   switch (instr->opc) {
+   case OPC_META_SPLIT:
+      handle_split(ctx, instr);
+      break;
+   case OPC_META_PHI:
+      handle_phi(ctx, instr);
+      break;
+   case OPC_META_PARALLEL_COPY:
+      handle_pcopy(ctx, instr);
+      break;
+   default:
+      handle_normal_instr(ctx, instr);
+   }
+}
+
+/* In case we define a value outside a loop, use it inside the loop, then spill
+ * it afterwards inside the same loop, we could lose the value so we have to
+ * reload it. We have to reload it after any parallel copy instruction, when the
+ * live shared registers equal the live-in of the backedge. lower_pcopy() will
+ * then move any non-shared parallel copies down past the reload.
+ */
+static void
+reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_block_state *state = &ctx->blocks[block->index];
+   unsigned name;
+   BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+
+      struct ra_interval *interval = &ctx->intervals[name];
+      if (!interval->interval.inserted) {
+         d("reloading %d at end of backedge", reg->name);
+         reload_interval(ctx, NULL, block, interval);
+      }
+   }
+}
+
+static void
+record_pred_live_out(struct ra_ctx *ctx,
+                     struct ra_interval *interval,
+                     struct ir3_block *pred)
+{
+   struct ra_block_state *state = &ctx->blocks[pred->index];
+
+   struct ir3_register *def = interval->interval.reg;
+   BITSET_SET(state->live_out, def->name);
+
+   rb_tree_foreach (struct ra_interval, child,
+                    &interval->interval.children, interval.node) {
+      record_pred_live_out(ctx, child, pred);
+   }
+}
+
+static void
+record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_block_state *state = &ctx->blocks[pred->index];
+      if (state->visited)
+         continue;
+
+      state->live_out = rzalloc_array(NULL, BITSET_WORD,
+                                      BITSET_WORDS(ctx->live->definitions_count));
+
+
+      rb_tree_foreach (struct ra_interval, interval,
+                       &ctx->reg_ctx.intervals, interval.node) {
+         record_pred_live_out(ctx, interval, pred);
+      }
+   }
+}
+
+static void
+handle_block(struct ra_ctx *ctx, struct ir3_block *block)
+{
+   ra_ctx_reset_block(ctx);
+
+   unsigned name;
+   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                       ctx->live->definitions_count) {
+      struct ir3_register *def = ctx->live->definitions[name];
+      struct ra_interval *interval = &ctx->intervals[name];
+
+      /* Non-shared definitions may still be definitions we spilled by demoting
+       * them, so we still need to initialize the interval. But we shouldn't
+       * make these intervals live.
+       */
+      ra_interval_init(interval, def);
+
+      if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
+         ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+      }
+   }
+
+   if (RA_DEBUG) {
+      d("after live-in block %u:\n", block->index);
+      ra_ctx_dump(ctx);
+   }
+
+   if (block->predecessors_count > 1)
+      record_pred_live_outs(ctx, block);
+
+   foreach_instr (instr, &block->instr_list) {
+      di(instr, "processing");
+
+      handle_instr(ctx, instr);
+
+      if (RA_DEBUG)
+         ra_ctx_dump(ctx);
+   }
+
+   if (block->successors[0]) {
+      struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];
+
+      if (state->visited) {
+         assert(!block->successors[1]);
+
+         reload_live_outs(ctx, block);
+      }
+   }
+
+   ctx->blocks[block->index].visited = true;
+}
+
+static void
+lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         /* At this point due to spilling there may be parallel copies from
+          * shared to non-shared registers and vice versa. Lowering these after
+          * RA may produce cycles involving shared and non-shared registers,
+          * which would need to be resolved by swapping a shared and non-shared
+          * register which is something we can't handle. However by lowering
+          * these to moves now, we can make sure that cycles only involve
+          * non-shared registers. To avoid illegally moving a shared register
+          * read or write across the parallel copy, which may have other
+          * conflicting reads/writes if there's a cycle, we need to move copies
+          * from non-shared to shared below the shared copies, and we need to
+          * move copies from shared to non-shared above them. So, we have the
+          * following order:
+          *
+          * 1. shared->non-shared copies (spills)
+          * 2. shared->shared copies (one parallel copy as there may be cycles)
+          * 3. non-shared->shared copies (reloads)
+          * 4. non-shared->non-shared copies
+          *
+          * We split out the non-shared->non-shared copies as a separate step.
+          */
+         if (instr->opc == OPC_META_PARALLEL_COPY) {
+            for (unsigned i = 0; i < instr->srcs_count; i++) {
+               if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
+                   !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
+                  /* shared->non-shared. Create a spill move and rewrite the
+                   * source to be the destination of the move (so that the
+                   * original shared->non-shared copy becomes a
+                   * non-shared->non-shared copy).
+                   */
+                  struct ir3_instruction *mov =
+                     ir3_instr_create(block, OPC_MOV, 1, 1);
+                  mov->flags |= IR3_INSTR_SHARED_SPILL;
+                  struct ir3_register *dst =
+                     ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
+                  dst->wrmask = instr->dsts[i]->wrmask;
+                  dst->instr = mov;
+                  mov->repeat = reg_elems(mov->dsts[0]) - 1;
+                  struct ir3_register *src =
+                     ir3_src_create(mov, instr->srcs[i]->num,
+                                    instr->srcs[i]->flags |
+                                    (mov->repeat ? IR3_REG_R : 0));
+                  src->wrmask = instr->srcs[i]->wrmask;
+                  mov->cat1.dst_type = mov->cat1.src_type =
+                     (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+                  instr->srcs[i]->flags = mov->dsts[0]->flags;
+                  instr->srcs[i]->def = mov->dsts[0];
+                  ir3_instr_move_before(mov, instr);
+               }
+            }
+
+            for (unsigned i = 0; i < instr->dsts_count;) {
+               if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
+                   (instr->srcs[i]->flags & IR3_REG_SSA) &&
+                   !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
+                  /* non-shared->shared. Create a reload move.
+                   */
+                  struct ir3_instruction *mov =
+                     ir3_instr_create(block, OPC_MOV, 1, 1);
+                  mov->flags |= IR3_INSTR_SHARED_SPILL;
+                  struct ir3_register *dst =
+                     ir3_dst_create(mov, instr->dsts[i]->num,
+                                    instr->dsts[i]->flags);
+                  dst->instr = mov;
+                  dst->wrmask = instr->dsts[i]->wrmask;
+                  mov->repeat = reg_elems(mov->dsts[0]) - 1;
+                  struct ir3_register *src = 
+                     ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
+                                    (mov->repeat ? IR3_REG_R : 0));
+                  src->def = instr->srcs[i]->def;
+                  src->wrmask = instr->srcs[i]->wrmask;
+                  mov->cat1.dst_type = mov->cat1.src_type =
+                     (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+                  /* When we spill a parallel copy source, we lose the
+                   * information of where it originally points to since we make
+                   * it point to the spill def. If we later decide not to also
+                   * spill the phi associated with it, we have to restore it
+                   * here using the stashed original source so that RA
+                   * validation can check that we did the correct thing.
+                   *
+                   * Because SSA-ness goes away after validation, this is really
+                   * just about validation.
+                   */
+                  struct ir3_block *succ = block->successors[0];
+                  unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+                  foreach_instr (phi, &succ->instr_list) {
+                     if (phi->opc != OPC_META_PHI)
+                        break;
+
+                     if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
+                        struct ir3_register *def =
+                           _mesa_hash_table_search(ctx->pcopy_src_map,
+                                                   instr->srcs[i])->data;
+                        phi->srcs[pred_idx]->def = def;
+                        break;
+                     }
+                  }
+
+                  instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
+                  instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
+                  instr->srcs_count--;
+                  instr->dsts_count--;
+                  ir3_instr_move_after(mov, instr);
+                  continue;
+               }
+
+               i++;
+            }
+
+            /* Move any non-shared copies to a separate parallel copy
+             * instruction right at the end of the block, after any reloads. At
+             * this point all copies should be {shared,immediate}->shared or
+             * {non-shared,immediate}->non-shared. 
+             */
+            unsigned non_shared_copies = 0;
+            for (unsigned i = 0; i < instr->dsts_count; i++) {
+               if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
+                  non_shared_copies++;
+            }
+
+            if (non_shared_copies != 0) {
+               struct ir3_instruction *pcopy =
+                  ir3_instr_create(block, OPC_META_PARALLEL_COPY,
+                                   non_shared_copies, non_shared_copies);
+
+               unsigned j = 0;
+               for (unsigned i = 0; i < instr->dsts_count;) {
+                  if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
+                     pcopy->dsts[j] = instr->dsts[i];
+                     pcopy->srcs[j] = instr->srcs[i];
+                     pcopy->dsts[j]->instr = pcopy;
+                     instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
+                     instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
+                     instr->srcs_count--;
+                     instr->dsts_count--;
+                     j++;
+                     continue;
+                  }
+                  i++;
+               }
+
+               pcopy->srcs_count = pcopy->dsts_count = j;
+               if (instr->dsts_count == 0)
+                  list_del(&instr->node);
+            }
+         }
+      }
+   }
+}
+
+static void
+finalize(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         for (unsigned i = 0; i < instr->dsts_count; i++) {
+            if (instr->dsts[i]->flags & IR3_REG_SHARED) {
+               instr->dsts[i]->flags &= ~IR3_REG_SSA;
+            }
+         }
+
+         for (unsigned i = 0; i < instr->srcs_count; i++) {
+            if (instr->srcs[i]->flags & IR3_REG_SHARED) {
+               instr->srcs[i]->flags &= ~IR3_REG_SSA;
+               instr->srcs[i]->def = NULL;
+            }
+         }
+      }
+   }
+}
+
+void
+ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness *live)
+{
+   struct ra_ctx ctx;
+
+   ra_ctx_init(&ctx);
+   ctx.intervals = rzalloc_array(NULL, struct ra_interval,
+                                 live->definitions_count);
+   ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
+                              live->block_count);
+   ctx.start = 0;
+   ctx.live = live;
+   ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);
+
+   foreach_block (block, &v->ir->block_list) {
+      handle_block(&ctx, block);
+   }
+
+   lower_pcopy(v->ir, &ctx);
+
+   for (unsigned i = 0; i < live->block_count; i++) {
+      if (ctx.blocks[i].live_out)
+         ralloc_free(ctx.blocks[i].live_out);
+   }
+
+   ralloc_free(ctx.intervals);
+   ralloc_free(ctx.pcopy_src_map);
+   ralloc_free(ctx.blocks);
+
+   ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
+   finalize(v->ir);
+}
+
diff --git a/src/freedreno/ir3/ir3_spill.c b/src/freedreno/ir3/ir3_spill.c
index 475c132f6fa..0ea80bca337 100644
--- a/src/freedreno/ir3/ir3_spill.c
+++ b/src/freedreno/ir3/ir3_spill.c
@@ -1193,20 +1193,23 @@ is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
 static void
 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
 {
-   foreach_dst (dst, pcopy) {
+   ra_foreach_dst (dst, pcopy) {
       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
       ra_spill_interval_init(dst_interval, dst);
    }
 
    foreach_src_n (src, i, pcopy) {
-      d("processing src %u", i);
       struct ir3_register *dst = pcopy->dsts[i];
+      if (!(dst->flags & IR3_REG_SSA))
+         continue;
+
+      d("processing src %u", i);
 
       /* Skip the intermediate copy for cases where the source is merged with
        * the destination. Crucially this means that we also don't reload/spill
        * it if it's been spilled, because it shares the same spill slot.
        */
-      if (src->def && src->def->merge_set &&
+      if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
           src->def->merge_set == dst->merge_set &&
           src->def->merge_set_offset == dst->merge_set_offset) {
          struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
@@ -1221,7 +1224,7 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
             dst_interval->cant_spill = false;
             dst_interval->dst = src_interval->dst;
          }
-      } else if (src->def) {
+      } else if (src->flags & IR3_REG_SSA) {
          struct ra_spill_interval *temp_interval =
             create_temp_interval(ctx, dst);
          struct ir3_register *temp = temp_interval->interval.reg;
@@ -1251,15 +1254,17 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
 
    foreach_src_n (src, i, pcopy) {
       struct ir3_register *dst = pcopy->dsts[i];
+      if (!(dst->flags & IR3_REG_SSA))
+         continue;
 
-      if (src->def && src->def->merge_set &&
+      if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
           src->def->merge_set == dst->merge_set &&
           src->def->merge_set_offset == dst->merge_set_offset)
          continue;
 
       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
 
-      if (!src->def) {
+      if (!(src->flags & IR3_REG_SSA)) {
          dst_interval->cant_spill = true;
          ra_spill_ctx_insert(ctx, dst_interval);
          limit(ctx, pcopy);
@@ -1292,6 +1297,9 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
 static void
 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    init_dst(ctx, instr->dsts[0]);
    insert_dst(ctx, instr->dsts[0]);
    finish_dst(ctx, instr->dsts[0]);
@@ -1300,6 +1308,9 @@ handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 static void
 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    if (instr->opc == OPC_META_TEX_PREFETCH) {
       ra_foreach_src (src, instr)
          remove_src(ctx, instr, src);
@@ -1623,6 +1634,9 @@ static void
 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
             struct ir3_block *block)
 {
+   if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
       phi->flags |= IR3_INSTR_UNUSED;
       return;
@@ -1977,8 +1991,25 @@ cleanup_dead(struct ir3 *ir)
 {
    foreach_block (block, &ir->block_list) {
       foreach_instr_safe (instr, &block->instr_list) {
-         if (instr->flags & IR3_INSTR_UNUSED)
-            list_delinit(&instr->node);
+         if (instr->flags & IR3_INSTR_UNUSED) {
+            if (instr->opc == OPC_META_PARALLEL_COPY) {
+               /* There may be non-SSA shared copies, we need to preserve these.
+                */
+               for (unsigned i = 0; i < instr->dsts_count;) {
+                  if (instr->dsts[i]->flags & IR3_REG_SSA) {
+                     instr->dsts[i] = instr->dsts[--instr->dsts_count];
+                     instr->srcs[i] = instr->srcs[--instr->srcs_count];
+                  } else {
+                     i++;
+                  }
+               }
+
+               if (instr->dsts_count == 0)
+                  list_delinit(&instr->node);
+            } else {
+               list_delinit(&instr->node);
+            }
+         }
       }
    }
 }
diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c
index 1c74d711e0c..2b45559cab8 100644
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@@ -84,6 +84,9 @@ validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
    validate_assert(ctx, src->wrmask == reg->wrmask);
    validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));
 
+   if (src->flags & IR3_REG_CONST)
+      validate_assert(ctx, !(src->flags & IR3_REG_SHARED));
+
    if (reg->tied) {
       validate_assert(ctx, reg->tied->tied == reg);
       bool found = false;
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
index 954b0b88fdb..e7ba1d8fe5f 100644
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -112,6 +112,7 @@ libfreedreno_ir3_files = files(
   'ir3_sched.c',
   'ir3_shader.c',
   'ir3_shader.h',
+  'ir3_shared_ra.c',
   'ir3_spill.c',
   'ir3_validate.c',
 )