From fa22b0901af548d5e1433ad4cdbda314182137c5 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Thu, 9 Feb 2023 13:06:30 +0100
Subject: [PATCH] ir3/ra: Add specialized shared register RA/spilling

There are two problems with shared register allocation at the moment:

1. We weren't modelling physical edges correctly, and once we do, the
   current hack in RA for handling them won't work correctly. This means
   live-range splitting doesn't work. I've tried various strategies but
   none of them seems to fix this.
2. Spilling of shared registers to non-shared registers isn't
   implemented.

Spilling of shared regs is significantly simpler than spilling
non-shared regs, because (1) spilling and unspilling is significantly
cheaper, just a single mov, and (2) we can swap "stack slots" (actually
non-shared regs) so all the complexity of parallel copy handling isn't
necessary. This means that it's much easier to integrate RA and
spilling, while still using the tree-scan framework, so that we can
spill instead of splitting live ranges. The other issue, of phi nodes
with physical edges, we can handle by spilling those phis earlier. For
this to work, we need to accurately insert physical edges based on
divergence analysis or else every phi node would involve physical edges,
which later commits will accomplish.

This commit adds a shared register allocation pass which is a
severely-cut-down version of RA and spilling. Everything to do with live
range splitting is cut from RA, and everything to do with parallel copy
handling and for spilling we simply always spill as long as soon as we
encounter a case where it's necessary. This could be improved,
especially the spilling strategy, but for now it keeps the pass simple
and cuts down on code duplication. Unfortunately there's still some
shared boilerplate with regular RA which seems unavoidable however.

The new RA requires us to redo liveness information, which is
significantly expensive, so we keep the ability of the old RA to handle
shared registers and only use the new RA when it may be required: either
something potentially requiring live-range splitting, or a too-high
shared register limit.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22072>
---
 src/freedreno/ir3/ir3.h             |    9 +-
 src/freedreno/ir3/ir3_merge_regs.c  |   10 +
 src/freedreno/ir3/ir3_ra.c          |   84 +-
 src/freedreno/ir3/ir3_ra.h          |    4 +-
 src/freedreno/ir3/ir3_ra_validate.c |  189 +++-
 src/freedreno/ir3/ir3_shared_ra.c   | 1415 +++++++++++++++++++++++++++
 src/freedreno/ir3/ir3_spill.c       |   47 +-
 src/freedreno/ir3/ir3_validate.c    |    3 +
 src/freedreno/ir3/meson.build       |    1 +
 9 files changed, 1687 insertions(+), 75 deletions(-)
 create mode 100644 src/freedreno/ir3/ir3_shared_ra.c

diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 7eb8ca36209..0a1d937ad7d 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -335,7 +335,14 @@ typedef enum ir3_instruction_flags {
     * before register assignment is done:
     */
    IR3_INSTR_MARK = BIT(15),
-   IR3_INSTR_UNUSED = BIT(16),
+
+   /* Used by shared register allocation when creating spill/reload instructions
+    * to inform validation that this is created by RA. This also may be set on
+    * an instruction where a spill has been folded into it.
+    */
+   IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
+
+   IR3_INSTR_UNUSED = BIT(17),
 } ir3_instruction_flags;
 
 struct ir3_instruction {
diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c
index 1cdaad67b0e..7de6aaa4b18 100644
--- a/src/freedreno/ir3/ir3_merge_regs.c
+++ b/src/freedreno/ir3/ir3_merge_regs.c
@@ -377,6 +377,8 @@ static void
 aggressive_coalesce_split(struct ir3_liveness *live,
                           struct ir3_instruction *split)
 {
+   if (!(split->dsts[0]->flags & IR3_REG_SSA))
+      return;
    try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
                   split->split.off * reg_elem_size(split->dsts[0]));
 }
@@ -409,6 +411,10 @@ create_parallel_copy(struct ir3_block *block)
          if (phi->opc != OPC_META_PHI)
             break;
 
+         /* Avoid phis we've already colored */
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
+
          /* Avoid undef */
          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
              !phi->srcs[pred_idx]->def)
@@ -430,6 +436,8 @@ create_parallel_copy(struct ir3_block *block)
       foreach_instr (phi, &succ->instr_list) {
          if (phi->opc != OPC_META_PHI)
             break;
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
              !phi->srcs[pred_idx]->def)
             continue;
@@ -456,6 +464,8 @@ create_parallel_copy(struct ir3_block *block)
       foreach_instr (phi, &succ->instr_list) {
          if (phi->opc != OPC_META_PHI)
             break;
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
              !phi->srcs[pred_idx]->def)
             continue;
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 070f67ba7ba..3b1b424ae22 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -193,6 +193,8 @@ void
 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
                         struct ir3_reg_interval *interval)
 {
+   assert(interval->inserted);
+
    if (interval->parent) {
       rb_tree_remove(&interval->parent->children, &interval->node);
    } else {
@@ -684,6 +686,8 @@ ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
                 struct ra_interval *interval)
 {
    assert(!interval->interval.parent);
+   /* shared live splitting is not allowed! */
+   assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
 
    /* Check if we've already moved this reg before */
    unsigned pcopy_index;
@@ -1665,6 +1669,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
    struct ir3_register *dst = instr->dsts[0];
    struct ir3_register *src = instr->srcs[0];
 
+   if (!(dst->flags & IR3_REG_SSA))
+      return;
+
    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
       handle_normal_instr(ctx, instr);
       return;
@@ -1683,6 +1690,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
    unsigned dst_offset = instr->dsts[0]->merge_set_offset;
 
@@ -1798,7 +1808,8 @@ handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-   if (instr->dsts[0]->num == INVALID_REG)
+   if (instr->dsts[0]->num == INVALID_REG ||
+       !(instr->dsts[0]->flags & IR3_REG_SSA))
       return;
 
    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
@@ -1829,6 +1840,9 @@ handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
 
@@ -1973,6 +1987,9 @@ handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
 static void
 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
 {
+   if (!(def->flags & IR3_REG_SSA))
+      return;
+
    struct ra_file *file = ra_get_file(ctx, def);
    struct ra_interval *interval = &ctx->intervals[def->name];
 
@@ -1999,6 +2016,9 @@ handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
 static void
 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
 {
+   if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
    struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
    assert(!interval->interval.parent);
@@ -2085,15 +2105,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
 {
    physreg_t physreg = ra_interval_get_physreg(interval);
 
-   bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
-   struct ir3_block **predecessors =
-      shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
-   unsigned predecessors_count = shared
-                                    ? ctx->block->physical_predecessors_count
-                                    : ctx->block->predecessors_count;
-
-   for (unsigned i = 0; i < predecessors_count; i++) {
-      struct ir3_block *pred = predecessors[i];
+   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+      struct ir3_block *pred = ctx->block->predecessors[i];
       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
 
       if (!pred_state->visited)
@@ -2101,28 +2114,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
 
       physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
       if (pred_reg != physreg) {
+         assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
          insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
-
-         /* This is a bit tricky, but when visiting the destination of a
-          * physical-only edge, we have two predecessors (the if and the
-          * header block) and both have multiple successors. We pick the
-          * register for all live-ins from the normal edge, which should
-          * guarantee that there's no need for shuffling things around in
-          * the normal predecessor as long as there are no phi nodes, but
-          * we still may need to insert fixup code in the physical
-          * predecessor (i.e. the last block of the if) and that has
-          * another successor (the block after the if) so we need to update
-          * the renames state for when we process the other successor. This
-          * crucially depends on the other successor getting processed
-          * after this.
-          *
-          * For normal (non-physical) edges we disallow critical edges so
-          * that hacks like this aren't necessary.
-          */
-         if (!pred_state->renames)
-            pred_state->renames = _mesa_pointer_hash_table_create(ctx);
-         _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
-                                 (void *)(uintptr_t)physreg);
       }
    }
 }
@@ -2561,6 +2554,18 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ir3_merge_regs(live, v->ir);
 
+   bool has_shared_vectors = false;
+   foreach_block (block, &v->ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) {
+               has_shared_vectors = true;
+               break;
+            }
+         }
+      }
+   }
+
    struct ir3_pressure max_pressure;
    ir3_calc_pressure(v, live, &max_pressure);
    d("max pressure:");
@@ -2590,10 +2595,17 @@ ir3_ra(struct ir3_shader_variant *v)
    if (ir3_shader_debug & IR3_DBG_SPILLALL)
       calc_min_limit_pressure(v, live, &limit_pressure);
 
-   if (max_pressure.shared > limit_pressure.shared) {
-      /* TODO shared reg -> normal reg spilling */
-      d("shared max pressure exceeded!");
-      goto fail;
+   if (max_pressure.shared > limit_pressure.shared || has_shared_vectors) {
+      ir3_ra_shared(v, live);
+
+      /* Recalculate liveness and register pressure now that additional values
+       * have been added.
+       */
+      ralloc_free(live);
+      live = ir3_calc_liveness(ctx, v->ir);
+      ir3_calc_pressure(v, live, &max_pressure);
+
+      ir3_debug_print(v->ir, "AFTER: shared register allocation");
    }
 
    bool spilled = false;
@@ -2629,7 +2641,7 @@ ir3_ra(struct ir3_shader_variant *v)
    foreach_block (block, &v->ir->block_list)
       handle_block(ctx, block);
 
-   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
+   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false);
 
    /* Strip array-ness and SSA-ness at the end, because various helpers still
     * need to work even on definitions that have already been assigned. For
diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h
index c6837aaae21..1c561a57d76 100644
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@@ -168,8 +168,10 @@ bool ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
 
 bool ir3_lower_spill(struct ir3 *ir);
 
+void ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness *live);
+
 void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
-                     unsigned half_size, unsigned block_count);
+                     unsigned half_size, unsigned block_count, bool shared_ra);
 
 void ir3_lower_copies(struct ir3_shader_variant *v);
 
diff --git a/src/freedreno/ir3/ir3_ra_validate.c b/src/freedreno/ir3/ir3_ra_validate.c
index aab26760ab9..3d19e2b7431 100644
--- a/src/freedreno/ir3/ir3_ra_validate.c
+++ b/src/freedreno/ir3/ir3_ra_validate.c
@@ -92,13 +92,25 @@ struct reaching_state {
 struct ra_val_ctx {
    struct ir3_instruction *current_instr;
 
+   /* The current state of the dataflow analysis for the instruction we're
+    * processing.
+    */
    struct reaching_state reaching;
+
+   /* The state at the end of each basic block. */
    struct reaching_state *block_reaching;
    unsigned block_count;
 
+   /* When validating shared RA, we have to take spill/reload instructions into
+    * account. This saves an array of reg_state for the source of each spill
+    * instruction, to be restored at the corresponding reload(s).
+    */
+   struct hash_table *spill_reaching;
+
    unsigned full_size, half_size;
 
    bool merged_regs;
+   bool shared_ra;
 
    bool failed;
 };
@@ -130,6 +142,28 @@ get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
       return ctx->half_size;
 }
 
+static struct reg_state *
+get_spill_state(struct ra_val_ctx *ctx, struct ir3_register *dst)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(ctx->spill_reaching, dst);
+   if (entry)
+      return entry->data;
+   else
+      return NULL;
+}
+
+static struct reg_state *
+get_or_create_spill_state(struct ra_val_ctx *ctx, struct ir3_register *dst)
+{
+   struct reg_state *state = get_spill_state(ctx, dst);
+   if (state)
+      return state;
+
+   state = rzalloc_array(ctx, struct reg_state, reg_size(dst));
+   _mesa_hash_table_insert(ctx->spill_reaching, dst, state);
+   return state;
+}
+
 /* Validate simple things, like the registers being in-bounds. This way we
  * don't have to worry about out-of-bounds accesses later.
  */
@@ -139,6 +173,8 @@ validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
    ctx->current_instr = instr;
    ra_foreach_dst (dst, instr) {
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED))
+         continue;
       unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
       validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
       if (dst->tied)
@@ -146,6 +182,8 @@ validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
    }
 
    ra_foreach_src (src, instr) {
+      if (ctx->shared_ra && !(src->flags & IR3_REG_SHARED))
+         continue;
       unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
       validate_assert(ctx, src_max <= get_file_size(ctx, src));
    }
@@ -219,6 +257,24 @@ static void
 propagate_normal_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
    ra_foreach_dst (dst, instr) {
+      /* Process destinations from scalar ALU instructions that were demoted to
+       * normal ALU instructions. For these we must treat the instruction as a
+       * spill of itself and set the propagate state to itself. See
+       * try_demote_instructions().
+       */
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         if (instr->flags & IR3_INSTR_SHARED_SPILL) {
+            struct reg_state *state = get_or_create_spill_state(ctx, dst);
+            for (unsigned i = 0; i < reg_size(dst); i++) {
+               state[i] = (struct reg_state){
+                  .def = dst,
+                  .offset = i,
+               };
+            }
+         }
+         continue;
+      }
+
       struct file_state *file = ra_val_get_file(ctx, dst);
       physreg_t physreg = ra_reg_get_physreg(dst);
       for (unsigned i = 0; i < reg_size(dst); i++) {
@@ -239,6 +295,16 @@ propagate_split(struct ra_val_ctx *ctx, struct ir3_instruction *split)
    physreg_t src_physreg = ra_reg_get_physreg(src);
    struct file_state *file = ra_val_get_file(ctx, dst);
 
+   if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+      struct reg_state *src_state = get_spill_state(ctx, src->def);
+      if (src_state) {
+         struct reg_state *dst_state = get_or_create_spill_state(ctx, dst);
+         memcpy(dst_state, &src_state[split->split.off * reg_elem_size(src)],
+                reg_size(dst) * sizeof(struct reg_state));
+      }
+      return;
+   }
+
    unsigned offset = split->split.off * reg_elem_size(src);
    for (unsigned i = 0; i < reg_elem_size(src); i++) {
       file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
@@ -249,30 +315,50 @@ static void
 propagate_collect(struct ra_val_ctx *ctx, struct ir3_instruction *collect)
 {
    struct ir3_register *dst = collect->dsts[0];
-   physreg_t dst_physreg = ra_reg_get_physreg(dst);
-   struct file_state *file = ra_val_get_file(ctx, dst);
-
    unsigned size = reg_size(dst);
-   struct reg_state srcs[size];
 
-   for (unsigned i = 0; i < collect->srcs_count; i++) {
-      struct ir3_register *src = collect->srcs[i];
-      unsigned dst_offset = i * reg_elem_size(dst);
-      for (unsigned j = 0; j < reg_elem_size(dst); j++) {
-         if (!ra_reg_is_src(src)) {
-            srcs[dst_offset + j] = (struct reg_state){
-               .def = dst,
-               .offset = dst_offset + j,
-            };
-         } else {
-            physreg_t src_physreg = ra_reg_get_physreg(src);
-            srcs[dst_offset + j] = file->regs[src_physreg + j];
+   if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+      struct reg_state *dst_state = NULL;
+
+      for (unsigned i = 0; i < collect->srcs_count; i++) {
+         struct ir3_register *src = collect->srcs[i];
+         unsigned dst_offset = i * reg_elem_size(dst);
+
+         if (ra_reg_is_src(src)) {
+            struct reg_state *src_state = get_spill_state(ctx, src->def);
+            if (src_state) {
+               if (!dst_state)
+                  dst_state = get_or_create_spill_state(ctx, dst);
+               memcpy(&dst_state[dst_offset], src_state,
+                      reg_size(src) * sizeof(struct reg_state));
+            }
          }
       }
-   }
+   } else {
+      struct file_state *file = ra_val_get_file(ctx, dst);
+      physreg_t dst_physreg = ra_reg_get_physreg(dst);
+      struct reg_state srcs[size];
 
-   for (unsigned i = 0; i < size; i++)
-      file->regs[dst_physreg + i] = srcs[i];
+      for (unsigned i = 0; i < collect->srcs_count; i++) {
+         struct ir3_register *src = collect->srcs[i];
+         unsigned dst_offset = i * reg_elem_size(dst);
+
+         for (unsigned j = 0; j < reg_elem_size(dst); j++) {
+            if (!ra_reg_is_src(src)) {
+               srcs[dst_offset + j] = (struct reg_state){
+                  .def = dst,
+                  .offset = dst_offset + j,
+               };
+            } else {
+               physreg_t src_physreg = ra_reg_get_physreg(src);
+               srcs[dst_offset + j] = file->regs[src_physreg + j];
+            }
+         }
+      }
+
+      for (unsigned i = 0; i < size; i++)
+         file->regs[dst_physreg + i] = srcs[i];
+   }
 }
 
 static void
@@ -291,15 +377,25 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
       struct ir3_register *src = pcopy->srcs[i];
       struct file_state *file = ra_val_get_file(ctx, dst);
 
-      for (unsigned j = 0; j < reg_size(dst); j++) {
-         if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
-            srcs[offset + j] = (struct reg_state){
-               .def = dst,
-               .offset = j,
-            };
-         } else {
-            physreg_t src_physreg = ra_reg_get_physreg(src);
-            srcs[offset + j] = file->regs[src_physreg + j];
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         if (ra_reg_is_src(src)) {
+            struct reg_state *src_state = get_spill_state(ctx, src->def);
+            if (src_state) {
+               struct reg_state *dst_state = get_or_create_spill_state(ctx, dst);
+               memcpy(dst_state, src_state, reg_size(dst) * sizeof(struct reg_state));
+            }
+         }
+      } else {
+         for (unsigned j = 0; j < reg_size(dst); j++) {
+            if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
+               srcs[offset + j] = (struct reg_state){
+                  .def = dst,
+                  .offset = j,
+               };
+            } else {
+               physreg_t src_physreg = ra_reg_get_physreg(src);
+               srcs[offset + j] = file->regs[src_physreg + j];
+            }
          }
       }
 
@@ -310,6 +406,12 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
    offset = 0;
    for (unsigned i = 0; i < pcopy->dsts_count; i++) {
       struct ir3_register *dst = pcopy->dsts[i];
+
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         offset += reg_size(dst);
+         continue;
+      }
+
       physreg_t dst_physreg = ra_reg_get_physreg(dst);
       struct file_state *file = ra_val_get_file(ctx, dst);
 
@@ -321,6 +423,23 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
    assert(offset == size);
 }
 
+static void
+propagate_spill(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
+{
+   if (instr->srcs[0]->flags & IR3_REG_SHARED) { /* spill */
+      struct reg_state *state = get_or_create_spill_state(ctx, instr->dsts[0]);
+      physreg_t src_physreg = ra_reg_get_physreg(instr->srcs[0]);
+      memcpy(state, &ctx->reaching.shared.regs[src_physreg],
+             reg_size(instr->srcs[0]) * sizeof(struct reg_state));
+   } else { /* reload */
+      struct reg_state *state = get_spill_state(ctx, instr->srcs[0]->def);
+      assert(state);
+      physreg_t dst_physreg = ra_reg_get_physreg(instr->dsts[0]);
+      memcpy(&ctx->reaching.shared.regs[dst_physreg], state,
+             reg_size(instr->dsts[0]) * sizeof(struct reg_state));
+   }
+}
+
 static void
 propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
@@ -330,6 +449,13 @@ propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
       propagate_collect(ctx, instr);
    else if (instr->opc == OPC_META_PARALLEL_COPY)
       propagate_parallelcopy(ctx, instr);
+   else if (ctx->shared_ra && instr->opc == OPC_MOV &&
+            /* Moves from immed/const with IR3_INSTR_SHARED_SPILL were demoted
+             * from scalar ALU, see try_demote_instruction().
+             */
+            !(instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)) &&
+            (instr->flags & IR3_INSTR_SHARED_SPILL))
+      propagate_spill(ctx, instr);
    else
       propagate_normal_instr(ctx, instr);
 }
@@ -439,6 +565,8 @@ static void
 check_reaching_src(struct ra_val_ctx *ctx, struct ir3_instruction *instr,
                    struct ir3_register *src)
 {
+   if (ctx->shared_ra && !(src->flags & IR3_REG_SHARED))
+      return;
    struct file_state *file = ra_val_get_file(ctx, src);
    physreg_t physreg = ra_reg_get_physreg(src);
    for (unsigned i = 0; i < reg_size(src); i++) {
@@ -541,7 +669,7 @@ check_reaching_defs(struct ra_val_ctx *ctx, struct ir3 *ir)
 
 void
 ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
-                unsigned half_size, unsigned block_count)
+                unsigned half_size, unsigned block_count, bool shared_ra)
 {
 #ifdef NDEBUG
 #define VALIDATE 0
@@ -557,6 +685,9 @@ ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
    ctx->full_size = full_size;
    ctx->half_size = half_size;
    ctx->block_count = block_count;
+   ctx->shared_ra = shared_ra;
+   if (ctx->shared_ra)
+      ctx->spill_reaching = _mesa_pointer_hash_table_create(ctx);
 
    foreach_block (block, &v->ir->block_list) {
       foreach_instr (instr, &block->instr_list) {
diff --git a/src/freedreno/ir3/ir3_shared_ra.c b/src/freedreno/ir3/ir3_shared_ra.c
new file mode 100644
index 00000000000..d00198b2a3c
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shared_ra.c
@@ -0,0 +1,1415 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+#include "ir3_shader.h"
+
+#include "util/u_math.h"
+
+/* Allocating shared registers can pose a challenge, because their live
+ * intervals use the physical CFG which has extra edges inserted that are
+ * pretty much always critical edges. This causes problems with phi nodes,
+ * because copies for phi nodes have to happen "along the edge," and similarly
+ * causes problems when reunifying values that have had their live range split.
+ * Problematic phi nodes should be relatively rare, so we ban them for now.
+ * The solution we choose for live-range splitting is to integrate spilling and
+ * register allcoation and spill to vector registers rather than split a live
+ * range, which negates some of the advantages of SSA-based RA, but it isn't as
+ * bad as it seems because the conditions needed (vector shared registers, which
+ * only movmsk currently produces, or fixed registers which we don't do) are
+ * relatively rare. Spilling is also much cheaper than spilling vector registers
+ * to private memory.
+ */
+
+struct ra_interval {
+   struct ir3_reg_interval interval;
+
+   struct rb_node physreg_node;
+   physreg_t physreg_start, physreg_end;
+
+   /* Where the shared register is spilled to. If there were no uses when it's
+    * spilled it could be the original defining instruction.
+    */
+   struct ir3_register *spill_def;
+
+   /* Whether this contains a source of the current instruction that can't be
+    * spilled.
+    */
+   bool src;
+
+   bool needs_reload;
+};
+
+struct ra_block_state {
+   bool visited;
+
+   /* For blocks whose successors are visited first (i.e. loop backedges), which
+    * values should be live at the end.
+    */
+   BITSET_WORD *live_out;
+};
+
+struct ra_ctx {
+   struct ir3_reg_ctx reg_ctx;
+
+   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
+
+   struct rb_tree physreg_intervals;
+
+   struct ra_interval *intervals;
+
+   struct ir3_liveness *live;
+
+   struct hash_table *pcopy_src_map;
+
+   struct ra_block_state *blocks;
+
+   unsigned start;
+};
+
+static struct ra_interval *
+ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
+{
+   return rb_node_data(struct ra_interval, interval, interval);
+}
+
+static struct ra_interval *
+rb_node_to_interval(struct rb_node *node)
+{
+   return rb_node_data(struct ra_interval, node, physreg_node);
+}
+
+static const struct ra_interval *
+rb_node_to_interval_const(const struct rb_node *node)
+{
+   return rb_node_data(struct ra_interval, node, physreg_node);
+}
+
+static struct ra_interval *
+ra_interval_next(struct ra_interval *interval)
+{
+   struct rb_node *next = rb_node_next(&interval->physreg_node);
+   return next ? rb_node_to_interval(next) : NULL;
+}
+
+static struct ra_interval *
+ra_interval_next_or_null(struct ra_interval *interval)
+{
+   return interval ? ra_interval_next(interval) : NULL;
+}
+
+static int
+ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+   const struct ra_interval *a = rb_node_to_interval_const(_a);
+   const struct ra_interval *b = rb_node_to_interval_const(_b);
+   return b->physreg_start - a->physreg_start;
+}
+
+static int
+ra_interval_cmp(const struct rb_node *node, const void *data)
+{
+   physreg_t reg = *(const physreg_t *)data;
+   const struct ra_interval *interval = rb_node_to_interval_const(node);
+   if (interval->physreg_start > reg)
+      return -1;
+   else if (interval->physreg_end <= reg)
+      return 1;
+   else
+      return 0;
+}
+
+static struct ra_ctx *
+ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
+{
+   return rb_node_data(struct ra_ctx, ctx, reg_ctx);
+}
+
+static struct ra_interval *
+ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
+{
+   struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
+   return node ? rb_node_to_interval(node) : NULL;
+}
+
+/* Get the interval covering the reg, or the closest to the right if it
+ * doesn't exist.
+ */
+static struct ra_interval *
+ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
+{
+   struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
+   if (!interval) {
+      return NULL;
+   } else if (interval->physreg_end > reg) {
+      return interval;
+   } else {
+      /* There is no interval covering reg, and ra_file_search_sloppy()
+       * returned the closest range to the left, so the next interval to the
+       * right should be the closest to the right.
+       */
+      return ra_interval_next_or_null(interval);
+   }
+}
+
+static struct ra_interval *
+ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
+{
+   return ra_interval_search_right(&ctx->physreg_intervals, reg);
+}
+
+static void
+interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
+{
+   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+   struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
+
+   /* We can assume in this case that physreg_start/physreg_end is already
+    * initialized.
+    */
+   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+      BITSET_CLEAR(ctx->available, i);
+   }
+
+   rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
+                  ra_interval_insert_cmp);
+}
+
+static void
+interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
+{
+   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+   struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
+
+   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+      BITSET_SET(ctx->available, i);
+   }
+
+   rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
+}
+
+static void
+interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
+               struct ir3_reg_interval *_child)
+{
+   struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
+   struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
+
+   child->physreg_start =
+      parent->physreg_start + (child->interval.reg->interval_start -
+                               parent->interval.reg->interval_start);
+   child->physreg_end =
+      child->physreg_start +
+      (child->interval.reg->interval_end - child->interval.reg->interval_start);
+
+   interval_add(ctx, _child);
+}
+
+static void
+ra_ctx_init(struct ra_ctx *ctx)
+{
+   ctx->reg_ctx.interval_add = interval_add;
+   ctx->reg_ctx.interval_delete = interval_delete;
+   ctx->reg_ctx.interval_readd = interval_readd;
+}
+
+static void
+ra_ctx_reset_block(struct ra_ctx *ctx)
+{
+   for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
+      BITSET_SET(ctx->available, i);
+   }
+
+   rb_tree_init(&ctx->reg_ctx.intervals);
+   rb_tree_init(&ctx->physreg_intervals);
+}
+
+static void
+ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
+{
+   ir3_reg_interval_init(&interval->interval, reg);
+}
+
+static physreg_t
+ra_interval_get_physreg(const struct ra_interval *interval)
+{
+   unsigned child_start = interval->interval.reg->interval_start;
+
+   while (interval->interval.parent) {
+      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+   }
+
+   return interval->physreg_start +
+          (child_start - interval->interval.reg->interval_start);
+}
+
+static unsigned
+ra_interval_get_num(const struct ra_interval *interval)
+{
+   return ra_physreg_to_num(ra_interval_get_physreg(interval),
+                            interval->interval.reg->flags);
+}
+
+static void
+ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
+{
+   mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
+
+   ir3_reg_interval_dump(stream, &interval->interval);
+}
+
+static void
+ra_ctx_dump(struct ra_ctx *ctx)
+{
+   struct log_stream *stream = mesa_log_streami();
+
+   mesa_log_stream_printf(stream, "shared:\n");
+   rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
+                    physreg_node) {
+      ra_interval_dump(stream, interval);
+   }
+
+   unsigned start, end;
+   mesa_log_stream_printf(stream, "available:\n");
+   BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
+      mesa_log_stream_printf(stream, "%u-%u ", start, end);
+   }
+   mesa_log_stream_printf(stream, "\n");
+   mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
+}
+
+static bool
+get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
+{
+   for (unsigned i = 0; i < reg_size(reg); i++) {
+      if (!BITSET_TEST(ctx->available, physreg + i))
+         return false;
+   }
+
+   return true;
+}
+
+static unsigned
+reg_file_size(struct ir3_register *reg)
+{
+   return RA_SHARED_SIZE;
+}
+
+static physreg_t
+find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
+              unsigned align)
+{
+   unsigned file_size = reg_file_size(dst);
+
+   /* This can happen if we create a very large merge set. Just bail out in that
+    * case.
+    */
+   if (size > file_size)
+      return (physreg_t) ~0;
+
+   unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
+   unsigned candidate = start;
+   do {
+      bool is_available = true;
+      for (unsigned i = 0; i < size; i++) {
+         if (!BITSET_TEST(ctx->available, candidate + i)) {
+            is_available = false;
+            break;
+         }
+      }
+
+      if (is_available) {
+         ctx->start = (candidate + size) % file_size;
+         return candidate;
+      }
+
+      candidate += align;
+      if (candidate + size > file_size)
+         candidate = 0;
+   } while (candidate != start);
+
+   return (physreg_t)~0;
+}
+
+static physreg_t
+find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
+                    unsigned size, unsigned align)
+{
+   unsigned file_size = reg_file_size(reg);
+   unsigned min_cost = UINT_MAX;
+
+   unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
+   physreg_t candidate = start;
+   physreg_t best_reg = (physreg_t)~0;
+   do {
+      unsigned cost = 0;
+
+      /* Iterate through intervals we'd need to spill to use this reg. */
+      for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
+           interval && interval->physreg_start < candidate + size;
+           interval = ra_interval_next_or_null(interval)) {
+         /* We can't spill sources of the current instruction when reloading
+          * sources.
+          */
+         if (interval->src) {
+            cost = UINT_MAX;
+            break;
+         }
+
+         /* We prefer spilling intervals that already have been spilled, so we
+          * don't have to emit another mov.
+          */
+         if (!interval->spill_def)
+            cost += (interval->physreg_end - interval->physreg_start);
+      }
+
+      if (cost < min_cost) {
+         min_cost = cost;
+         best_reg = candidate;
+      }
+
+      candidate += align;
+      if (candidate + size > file_size)
+         candidate = 0;
+   } while (candidate != start);
+
+   return best_reg;
+}
+
+static struct ir3_register *
+split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
+{
+   if (reg_elems(def) == 1) {
+      assert(offset == 0);
+      return def;
+   }
+
+   struct ir3_instruction *split =
+      ir3_instr_create(before->block, OPC_META_SPLIT, 1, 1);
+   split->split.off = offset;
+   struct ir3_register *dst = __ssa_dst(split);
+   struct ir3_register *src =
+      ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
+   src->wrmask = def->wrmask;
+   src->def = def;
+   ir3_instr_move_after(split, before);
+   return dst;
+}
+
+static struct ir3_register *
+extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
+        struct ir3_instruction *before)
+{
+   if (offset == 0 && elems == reg_elems(parent_def))
+      return parent_def;
+
+   if (elems == 1)
+      return split(parent_def, offset, before);
+
+   struct ir3_instruction *collect =
+      ir3_instr_create(before->block, OPC_META_COLLECT, 1, elems);
+   struct ir3_register *dst = __ssa_dst(collect);
+   dst->flags |= parent_def->flags & IR3_REG_HALF;
+   dst->wrmask = MASK(elems);
+
+   ir3_instr_move_after(collect, before);
+
+   for (unsigned i = 0; i < elems; i++) {
+      ir3_src_create(collect, INVALID_REG,
+                     parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
+         split(parent_def, offset + i, before);
+   }
+
+   return dst;
+}
+
+static void
+spill_interval_children(struct ra_interval *interval,
+                        struct ir3_instruction *before)
+{
+   rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
+                    interval.node) {
+      if (!child->spill_def) {
+         child->spill_def = extract(interval->spill_def,
+                                    (child->interval.reg->interval_start -
+                                     interval->interval.reg->interval_start) /
+                                    reg_elem_size(interval->interval.reg),
+                                    reg_elems(child->interval.reg), before);
+      }
+      spill_interval_children(child, before);
+   }
+}
+
+static void
+spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
+{
+   struct ir3_instruction *before = interval->interval.reg->instr;
+
+   d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);
+
+   if (!interval->spill_def) {
+      /* If this is a phi node or input, we need to insert the demotion to a
+       * regular register after the last phi or input in the block.
+       */
+      if (before->opc == OPC_META_PHI ||
+          before->opc == OPC_META_INPUT) {
+         struct ir3_block *block = before->block;
+         struct ir3_instruction *last_phi_input = NULL;
+         foreach_instr_from (instr, before, &block->instr_list) {
+            if (instr->opc != before->opc)
+               break;
+            last_phi_input = instr;
+         }
+         before = last_phi_input;
+      }
+
+      struct ir3_instruction *mov = ir3_instr_create(before->block, OPC_MOV, 1, 1);
+      mov->flags |= IR3_INSTR_SHARED_SPILL;
+      struct ir3_register *dst = __ssa_dst(mov);
+      dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
+      dst->wrmask = interval->interval.reg->wrmask;
+      mov->repeat = reg_elems(dst) - 1;
+      ir3_src_create(mov, interval->interval.reg->num,
+                     IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
+                     (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
+                     interval->interval.reg->wrmask;
+      mov->cat1.src_type = mov->cat1.dst_type =
+         (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+      ir3_instr_move_after(mov, before);
+      interval->spill_def = dst;
+   }
+
+   spill_interval_children(interval, interval->spill_def->instr);
+
+   ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+}
+
+/* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
+ * spilled sources. We have to take into account restrictions on the number of
+ * shared sources that only exist for normal ALU instructions.
+ */
+static bool
+try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+   /* First, check restrictions. */
+   switch (opc_cat(instr->opc)) {
+   case 1:
+      if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
+         return false;
+      break;
+   case 2: {
+      /* We need one source to either be demotable or an immediate. */
+      if (instr->srcs_count > 1) {
+         struct ra_interval *src0_interval =
+            (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
+         struct ra_interval *src1_interval =
+            (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
+         if (!(src0_interval && src0_interval->spill_def) &&
+             !(src1_interval && src1_interval->spill_def) &&
+             !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
+             !(instr->srcs[1]->flags & IR3_REG_IMMED))
+            return false;
+      }
+      break;
+   }
+   case 3: {
+      struct ra_interval *src0_interval =
+         (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
+      struct ra_interval *src1_interval =
+         (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;
+
+      /* src1 cannot be shared */
+      if (src1_interval && !src1_interval->spill_def) {
+         /* Try to swap src0 and src1, similar to what copy prop does. */
+         if (!is_mad(instr->opc))
+            return false;
+
+         if ((src0_interval && src0_interval->spill_def) ||
+             (instr->srcs[0]->flags & IR3_REG_IMMED)) {
+            struct ir3_register *src0 = instr->srcs[0];
+            instr->srcs[0] = instr->srcs[1];
+            instr->srcs[1] = src0;
+         } else {
+            return false;
+         }
+      }
+      break;
+   }
+   case 4: {
+      assert(instr->srcs[0]->flags & IR3_REG_SSA);
+      struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
+      if (!src_interval->spill_def)
+         return false;
+      break;
+   }
+
+   default:
+      return false;
+   }
+
+   d("demoting instruction");
+
+   /* If the instruction is already not a scalar ALU instruction, we should've
+    * skipped reloading and just demoted sources directly, so we should never
+    * get here.
+    */
+   assert(instr->dsts[0]->flags & IR3_REG_SHARED);
+
+   /* Now we actually demote the instruction */
+   ra_foreach_src (src, instr) {
+      assert(src->flags & IR3_REG_SHARED);
+      struct ra_interval *interval = &ctx->intervals[src->def->name];
+      if (interval->spill_def) {
+         src->def = interval->spill_def;
+         src->flags &= ~IR3_REG_SHARED;
+         interval->needs_reload = false;
+         if (interval->interval.inserted)
+            ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
+         while (interval->interval.parent)
+            interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+         interval->src = false;
+      }
+   }
+
+   struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
+   instr->dsts[0]->flags &= ~IR3_REG_SHARED;
+   ra_interval_init(dst_interval, instr->dsts[0]);
+   dst_interval->spill_def = instr->dsts[0];
+
+   instr->flags |= IR3_INSTR_SHARED_SPILL;
+
+   return true;
+}
+
+/* Free up [start, start + size) by spilling live intervals.
+ */
+static void
+free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
+{
+   struct ra_interval *interval = ra_ctx_search_right(ctx, start);
+   while (interval && interval->physreg_start < start + size) {
+      struct ra_interval *next = ra_interval_next_or_null(interval);
+      spill_interval(ctx, interval);
+      interval = next;
+   }
+}
+
+static physreg_t
+get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
+{
+   if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
+      physreg_t preferred_reg =
+         reg->merge_set->preferred_reg + reg->merge_set_offset;
+      if (preferred_reg < reg_file_size(reg) &&
+          preferred_reg % reg_elem_size(reg) == 0 &&
+          get_reg_specified(ctx, reg, preferred_reg))
+         return preferred_reg;
+   }
+
+   /* If this register is a subset of a merge set which we have not picked a
+    * register for, first try to allocate enough space for the entire merge
+    * set.
+    */
+   unsigned size = reg_size(reg);
+   if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
+       size < reg->merge_set->size) {
+      physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
+                                         reg->merge_set->alignment);
+      if (best_reg != (physreg_t)~0u) {
+         best_reg += reg->merge_set_offset;
+         return best_reg;
+      }
+   }
+
+   /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
+    * Because this doesn't introduce unnecessary dependencies, and it
+    * potentially avoids needing (ss) syncs for write after read hazards for
+    * SFU instructions:
+    */
+   if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
+      for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
+         struct ir3_register *src = reg->instr->srcs[i];
+         if (!ra_reg_is_src(src))
+            continue;
+         if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
+            struct ra_interval *src_interval = &ctx->intervals[src->def->name];
+            physreg_t src_physreg = ra_interval_get_physreg(src_interval);
+            if (src_physreg % reg_elem_size(reg) == 0 &&
+                src_physreg + size <= reg_file_size(reg) &&
+                get_reg_specified(ctx, reg, src_physreg))
+               return src_physreg;
+         }
+      }
+   }
+
+   return find_best_gap(ctx, reg, size, reg_elem_size(reg));
+}
+
+/* The reload process is split in two, first we allocate a register to reload to
+ * for all sources that need a reload and then we actually execute the reload.
+ * This is to allow us to demote shared ALU instructions to non-shared whenever
+ * we would otherwise need to spill to reload, without leaving dangling unused
+ * reload mov's from previously processed sources. So, for example, we could
+ * need to reload both sources of an add, but after reloading the first source
+ * we realize that we would need to spill to reload the second source and we
+ * should demote the add instead, which means cancelling the first reload.
+ */
+static void
+reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
+           struct ir3_register *src)
+{
+   struct ir3_register *reg = src->def;
+   struct ra_interval *interval = &ctx->intervals[reg->name];
+   unsigned size = reg_size(reg);
+
+   physreg_t best_reg = get_reg(ctx, reg, true);
+
+   if (best_reg == (physreg_t)~0u) {
+      if (try_demote_instruction(ctx, instr))
+         return;
+
+      best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
+      assert(best_reg != (physreg_t)~0u);
+
+      free_space(ctx, best_reg, size);
+   }
+
+   d("reload src %u physreg %u", reg->name, best_reg);
+   interval->physreg_start = best_reg;
+   interval->physreg_end = best_reg + size;
+   interval->needs_reload = true;
+   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+   interval->src = true;
+}
+
+static void
+reload_interval(struct ra_ctx *ctx, struct ir3_instruction *instr,
+                struct ir3_block *block, struct ra_interval *interval)
+{
+   struct ir3_register *def = interval->interval.reg;
+   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   mov->flags |= IR3_INSTR_SHARED_SPILL;
+   unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
+   ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
+                  flags)->wrmask = def->wrmask;
+   mov->repeat = reg_elems(def) - 1;
+   struct ir3_register *mov_src =
+      ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
+                     (mov->repeat ? IR3_REG_R : 0));
+   assert(interval->spill_def);
+   mov_src->def = interval->spill_def;
+   mov_src->wrmask = def->wrmask;
+   mov->cat1.src_type = mov->cat1.dst_type =
+      (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+   if (instr)
+      ir3_instr_move_before(mov, instr);
+}
+
+static void
+reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
+                    struct ir3_register *src)
+{
+   struct ir3_register *reg = src->def;
+   struct ra_interval *interval = &ctx->intervals[reg->name];
+
+   if (!interval->needs_reload)
+      return;
+
+   reload_interval(ctx, instr, instr->block, interval);
+
+   interval->needs_reload = false;
+}
+
+static bool
+can_demote_src(struct ir3_instruction *instr)
+{
+   switch (instr->opc) {
+   case OPC_SCAN_MACRO:
+   case OPC_META_COLLECT:
+      return false;
+   case OPC_MOV:
+      /* non-shared -> shared floating-point conversions don't work */
+      return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
+          (full_type(instr->cat1.src_type) != TYPE_F32 &&
+           full_type(instr->cat1.dst_type) != TYPE_F32));
+   default:
+      return (!is_alu(instr) && !is_sfu(instr)) ||
+         !(instr->dsts[0]->flags & IR3_REG_SHARED);
+   }
+}
+
+/* Ensure that this source is never spilled while reloading other sources.
+ */
+static void
+mark_src(struct ra_ctx *ctx, struct ir3_register *src)
+{
+   if (!(src->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+
+   if (interval->interval.inserted) {
+      while (interval->interval.parent)
+         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+
+      interval->src = true;
+   }
+}
+
+static void
+ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
+                struct ir3_register *src)
+{
+   if (!(src->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+
+   if (!interval->interval.inserted) {
+      /* In some cases we cannot demote shared reg sources to non-shared regs,
+       * then we have to reload it.
+       */
+      assert(interval->spill_def);
+      if (!can_demote_src(instr)) {
+         reload_src(ctx, instr, src);
+      } else {
+         if (instr->opc == OPC_META_PARALLEL_COPY) {
+            /* Stash away the original def to use later in case we actually have
+             * to insert a reload.
+             */
+            _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
+         }
+         src->def = interval->spill_def;
+         src->flags &= ~IR3_REG_SHARED;
+      }
+   }
+}
+
+static void
+assign_src(struct ra_ctx *ctx, struct ir3_register *src)
+{
+   if (!(src->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+   assert(interval->interval.inserted);
+   src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);
+
+   if ((src->flags & IR3_REG_FIRST_KILL) &&
+       !interval->interval.parent &&
+       rb_tree_is_empty(&interval->interval.children))
+      ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
+
+   while (interval->interval.parent)
+      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
+
+   interval->src = false;
+}
+
+static void
+handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
+           struct ir3_register *dst)
+{
+   if (!(dst->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[dst->name];
+   ra_interval_init(interval, dst);
+   interval->spill_def = NULL;
+
+   if (dst->tied) {
+      struct ir3_register *tied_def = dst->tied->def;
+      struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
+      if ((dst->tied->flags & IR3_REG_KILL) &&
+          !tied_interval->interval.parent &&
+          rb_tree_is_empty(&tied_interval->interval.children)) {
+         dst->num = dst->tied->num;
+         interval->physreg_start = tied_interval->physreg_start;
+         interval->physreg_end = tied_interval->physreg_end;
+         ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+         return;
+      }
+   }
+
+   physreg_t physreg = get_reg(ctx, dst, false);
+   if (physreg == (physreg_t) ~0u) {
+      if (try_demote_instruction(ctx, instr))
+         return;
+
+      unsigned size = reg_size(dst);
+      physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
+      assert(physreg != (physreg_t)~0u);
+      free_space(ctx, physreg, size);
+   }
+
+   interval->physreg_start = physreg;
+   interval->physreg_end = physreg + reg_size(dst);
+   dst->num = ra_physreg_to_num(physreg, dst->flags);
+   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+   d("insert dst %u physreg %u", dst->name, physreg);
+
+   if (dst->tied) {
+      struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 1, 1);
+      unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
+      ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
+      ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
+      mov->cat1.src_type = mov->cat1.dst_type =
+         (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
+      ir3_instr_move_before(mov, instr);
+      dst->tied->num = dst->num;
+   }
+}
+
+static void
+handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
+                struct ir3_register *src)
+{
+   if (!(src->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *interval = &ctx->intervals[src->def->name];
+   reload_src_finalize(ctx, instr, src);
+
+   /* Remove killed sources that have to be killed late due to being merged with
+    * other defs.
+    */
+   if (!(src->flags & IR3_REG_KILL))
+      return;
+
+   if (interval->interval.inserted)
+      ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
+}
+
+static void
+handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+   ra_foreach_src (src, instr)
+      mark_src(ctx, src);
+
+   ra_foreach_src (src, instr)
+      ensure_src_live(ctx, instr, src);
+
+   ra_foreach_src_rev (src, instr)
+      assign_src(ctx, src);
+
+   ra_foreach_dst (dst, instr)
+      handle_dst(ctx, instr, dst);
+
+   ra_foreach_src (src, instr)
+      handle_src_late(ctx, instr, src);
+}
+
+static void
+handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
+{
+   struct ir3_register *src = split->srcs[0];
+   struct ir3_register *dst = split->dsts[0];
+
+   if (!(dst->flags & IR3_REG_SHARED))
+      return;
+
+   if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
+      handle_normal_instr(ctx, split);
+      return;
+   }
+
+   struct ra_interval *src_interval = &ctx->intervals[src->def->name];
+   struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+
+   ra_interval_init(dst_interval, dst);
+   dst_interval->spill_def = NULL;
+
+   if (src_interval->spill_def) {
+      struct ir3_instruction *spill_split =
+         ir3_instr_create(split->block, OPC_META_SPLIT, 1, 1);
+      struct ir3_register *dst = __ssa_dst(spill_split);
+      ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA)->def =
+         src_interval->spill_def;
+      spill_split->split.off = split->split.off;
+      ir3_instr_move_after(spill_split, split);
+      dst_interval->spill_def = dst;
+      return;
+   }
+
+   dst_interval->physreg_start =
+      src_interval->physreg_start + dst->merge_set_offset -
+      src->def->merge_set_offset;
+   dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
+   ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
+   src->num = ra_interval_get_num(src_interval);
+   dst->num = ra_interval_get_num(dst_interval);
+   d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);
+
+   if (src->flags & IR3_REG_KILL)
+      ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
+}
+
+static void
+handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
+{
+   struct ir3_register *dst = phi->dsts[0];
+   
+   if (!(dst->flags & IR3_REG_SHARED))
+      return;
+
+   struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+   ra_interval_init(dst_interval, dst);
+
+   /* In some rare cases, it's possible to have a phi node with a physical-only
+    * source. Here's a contrived example:
+    *
+    * loop {
+    *    if non-uniform {
+    *       if uniform {
+    *          x_1 = ...;
+    *          continue;
+    *       }
+    *       x_2 = ...;
+    *    } else {
+    *       break;
+    *    }
+    *    // continue block
+    *    x_3 = phi(x_1, x_2)
+    * }
+    *
+    * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
+    * threads that stay in the loop take the same branch to the continue block,
+    * however execution may fall through from the assignment to x_2 to the
+    * break statement because the outer if is non-uniform, and then it will fall
+    * through again to the continue block, so if x_3 is to be in a shared reg
+    * then the phi needs an extra source pointing to the break statement, which
+    * itself needs a phi node:
+    *
+    * loop {
+    *    if non-uniform {
+    *       if uniform {
+    *          x_1 = ...;
+    *          continue;
+    *       }
+    *       x_2 = ...;
+    *    } else {
+    *       x_4 = phi(undef, x_2)
+    *       break;
+    *    }
+    *    // continue block
+    *    x_3 = phi(x_1, x_2, x_4)
+    * }
+    */
+
+   /* phi nodes are special because we cannot spill them normally, instead we
+    * have to spill the parallel copies that their sources point to and make the
+    * entire phi not shared anymore.
+    */
+
+   physreg_t physreg = get_reg(ctx, dst, false);
+   if (physreg == (physreg_t) ~0u) {
+      d("spilling phi destination");
+      dst->flags &= ~IR3_REG_SHARED;
+      dst_interval->spill_def = dst;
+      phi->flags |= IR3_INSTR_SHARED_SPILL;
+
+      foreach_src (src, phi) {
+         src->flags &= ~IR3_REG_SHARED;
+         if (src->def)
+            src->def->flags &= ~IR3_REG_SHARED;
+      }
+
+      return;
+   }
+
+   dst->num = ra_physreg_to_num(physreg, dst->flags);
+   dst_interval->spill_def = NULL;
+   dst_interval->physreg_start = physreg;
+   dst_interval->physreg_end = physreg + reg_size(dst);
+   ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
+
+   ra_foreach_src_n (src, i, phi) {
+      /* We assume that any phis with non-logical sources aren't promoted. */
+      assert(i < phi->block->predecessors_count);
+      src->num = dst->num;
+      src->def->num = dst->num;
+   }
+}
+
+static void
+handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
+{
+   /* For parallel copies, we only handle the source. The destination is handled
+    * later when processing phi nodes.
+    */
+
+   ra_foreach_src (src, pcopy)
+      mark_src(ctx, src);
+
+   ra_foreach_src (src, pcopy)
+      ensure_src_live(ctx, pcopy, src);
+
+   ra_foreach_src_rev (src, pcopy)
+      assign_src(ctx, src);
+
+   ra_foreach_src (src, pcopy)
+      handle_src_late(ctx, pcopy, src);
+}
+
+static void
+handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+   instr->flags &= ~IR3_INSTR_SHARED_SPILL;
+
+   switch (instr->opc) {
+   case OPC_META_SPLIT:
+      handle_split(ctx, instr);
+      break;
+   case OPC_META_PHI:
+      handle_phi(ctx, instr);
+      break;
+   case OPC_META_PARALLEL_COPY:
+      handle_pcopy(ctx, instr);
+      break;
+   default:
+      handle_normal_instr(ctx, instr);
+   }
+}
+
+/* In case we define a value outside a loop, use it inside the loop, then spill
+ * it afterwards inside the same loop, we could lose the value so we have to
+ * reload it. We have to reload it after any parallel copy instruction, when the
+ * live shared registers equal the live-in of the backedge. lower_pcopy() will
+ * then move any non-shared parallel copies down past the reload.
+ */
+static void
+reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_block_state *state = &ctx->blocks[block->index];
+   unsigned name;
+   BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+
+      struct ra_interval *interval = &ctx->intervals[name];
+      if (!interval->interval.inserted) {
+         d("reloading %d at end of backedge", reg->name);
+         reload_interval(ctx, NULL, block, interval);
+      }
+   }
+}
+
+static void
+record_pred_live_out(struct ra_ctx *ctx,
+                     struct ra_interval *interval,
+                     struct ir3_block *pred)
+{
+   struct ra_block_state *state = &ctx->blocks[pred->index];
+
+   struct ir3_register *def = interval->interval.reg;
+   BITSET_SET(state->live_out, def->name);
+
+   rb_tree_foreach (struct ra_interval, child,
+                    &interval->interval.children, interval.node) {
+      record_pred_live_out(ctx, child, pred);
+   }
+}
+
+static void
+record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_block_state *state = &ctx->blocks[pred->index];
+      if (state->visited)
+         continue;
+
+      state->live_out = rzalloc_array(NULL, BITSET_WORD,
+                                      BITSET_WORDS(ctx->live->definitions_count));
+
+
+      rb_tree_foreach (struct ra_interval, interval,
+                       &ctx->reg_ctx.intervals, interval.node) {
+         record_pred_live_out(ctx, interval, pred);
+      }
+   }
+}
+
+static void
+handle_block(struct ra_ctx *ctx, struct ir3_block *block)
+{
+   ra_ctx_reset_block(ctx);
+
+   unsigned name;
+   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                       ctx->live->definitions_count) {
+      struct ir3_register *def = ctx->live->definitions[name];
+      struct ra_interval *interval = &ctx->intervals[name];
+
+      /* Non-shared definitions may still be definitions we spilled by demoting
+       * them, so we still need to initialize the interval. But we shouldn't
+       * make these intervals live.
+       */
+      ra_interval_init(interval, def);
+
+      if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
+         ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+      }
+   }
+
+   if (RA_DEBUG) {
+      d("after live-in block %u:\n", block->index);
+      ra_ctx_dump(ctx);
+   }
+
+   if (block->predecessors_count > 1)
+      record_pred_live_outs(ctx, block);
+
+   foreach_instr (instr, &block->instr_list) {
+      di(instr, "processing");
+
+      handle_instr(ctx, instr);
+
+      if (RA_DEBUG)
+         ra_ctx_dump(ctx);
+   }
+
+   if (block->successors[0]) {
+      struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];
+
+      if (state->visited) {
+         assert(!block->successors[1]);
+
+         reload_live_outs(ctx, block);
+      }
+   }
+
+   ctx->blocks[block->index].visited = true;
+}
+
+static void
+lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         /* At this point due to spilling there may be parallel copies from
+          * shared to non-shared registers and vice versa. Lowering these after
+          * RA may produce cycles involving shared and non-shared registers,
+          * which would need to be resolved by swapping a shared and non-shared
+          * register which is something we can't handle. However by lowering
+          * these to moves now, we can make sure that cycles only involve
+          * non-shared registers. To avoid illegally moving a shared register
+          * read or write across the parallel copy, which may have other
+          * conflicting reads/writes if there's a cycle, we need to move copies
+          * from non-shared to shared below the shared copies, and we need to
+          * move copies from shared to non-shared above them. So, we have the
+          * following order:
+          *
+          * 1. shared->non-shared copies (spills)
+          * 2. shared->shared copies (one parallel copy as there may be cycles)
+          * 3. non-shared->shared copies (reloads)
+          * 4. non-shared->non-shared copies
+          *
+          * We split out the non-shared->non-shared copies as a separate step.
+          */
+         if (instr->opc == OPC_META_PARALLEL_COPY) {
+            for (unsigned i = 0; i < instr->srcs_count; i++) {
+               if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
+                   !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
+                  /* shared->non-shared. Create a spill move and rewrite the
+                   * source to be the destination of the move (so that the
+                   * original shared->non-shared copy becomes a
+                   * non-shared->non-shared copy).
+                   */
+                  struct ir3_instruction *mov =
+                     ir3_instr_create(block, OPC_MOV, 1, 1);
+                  mov->flags |= IR3_INSTR_SHARED_SPILL;
+                  struct ir3_register *dst =
+                     ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
+                  dst->wrmask = instr->dsts[i]->wrmask;
+                  dst->instr = mov;
+                  mov->repeat = reg_elems(mov->dsts[0]) - 1;
+                  struct ir3_register *src =
+                     ir3_src_create(mov, instr->srcs[i]->num,
+                                    instr->srcs[i]->flags |
+                                    (mov->repeat ? IR3_REG_R : 0));
+                  src->wrmask = instr->srcs[i]->wrmask;
+                  mov->cat1.dst_type = mov->cat1.src_type =
+                     (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+                  instr->srcs[i]->flags = mov->dsts[0]->flags;
+                  instr->srcs[i]->def = mov->dsts[0];
+                  ir3_instr_move_before(mov, instr);
+               }
+            }
+
+            for (unsigned i = 0; i < instr->dsts_count;) {
+               if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
+                   (instr->srcs[i]->flags & IR3_REG_SSA) &&
+                   !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
+                  /* non-shared->shared. Create a reload move.
+                   */
+                  struct ir3_instruction *mov =
+                     ir3_instr_create(block, OPC_MOV, 1, 1);
+                  mov->flags |= IR3_INSTR_SHARED_SPILL;
+                  struct ir3_register *dst =
+                     ir3_dst_create(mov, instr->dsts[i]->num,
+                                    instr->dsts[i]->flags);
+                  dst->instr = mov;
+                  dst->wrmask = instr->dsts[i]->wrmask;
+                  mov->repeat = reg_elems(mov->dsts[0]) - 1;
+                  struct ir3_register *src = 
+                     ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
+                                    (mov->repeat ? IR3_REG_R : 0));
+                  src->def = instr->srcs[i]->def;
+                  src->wrmask = instr->srcs[i]->wrmask;
+                  mov->cat1.dst_type = mov->cat1.src_type =
+                     (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+                  /* When we spill a parallel copy source, we lose the
+                   * information of where it originally points to since we make
+                   * it point to the spill def. If we later decide not to also
+                   * spill the phi associated with it, we have to restore it
+                   * here using the stashed original source so that RA
+                   * validation can check that we did the correct thing.
+                   *
+                   * Because SSA-ness goes away after validation, this is really
+                   * just about validation.
+                   */
+                  struct ir3_block *succ = block->successors[0];
+                  unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+                  foreach_instr (phi, &succ->instr_list) {
+                     if (phi->opc != OPC_META_PHI)
+                        break;
+
+                     if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
+                        struct ir3_register *def =
+                           _mesa_hash_table_search(ctx->pcopy_src_map,
+                                                   instr->srcs[i])->data;
+                        phi->srcs[pred_idx]->def = def;
+                        break;
+                     }
+                  }
+
+                  instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
+                  instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
+                  instr->srcs_count--;
+                  instr->dsts_count--;
+                  ir3_instr_move_after(mov, instr);
+                  continue;
+               }
+
+               i++;
+            }
+
+            /* Move any non-shared copies to a separate parallel copy
+             * instruction right at the end of the block, after any reloads. At
+             * this point all copies should be {shared,immediate}->shared or
+             * {non-shared,immediate}->non-shared. 
+             */
+            unsigned non_shared_copies = 0;
+            for (unsigned i = 0; i < instr->dsts_count; i++) {
+               if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
+                  non_shared_copies++;
+            }
+
+            if (non_shared_copies != 0) {
+               struct ir3_instruction *pcopy =
+                  ir3_instr_create(block, OPC_META_PARALLEL_COPY,
+                                   non_shared_copies, non_shared_copies);
+
+               unsigned j = 0;
+               for (unsigned i = 0; i < instr->dsts_count;) {
+                  if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
+                     pcopy->dsts[j] = instr->dsts[i];
+                     pcopy->srcs[j] = instr->srcs[i];
+                     pcopy->dsts[j]->instr = pcopy;
+                     instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
+                     instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
+                     instr->srcs_count--;
+                     instr->dsts_count--;
+                     j++;
+                     continue;
+                  }
+                  i++;
+               }
+
+               pcopy->srcs_count = pcopy->dsts_count = j;
+               if (instr->dsts_count == 0)
+                  list_del(&instr->node);
+            }
+         }
+      }
+   }
+}
+
+static void
+finalize(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         for (unsigned i = 0; i < instr->dsts_count; i++) {
+            if (instr->dsts[i]->flags & IR3_REG_SHARED) {
+               instr->dsts[i]->flags &= ~IR3_REG_SSA;
+            }
+         }
+
+         for (unsigned i = 0; i < instr->srcs_count; i++) {
+            if (instr->srcs[i]->flags & IR3_REG_SHARED) {
+               instr->srcs[i]->flags &= ~IR3_REG_SSA;
+               instr->srcs[i]->def = NULL;
+            }
+         }
+      }
+   }
+}
+
+void
+ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness *live)
+{
+   struct ra_ctx ctx;
+
+   ra_ctx_init(&ctx);
+   ctx.intervals = rzalloc_array(NULL, struct ra_interval,
+                                 live->definitions_count);
+   ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
+                              live->block_count);
+   ctx.start = 0;
+   ctx.live = live;
+   ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);
+
+   foreach_block (block, &v->ir->block_list) {
+      handle_block(&ctx, block);
+   }
+
+   lower_pcopy(v->ir, &ctx);
+
+   for (unsigned i = 0; i < live->block_count; i++) {
+      if (ctx.blocks[i].live_out)
+         ralloc_free(ctx.blocks[i].live_out);
+   }
+
+   ralloc_free(ctx.intervals);
+   ralloc_free(ctx.pcopy_src_map);
+   ralloc_free(ctx.blocks);
+
+   ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
+   finalize(v->ir);
+}
+
diff --git a/src/freedreno/ir3/ir3_spill.c b/src/freedreno/ir3/ir3_spill.c
index 475c132f6fa..0ea80bca337 100644
--- a/src/freedreno/ir3/ir3_spill.c
+++ b/src/freedreno/ir3/ir3_spill.c
@@ -1193,20 +1193,23 @@ is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
 static void
 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
 {
-   foreach_dst (dst, pcopy) {
+   ra_foreach_dst (dst, pcopy) {
       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
       ra_spill_interval_init(dst_interval, dst);
    }
 
    foreach_src_n (src, i, pcopy) {
-      d("processing src %u", i);
       struct ir3_register *dst = pcopy->dsts[i];
+      if (!(dst->flags & IR3_REG_SSA))
+         continue;
+
+      d("processing src %u", i);
 
       /* Skip the intermediate copy for cases where the source is merged with
        * the destination. Crucially this means that we also don't reload/spill
        * it if it's been spilled, because it shares the same spill slot.
        */
-      if (src->def && src->def->merge_set &&
+      if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
           src->def->merge_set == dst->merge_set &&
           src->def->merge_set_offset == dst->merge_set_offset) {
          struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
@@ -1221,7 +1224,7 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
             dst_interval->cant_spill = false;
             dst_interval->dst = src_interval->dst;
          }
-      } else if (src->def) {
+      } else if (src->flags & IR3_REG_SSA) {
          struct ra_spill_interval *temp_interval =
             create_temp_interval(ctx, dst);
          struct ir3_register *temp = temp_interval->interval.reg;
@@ -1251,15 +1254,17 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
 
    foreach_src_n (src, i, pcopy) {
       struct ir3_register *dst = pcopy->dsts[i];
+      if (!(dst->flags & IR3_REG_SSA))
+         continue;
 
-      if (src->def && src->def->merge_set &&
+      if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
           src->def->merge_set == dst->merge_set &&
           src->def->merge_set_offset == dst->merge_set_offset)
          continue;
 
       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
 
-      if (!src->def) {
+      if (!(src->flags & IR3_REG_SSA)) {
          dst_interval->cant_spill = true;
          ra_spill_ctx_insert(ctx, dst_interval);
          limit(ctx, pcopy);
@@ -1292,6 +1297,9 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
 static void
 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    init_dst(ctx, instr->dsts[0]);
    insert_dst(ctx, instr->dsts[0]);
    finish_dst(ctx, instr->dsts[0]);
@@ -1300,6 +1308,9 @@ handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 static void
 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    if (instr->opc == OPC_META_TEX_PREFETCH) {
       ra_foreach_src (src, instr)
          remove_src(ctx, instr, src);
@@ -1623,6 +1634,9 @@ static void
 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
             struct ir3_block *block)
 {
+   if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
    if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
       phi->flags |= IR3_INSTR_UNUSED;
       return;
@@ -1977,8 +1991,25 @@ cleanup_dead(struct ir3 *ir)
 {
    foreach_block (block, &ir->block_list) {
       foreach_instr_safe (instr, &block->instr_list) {
-         if (instr->flags & IR3_INSTR_UNUSED)
-            list_delinit(&instr->node);
+         if (instr->flags & IR3_INSTR_UNUSED) {
+            if (instr->opc == OPC_META_PARALLEL_COPY) {
+               /* There may be non-SSA shared copies, we need to preserve these.
+                */
+               for (unsigned i = 0; i < instr->dsts_count;) {
+                  if (instr->dsts[i]->flags & IR3_REG_SSA) {
+                     instr->dsts[i] = instr->dsts[--instr->dsts_count];
+                     instr->srcs[i] = instr->srcs[--instr->srcs_count];
+                  } else {
+                     i++;
+                  }
+               }
+
+               if (instr->dsts_count == 0)
+                  list_delinit(&instr->node);
+            } else {
+               list_delinit(&instr->node);
+            }
+         }
       }
    }
 }
diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c
index 1c74d711e0c..2b45559cab8 100644
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@@ -84,6 +84,9 @@ validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
    validate_assert(ctx, src->wrmask == reg->wrmask);
    validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));
 
+   if (src->flags & IR3_REG_CONST)
+      validate_assert(ctx, !(src->flags & IR3_REG_SHARED));
+
    if (reg->tied) {
       validate_assert(ctx, reg->tied->tied == reg);
       bool found = false;
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
index 954b0b88fdb..e7ba1d8fe5f 100644
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -112,6 +112,7 @@ libfreedreno_ir3_files = files(
   'ir3_sched.c',
   'ir3_shader.c',
   'ir3_shader.h',
+  'ir3_shared_ra.c',
   'ir3_spill.c',
   'ir3_validate.c',
 )