diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h
index 6cbd406032e..4d11fb1eaa7 100644
--- a/src/gallium/drivers/lima/ir/gp/gpir.h
+++ b/src/gallium/drivers/lima/ir/gp/gpir.h
@@ -211,11 +211,6 @@ typedef struct {
 typedef struct {
    int index;
    struct list_head list;
-
-   struct list_head defs_list;
-   struct list_head uses_list;
-
-   int start, end;
 } gpir_reg;
 
 typedef struct {
@@ -236,7 +231,6 @@ typedef struct gpir_store_node {
    gpir_node *child;
 
    gpir_reg *reg;
-   struct list_head reg_link;
 } gpir_store_node;
 
 enum gpir_instr_slot {
@@ -350,6 +344,26 @@ typedef struct gpir_block {
    struct list_head predecessors;
    struct list_head predecessors_node;
 
+   /* for regalloc */
+
+   /* The set of live registers, i.e. registers whose value may be used
+    * eventually, at the beginning of the block.
+    */
+   BITSET_WORD *live_in;
+
+   /* Set of live registers at the end of the block. */
+   BITSET_WORD *live_out;
+
+   /* Set of registers that may have a value defined at the end of the
+    * block.
+    */
+   BITSET_WORD *def_out;
+
+   /* After register allocation, the set of live physical registers at the end
+    * of the block. Needed for scheduling.
+    */
+   uint64_t live_out_phys;
+
    /* For codegen, the offset in the final program. */
    unsigned instr_offset;
 
@@ -380,8 +394,17 @@ typedef struct gpir_compiler {
    struct list_head block_list;
    int cur_index;
 
-   /* array for searching ssa node */
-   gpir_node **var_nodes;
+   /* Find the gpir node for a given NIR SSA def. */
+   gpir_node **node_for_ssa;
+
+   /* Find the gpir node for a given NIR register. */
+   gpir_node **node_for_reg;
+
+   /* Find the gpir register for a given NIR SSA def. */
+   gpir_reg **reg_for_ssa;
+
+   /* Find the gpir register for a given NIR register. */
+   gpir_reg **reg_for_reg;
 
    /* gpir block for NIR block. */
    gpir_block **blocks;
diff --git a/src/gallium/drivers/lima/ir/gp/lower.c b/src/gallium/drivers/lima/ir/gp/lower.c
index eaeeeb8f1eb..296f141216e 100644
--- a/src/gallium/drivers/lima/ir/gp/lower.c
+++ b/src/gallium/drivers/lima/ir/gp/lower.c
@@ -109,10 +109,7 @@ static bool gpir_lower_load(gpir_compiler *comp)
                gpir_load_node *nload = gpir_node_to_load(new);
                nload->index = load->index;
                nload->component = load->component;
-               if (load->reg) {
-                  nload->reg = load->reg;
-                  list_addtail(&nload->reg_link, &load->reg->uses_list);
-               }
+               nload->reg = load->reg;
 
                gpir_node_replace_pred(dep, new);
                gpir_node_replace_child(succ, node, new);
diff --git a/src/gallium/drivers/lima/ir/gp/nir.c b/src/gallium/drivers/lima/ir/gp/nir.c
index e2dc939f1a0..8d9a5beb98a 100644
--- a/src/gallium/drivers/lima/ir/gp/nir.c
+++ b/src/gallium/drivers/lima/ir/gp/nir.c
@@ -30,37 +30,80 @@
 #include "gpir.h"
 #include "lima_context.h"
 
+gpir_reg *gpir_create_reg(gpir_compiler *comp)
+{
+   gpir_reg *reg = ralloc(comp, gpir_reg);
+   reg->index = comp->cur_reg++;
+   list_addtail(&reg->list, &comp->reg_list);
+   return reg;
+}
 
-static inline void *gpir_node_create_ssa(gpir_block *block, gpir_op op, nir_ssa_def *ssa)
+static gpir_reg *reg_for_nir_reg(gpir_compiler *comp, nir_register *nir_reg)
+{
+   unsigned index = nir_reg->index;
+   gpir_reg *reg = comp->reg_for_reg[index];
+   if (reg)
+      return reg;
+   reg = gpir_create_reg(comp);
+   comp->reg_for_reg[index] = reg;
+   return reg;
+}
+
+static inline gpir_node *gpir_node_create_ssa(gpir_block *block, gpir_op op, nir_ssa_def *ssa)
 {
    int index = ssa->index;
    gpir_node *node = gpir_node_create(block, op);
 
-   block->comp->var_nodes[index] = node;
+   block->comp->node_for_ssa[index] = node;
    snprintf(node->name, sizeof(node->name), "ssa%d", index);
    list_addtail(&node->list, &block->node_list);
+
+   /* If any uses are outside the current block, we'll need to create a store
+    * instruction for them.
+    */
+   bool needs_register = false;
+   nir_foreach_use(use, ssa) {
+      if (use->parent_instr->block != ssa->parent_instr->block) {
+         needs_register = true;
+         break;
+      }
+   }
+
+   if (!needs_register) {
+      nir_foreach_if_use(use, ssa) {
+         if (nir_cf_node_prev(&use->parent_if->cf_node) !=
+             &ssa->parent_instr->block->cf_node) {
+            needs_register = true;
+            break;
+         }
+      }
+   }
+
+   if (needs_register) {
+      gpir_store_node *store = gpir_node_create(block, gpir_op_store_reg);
+      store->child = node;
+      store->reg = gpir_create_reg(block->comp);
+      gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
+      list_addtail(&store->node.list, &block->node_list);
+      block->comp->reg_for_ssa[ssa->index] = store->reg;
+   }
+
    return node;
 }
 
-static inline void *gpir_node_create_reg(gpir_block *block, gpir_op op, nir_reg_dest *reg)
+static inline void *gpir_node_create_reg(gpir_block *block, gpir_op op, nir_reg_dest *nir_reg)
 {
-   int index = reg->reg->index;
+   int index = nir_reg->reg->index;
    gpir_node *node = gpir_node_create(block, op);
+   block->comp->node_for_reg[index] = node;
    gpir_store_node *store = gpir_node_create(block, gpir_op_store_reg);
 
    snprintf(node->name, sizeof(node->name), "reg%d", index);
 
    store->child = node;
+   store->reg = reg_for_nir_reg(block->comp, nir_reg->reg);
    gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
 
-   list_for_each_entry(gpir_reg, reg, &block->comp->reg_list, list) {
-      if (reg->index == index) {
-         store->reg = reg;
-         list_addtail(&store->reg_link, &reg->defs_list);
-         break;
-      }
-   }
-
    list_addtail(&node->list, &block->node_list);
    list_addtail(&store->node.list, &block->node_list);
    return node;
@@ -77,35 +120,34 @@ static void *gpir_node_create_dest(gpir_block *block, gpir_op op, nir_dest *dest
 static gpir_node *gpir_node_find(gpir_block *block, gpir_node *succ, nir_src *src,
                                  int channel)
 {
+   gpir_reg *reg = NULL;
    gpir_node *pred = NULL;
-
    if (src->is_ssa) {
       if (src->ssa->num_components > 1) {
          for (int i = 0; i < GPIR_VECTOR_SSA_NUM; i++) {
             if (block->comp->vector_ssa[i].ssa == src->ssa->index) {
-               pred = block->comp->vector_ssa[i].nodes[channel];
-               break;
+               return block->comp->vector_ssa[i].nodes[channel];
             }
          }
-      } else
-         pred = block->comp->var_nodes[src->ssa->index];
-
-      assert(pred);
-   }
-   else {
-      pred = gpir_node_create(block, gpir_op_load_reg);
-      list_addtail(&pred->list, &succ->list);
-
-      gpir_load_node *load = gpir_node_to_load(pred);
-      list_for_each_entry(gpir_reg, reg, &block->comp->reg_list, list) {
-         if (reg->index == src->reg.reg->index) {
-            load->reg = reg;
-            list_addtail(&load->reg_link, &reg->uses_list);
-            break;
-         }
+      } else {
+         gpir_node *pred = block->comp->node_for_ssa[src->ssa->index];
+         if (pred->block == block)
+            return pred;
+         reg = block->comp->reg_for_ssa[src->ssa->index];
       }
+   } else {
+      pred = block->comp->node_for_reg[src->reg.reg->index];
+      if (pred && pred->block == block && pred != succ)
+         return pred;
+      reg = reg_for_nir_reg(block->comp, src->reg.reg);
    }
 
+   assert(reg);
+   pred = gpir_node_create(block, gpir_op_load_reg);
+   gpir_load_node *load = gpir_node_to_load(pred);
+   load->reg = reg;
+   list_addtail(&pred->list, &succ->list);
+
    return pred;
 }
 
@@ -345,16 +387,6 @@ static bool gpir_emit_function(gpir_compiler *comp, nir_function_impl *impl)
    return true;
 }
 
-gpir_reg *gpir_create_reg(gpir_compiler *comp)
-{
-   gpir_reg *reg = ralloc(comp, gpir_reg);
-   reg->index = comp->cur_reg++;
-   list_addtail(&reg->list, &comp->reg_list);
-   list_inithead(&reg->defs_list);
-   list_inithead(&reg->uses_list);
-   return reg;
-}
-
 static gpir_compiler *gpir_compiler_create(void *prog, unsigned num_reg, unsigned num_ssa)
 {
    gpir_compiler *comp = rzalloc(prog, gpir_compiler);
@@ -362,13 +394,13 @@ static gpir_compiler *gpir_compiler_create(void *prog, unsigned num_reg, unsigne
    list_inithead(&comp->block_list);
    list_inithead(&comp->reg_list);
 
-   for (int i = 0; i < num_reg; i++)
-      gpir_create_reg(comp);
-
    for (int i = 0; i < GPIR_VECTOR_SSA_NUM; i++)
       comp->vector_ssa[i].ssa = -1;
 
-   comp->var_nodes = rzalloc_array(comp, gpir_node *, num_ssa);
+   comp->node_for_ssa = rzalloc_array(comp, gpir_node *, num_ssa);
+   comp->node_for_reg = rzalloc_array(comp, gpir_node *, num_reg);
+   comp->reg_for_ssa = rzalloc_array(comp, gpir_reg *, num_ssa);
+   comp->reg_for_reg = rzalloc_array(comp, gpir_reg *, num_reg);
    comp->prog = prog;
    return comp;
 }
diff --git a/src/gallium/drivers/lima/ir/gp/node.c b/src/gallium/drivers/lima/ir/gp/node.c
index e62512890b3..78f7bd130ea 100644
--- a/src/gallium/drivers/lima/ir/gp/node.c
+++ b/src/gallium/drivers/lima/ir/gp/node.c
@@ -433,17 +433,6 @@ void gpir_node_delete(gpir_node *node)
       ralloc_free(dep);
    }
 
-   if (node->type == gpir_node_type_store) {
-      gpir_store_node *store = gpir_node_to_store(node);
-      if (store->reg)
-         list_del(&store->reg_link);
-   }
-   else if (node->type == gpir_node_type_load) {
-      gpir_load_node *load = gpir_node_to_load(node);
-      if (load->reg)
-         list_del(&load->reg_link);
-   }
-
    list_del(&node->list);
    ralloc_free(node);
 }
diff --git a/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c b/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c
index a5013a59dbf..d51fc355f2b 100644
--- a/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c
@@ -190,21 +190,47 @@ static void schedule_block(gpir_block *block)
    schedule_ready_list(block, &ready_list);
 }
 
+/* Due to how we translate from NIR, we never read a register written in the
+ * same block (we just pass the node through instead), so we don't have to
+ * worry about read-after-write dependencies. We do have to worry about
+ * write-after-read though, so we add those dependencies now. For example in a
+ * loop like this we need a dependency between the write and the read of i:
+ *
+ * i = ...
+ * while (...) {
+ *    ... = i;
+ *    i = i + 1;
+ * }
+ */
+
+static void add_false_dependencies(gpir_compiler *comp)
+{
+   /* Make sure we allocate this only once, in case there are many values and
+    * many blocks.
+    */
+   gpir_node **last_written = calloc(comp->cur_reg, sizeof(gpir_node *));
+
+   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
+      list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
+         if (node->op == gpir_op_load_reg) {
+            gpir_load_node *load = gpir_node_to_load(node);
+            gpir_node *store = last_written[load->reg->index];
+            if (store && store->block == block) {
+               gpir_node_add_dep(store, node, GPIR_DEP_WRITE_AFTER_READ);
+            }
+         } else if (node->op == gpir_op_store_reg) {
+            gpir_store_node *store = gpir_node_to_store(node);
+            last_written[store->reg->index] = node;
+         }
+      }
+   }
+
+   free(last_written);
+}
+
 bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp)
 {
-   /* No need to build physical reg load/store dependency here,
-    * because we just exit SSA form, there should be at most
-    * one load and one store pair for a physical reg within a
-    * block, and the store must be after load with the output
-    * of load as input after some calculation. So we don't need to
-    * insert extra write-after-read or read-after-write dependecy
-    * for load/store nodes to maintain the right sequence before
-    * scheduling.
-    *
-    * Also no need to handle SSA def/use in difference block,
-    * because we'll load/store SSA to a physical reg if def/use
-    * are not in the same block.
-    */
+   add_false_dependencies(comp);
 
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
       block->rsched.node_index = 0;
diff --git a/src/gallium/drivers/lima/ir/gp/regalloc.c b/src/gallium/drivers/lima/ir/gp/regalloc.c
index c145bfbee81..91590d7364f 100644
--- a/src/gallium/drivers/lima/ir/gp/regalloc.c
+++ b/src/gallium/drivers/lima/ir/gp/regalloc.c
@@ -23,69 +23,435 @@
  */
 
 #include "gpir.h"
+#include "util/u_dynarray.h"
 
-/* Register allocation
- *
- * TODO: This needs to be rewritten when we support multiple basic blocks. We
- * need to do proper liveness analysis, combined with either linear scan,
- * graph coloring, or SSA-based allocation. We should also support spilling to
- * temporaries.
- *
- * For now, this only assigns fake registers to values, used to build the fake
- * dependencies that the scheduler relies on. In the future we should also be
- * assigning actual physreg numbers to load_reg/store_reg nodes.
- */
+/* Per-register information */
 
-static void regalloc_block(gpir_block *block)
+struct reg_info {
+   BITSET_WORD *conflicts;
+   struct util_dynarray conflict_list;
+
+   /* Number of conflicts that must be allocated to physical registers.
+    */
+   unsigned phys_conflicts;
+
+   unsigned node_conflicts;
+
+   /* Number of conflicts that can be allocated to either. */
+   unsigned total_conflicts;
+
+   int assigned_color;
+
+   bool visited;
+};
+
+struct regalloc_ctx {
+   unsigned bitset_words, num_nodes_and_regs;
+   struct reg_info *registers;
+
+   /* Reusable scratch liveness array */
+   BITSET_WORD *live;
+
+   unsigned *worklist;
+   unsigned worklist_start, worklist_end;
+
+   unsigned *stack;
+   unsigned stack_size;
+
+   gpir_compiler *comp;
+   void *mem_ctx;
+};
+
+/* Liveness analysis */
+
+static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live,
+                                     gpir_compiler *comp)
 {
-   /* build each node sequence index in the block node list */
-   int index = 0;
-   list_for_each_entry(gpir_node, node, &block->node_list, list) {
-      node->vreg.index = index++;
-   }
-
-   /* find the last successor of each node by the sequence index */
-   list_for_each_entry(gpir_node, node, &block->node_list, list) {
-      node->vreg.last = NULL;
-      gpir_node_foreach_succ(node, dep) {
-         gpir_node *succ = dep->succ;
-         if (!node->vreg.last || node->vreg.last->vreg.index < succ->vreg.index)
-            node->vreg.last = succ;
+   /* KILL */
+   if (node->type == gpir_node_type_store) {
+      if (node->op == gpir_op_store_reg) {
+         gpir_store_node *store = gpir_node_to_store(node);
+         BITSET_CLEAR(live, store->reg->index);
       }
    }
 
-   /* do linear scan regalloc */
-   int reg_search_start = 0;
-   gpir_node *active[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
+   /* GEN */
+   if (node->type == gpir_node_type_load) {
+      if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         BITSET_SET(live, load->reg->index);
+      }
+   }
+}
+
+static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx)
+{
+   for (unsigned i = 0; i < 2; i++) {
+      if (block->successors[i]) {
+         for (unsigned j = 0; j < ctx->bitset_words; j++)
+            block->live_out[j] |= block->successors[i]->live_in[j];
+      }
+   }
+
+   memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD));
+
+   list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
+      propagate_liveness_instr(node, ctx->live, block->comp);
+   }
+
+   bool changed = false;
+   for (unsigned i = 0; i < ctx->bitset_words; i++) {
+      changed |= (block->live_in[i] != ctx->live[i]);
+      block->live_in[i] = ctx->live[i];
+   }
+   return changed;
+}
+
+static void calc_def_block(gpir_block *block)
+{
    list_for_each_entry(gpir_node, node, &block->node_list, list) {
-      /* if some reg is expired */
-      gpir_node_foreach_pred(node, dep) {
-         gpir_node *pred = dep->pred;
-         if (pred->vreg.last == node)
-            active[pred->value_reg] = NULL;
+      if (node->op == gpir_op_store_reg) {
+         gpir_store_node *store = gpir_node_to_store(node);
+         BITSET_SET(block->def_out, store->reg->index);
+      }
+   }
+}
+
+static void calc_liveness(struct regalloc_ctx *ctx)
+{
+   bool changed = true;
+   while (changed) {
+      changed = false;
+      list_for_each_entry_rev(gpir_block, block, &ctx->comp->block_list, list) {
+         changed |= propagate_liveness_block(block, ctx);
+      }
+   }
+
+   list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
+      calc_def_block(block);
+   }
+
+   changed = true;
+   while (changed) {
+      changed = false;
+      list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
+         for (unsigned i = 0; i < 2; i++) {
+            gpir_block *succ = block->successors[i];
+            if (!succ)
+               continue;
+
+            for (unsigned j = 0; j < ctx->bitset_words; j++) {
+               BITSET_WORD new = block->def_out[j] & ~succ->def_out[j];
+               changed |= (new != 0);
+               succ->def_out[j] |= block->def_out[j];
+            }
+         }
+      }
+   }
+}
+
+/* Interference calculation */
+
+static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j)
+{
+   if (i == j)
+      return;
+
+   struct reg_info *a = &ctx->registers[i];
+   struct reg_info *b = &ctx->registers[j];
+
+   if (BITSET_TEST(a->conflicts, j))
+      return;
+
+   BITSET_SET(a->conflicts, j);
+   BITSET_SET(b->conflicts, i);
+
+   a->total_conflicts++;
+   b->total_conflicts++;
+   if (j < ctx->comp->cur_reg)
+      a->phys_conflicts++;
+   else
+      a->node_conflicts++;
+
+   if (i < ctx->comp->cur_reg)
+      b->phys_conflicts++;
+   else
+      b->node_conflicts++;
+
+   util_dynarray_append(&a->conflict_list, unsigned, j);
+   util_dynarray_append(&b->conflict_list, unsigned, i);
+}
+
+/* Make the register or node "i" intefere with all the other live registers
+ * and nodes.
+ */
+static void add_all_interferences(struct regalloc_ctx *ctx,
+                                  unsigned i,
+                                  BITSET_WORD *live_nodes,
+                                  BITSET_WORD *live_regs)
+{
+   int live_node;
+   BITSET_WORD tmp;
+   BITSET_FOREACH_SET(live_node, tmp, live_nodes, ctx->comp->cur_index) {
+      add_interference(ctx, i,
+                       live_node + ctx->comp->cur_reg);
+   }
+
+   int live_reg;
+   BITSET_FOREACH_SET(live_reg, tmp, ctx->live, ctx->comp->cur_index) {
+      add_interference(ctx, i, live_reg);
+   }
+
+}
+
+static void print_liveness(struct regalloc_ctx *ctx,
+                           BITSET_WORD *live_reg, BITSET_WORD *live_val)
+{
+   if (!(lima_debug & LIMA_DEBUG_GP))
+      return;
+
+   int live_idx;
+   BITSET_WORD tmp;
+   BITSET_FOREACH_SET(live_idx, tmp, live_reg, ctx->comp->cur_reg) {
+      printf("reg%d ", live_idx);
+   }
+   BITSET_FOREACH_SET(live_idx, tmp, live_val, ctx->comp->cur_index) {
+      printf("%d ", live_idx);
+   }
+   printf("\n");
+}
+
+static void calc_interference(struct regalloc_ctx *ctx)
+{
+   BITSET_WORD *live_nodes =
+      rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index);
+
+   list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
+      /* Initialize liveness at the end of the block, but exclude values that
+       * definitely aren't defined by the end. This helps out with
+       * partially-defined registers, like:
+       *
+       * if (condition) {
+       *    foo = ...;
+       * }
+       * if (condition) {
+       *    ... = foo;
+       * }
+       *
+       * If we naively propagated liveness backwards, foo would be live from
+       * the beginning of the program, but if we're not inside a loop then
+       * its value is undefined before the first if and we don't have to
+       * consider it live. Mask out registers like foo here.
+       */
+      for (unsigned i = 0; i < ctx->bitset_words; i++) {
+         ctx->live[i] = block->live_out[i] & block->def_out[i];
       }
 
-      /* no need to alloc value reg for root node */
-      if (gpir_node_is_root(node)) {
-         node->value_reg = -1;
-         continue;
+      list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
+         gpir_debug("processing node %d\n", node->index);
+         print_liveness(ctx, ctx->live, live_nodes);
+         if (node->type != gpir_node_type_store &&
+             node->type != gpir_node_type_branch) {
+            add_all_interferences(ctx, node->index + ctx->comp->cur_reg,
+                                  live_nodes, ctx->live);
+
+            /* KILL */
+            BITSET_CLEAR(live_nodes, node->index);
+         } else if (node->op == gpir_op_store_reg) {
+            gpir_store_node *store = gpir_node_to_store(node);
+            add_all_interferences(ctx, store->reg->index,
+                                  live_nodes, ctx->live);
+
+            /* KILL */
+            BITSET_CLEAR(ctx->live, store->reg->index);
+         }
+
+         /* GEN */
+         if (node->type == gpir_node_type_store) {
+            gpir_store_node *store = gpir_node_to_store(node);
+            BITSET_SET(live_nodes, store->child->index);
+         } else if (node->type == gpir_node_type_alu) {
+            gpir_alu_node *alu = gpir_node_to_alu(node);
+            for (int i = 0; i < alu->num_child; i++)
+               BITSET_SET(live_nodes, alu->children[i]->index);
+         } else if (node->type == gpir_node_type_branch) {
+            gpir_branch_node *branch = gpir_node_to_branch(node);
+            BITSET_SET(live_nodes, branch->cond->index);
+         } else if (node->op == gpir_op_load_reg) {
+            gpir_load_node *load = gpir_node_to_load(node);
+            BITSET_SET(ctx->live, load->reg->index);
+         }
+      }
+   }
+}
+
+/* Register allocation */
+
+static bool can_simplify(struct regalloc_ctx *ctx, unsigned i)
+{
+   struct reg_info *info = &ctx->registers[i];
+   if (i < ctx->comp->cur_reg) {
+      /* Physical regs. */
+      return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM;
+   } else {
+      /* Nodes: if we manage to allocate all of its conflicting physical
+       * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so
+       * we can ignore any more than that.
+       */
+      return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) + 
+         info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM;
+   }
+}
+
+static void push_stack(struct regalloc_ctx *ctx, unsigned i)
+{
+   ctx->stack[ctx->stack_size++] = i;
+   if (i < ctx->comp->cur_reg)
+      gpir_debug("pushing reg%u\n", i);
+   else
+      gpir_debug("pushing %d\n", i - ctx->comp->cur_reg);
+
+   struct reg_info *info = &ctx->registers[i];
+   assert(info->visited);
+
+   util_dynarray_foreach(&info->conflict_list, unsigned, conflict) {
+      struct reg_info *conflict_info = &ctx->registers[*conflict];
+      if (i < ctx->comp->cur_reg) {
+         assert(conflict_info->phys_conflicts > 0);
+         conflict_info->phys_conflicts--;
+      } else {
+         assert(conflict_info->node_conflicts > 0);
+         conflict_info->node_conflicts--;
+      }
+      if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) {
+         ctx->worklist[ctx->worklist_end++] = *conflict;
+         ctx->registers[*conflict].visited = true;
+      }
+   }
+}
+
+static bool do_regalloc(struct regalloc_ctx *ctx)
+{
+   ctx->worklist_start = 0;
+   ctx->worklist_end = 0;
+   ctx->stack_size = 0;
+
+   /* Step 1: find the initially simplifiable registers */
+   for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) {
+      if (can_simplify(ctx, i)) {
+         ctx->worklist[ctx->worklist_end++] = i;
+         ctx->registers[i].visited = true;
+      }
+   }
+
+   while (true) {
+      /* Step 2: push onto the stack whatever we can */
+      while (ctx->worklist_start != ctx->worklist_end) {
+         push_stack(ctx, ctx->worklist[ctx->worklist_start++]);
       }
 
-      /* find a free reg for this node */
-      int i;
-      for (i = 0; i < GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; i++) {
-         /* round robin reg select to reduce false dep when schedule */
-         int reg = (reg_search_start + i) % (GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM);
-         if (!active[reg]) {
-            active[reg] = node;
-            node->value_reg = reg;
-            reg_search_start++;
+      if (ctx->stack_size < ctx->num_nodes_and_regs) {
+         /* If there are still unsimplifiable nodes left, we need to
+          * optimistically push a node onto the stack.  Choose the one with
+          * the smallest number of current neighbors, since that's the most
+          * likely to succeed.
+          */
+         unsigned min_conflicts = UINT_MAX;
+         unsigned best_reg = 0;
+         for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) {
+            struct reg_info *info = &ctx->registers[reg];
+            if (info->visited)
+               continue;
+            if (info->phys_conflicts + info->node_conflicts < min_conflicts) {
+               best_reg = reg;
+               min_conflicts = info->phys_conflicts + info->node_conflicts;
+            }
+         }
+         gpir_debug("optimistic triggered\n");
+         ctx->registers[best_reg].visited = true;
+         push_stack(ctx, best_reg);
+      } else {
+         break;
+      }
+   }
+
+   /* Step 4: pop off the stack and assign colors */
+   for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) {
+      unsigned idx = ctx->stack[i];
+      struct reg_info *reg = &ctx->registers[idx];
+
+      unsigned num_available_regs;
+      if (idx < ctx->comp->cur_reg) {
+         num_available_regs = GPIR_PHYSICAL_REG_NUM;
+      } else {
+         num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM;
+      }
+
+      bool found = false;
+      unsigned start = i % num_available_regs;
+      for (unsigned j = 0; j < num_available_regs; j++) {
+         unsigned candidate = (j + start) % num_available_regs;
+         bool available = true;
+         util_dynarray_foreach(&reg->conflict_list, unsigned, conflict_idx) {
+            struct reg_info *conflict = &ctx->registers[*conflict_idx];
+            if (conflict->assigned_color >= 0 &&
+                conflict->assigned_color == (int) candidate) {
+               available = false;
+               break;
+            }
+         }
+
+         if (available) {
+            reg->assigned_color = candidate;
+            found = true;
             break;
          }
       }
 
-      /* TODO: spill */
-      assert(i != GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM);
+      /* TODO: spilling */
+      if (!found) {
+         gpir_error("Failed to allocate registers\n");
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static void assign_regs(struct regalloc_ctx *ctx)
+{
+   list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
+      list_for_each_entry(gpir_node, node, &block->node_list, list) {
+         if (node->index >= 0) {
+            node->value_reg =
+               ctx->registers[ctx->comp->cur_reg + node->index].assigned_color;
+         }
+
+         if (node->op == gpir_op_load_reg) {
+            gpir_load_node *load = gpir_node_to_load(node);
+            unsigned color = ctx->registers[load->reg->index].assigned_color;
+            load->index = color / 4;
+            load->component = color % 4;
+         }
+
+         if (node->op == gpir_op_store_reg) {
+            gpir_store_node *store = gpir_node_to_store(node);
+            unsigned color = ctx->registers[store->reg->index].assigned_color;
+            store->index = color / 4;
+            store->component = color % 4;
+            node->value_reg = color;
+         }
+      }
+
+      block->live_out_phys = 0;
+
+      int reg_idx;
+      BITSET_WORD tmp;
+      BITSET_FOREACH_SET(reg_idx, tmp, block->live_out, ctx->comp->cur_reg) {
+         if (BITSET_TEST(block->def_out, reg_idx)) {
+            block->live_out_phys |= (1ull << ctx->registers[reg_idx].assigned_color);
+         }
+      }
    }
 }
 
@@ -104,6 +470,14 @@ static void regalloc_print_result(gpir_compiler *comp)
             gpir_node *pred = dep->pred;
             printf(" %d/%d", pred->index, pred->value_reg);
          }
+         if (node->op == gpir_op_load_reg) {
+            gpir_load_node *load = gpir_node_to_load(node);
+            printf(" -/%d", 4 * load->index + load->component);
+            printf(" (%d)", load->reg->index);
+         } else if (node->op == gpir_op_store_reg) {
+            gpir_store_node *store = gpir_node_to_store(node);
+            printf(" (%d)", store->reg->index);
+         }
          printf("\n");
       }
       printf("----------------------------\n");
@@ -112,10 +486,38 @@ static void regalloc_print_result(gpir_compiler *comp)
 
 bool gpir_regalloc_prog(gpir_compiler *comp)
 {
-   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
-      regalloc_block(block);
+   struct regalloc_ctx ctx;
+
+   ctx.mem_ctx = ralloc_context(NULL);
+   ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index;
+   ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs);
+   ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
+   ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
+   ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
+   ctx.comp = comp;
+
+   ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs);
+   for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) {
+      ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD,
+                                                 ctx.bitset_words);
+      util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx);
    }
 
+   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
+      block->live_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
+      block->live_in = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
+      block->def_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
+   }
+
+   calc_liveness(&ctx);
+   calc_interference(&ctx);
+   if (!do_regalloc(&ctx)) {
+      ralloc_free(ctx.mem_ctx);
+      return false;
+   }
+   assign_regs(&ctx);
+
    regalloc_print_result(comp);
+   ralloc_free(ctx.mem_ctx);
    return true;
 }
diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c
index 3b490974fc0..ac132776d43 100644
--- a/src/gallium/drivers/lima/ir/gp/scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/scheduler.c
@@ -215,6 +215,14 @@ typedef struct {
     * schedule the instruction.
     */
    int total_spill_needed;
+
+   /* For each physical register, a linked list of loads associated with it in
+    * this block. When we spill a value to a given register, and there are
+    * existing loads associated with it that haven't been scheduled yet, we
+    * have to make sure that the corresponding unspill happens after the last
+    * original use has happened, i.e. is scheduled before.
+    */
+   struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM];
 } sched_ctx;
 
 static int gpir_min_dist_alu(gpir_dep *dep)
@@ -535,6 +543,19 @@ static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
       }
    }
 
+   if (node->op == gpir_op_store_reg) {
+      /* This register may be loaded in the next basic block, in which case
+       * there still needs to be a 2 instruction gap. We do what the blob
+       * seems to do and simply disable stores in the last two instructions of
+       * the basic block.
+       *
+       * TODO: We may be able to do better than this, but we have to check
+       * first if storing a register works across branches.
+       */
+      if (instr->index < 2)
+         return false;
+   }
+
    node->sched.instr = instr;
 
    int max_node_spill_needed = INT_MAX;
@@ -1009,10 +1030,6 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
 
       ctx->live_physregs |= (1ull << physreg);
 
-      /* TODO: when we support multiple basic blocks, there may be register
-       * loads/stores to this register other than this one that haven't been
-       * scheduled yet so we may need to insert write-after-read dependencies.
-       */
       gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
       store->index = physreg / 4;
       store->component = physreg % 4;
@@ -1030,6 +1047,16 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
       }
       node->sched.physreg_store = store;
       gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
+
+      list_for_each_entry(gpir_load_node, load,
+                          &ctx->physreg_reads[physreg], reg_link) {
+         gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);
+         if (load->node.sched.ready) {
+            list_del(&load->node.list);
+            load->node.sched.ready = false;
+         }
+      }
+
       node->sched.ready = false;
       schedule_insert_ready_list(ctx, &store->node);
    }
@@ -1556,13 +1583,21 @@ static bool schedule_block(gpir_block *block)
    list_inithead(&ctx.ready_list);
    ctx.block = block;
    ctx.ready_list_slots = 0;
-   /* TODO initialize with block live out once we have proper liveness
-    * tracking
-    */
-   ctx.live_physregs = 0;
+   ctx.live_physregs = block->live_out_phys;
+
+   for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
+      list_inithead(&ctx.physreg_reads[i]);
+   }
 
    /* construct the ready list from root nodes */
    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
+      /* Add to physreg_reads */
+      if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         unsigned index = 4 * load->index + load->component;
+         list_addtail(&load->reg_link, &ctx.physreg_reads[index]);
+      }
+
       if (gpir_node_is_root(node))
          schedule_insert_ready_list(&ctx, node);
    }
@@ -1623,22 +1658,6 @@ static void schedule_build_dependency(gpir_block *block)
       }
    }
 
-   /* Forward dependencies. We only need to add these for register loads,
-    * since value registers already have an input dependency.
-    */
-   list_for_each_entry(gpir_node, node, &block->node_list, list) {
-      if (node->op == gpir_op_load_reg) {
-         gpir_load_node *load = gpir_node_to_load(node);
-         unsigned index = 4 * load->index + load->component;
-         if (last_written[index]) {
-            gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE);
-         }
-      }
-
-      if (node->value_reg >= 0)
-         last_written[node->value_reg] = node;
-   }
-
    memset(last_written, 0, sizeof(last_written));
 
    /* False dependencies. For value registers, these exist only to make sure
@@ -1651,6 +1670,10 @@ static void schedule_build_dependency(gpir_block *block)
          if (last_written[index]) {
             gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
          }
+      } else if (node->op == gpir_op_store_reg) {
+         gpir_store_node *store = gpir_node_to_store(node);
+         unsigned index = 4 * store->index + store->component;
+         last_written[index] = node;
       } else {
          add_fake_dep(node, node, last_written);
       }