diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h index 6cbd406032e..4d11fb1eaa7 100644 --- a/src/gallium/drivers/lima/ir/gp/gpir.h +++ b/src/gallium/drivers/lima/ir/gp/gpir.h @@ -211,11 +211,6 @@ typedef struct { typedef struct { int index; struct list_head list; - - struct list_head defs_list; - struct list_head uses_list; - - int start, end; } gpir_reg; typedef struct { @@ -236,7 +231,6 @@ typedef struct gpir_store_node { gpir_node *child; gpir_reg *reg; - struct list_head reg_link; } gpir_store_node; enum gpir_instr_slot { @@ -350,6 +344,26 @@ typedef struct gpir_block { struct list_head predecessors; struct list_head predecessors_node; + /* for regalloc */ + + /* The set of live registers, i.e. registers whose value may be used + * eventually, at the beginning of the block. + */ + BITSET_WORD *live_in; + + /* Set of live registers at the end of the block. */ + BITSET_WORD *live_out; + + /* Set of registers that may have a value defined at the end of the + * block. + */ + BITSET_WORD *def_out; + + /* After register allocation, the set of live physical registers at the end + * of the block. Needed for scheduling. + */ + uint64_t live_out_phys; + /* For codegen, the offset in the final program. */ unsigned instr_offset; @@ -380,8 +394,17 @@ typedef struct gpir_compiler { struct list_head block_list; int cur_index; - /* array for searching ssa node */ - gpir_node **var_nodes; + /* Find the gpir node for a given NIR SSA def. */ + gpir_node **node_for_ssa; + + /* Find the gpir node for a given NIR register. */ + gpir_node **node_for_reg; + + /* Find the gpir register for a given NIR SSA def. */ + gpir_reg **reg_for_ssa; + + /* Find the gpir register for a given NIR register. */ + gpir_reg **reg_for_reg; /* gpir block for NIR block. */ gpir_block **blocks; diff --git a/src/gallium/drivers/lima/ir/gp/lower.c b/src/gallium/drivers/lima/ir/gp/lower.c index eaeeeb8f1eb..296f141216e 100644 --- a/src/gallium/drivers/lima/ir/gp/lower.c +++ b/src/gallium/drivers/lima/ir/gp/lower.c @@ -109,10 +109,7 @@ static bool gpir_lower_load(gpir_compiler *comp) gpir_load_node *nload = gpir_node_to_load(new); nload->index = load->index; nload->component = load->component; - if (load->reg) { - nload->reg = load->reg; - list_addtail(&nload->reg_link, &load->reg->uses_list); - } + nload->reg = load->reg; gpir_node_replace_pred(dep, new); gpir_node_replace_child(succ, node, new); diff --git a/src/gallium/drivers/lima/ir/gp/nir.c b/src/gallium/drivers/lima/ir/gp/nir.c index e2dc939f1a0..8d9a5beb98a 100644 --- a/src/gallium/drivers/lima/ir/gp/nir.c +++ b/src/gallium/drivers/lima/ir/gp/nir.c @@ -30,37 +30,80 @@ #include "gpir.h" #include "lima_context.h" +gpir_reg *gpir_create_reg(gpir_compiler *comp) +{ + gpir_reg *reg = ralloc(comp, gpir_reg); + reg->index = comp->cur_reg++; + list_addtail(®->list, &comp->reg_list); + return reg; +} -static inline void *gpir_node_create_ssa(gpir_block *block, gpir_op op, nir_ssa_def *ssa) +static gpir_reg *reg_for_nir_reg(gpir_compiler *comp, nir_register *nir_reg) +{ + unsigned index = nir_reg->index; + gpir_reg *reg = comp->reg_for_reg[index]; + if (reg) + return reg; + reg = gpir_create_reg(comp); + comp->reg_for_reg[index] = reg; + return reg; +} + +static inline gpir_node *gpir_node_create_ssa(gpir_block *block, gpir_op op, nir_ssa_def *ssa) { int index = ssa->index; gpir_node *node = gpir_node_create(block, op); - block->comp->var_nodes[index] = node; + block->comp->node_for_ssa[index] = node; snprintf(node->name, sizeof(node->name), "ssa%d", index); list_addtail(&node->list, &block->node_list); + + /* If any uses are outside the current block, we'll need to create a store + * instruction for them. + */ + bool needs_register = false; + nir_foreach_use(use, ssa) { + if (use->parent_instr->block != ssa->parent_instr->block) { + needs_register = true; + break; + } + } + + if (!needs_register) { + nir_foreach_if_use(use, ssa) { + if (nir_cf_node_prev(&use->parent_if->cf_node) != + &ssa->parent_instr->block->cf_node) { + needs_register = true; + break; + } + } + } + + if (needs_register) { + gpir_store_node *store = gpir_node_create(block, gpir_op_store_reg); + store->child = node; + store->reg = gpir_create_reg(block->comp); + gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); + list_addtail(&store->node.list, &block->node_list); + block->comp->reg_for_ssa[ssa->index] = store->reg; + } + return node; } -static inline void *gpir_node_create_reg(gpir_block *block, gpir_op op, nir_reg_dest *reg) +static inline void *gpir_node_create_reg(gpir_block *block, gpir_op op, nir_reg_dest *nir_reg) { - int index = reg->reg->index; + int index = nir_reg->reg->index; gpir_node *node = gpir_node_create(block, op); + block->comp->node_for_reg[index] = node; gpir_store_node *store = gpir_node_create(block, gpir_op_store_reg); snprintf(node->name, sizeof(node->name), "reg%d", index); store->child = node; + store->reg = reg_for_nir_reg(block->comp, nir_reg->reg); gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); - list_for_each_entry(gpir_reg, reg, &block->comp->reg_list, list) { - if (reg->index == index) { - store->reg = reg; - list_addtail(&store->reg_link, ®->defs_list); - break; - } - } - list_addtail(&node->list, &block->node_list); list_addtail(&store->node.list, &block->node_list); return node; @@ -77,35 +120,34 @@ static void *gpir_node_create_dest(gpir_block *block, gpir_op op, nir_dest *dest static gpir_node *gpir_node_find(gpir_block *block, gpir_node *succ, nir_src *src, int channel) { + gpir_reg *reg = NULL; gpir_node *pred = NULL; - if (src->is_ssa) { if (src->ssa->num_components > 1) { for (int i = 0; i < GPIR_VECTOR_SSA_NUM; i++) { if (block->comp->vector_ssa[i].ssa == src->ssa->index) { - pred = block->comp->vector_ssa[i].nodes[channel]; - break; + return block->comp->vector_ssa[i].nodes[channel]; } } - } else - pred = block->comp->var_nodes[src->ssa->index]; - - assert(pred); - } - else { - pred = gpir_node_create(block, gpir_op_load_reg); - list_addtail(&pred->list, &succ->list); - - gpir_load_node *load = gpir_node_to_load(pred); - list_for_each_entry(gpir_reg, reg, &block->comp->reg_list, list) { - if (reg->index == src->reg.reg->index) { - load->reg = reg; - list_addtail(&load->reg_link, ®->uses_list); - break; - } + } else { + gpir_node *pred = block->comp->node_for_ssa[src->ssa->index]; + if (pred->block == block) + return pred; + reg = block->comp->reg_for_ssa[src->ssa->index]; } + } else { + pred = block->comp->node_for_reg[src->reg.reg->index]; + if (pred && pred->block == block && pred != succ) + return pred; + reg = reg_for_nir_reg(block->comp, src->reg.reg); } + assert(reg); + pred = gpir_node_create(block, gpir_op_load_reg); + gpir_load_node *load = gpir_node_to_load(pred); + load->reg = reg; + list_addtail(&pred->list, &succ->list); + return pred; } @@ -345,16 +387,6 @@ static bool gpir_emit_function(gpir_compiler *comp, nir_function_impl *impl) return true; } -gpir_reg *gpir_create_reg(gpir_compiler *comp) -{ - gpir_reg *reg = ralloc(comp, gpir_reg); - reg->index = comp->cur_reg++; - list_addtail(®->list, &comp->reg_list); - list_inithead(®->defs_list); - list_inithead(®->uses_list); - return reg; -} - static gpir_compiler *gpir_compiler_create(void *prog, unsigned num_reg, unsigned num_ssa) { gpir_compiler *comp = rzalloc(prog, gpir_compiler); @@ -362,13 +394,13 @@ static gpir_compiler *gpir_compiler_create(void *prog, unsigned num_reg, unsigne list_inithead(&comp->block_list); list_inithead(&comp->reg_list); - for (int i = 0; i < num_reg; i++) - gpir_create_reg(comp); - for (int i = 0; i < GPIR_VECTOR_SSA_NUM; i++) comp->vector_ssa[i].ssa = -1; - comp->var_nodes = rzalloc_array(comp, gpir_node *, num_ssa); + comp->node_for_ssa = rzalloc_array(comp, gpir_node *, num_ssa); + comp->node_for_reg = rzalloc_array(comp, gpir_node *, num_reg); + comp->reg_for_ssa = rzalloc_array(comp, gpir_reg *, num_ssa); + comp->reg_for_reg = rzalloc_array(comp, gpir_reg *, num_reg); comp->prog = prog; return comp; } diff --git a/src/gallium/drivers/lima/ir/gp/node.c b/src/gallium/drivers/lima/ir/gp/node.c index e62512890b3..78f7bd130ea 100644 --- a/src/gallium/drivers/lima/ir/gp/node.c +++ b/src/gallium/drivers/lima/ir/gp/node.c @@ -433,17 +433,6 @@ void gpir_node_delete(gpir_node *node) ralloc_free(dep); } - if (node->type == gpir_node_type_store) { - gpir_store_node *store = gpir_node_to_store(node); - if (store->reg) - list_del(&store->reg_link); - } - else if (node->type == gpir_node_type_load) { - gpir_load_node *load = gpir_node_to_load(node); - if (load->reg) - list_del(&load->reg_link); - } - list_del(&node->list); ralloc_free(node); } diff --git a/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c b/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c index a5013a59dbf..d51fc355f2b 100644 --- a/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c +++ b/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c @@ -190,21 +190,47 @@ static void schedule_block(gpir_block *block) schedule_ready_list(block, &ready_list); } +/* Due to how we translate from NIR, we never read a register written in the + * same block (we just pass the node through instead), so we don't have to + * worry about read-after-write dependencies. We do have to worry about + * write-after-read though, so we add those dependencies now. For example in a + * loop like this we need a dependency between the write and the read of i: + * + * i = ... + * while (...) { + * ... = i; + * i = i + 1; + * } + */ + +static void add_false_dependencies(gpir_compiler *comp) +{ + /* Make sure we allocate this only once, in case there are many values and + * many blocks. + */ + gpir_node **last_written = calloc(comp->cur_reg, sizeof(gpir_node *)); + + list_for_each_entry(gpir_block, block, &comp->block_list, list) { + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + gpir_node *store = last_written[load->reg->index]; + if (store && store->block == block) { + gpir_node_add_dep(store, node, GPIR_DEP_WRITE_AFTER_READ); + } + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + last_written[store->reg->index] = node; + } + } + } + + free(last_written); +} + bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp) { - /* No need to build physical reg load/store dependency here, - * because we just exit SSA form, there should be at most - * one load and one store pair for a physical reg within a - * block, and the store must be after load with the output - * of load as input after some calculation. So we don't need to - * insert extra write-after-read or read-after-write dependecy - * for load/store nodes to maintain the right sequence before - * scheduling. - * - * Also no need to handle SSA def/use in difference block, - * because we'll load/store SSA to a physical reg if def/use - * are not in the same block. - */ + add_false_dependencies(comp); list_for_each_entry(gpir_block, block, &comp->block_list, list) { block->rsched.node_index = 0; diff --git a/src/gallium/drivers/lima/ir/gp/regalloc.c b/src/gallium/drivers/lima/ir/gp/regalloc.c index c145bfbee81..91590d7364f 100644 --- a/src/gallium/drivers/lima/ir/gp/regalloc.c +++ b/src/gallium/drivers/lima/ir/gp/regalloc.c @@ -23,69 +23,435 @@ */ #include "gpir.h" +#include "util/u_dynarray.h" -/* Register allocation - * - * TODO: This needs to be rewritten when we support multiple basic blocks. We - * need to do proper liveness analysis, combined with either linear scan, - * graph coloring, or SSA-based allocation. We should also support spilling to - * temporaries. - * - * For now, this only assigns fake registers to values, used to build the fake - * dependencies that the scheduler relies on. In the future we should also be - * assigning actual physreg numbers to load_reg/store_reg nodes. - */ +/* Per-register information */ -static void regalloc_block(gpir_block *block) +struct reg_info { + BITSET_WORD *conflicts; + struct util_dynarray conflict_list; + + /* Number of conflicts that must be allocated to physical registers. + */ + unsigned phys_conflicts; + + unsigned node_conflicts; + + /* Number of conflicts that can be allocated to either. */ + unsigned total_conflicts; + + int assigned_color; + + bool visited; +}; + +struct regalloc_ctx { + unsigned bitset_words, num_nodes_and_regs; + struct reg_info *registers; + + /* Reusable scratch liveness array */ + BITSET_WORD *live; + + unsigned *worklist; + unsigned worklist_start, worklist_end; + + unsigned *stack; + unsigned stack_size; + + gpir_compiler *comp; + void *mem_ctx; +}; + +/* Liveness analysis */ + +static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live, + gpir_compiler *comp) { - /* build each node sequence index in the block node list */ - int index = 0; - list_for_each_entry(gpir_node, node, &block->node_list, list) { - node->vreg.index = index++; - } - - /* find the last successor of each node by the sequence index */ - list_for_each_entry(gpir_node, node, &block->node_list, list) { - node->vreg.last = NULL; - gpir_node_foreach_succ(node, dep) { - gpir_node *succ = dep->succ; - if (!node->vreg.last || node->vreg.last->vreg.index < succ->vreg.index) - node->vreg.last = succ; + /* KILL */ + if (node->type == gpir_node_type_store) { + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_CLEAR(live, store->reg->index); } } - /* do linear scan regalloc */ - int reg_search_start = 0; - gpir_node *active[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0}; + /* GEN */ + if (node->type == gpir_node_type_load) { + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + BITSET_SET(live, load->reg->index); + } + } +} + +static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx) +{ + for (unsigned i = 0; i < 2; i++) { + if (block->successors[i]) { + for (unsigned j = 0; j < ctx->bitset_words; j++) + block->live_out[j] |= block->successors[i]->live_in[j]; + } + } + + memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD)); + + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + propagate_liveness_instr(node, ctx->live, block->comp); + } + + bool changed = false; + for (unsigned i = 0; i < ctx->bitset_words; i++) { + changed |= (block->live_in[i] != ctx->live[i]); + block->live_in[i] = ctx->live[i]; + } + return changed; +} + +static void calc_def_block(gpir_block *block) +{ list_for_each_entry(gpir_node, node, &block->node_list, list) { - /* if some reg is expired */ - gpir_node_foreach_pred(node, dep) { - gpir_node *pred = dep->pred; - if (pred->vreg.last == node) - active[pred->value_reg] = NULL; + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_SET(block->def_out, store->reg->index); + } + } +} + +static void calc_liveness(struct regalloc_ctx *ctx) +{ + bool changed = true; + while (changed) { + changed = false; + list_for_each_entry_rev(gpir_block, block, &ctx->comp->block_list, list) { + changed |= propagate_liveness_block(block, ctx); + } + } + + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + calc_def_block(block); + } + + changed = true; + while (changed) { + changed = false; + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + for (unsigned i = 0; i < 2; i++) { + gpir_block *succ = block->successors[i]; + if (!succ) + continue; + + for (unsigned j = 0; j < ctx->bitset_words; j++) { + BITSET_WORD new = block->def_out[j] & ~succ->def_out[j]; + changed |= (new != 0); + succ->def_out[j] |= block->def_out[j]; + } + } + } + } +} + +/* Interference calculation */ + +static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j) +{ + if (i == j) + return; + + struct reg_info *a = &ctx->registers[i]; + struct reg_info *b = &ctx->registers[j]; + + if (BITSET_TEST(a->conflicts, j)) + return; + + BITSET_SET(a->conflicts, j); + BITSET_SET(b->conflicts, i); + + a->total_conflicts++; + b->total_conflicts++; + if (j < ctx->comp->cur_reg) + a->phys_conflicts++; + else + a->node_conflicts++; + + if (i < ctx->comp->cur_reg) + b->phys_conflicts++; + else + b->node_conflicts++; + + util_dynarray_append(&a->conflict_list, unsigned, j); + util_dynarray_append(&b->conflict_list, unsigned, i); +} + +/* Make the register or node "i" intefere with all the other live registers + * and nodes. + */ +static void add_all_interferences(struct regalloc_ctx *ctx, + unsigned i, + BITSET_WORD *live_nodes, + BITSET_WORD *live_regs) +{ + int live_node; + BITSET_WORD tmp; + BITSET_FOREACH_SET(live_node, tmp, live_nodes, ctx->comp->cur_index) { + add_interference(ctx, i, + live_node + ctx->comp->cur_reg); + } + + int live_reg; + BITSET_FOREACH_SET(live_reg, tmp, ctx->live, ctx->comp->cur_index) { + add_interference(ctx, i, live_reg); + } + +} + +static void print_liveness(struct regalloc_ctx *ctx, + BITSET_WORD *live_reg, BITSET_WORD *live_val) +{ + if (!(lima_debug & LIMA_DEBUG_GP)) + return; + + int live_idx; + BITSET_WORD tmp; + BITSET_FOREACH_SET(live_idx, tmp, live_reg, ctx->comp->cur_reg) { + printf("reg%d ", live_idx); + } + BITSET_FOREACH_SET(live_idx, tmp, live_val, ctx->comp->cur_index) { + printf("%d ", live_idx); + } + printf("\n"); +} + +static void calc_interference(struct regalloc_ctx *ctx) +{ + BITSET_WORD *live_nodes = + rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index); + + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + /* Initialize liveness at the end of the block, but exclude values that + * definitely aren't defined by the end. This helps out with + * partially-defined registers, like: + * + * if (condition) { + * foo = ...; + * } + * if (condition) { + * ... = foo; + * } + * + * If we naively propagated liveness backwards, foo would be live from + * the beginning of the program, but if we're not inside a loop then + * its value is undefined before the first if and we don't have to + * consider it live. Mask out registers like foo here. + */ + for (unsigned i = 0; i < ctx->bitset_words; i++) { + ctx->live[i] = block->live_out[i] & block->def_out[i]; } - /* no need to alloc value reg for root node */ - if (gpir_node_is_root(node)) { - node->value_reg = -1; - continue; + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + gpir_debug("processing node %d\n", node->index); + print_liveness(ctx, ctx->live, live_nodes); + if (node->type != gpir_node_type_store && + node->type != gpir_node_type_branch) { + add_all_interferences(ctx, node->index + ctx->comp->cur_reg, + live_nodes, ctx->live); + + /* KILL */ + BITSET_CLEAR(live_nodes, node->index); + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + add_all_interferences(ctx, store->reg->index, + live_nodes, ctx->live); + + /* KILL */ + BITSET_CLEAR(ctx->live, store->reg->index); + } + + /* GEN */ + if (node->type == gpir_node_type_store) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_SET(live_nodes, store->child->index); + } else if (node->type == gpir_node_type_alu) { + gpir_alu_node *alu = gpir_node_to_alu(node); + for (int i = 0; i < alu->num_child; i++) + BITSET_SET(live_nodes, alu->children[i]->index); + } else if (node->type == gpir_node_type_branch) { + gpir_branch_node *branch = gpir_node_to_branch(node); + BITSET_SET(live_nodes, branch->cond->index); + } else if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + BITSET_SET(ctx->live, load->reg->index); + } + } + } +} + +/* Register allocation */ + +static bool can_simplify(struct regalloc_ctx *ctx, unsigned i) +{ + struct reg_info *info = &ctx->registers[i]; + if (i < ctx->comp->cur_reg) { + /* Physical regs. */ + return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM; + } else { + /* Nodes: if we manage to allocate all of its conflicting physical + * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so + * we can ignore any more than that. + */ + return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) + + info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM; + } +} + +static void push_stack(struct regalloc_ctx *ctx, unsigned i) +{ + ctx->stack[ctx->stack_size++] = i; + if (i < ctx->comp->cur_reg) + gpir_debug("pushing reg%u\n", i); + else + gpir_debug("pushing %d\n", i - ctx->comp->cur_reg); + + struct reg_info *info = &ctx->registers[i]; + assert(info->visited); + + util_dynarray_foreach(&info->conflict_list, unsigned, conflict) { + struct reg_info *conflict_info = &ctx->registers[*conflict]; + if (i < ctx->comp->cur_reg) { + assert(conflict_info->phys_conflicts > 0); + conflict_info->phys_conflicts--; + } else { + assert(conflict_info->node_conflicts > 0); + conflict_info->node_conflicts--; + } + if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) { + ctx->worklist[ctx->worklist_end++] = *conflict; + ctx->registers[*conflict].visited = true; + } + } +} + +static bool do_regalloc(struct regalloc_ctx *ctx) +{ + ctx->worklist_start = 0; + ctx->worklist_end = 0; + ctx->stack_size = 0; + + /* Step 1: find the initially simplifiable registers */ + for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) { + if (can_simplify(ctx, i)) { + ctx->worklist[ctx->worklist_end++] = i; + ctx->registers[i].visited = true; + } + } + + while (true) { + /* Step 2: push onto the stack whatever we can */ + while (ctx->worklist_start != ctx->worklist_end) { + push_stack(ctx, ctx->worklist[ctx->worklist_start++]); } - /* find a free reg for this node */ - int i; - for (i = 0; i < GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; i++) { - /* round robin reg select to reduce false dep when schedule */ - int reg = (reg_search_start + i) % (GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM); - if (!active[reg]) { - active[reg] = node; - node->value_reg = reg; - reg_search_start++; + if (ctx->stack_size < ctx->num_nodes_and_regs) { + /* If there are still unsimplifiable nodes left, we need to + * optimistically push a node onto the stack. Choose the one with + * the smallest number of current neighbors, since that's the most + * likely to succeed. + */ + unsigned min_conflicts = UINT_MAX; + unsigned best_reg = 0; + for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) { + struct reg_info *info = &ctx->registers[reg]; + if (info->visited) + continue; + if (info->phys_conflicts + info->node_conflicts < min_conflicts) { + best_reg = reg; + min_conflicts = info->phys_conflicts + info->node_conflicts; + } + } + gpir_debug("optimistic triggered\n"); + ctx->registers[best_reg].visited = true; + push_stack(ctx, best_reg); + } else { + break; + } + } + + /* Step 4: pop off the stack and assign colors */ + for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) { + unsigned idx = ctx->stack[i]; + struct reg_info *reg = &ctx->registers[idx]; + + unsigned num_available_regs; + if (idx < ctx->comp->cur_reg) { + num_available_regs = GPIR_PHYSICAL_REG_NUM; + } else { + num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; + } + + bool found = false; + unsigned start = i % num_available_regs; + for (unsigned j = 0; j < num_available_regs; j++) { + unsigned candidate = (j + start) % num_available_regs; + bool available = true; + util_dynarray_foreach(®->conflict_list, unsigned, conflict_idx) { + struct reg_info *conflict = &ctx->registers[*conflict_idx]; + if (conflict->assigned_color >= 0 && + conflict->assigned_color == (int) candidate) { + available = false; + break; + } + } + + if (available) { + reg->assigned_color = candidate; + found = true; break; } } - /* TODO: spill */ - assert(i != GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM); + /* TODO: spilling */ + if (!found) { + gpir_error("Failed to allocate registers\n"); + return false; + } + } + + return true; +} + +static void assign_regs(struct regalloc_ctx *ctx) +{ + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + list_for_each_entry(gpir_node, node, &block->node_list, list) { + if (node->index >= 0) { + node->value_reg = + ctx->registers[ctx->comp->cur_reg + node->index].assigned_color; + } + + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + unsigned color = ctx->registers[load->reg->index].assigned_color; + load->index = color / 4; + load->component = color % 4; + } + + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + unsigned color = ctx->registers[store->reg->index].assigned_color; + store->index = color / 4; + store->component = color % 4; + node->value_reg = color; + } + } + + block->live_out_phys = 0; + + int reg_idx; + BITSET_WORD tmp; + BITSET_FOREACH_SET(reg_idx, tmp, block->live_out, ctx->comp->cur_reg) { + if (BITSET_TEST(block->def_out, reg_idx)) { + block->live_out_phys |= (1ull << ctx->registers[reg_idx].assigned_color); + } + } } } @@ -104,6 +470,14 @@ static void regalloc_print_result(gpir_compiler *comp) gpir_node *pred = dep->pred; printf(" %d/%d", pred->index, pred->value_reg); } + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + printf(" -/%d", 4 * load->index + load->component); + printf(" (%d)", load->reg->index); + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + printf(" (%d)", store->reg->index); + } printf("\n"); } printf("----------------------------\n"); @@ -112,10 +486,38 @@ static void regalloc_print_result(gpir_compiler *comp) bool gpir_regalloc_prog(gpir_compiler *comp) { - list_for_each_entry(gpir_block, block, &comp->block_list, list) { - regalloc_block(block); + struct regalloc_ctx ctx; + + ctx.mem_ctx = ralloc_context(NULL); + ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index; + ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs); + ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs); + ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs); + ctx.comp = comp; + + ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs); + for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) { + ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD, + ctx.bitset_words); + util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx); } + list_for_each_entry(gpir_block, block, &comp->block_list, list) { + block->live_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + block->live_in = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + block->def_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + } + + calc_liveness(&ctx); + calc_interference(&ctx); + if (!do_regalloc(&ctx)) { + ralloc_free(ctx.mem_ctx); + return false; + } + assign_regs(&ctx); + regalloc_print_result(comp); + ralloc_free(ctx.mem_ctx); return true; } diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c index 3b490974fc0..ac132776d43 100644 --- a/src/gallium/drivers/lima/ir/gp/scheduler.c +++ b/src/gallium/drivers/lima/ir/gp/scheduler.c @@ -215,6 +215,14 @@ typedef struct { * schedule the instruction. */ int total_spill_needed; + + /* For each physical register, a linked list of loads associated with it in + * this block. When we spill a value to a given register, and there are + * existing loads associated with it that haven't been scheduled yet, we + * have to make sure that the corresponding unspill happens after the last + * original use has happened, i.e. is scheduled before. + */ + struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM]; } sched_ctx; static int gpir_min_dist_alu(gpir_dep *dep) @@ -535,6 +543,19 @@ static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node) } } + if (node->op == gpir_op_store_reg) { + /* This register may be loaded in the next basic block, in which case + * there still needs to be a 2 instruction gap. We do what the blob + * seems to do and simply disable stores in the last two instructions of + * the basic block. + * + * TODO: We may be able to do better than this, but we have to check + * first if storing a register works across branches. + */ + if (instr->index < 2) + return false; + } + node->sched.instr = instr; int max_node_spill_needed = INT_MAX; @@ -1009,10 +1030,6 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node) ctx->live_physregs |= (1ull << physreg); - /* TODO: when we support multiple basic blocks, there may be register - * loads/stores to this register other than this one that haven't been - * scheduled yet so we may need to insert write-after-read dependencies. - */ gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg); store->index = physreg / 4; store->component = physreg % 4; @@ -1030,6 +1047,16 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node) } node->sched.physreg_store = store; gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); + + list_for_each_entry(gpir_load_node, load, + &ctx->physreg_reads[physreg], reg_link) { + gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ); + if (load->node.sched.ready) { + list_del(&load->node.list); + load->node.sched.ready = false; + } + } + node->sched.ready = false; schedule_insert_ready_list(ctx, &store->node); } @@ -1556,13 +1583,21 @@ static bool schedule_block(gpir_block *block) list_inithead(&ctx.ready_list); ctx.block = block; ctx.ready_list_slots = 0; - /* TODO initialize with block live out once we have proper liveness - * tracking - */ - ctx.live_physregs = 0; + ctx.live_physregs = block->live_out_phys; + + for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) { + list_inithead(&ctx.physreg_reads[i]); + } /* construct the ready list from root nodes */ list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { + /* Add to physreg_reads */ + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + unsigned index = 4 * load->index + load->component; + list_addtail(&load->reg_link, &ctx.physreg_reads[index]); + } + if (gpir_node_is_root(node)) schedule_insert_ready_list(&ctx, node); } @@ -1623,22 +1658,6 @@ static void schedule_build_dependency(gpir_block *block) } } - /* Forward dependencies. We only need to add these for register loads, - * since value registers already have an input dependency. - */ - list_for_each_entry(gpir_node, node, &block->node_list, list) { - if (node->op == gpir_op_load_reg) { - gpir_load_node *load = gpir_node_to_load(node); - unsigned index = 4 * load->index + load->component; - if (last_written[index]) { - gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE); - } - } - - if (node->value_reg >= 0) - last_written[node->value_reg] = node; - } - memset(last_written, 0, sizeof(last_written)); /* False dependencies. For value registers, these exist only to make sure @@ -1651,6 +1670,10 @@ static void schedule_build_dependency(gpir_block *block) if (last_written[index]) { gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ); } + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + unsigned index = 4 * store->index + store->component; + last_written[index] = node; } else { add_fake_dep(node, node, last_written); }