diff --git a/src/panfrost/compiler/bi_liveness.c b/src/panfrost/compiler/bi_liveness.c index 52e0450877b..e5271c54551 100644 --- a/src/panfrost/compiler/bi_liveness.c +++ b/src/panfrost/compiler/bi_liveness.c @@ -32,8 +32,13 @@ bi_liveness_ins_update_ssa(BITSET_WORD *live, const bi_instr *I) bi_foreach_dest(I, d) BITSET_CLEAR(live, I->dest[d].value); - bi_foreach_ssa_src(I, s) + bi_foreach_ssa_src(I, s) { + /* If the source is not live after this instruction, but becomes live + * at this instruction, this is the use that kills the source + */ + I->src[s].kill_ssa = !BITSET_TEST(live, I->src[s].value); BITSET_SET(live, I->src[s].value); + } } void @@ -89,21 +94,17 @@ bi_compute_liveness_ssa(bi_context *ctx) memcpy(live, blk->ssa_live_in, words * sizeof(BITSET_WORD)); /* Kill write */ - bi_foreach_instr_in_block(blk, I) { - if (I->op != BI_OPCODE_PHI) - break; - + bi_foreach_phi_in_block(blk, I) { BITSET_CLEAR(live, I->dest[0].value); } /* Make live the corresponding source */ - bi_foreach_instr_in_block(blk, I) { - if (I->op != BI_OPCODE_PHI) - break; - + bi_foreach_phi_in_block(blk, I) { bi_index operand = I->src[bi_predecessor_index(blk, *pred)]; - if (bi_is_ssa(operand)) + if (bi_is_ssa(operand)) { BITSET_SET(live, operand.value); + I->src[bi_predecessor_index(blk, *pred)].kill_ssa = false; + } } BITSET_WORD progress = 0; diff --git a/src/panfrost/compiler/bi_printer.c.py b/src/panfrost/compiler/bi_printer.c.py index e203aa95c57..833805d2276 100644 --- a/src/panfrost/compiler/bi_printer.c.py +++ b/src/panfrost/compiler/bi_printer.c.py @@ -88,6 +88,8 @@ bi_print_index(FILE *fp, bi_index index) { if (index.discard) fputs("^", fp); + if (index.kill_ssa) + fputs("!", fp); if (bi_is_null(index)) fprintf(fp, "_"); @@ -95,6 +97,8 @@ bi_print_index(FILE *fp, bi_index index) fprintf(fp, "#0x%x", index.value); else if (index.type == BI_INDEX_FAU && index.value >= BIR_FAU_UNIFORM) fprintf(fp, "u%u", index.value & ~BIR_FAU_UNIFORM); + else if (index.type == BI_INDEX_FAU && index.memory) + fprintf(fp, "m%u", index.value); else if (index.type == BI_INDEX_FAU) fprintf(fp, "%s", bir_fau_name(index.value)); else if (index.type == BI_INDEX_PASS) diff --git a/src/panfrost/compiler/bi_ra.c b/src/panfrost/compiler/bi_ra.c index 486180e2bf0..f9b99272419 100644 --- a/src/panfrost/compiler/bi_ra.c +++ b/src/panfrost/compiler/bi_ra.c @@ -1,27 +1,10 @@ /* - * Copyright (C) 2020 Collabora Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Copyright (C) 2020,2025 Collabora Ltd. + * SPDX-License-Identifier: MIT * * Authors (Collabora): * Alyssa Rosenzweig + * Eric R. Smith */ #include "util/u_memory.h" @@ -454,7 +437,7 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs) : (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48)); /* To test spilling, mimic a small register file */ - if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend) + if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend && (bifrost_debug & BIFROST_DBG_NOSSARA)) default_affinity &= BITFIELD64_MASK(48) << 8; bi_foreach_instr_global(ctx, ins) { @@ -550,7 +533,7 @@ bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index) /* LCRA didn't bother solving this index (how lazy!) */ signed solution = l->solutions[index.value]; if (solution < 0) { - assert(!is_offset); + assert(0 && "no solution for index"); return index; } @@ -632,7 +615,7 @@ bi_choose_spill_node(bi_context *ctx, struct lcra_state *l) } } - unsigned best_benefit = 0.0; + unsigned best_benefit = 0; signed best_node = -1; if (nodearray_is_sparse(&l->linear[l->spill_node])) { @@ -706,7 +689,7 @@ bi_tls_ptr(bool hi) return bi_fau(BIR_FAU_TLS_PTR, hi); } -static bi_instr * +bi_instr * bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) { if (b->shader->arch >= 9) { @@ -718,7 +701,7 @@ bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) } } -static void +void bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) { if (b->shader->arch >= 9) { @@ -946,6 +929,7 @@ bi_out_of_ssa(bi_context *ctx) { bi_index zero = bi_fau(BIR_FAU_IMMEDIATE | 0, false); unsigned first_reg = ctx->ssa_alloc; + bool allow_propagate; /* Trivially lower phis */ bi_foreach_block(ctx, block) { @@ -969,16 +953,28 @@ bi_out_of_ssa(bi_context *ctx) assert(!I->src[i].neg); assert(I->src[i].swizzle == BI_SWIZZLE_H01); - /* MOV of immediate needs lowering on Valhall */ - if (ctx->arch >= 9 && I->src[i].type == BI_INDEX_CONSTANT) + if (I->src[i].memory) + /* spilled register, need to un-spill */ + bi_load_tl(&b, 32, reg, I->src[i].value); + else if (ctx->arch >= 9 && I->src[i].type == BI_INDEX_CONSTANT) + /* MOV of immediate needs lowering on Valhall */ bi_iadd_imm_i32_to(&b, reg, zero, I->src[i].value); else bi_mov_i32_to(&b, reg, I->src[i]); } /* Replace the phi with a move */ + allow_propagate = false; bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - bi_mov_i32_to(&b, I->dest[0], reg); + if (I->dest[0].memory) { + /* dest was spilled to memory */ + bi_store_tl(&b, 32, reg, I->dest[0].value); + allow_propagate = false; + } else if (ctx->arch >= 9 && reg.type == BI_INDEX_CONSTANT) + /* MOV of immediate needs lowering on Valhall */ + bi_iadd_imm_i32_to(&b, I->dest[0], zero, reg.value); + else + bi_mov_i32_to(&b, I->dest[0], reg); bi_remove_instruction(I); /* Propagate that move within the block. The destination @@ -987,7 +983,7 @@ bi_out_of_ssa(bi_context *ctx) * possible in the next pass. */ bi_foreach_instr_in_block_rev(block, prop) { - if (prop->op == BI_OPCODE_PHI) + if (prop->op == BI_OPCODE_PHI || !allow_propagate) break; bi_foreach_src(prop, s) { @@ -1083,14 +1079,49 @@ bi_register_allocate(bi_context *ctx) struct lcra_state *l = NULL; bool success = false; - unsigned iter_count = 2000; /* max iterations */ - + unsigned iter_count = 0; + unsigned max_iters = 2000; /* Number of bytes of memory we've spilled into */ unsigned spill_count = ctx->info.tls_size; if (ctx->arch >= 9) va_lower_split_64bit(ctx); + /* get estimate of register demand (must be done in SSA form) + * and do a preliminary spill; this doesn't have to be perfect, + * since register allocation can spill too, but RA is really slow + * so the closer we get to having enough registers free, the better + */ + if (!(bifrost_debug & BIFROST_DBG_NOSSARA)) { + unsigned regs_to_use = + ((bifrost_debug & BIFROST_DBG_SPILL) && !ctx->inputs->is_blend) ? 16 : BI_MAX_REGS; + bool verbose = bifrost_debug & BIFROST_DBG_VERBOSE; + + bi_compute_liveness_ssa(ctx); + if (verbose) { + bi_print_shader(ctx, stdout); + } + unsigned register_demand = bi_calc_register_demand(ctx); + if (register_demand > regs_to_use) { + /* spill registers if we can */ + if (ctx->inputs->is_blend) + unreachable("Blend shaders may not spill"); + + spill_count = bi_spill_ssa(ctx, regs_to_use, spill_count); + /* By default, we use packed TLS addressing on Valhall. + * We cannot cross 16 byte boundaries with packed TLS + * addressing. Align to ensure this doesn't happen. This + * could be optimized a bit. + */ + if (ctx->arch >= 9) + spill_count = ALIGN_POT(spill_count, 16); + if (verbose) { + printf("\nspill_registers=%d\n", spill_count); + bi_print_shader(ctx, stdout); + } + } + } + /* Lower tied operands. SSA is broken from here on. */ unsigned first_reg = bi_out_of_ssa(ctx); bi_lower_vector(ctx, first_reg); @@ -1110,11 +1141,11 @@ bi_register_allocate(bi_context *ctx) } /* Otherwise, use the register file and spill until we succeed */ - while (!success && ((iter_count--) > 0)) { + while (!success && ((iter_count++) < max_iters)) { l = bi_allocate_registers(ctx, &success, true); if (success) { - ctx->info.work_reg_count = 64; + ctx->info.work_reg_count = BI_MAX_REGS; } else { signed spill_node = bi_choose_spill_node(ctx, l); lcra_free(l); diff --git a/src/panfrost/compiler/bi_ra_ssa.c b/src/panfrost/compiler/bi_ra_ssa.c new file mode 100644 index 00000000000..8ca3cb6445b --- /dev/null +++ b/src/panfrost/compiler/bi_ra_ssa.c @@ -0,0 +1,147 @@ +/* + * Copyright 2023-2024 Alyssa Rosenzweig + * Copyright 2023-2024 Valve Corporation + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ +#include "util/list.h" +#include "util/set.h" +#include "util/u_memory.h" +#include "bifrost_compile.h" +#include "bifrost_nir.h" +#include "compiler.h" + +/* + * RA treats the nesting counter, the divergent shuffle temporary, and the + * spiller temporaries as alive throughout if used anywhere. This could be + * optimized. Using a single power-of-two reserved region at the start ensures + * these registers are never shuffled. + */ +static unsigned +reserved_size(bi_context *ctx) +{ + if (ctx->has_spill_pcopy_reserved) + return 8; + else + return 0; +} + +/* + * Calculate register demand in registers, while gathering widths and + * classes. Becuase we allocate in SSA, this calculation is exact in + * linear-time. Depends on SSA liveness information. + */ +unsigned +bi_calc_register_demand(bi_context *ctx) +{ + /* Print detailed demand calculation, helpful to debug spilling */ + bool debug = false; + + if (debug) { + bi_print_shader(ctx, stdout); + } + + uint8_t *widths = calloc(ctx->ssa_alloc, sizeof(uint8_t)); + enum ra_class *classes = calloc(ctx->ssa_alloc, sizeof(enum ra_class)); + + bi_foreach_instr_global(ctx, I) { + bi_foreach_ssa_dest(I, d) { + unsigned v = I->dest[d].value; + assert(widths[v] == 0 && "broken SSA"); + /* Round up vectors for easier live range splitting */ + widths[v] = 1; + classes[v] = ra_class_for_index(I->dest[d]); + } + } + + /* Calculate demand at the start of each block based on live-in, then update + * for each instruction processed. Calculate rolling maximum. + */ + unsigned max_demand = 0; + + bi_foreach_block(ctx, block) { + unsigned demand = reserved_size(ctx); + + /* Everything live-in */ + { + int i; + BITSET_FOREACH_SET(i, block->ssa_live_in, ctx->ssa_alloc) { + if (classes[i] == RA_GPR) + demand += widths[i]; + } + } + + max_demand = MAX2(demand, max_demand); + + /* To handle non-power-of-two vectors, sometimes live range splitting + * needs extra registers for 1 instruction. This counter tracks the number + * of registers to be freed after 1 extra instruction. + */ + unsigned late_kill_count = 0; + + if (debug) { + printf("\n"); + } + + bi_foreach_instr_in_block(block, I) { + /* Phis happen in parallel and are already accounted for in the live-in + * set, just skip them so we don't double count. + */ + if (I->op == BI_OPCODE_PHI) + continue; + + if (debug) { + printf("%u: ", demand); + bi_print_instr(I, stdout); + } + + /* Handle late-kill registers from last instruction */ + demand -= late_kill_count; + late_kill_count = 0; + + /* Kill sources the first time we see them */ + bi_foreach_src(I, s) { + if (!I->src[s].kill_ssa) + continue; + assert(I->src[s].type == BI_INDEX_NORMAL); + if (ra_class_for_index(I->src[s]) != RA_GPR) + continue; + + bool skip = false; + + for (unsigned backwards = 0; backwards < s; ++backwards) { + if (bi_is_equiv(I->src[backwards], I->src[s])) { + skip = true; + break; + } + } + + if (!skip) + demand -= widths[I->src[s].value]; + } + + /* Make destinations live */ + bi_foreach_ssa_dest(I, d) { + if (ra_class_for_index(I->dest[d]) != RA_GPR) + continue; + + /* Live range splits allocate at power-of-two granularity. Round up + * destination sizes (temporarily) to powers-of-two. + */ + unsigned real_width = widths[I->dest[d].value]; + unsigned pot_width = util_next_power_of_two(real_width); + + demand += pot_width; + late_kill_count += (pot_width - real_width); + } + + max_demand = MAX2(demand, max_demand); + } + + demand -= late_kill_count; + } + + free(widths); + free(classes); + return max_demand; +} diff --git a/src/panfrost/compiler/bi_spill_ssa.c b/src/panfrost/compiler/bi_spill_ssa.c new file mode 100644 index 00000000000..c71054dbbba --- /dev/null +++ b/src/panfrost/compiler/bi_spill_ssa.c @@ -0,0 +1,1332 @@ +/* + * Copyright 2023-2024 Alyssa Rosenzweig + * Copyright 2023-2024 Valve Corporation + * Copyright 2022,2025 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "util/bitset.h" +#include "util/hash_table.h" +#include "util/ralloc.h" +#include "util/u_dynarray.h" +#include "util/u_qsort.h" +#include "bi_builder.h" +#include "bifrost_nir.h" +#include "compiler.h" + +/* allow this many temporaries for spilling */ +#define MAX_TEMPS_FOR_SPILL 16 + +/* + * An implementation of "Register Spilling and Live-Range Splitting for SSA-Form + * Programs" by Braun and Hack. + */ + +/* + * Next-use distances are logically in ℤ ∪ {∞}. Modeled as saturating uint32 and + * referred to as dist_t. + * + * next_uses represents a next-use map. This is a sparse data structure mapping + * variable names to next-use dist_t's. Variables with no later use (infinite + * next-use distance) are not stored explicitly, making the time/space + * requirements O(live variables). This is important for performance and memory + * usage on big shaders with many blocks. + * + * For now, next_uses is backed by a Mesa hash table, but it could be optimized + * to something more specialized in the future. + */ +#define DIST_INFINITY (UINT32_MAX) +typedef uint32_t dist_t; + +static dist_t +dist_sum(dist_t A, dist_t B) +{ + return (A + B < A) ? DIST_INFINITY : (A + B); +} + +struct next_uses { + struct hash_table_u64 *ht; +}; + +static void +init_next_uses(struct next_uses *nu, void *memctx) +{ + nu->ht = _mesa_hash_table_u64_create(memctx); +} + +static void +destroy_next_uses(struct next_uses *nu) +{ + _mesa_hash_table_u64_destroy(nu->ht); +} + +static void +clear_next_uses(struct next_uses *nu) +{ + _mesa_hash_table_u64_clear(nu->ht); +} + +static void +copy_next_uses(struct next_uses *nu, const struct next_uses *from) +{ + clear_next_uses(nu); + + hash_table_u64_foreach(from->ht, use) { + _mesa_hash_table_u64_insert(nu->ht, use.key, use.data); + } +} + +static void +set_next_use(struct next_uses *nu, unsigned node, dist_t dist) +{ + if (dist == DIST_INFINITY) { + _mesa_hash_table_u64_remove(nu->ht, node); + } else { + uintptr_t as_ptr = (uintptr_t)(dist + 1); + assert(as_ptr != 0 && "non-NULL"); + + _mesa_hash_table_u64_insert(nu->ht, node, (void *)as_ptr); + } +} + +static dist_t +search_next_uses(const struct next_uses *nu, unsigned node) +{ + void *ent = _mesa_hash_table_u64_search(nu->ht, node); + if (!ent) + return DIST_INFINITY; + + return ((uintptr_t)ent) - 1; +} + +#define foreach_next_use(nu, node, dist) \ + hash_table_u64_foreach((nu)->ht, use_) \ + for (uint32_t _terminator = 1, node = use_.key, \ + UNUSED dist = ((uintptr_t)use_.data) - 1; \ + _terminator != 0; _terminator = 0) + +/* + * Calculate the minimum of two next-use sets. Values absent from one of the + * underlying sets are infinity so do not contribute to the minimum, instead + * acting like a set union. + */ +static bool +minimum_next_uses(struct next_uses *nu, const struct next_uses *from) +{ + bool progress = false; + + foreach_next_use(from, node, from_dist) { + dist_t nu_dist = search_next_uses(nu, node); + + if (from_dist < nu_dist) { + set_next_use(nu, node, from_dist); + progress = true; + } + } + + return progress; +} + +static uint32_t +instr_cycles(const bi_instr *I) +{ + return 1; +} + +struct spill_block { + /* Set of values available in the register file at the end */ + unsigned W_exit[BI_MAX_REGS]; + unsigned nW_exit; + + unsigned W_entry[BI_MAX_REGS]; + unsigned nW_entry; + + /* Set of live-out spilled values at the end of the block */ + unsigned *S_exit; + unsigned nS_exit; + + unsigned *S_entry; + unsigned nS_entry; + + /* Estimate */ + uint32_t cycles; + + /* Next-use maps at the start/end of the block */ + struct next_uses next_use_in; + struct next_uses next_use_out; +}; + +struct spill_ctx { + void *memctx; + bi_context *shader; + bi_block *block; + + /* Set of values currently available in the register file */ + BITSET_WORD *W; + + /* |W| = Current register pressure */ + unsigned nW; + + /* Local IPs of next-use */ + dist_t *next_uses; + + /* Current local IP relative to the start of the block */ + uint32_t ip; + + /* Set of live values that have been spilled. Contrary to the paper, this + * is not a subset of W: the definition in the paper is bogus. + */ + BITSET_WORD *S; + + /* Mapping of rematerializable values to their definitions, or NULL for nodes + * that are not materializable. + */ + bi_instr **remat; + + /* Maximum register pressure allowed */ + unsigned k; + + /* Number of variables allocated */ + unsigned n_alloc; + + /* Information on blocks indexed in source order */ + struct spill_block *blocks; + + /* first FAU index for spilled registers */ + unsigned spill_base; + + /* Max index reserved for spilled indices */ + uint32_t spill_max; + + /* count of spilled bytes */ + uint32_t spill_bytes; + + /* mapping of registers to spill locations */ + uint32_t *spill_map; + /* and the reverse */ + uint32_t *mem_map; + + /* architecture */ + unsigned arch; +}; + +static inline struct spill_block * +spill_block(struct spill_ctx *ctx, bi_block *block) +{ + return &ctx->blocks[block->index]; +} + +/* Calculate the register demand of a node. This should be rounded up to + * a power-of-two to match the equivalent calculations in RA. + * For now just punt and return 1, but we'll want to revisit this later. + */ +static inline unsigned +node_size(struct spill_ctx *ctx, unsigned node) +{ + return 1; +} + +/* + * Map a control flow edge to a block. Assumes no critical edges. + */ +static bi_block * +bi_edge_to_block(bi_block *pred, bi_block *succ) +{ + /* End of predecessor is unique if there's a single successor */ + if (bi_num_successors(pred) == 1) + return pred; + + /* The predecessor has multiple successors, meaning this is not the only + * edge leaving the predecessor. Therefore, it is the only edge entering + * the successor (otherwise the edge would be critical), so the start of + * the successor is unique. + */ + assert(bi_num_predecessors(succ) == 1 && "critical edge detected"); + return succ; +} + +/* + * Get a cursor to insert along a control flow edge: either at the start of the + * successor or the end of the predecessor. This relies on the control flow + * graph having no critical edges. + */ +static bi_cursor +bi_along_edge(bi_block *pred, bi_block *succ) +{ + bi_block *to = bi_edge_to_block(pred, succ); + + if (to == pred) + return bi_after_block_logical(pred); + else + return bi_before_block(succ); +} + +static bool bi_idx_is_memory(bi_index idx) { +// return (idx.type == BI_INDEX_FAU); + return idx.memory; +} + +static bi_index +bi_index_as_mem(bi_index idx, struct spill_ctx *ctx) +{ + assert(idx.type == BI_INDEX_NORMAL); + idx.type = BI_INDEX_FAU; + unsigned val = idx.value; + + assert(val < ctx->spill_max); + if (ctx->spill_map[val] == 0xFFFFFFFFU) { + uint32_t remap = ctx->spill_bytes; + ctx->spill_bytes += 4; + ctx->spill_map[val] = remap; + unsigned i = (remap - ctx->spill_base)/4; + assert(i < ctx->spill_max); + ctx->mem_map[i] = val; + } + idx.value = ctx->spill_map[val]; + idx.memory = true; + return idx; +} + +static unsigned +chase_mem_index(bi_index ref, struct spill_ctx *ctx) +{ + unsigned val = ref.value; + if (bi_idx_is_memory(ref)) { + unsigned i = (val - ctx->spill_base)/4; + return ctx->mem_map[i]; + } + return val; +} + +static bi_index +reconstruct_index(struct spill_ctx *ctx, unsigned node) +{ + bi_index r = bi_get_index(node); + /* do we need to reconstruct the swizzle here? */ + return r; +} + +static bool +can_remat(bi_instr *I) +{ + switch (I->op) { + case BI_OPCODE_MOV_I32: + assert(!I->src[0].memory); + assert(!I->dest[0].memory); + assert(I->dest[0].type == BI_INDEX_NORMAL); + return (I->src[0].type == BI_INDEX_CONSTANT); // || (I->src[0].type == BI_INDEX_REGISTER); + default: + return false; + } +} + +static bi_instr * +remat_to(bi_builder *b, bi_index dst, struct spill_ctx *ctx, unsigned node) +{ + assert(node < ctx->spill_max); + bi_instr *I = ctx->remat[node]; + assert(can_remat(I)); + + switch (I->op) { + case BI_OPCODE_MOV_I32: + assert(I->src[0].type == BI_INDEX_CONSTANT /*|| I->src[0].type == BI_INDEX_REGISTER*/); + assert(dst.type == BI_INDEX_NORMAL); + return bi_mov_i32_to(b, dst, I->src[0]); + default: + unreachable("invalid remat"); + } +} + +static void +insert_spill(bi_builder *b, struct spill_ctx *ctx, unsigned node) +{ + assert(node < ctx->spill_max); + if (!ctx->remat[node]) { + bi_index idx = reconstruct_index(ctx, node); + bi_index mem = bi_index_as_mem(idx, ctx); + unsigned bits = 32; + + bi_store_tl(b, bits, idx, mem.value); + + b->shader->spills++; + /* We only need the extra registers reserved if we actually spilled + * instead of just remat. + */ + b->shader->has_spill_pcopy_reserved = true; + } +} + +static void +insert_reload(struct spill_ctx *ctx, bi_block *block, bi_cursor cursor, + unsigned node) +{ + bi_builder b = bi_init_builder(ctx->shader, cursor); + bi_index idx = reconstruct_index(ctx, node); + + /* Reloading breaks SSA, but we're leaving SSA anyway */ + assert(node < ctx->spill_max); + if (ctx->remat[node]) { + remat_to(&b, idx, ctx, node); + } else { + bi_index mem = bi_index_as_mem(idx, ctx); + unsigned bits = 32; + bi_load_tl(&b, bits, idx, mem.value); + b.shader->fills++; + } +} + +/* Insert into the register file */ +static void +insert_W(struct spill_ctx *ctx, unsigned v) +{ + assert(v < ctx->n_alloc); + assert(!BITSET_TEST(ctx->W, v)); + + BITSET_SET(ctx->W, v); + ctx->nW += node_size(ctx, v); +} + +/* Remove from the register file */ +static void +remove_W(struct spill_ctx *ctx, unsigned v) +{ + assert(v < ctx->n_alloc); + assert(BITSET_TEST(ctx->W, v)); + + BITSET_CLEAR(ctx->W, v); + ctx->nW -= node_size(ctx, v); +} + +static void +remove_W_if_present(struct spill_ctx *ctx, unsigned v) +{ + assert(v < ctx->n_alloc); + + if (BITSET_TEST(ctx->W, v)) + remove_W(ctx, v); +} + +#define bi_worklist_init(ctx, w) u_worklist_init(w, ctx->num_blocks, ctx) +#define bi_worklist_push_head(w, block) u_worklist_push_head(w, block, index) +#define bi_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index) +#define bi_worklist_peek_head(w) u_worklist_peek_head(w, bi_block, index) +#define bi_worklist_pop_head(w) u_worklist_pop_head(w, bi_block, index) +#define bi_worklist_peek_tail(w) u_worklist_peek_tail(w, bi_block, index) +#define bi_worklist_pop_tail(w) u_worklist_pop_tail(w, bi_block, index) + +struct candidate { + unsigned node; + dist_t dist; +}; + +static int +cmp_dist(const void *left_, const void *right_, void *ctx_) +{ + struct spill_ctx *ctx = ctx_; + const struct candidate *left = left_; + const struct candidate *right = right_; + + /* We assume that rematerializing - even before every instruction - is + * cheaper than spilling. As long as one of the nodes is rematerializable + * (with distance > 0), we choose it over spilling. Within a class of nodes + * (rematerializable or not), compare by next-use-distance. + */ + assert(left->node < ctx->n_alloc); + assert(right->node < ctx->n_alloc); + bool remat_left = ctx->remat[left->node] != NULL && left->dist > 0; + bool remat_right = ctx->remat[right->node] != NULL && right->dist > 0; + + if (remat_left != remat_right) + return remat_left ? 1 : -1; + else + return (left->dist > right->dist) - (left->dist < right->dist); +} + +/* + * Insert coupling code on block boundaries. This must ensure: + * + * - anything live-in we expect to have spilled is spilled + * - anything live-in we expect to have filled is filled + * - phi sources are spilled if the destination is spilled + * - phi sources are filled if the destination is not spilled + * + * The latter two requirements ensure correct pressure calculations for phis. + */ +static ATTRIBUTE_NOINLINE void +insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ) +{ + struct spill_block *sp = spill_block(ctx, pred); + struct spill_block *ss = spill_block(ctx, succ); + + bi_foreach_phi_in_block(succ, I) { + if (!bi_idx_is_memory(I->dest[0])) + continue; + + bi_builder b = + bi_init_builder(ctx->shader, bi_before_function(ctx->shader)); + + unsigned s = bi_predecessor_index(succ, pred); + + /* Copy immediate/uniform phi sources to memory variables at the start of + * the program, where pressure is zero and hence the copy is legal. + */ + if (I->src[s].type != BI_INDEX_NORMAL && I->src[s].type != BI_INDEX_FAU) { + assert(I->src[s].type == BI_INDEX_CONSTANT || + I->src[s].type == BI_INDEX_REGISTER); + + bi_index gpr = bi_temp(ctx->shader); + unsigned bits = 32; + + assert(gpr.type == BI_INDEX_NORMAL); + if (ctx->arch >= 9 && I->src[s].type == BI_INDEX_CONSTANT) { + /* MOV of immediate needs lowering on Valhall */ + bi_index zero = bi_fau(BIR_FAU_IMMEDIATE | 0, false); + bi_iadd_imm_i32_to(&b, gpr, zero, I->src[s].value); + } else + bi_mov_i32_to(&b, gpr, I->src[s]); + bi_index mem = bi_index_as_mem(gpr, ctx); + bi_store_tl(&b, bits, gpr, mem.value); + I->src[s] = mem; + continue; + } + + bool spilled = false; + for (unsigned i = 0; i < sp->nS_exit; ++i) { + if (sp->S_exit[i] == I->src[s].value) { + spilled = true; + break; + } + } + + if (!spilled) { + /* Spill the phi source. TODO: avoid redundant spills here */ + bi_builder b = + bi_init_builder(ctx->shader, bi_after_block_logical(pred)); + + insert_spill(&b, ctx, I->src[s].value); + } + + if (ctx->remat[I->src[s].value]) { + unsigned node = I->src[s].value; + bi_index idx = reconstruct_index(ctx, node); + bi_index tmp = bi_temp(ctx->shader); + unsigned bits = 32; + + remat_to(&b, tmp, ctx, node); + bi_store_tl(&b, bits, tmp, bi_index_as_mem(idx, ctx).value); + } + + /* Use the spilled version */ + I->src[s] = bi_index_as_mem(I->src[s], ctx); + } + + /* Anything assumed to be spilled at the start of succ must be spilled along + * all edges. + */ + for (unsigned i = 0; i < ss->nS_entry; ++i) { + unsigned v = ss->S_entry[i]; + + bool spilled = false; + for (unsigned j = 0; j < sp->nS_exit; ++j) { + if (sp->S_exit[j] == v) { + spilled = true; + break; + } + } + + /* We handle spilling phi destinations separately */ + bi_foreach_phi_in_block(succ, phi) { + if (chase_mem_index(phi->dest[0], ctx) == v) { + spilled = true; + break; + } + } + + if (spilled) + continue; + + bi_builder b = bi_init_builder(ctx->shader, bi_along_edge(pred, succ)); + insert_spill(&b, ctx, v); + } + + /* Variables in W at the start of succ must be defined along the edge. */ + for (unsigned i = 0; i < ss->nW_entry; ++i) { + unsigned node = ss->W_entry[i]; + bool defined = false; + + /* Variables live at the end of the predecessor are live along the edge */ + for (unsigned j = 0; j < sp->nW_exit; ++j) { + if (sp->W_exit[j] == node) { + defined = true; + break; + } + } + + /* Phis are defined along the edge */ + bi_foreach_phi_in_block(succ, phi) { + if (phi->dest[0].value == node) { + defined = true; + break; + } + } + + if (defined) + continue; + + /* Otherwise, inserting a reload defines the variable along the edge */ + bi_block *reload_block = bi_edge_to_block(pred, succ); + insert_reload(ctx, reload_block, bi_along_edge(pred, succ), node); + } + + bi_foreach_phi_in_block(succ, I) { + if (bi_idx_is_memory(I->dest[0])) + continue; + + unsigned s = bi_predecessor_index(succ, pred); + + /* Treat immediate/uniform phi sources as registers for pressure + * accounting and phi lowering purposes. Parallel copy lowering can handle + * a copy from a immediate/uniform to a register, but not from an + * immediate/uniform directly to memory. + */ + if (I->src[s].type != BI_INDEX_NORMAL && !I->src[s].memory) { + assert(I->src[s].type == BI_INDEX_CONSTANT || + I->src[s].type == BI_INDEX_REGISTER || + I->src[s].type == BI_INDEX_FAU + ); + + continue; + } + + bool live = false; + for (unsigned i = 0; i < sp->nW_exit; ++i) { + if (sp->W_exit[i] == I->src[s].value) { + live = true; + break; + } + } + + /* Fill the phi source in the predecessor */ + if (!live) { + bi_block *reload_block = bi_edge_to_block(pred, succ); + insert_reload(ctx, reload_block, bi_along_edge(pred, succ), + I->src[s].value); + } + + /* Leave as-is for the GPR version */ + assert(!bi_idx_is_memory(I->src[s])); + } +} + +/* + * Produce an array of next-use IPs relative to the start of the block. This is + * an array of dist_t scalars, representing the next-use IP of each SSA dest + * (right-to-left) and SSA source (left-to-right) of each instruction in the + * block (bottom-to-top). Its size equals the # of SSA sources in the block. + */ +static ATTRIBUTE_NOINLINE void +calculate_local_next_use(struct spill_ctx *ctx, struct util_dynarray *out) +{ + struct spill_block *sb = spill_block(ctx, ctx->block); + unsigned ip = sb->cycles; + + util_dynarray_init(out, NULL); + + struct next_uses nu; + init_next_uses(&nu, NULL); + + foreach_next_use(&sb->next_use_out, i, dist) { + set_next_use(&nu, i, dist_sum(ip, dist)); + } + + bi_foreach_instr_in_block_rev(ctx->block, I) { + ip -= instr_cycles(I); + + if (I->op != BI_OPCODE_PHI) { + bi_foreach_ssa_dest_rev(I, d) { + unsigned v = I->dest[d].value; + dist_t next_dist = search_next_uses(&nu, v); + util_dynarray_append(out, dist_t, next_dist); + } + + bi_foreach_ssa_src(I, s) { + unsigned v = I->src[s].value; + dist_t next_dist = search_next_uses(&nu, v); + util_dynarray_append(out, dist_t, next_dist); + assert((next_dist == DIST_INFINITY) == I->src[s].kill_ssa); + set_next_use(&nu, v, ip); + } + } + } + + assert(ip == 0 && "cycle counting is consistent"); + destroy_next_uses(&nu); +} + + +/* + * TODO: Implement section 4.2 of the paper. + * + * For now, we implement the simpler heuristic in Hack's thesis: sort + * the live-in set (+ destinations of phis) by next-use distance. + */ +static ATTRIBUTE_NOINLINE void +compute_w_entry_loop_header(struct spill_ctx *ctx) +{ + bi_block *block = ctx->block; + struct spill_block *sb = spill_block(ctx, block); + + unsigned nP = __bitset_count(block->ssa_live_in, BITSET_WORDS(ctx->n_alloc)); + struct candidate *candidates = calloc(nP, sizeof(struct candidate)); + unsigned j = 0; + + foreach_next_use(&sb->next_use_in, i, dist) { + assert(j < nP); + candidates[j++] = (struct candidate){.node = i, .dist = dist}; + } + + assert(j == nP); + + /* Sort by next-use distance */ + util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx); + + /* Take as much as we can */ + for (unsigned i = 0; i < j; ++i) { + unsigned node = candidates[i].node; + unsigned comps = node_size(ctx, node); + + if ((ctx->nW + comps) <= ctx->k) { + insert_W(ctx, node); + sb->W_entry[sb->nW_entry++] = node; + } + } + + assert(ctx->nW <= ctx->k); + free(candidates); +} + +/* + * Compute W_entry for a block. Section 4.2 in the paper. + */ +static ATTRIBUTE_NOINLINE void +compute_w_entry(struct spill_ctx *ctx) +{ + bi_block *block = ctx->block; + struct spill_block *sb = spill_block(ctx, block); + + /* Nothing to do for start blocks */ + if (bi_num_predecessors(block) == 0) + return; + + /* Loop headers have a different heuristic */ + if (block->loop_header) { + compute_w_entry_loop_header(ctx); + return; + } + + /* Usual blocks follow */ + unsigned *freq = calloc(ctx->n_alloc, sizeof(unsigned)); + + /* Record what's written at the end of each predecessor */ + bi_foreach_predecessor(ctx->block, P) { + struct spill_block *sp = spill_block(ctx, *P); + + for (unsigned i = 0; i < sp->nW_exit; ++i) { + unsigned v = sp->W_exit[i]; + freq[v]++; + } + } + + struct candidate *candidates = calloc(ctx->n_alloc, sizeof(struct candidate)); + unsigned j = 0; + + /* Variables that are in all predecessors are assumed in W_entry. Phis and + * variables in some predecessors are scored by next-use. + */ + foreach_next_use(&sb->next_use_in, i, dist) { + if (freq[i] == bi_num_predecessors(ctx->block)) { + insert_W(ctx, i); + } else if (freq[i]) { + candidates[j++] = (struct candidate){.node = i, .dist = dist}; + } + } + + bi_foreach_phi_in_block(ctx->block, I) { + bool all_found = true; + + bi_foreach_predecessor(ctx->block, pred) { + struct spill_block *sp = spill_block(ctx, *pred); + bool found = false; + + bi_index src = I->src[bi_predecessor_index(ctx->block, *pred)]; + if (src.type != BI_INDEX_NORMAL) + continue; + + unsigned v = src.value; + for (unsigned i = 0; i < sp->nW_exit; ++i) { + if (sp->W_exit[i] == v) { + found = true; + break; + } + } + + all_found &= found; + } + + /* Heuristic: if any phi source is spilled, spill the whole phi. This is + * suboptimal, but it massively reduces pointless fill/spill chains with + * massive phi webs. + */ + if (!all_found) + continue; + + candidates[j++] = (struct candidate){ + .node = I->dest[0].value, + .dist = search_next_uses(&sb->next_use_in, I->dest[0].value), + }; + } + + /* Sort by next-use distance */ + util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx); + + /* Take as much as we can */ + for (unsigned i = 0; i < j; ++i) { + unsigned node = candidates[i].node; + unsigned comps = node_size(ctx, node); + + if ((ctx->nW + comps) <= ctx->k) { + insert_W(ctx, node); + sb->W_entry[sb->nW_entry++] = node; + } + } + + assert(ctx->nW <= ctx->k && "invariant"); + + free(freq); + free(candidates); +} + +/* + * We initialize S with the union of S at the exit of (forward edge) + * predecessors and the complement of W, intersected with the live-in set. The + * former propagates S forward. The latter ensures we spill along the edge when + * a live value is not selected for the entry W. + */ +static ATTRIBUTE_NOINLINE void +compute_s_entry(struct spill_ctx *ctx) +{ + unsigned v; + + bi_foreach_predecessor(ctx->block, pred) { + struct spill_block *sp = spill_block(ctx, *pred); + + for (unsigned i = 0; i < sp->nS_exit; ++i) { + v = sp->S_exit[i]; + + if (BITSET_TEST(ctx->block->ssa_live_in, v)) + BITSET_SET(ctx->S, v); + } + } + + BITSET_FOREACH_SET(v, ctx->block->ssa_live_in, ctx->n_alloc) { + if (!BITSET_TEST(ctx->W, v)) + BITSET_SET(ctx->S, v); + } + + /* Copy ctx->S to S_entry for later look-ups with coupling code */ + struct spill_block *sb = spill_block(ctx, ctx->block); + unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n_alloc)); + sb->S_entry = ralloc_array(ctx->memctx, unsigned, nS); + + int i; + BITSET_FOREACH_SET(i, ctx->S, ctx->n_alloc) + sb->S_entry[sb->nS_entry++] = i; + + assert(sb->nS_entry == nS); +} + +static ATTRIBUTE_NOINLINE void +global_next_use_distances(bi_context *ctx, void *memctx, + struct spill_block *blocks) +{ + u_worklist worklist; + u_worklist_init(&worklist, ctx->num_blocks, NULL); + + bi_foreach_block(ctx, block) { + struct spill_block *sb = &blocks[block->index]; + + init_next_uses(&sb->next_use_in, memctx); + init_next_uses(&sb->next_use_out, memctx); + + bi_foreach_instr_in_block(block, I) { + sb->cycles += instr_cycles(I); + } + + bi_worklist_push_head(&worklist, block); + } + + /* Definitions that have been seen */ + BITSET_WORD *defined = + malloc(BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD)); + + struct next_uses dists; + init_next_uses(&dists, NULL); + + /* Iterate the work list in reverse order since liveness is backwards */ + while (!u_worklist_is_empty(&worklist)) { + bi_block *blk = bi_worklist_pop_head(&worklist); + struct spill_block *sb = &blocks[blk->index]; + + /* Definitions that have been seen */ + memset(defined, 0, BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD)); + + /* Initialize all distances to infinity */ + clear_next_uses(&dists); + + uint32_t cycle = 0; + + /* Calculate dists. Phis are handled separately. */ + bi_foreach_instr_in_block(blk, I) { + if (I->op == BI_OPCODE_PHI) { + cycle++; + continue; + } + + /* Record first use before def. Phi sources are handled + * above, because they logically happen in the + * predecessor. + */ + bi_foreach_ssa_src(I, s) { + if (BITSET_TEST(defined, I->src[s].value)) + continue; + if (search_next_uses(&dists, I->src[s].value) < DIST_INFINITY) + continue; + + assert(I->src[s].value < ctx->ssa_alloc); + set_next_use(&dists, I->src[s].value, cycle); + } + + /* Record defs */ + bi_foreach_ssa_dest(I, d) { + assert(I->dest[d].value < ctx->ssa_alloc); + BITSET_SET(defined, I->dest[d].value); + } + + cycle += instr_cycles(I); + } + + /* Apply transfer function to get our entry state. */ + foreach_next_use(&sb->next_use_out, node, dist) { + set_next_use(&sb->next_use_in, node, dist_sum(dist, sb->cycles)); + } + + foreach_next_use(&dists, node, dist) { + set_next_use(&sb->next_use_in, node, dist); + } + + int i; + BITSET_FOREACH_SET(i, defined, ctx->ssa_alloc) { + set_next_use(&sb->next_use_in, i, DIST_INFINITY); + } + + /* Propagate the live in of the successor (blk) to the live out of + * predecessors. + * + * Phi nodes are logically on the control flow edge and act in parallel. + * To handle when propagating, we kill writes from phis and make live the + * corresponding sources. + */ + bi_foreach_predecessor(blk, pred) { + struct spill_block *sp = &blocks[(*pred)->index]; + copy_next_uses(&dists, &sb->next_use_in); + + /* Kill write */ + bi_foreach_phi_in_block(blk, I) { + assert(I->dest[0].type == BI_INDEX_NORMAL); + set_next_use(&dists, I->dest[0].value, DIST_INFINITY); + } + + /* Make live the corresponding source */ + bi_foreach_phi_in_block(blk, I) { + bi_index operand = I->src[bi_predecessor_index(blk, *pred)]; + if (operand.type == BI_INDEX_NORMAL) + set_next_use(&dists, operand.value, 0); + } + + /* Join by taking minimum */ + if (minimum_next_uses(&sp->next_use_out, &dists)) + bi_worklist_push_tail(&worklist, *pred); + } + } + + free(defined); + u_worklist_fini(&worklist); + destroy_next_uses(&dists); +} + +static ATTRIBUTE_NOINLINE void +validate_next_use_info(UNUSED bi_context *ctx, + UNUSED struct spill_block *blocks) +{ +#ifndef NDEBUG + int i; + + bi_foreach_block(ctx, blk) { + struct spill_block *sb = &blocks[blk->index]; + + /* Invariant: next-use distance is finite iff the node is live */ + BITSET_FOREACH_SET(i, blk->ssa_live_in, ctx->ssa_alloc) + assert(search_next_uses(&sb->next_use_in, i) < DIST_INFINITY); + + BITSET_FOREACH_SET(i, blk->ssa_live_out, ctx->ssa_alloc) + assert(search_next_uses(&sb->next_use_out, i) < DIST_INFINITY); + + foreach_next_use(&sb->next_use_in, i, _) + assert(BITSET_TEST(blk->ssa_live_in, i)); + + foreach_next_use(&sb->next_use_out, i, _) + assert(BITSET_TEST(blk->ssa_live_out, i)); + } +#endif +} + +/* + * Limit the register file W to maximum size m by evicting registers. + */ +static ATTRIBUTE_NOINLINE void +limit(struct spill_ctx *ctx, bi_instr *I, unsigned m) +{ + /* Nothing to do if we're already below the limit */ + if (ctx->nW <= m) + return; + + /* Gather candidates for eviction. Note that next_uses gives IPs whereas + * cmp_dist expects relative distances. This requires us to subtract ctx->ip + * to ensure that cmp_dist works properly. Even though logically it shouldn't + * affect the sorted order, practically this matters for correctness with + * rematerialization. See the dist=0 test in cmp_dist. + */ + struct candidate *candidates = alloca(ctx->nW * sizeof(struct candidate)); + unsigned j = 0; + + int i; + BITSET_FOREACH_SET(i, ctx->W, ctx->n_alloc) { + assert(j < ctx->nW); + + dist_t next_use = ctx->next_uses[i]; + if (next_use < DIST_INFINITY && next_use >= ctx->ip) + next_use -= ctx->ip; + else + next_use = DIST_INFINITY; + candidates[j++] = (struct candidate){ + .node = i, + .dist = next_use, + }; + } + + /* Sort by next-use distance */ + util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx); + + /* Evict what doesn't fit */ + unsigned new_weight = 0; + + for (i = 0; i < j; ++i) { + unsigned v = candidates[i].node; + unsigned comps = node_size(ctx, v); + + if ((new_weight + comps) <= m) { + new_weight += comps; + } else { + /* Insert a spill if we haven't spilled before and there is + * another use + */ + if (!BITSET_TEST(ctx->S, v) && candidates[i].dist < DIST_INFINITY) { + bi_builder b = bi_init_builder(ctx->shader, bi_before_instr(I)); + insert_spill(&b, ctx, v); + BITSET_SET(ctx->S, v); + } + + remove_W(ctx, v); + + /* We keep going in case we can pack in a scalar */ + } + } +} + + +/* + * validation code for next_ip info + */ +static void +validate_next_ip(struct spill_ctx *ctx, struct util_dynarray *local_next_ip) +{ +#ifndef NDEBUG + dist_t *next_ips = util_dynarray_element(local_next_ip, dist_t, 0); + unsigned next_use_cursor = + util_dynarray_num_elements(local_next_ip, dist_t); + + bi_foreach_instr_in_block(ctx->block, I) { + if (I->op == BI_OPCODE_PHI) + continue; + bi_foreach_ssa_src_rev(I, s) { + assert(next_use_cursor >= 1); + + unsigned next_ip = next_ips[--next_use_cursor]; + assert((next_ip == DIST_INFINITY) == I->src[s].kill_ssa); + } + bi_foreach_ssa_dest(I, d) { + assert(next_use_cursor >= 1); + unsigned next_ip = next_ips[--next_use_cursor]; + (void)next_ip; + } + } + assert(next_use_cursor == 0 && "exactly sized"); +#endif +} + +/* + * Insert spills/fills for a single basic block, following Belady's algorithm. + * Corresponds to minAlgorithm from the paper. + */ +static ATTRIBUTE_NOINLINE void +min_algorithm(struct spill_ctx *ctx) +{ + struct spill_block *sblock = spill_block(ctx, ctx->block); + struct util_dynarray local_next_ip; + calculate_local_next_use(ctx, &local_next_ip); + + validate_next_ip(ctx, &local_next_ip); + + /* next_uses gives the distance from the start of the block, so prepopulate + * with next_use_in. + */ + foreach_next_use(&sblock->next_use_in, key, dist) { + assert(key < ctx->n_alloc); + ctx->next_uses[key] = dist; + } + + dist_t *next_ips = util_dynarray_element(&local_next_ip, dist_t, 0); + unsigned next_use_cursor = + util_dynarray_num_elements(&local_next_ip, dist_t); + + /* Iterate each instruction in forward order */ + bi_foreach_instr_in_block(ctx->block, I) { + assert(ctx->nW <= ctx->k && "invariant"); + + /* Debug to check against our RA demand calculations */ + if (0) { + printf("%u: ", ctx->nW); + bi_print_instr(I, stdout); + } + + /* Phis are special since they happen along the edge. When we initialized + * W and S, we implicitly chose which phis are spilled. So, here we just + * need to rewrite the phis to write into memory. + * + * Phi sources are handled later. + */ + if (I->op == BI_OPCODE_PHI) { + if (!BITSET_TEST(ctx->W, I->dest[0].value)) { + I->dest[0] = bi_index_as_mem(I->dest[0], ctx); + } + + ctx->ip += instr_cycles(I); + continue; + } + + /* Any source that is not in W needs to be reloaded. Gather the set R of + * such values. + */ + unsigned R[BI_MAX_SRCS]; + unsigned nR = 0; + + bi_foreach_ssa_src(I, s) { + unsigned node = I->src[s].value; + if (BITSET_TEST(ctx->W, node)) + continue; + + /* Mark this variable as needing a reload. */ + assert(node < ctx->n_alloc); + assert(BITSET_TEST(ctx->S, node) && "must have been spilled"); + assert(nR < ARRAY_SIZE(R) && "maximum source count"); + R[nR++] = node; + + /* The inserted reload will add the value to the register file. */ + insert_W(ctx, node); + } + + /* Limit W to make space for the sources we just added */ + limit(ctx, I, ctx->k); + + /* Update next-use distances for this instruction. Unlike the paper, we + * prune dead values from W as we go. This doesn't affect correctness, but + * it speeds up limit() on average. + */ + bi_foreach_ssa_src_rev(I, s) { + assert(next_use_cursor >= 1); + + unsigned next_ip = next_ips[--next_use_cursor]; + assert((next_ip == DIST_INFINITY) == I->src[s].kill_ssa); + + if (next_ip == DIST_INFINITY) + remove_W_if_present(ctx, I->src[s].value); + else + ctx->next_uses[I->src[s].value] = next_ip; + } + + bi_foreach_ssa_dest(I, d) { + assert(next_use_cursor >= 1); + unsigned next_ip = next_ips[--next_use_cursor]; + + if (next_ip == DIST_INFINITY) + remove_W_if_present(ctx, I->dest[d].value); + else + ctx->next_uses[I->dest[d].value] = next_ip; + } + + /* Count how many registers we need for destinations. Because of + * SSA form, destinations are unique. + */ + unsigned dest_size = 0; + bi_foreach_ssa_dest(I, d) { + dest_size += node_size(ctx, I->dest[d].value); + } + + /* Limit W to make space for the destinations. */ + limit(ctx, I, ctx->k - dest_size); + + /* Destinations are now in the register file */ + bi_foreach_ssa_dest(I, d) { + insert_W(ctx, I->dest[d].value); + } + + /* Add reloads for the sources in front of the instruction. We need to be + * careful around exports, hoisting the reloads to before all exports. + * + * This is legal since all exports happen in parallel and all registers + * are dead after the exports. The register file + * must be big enough for everything exported, so it must be big enough + * for all the reloaded values right before the parallel exports. + */ + for (unsigned i = 0; i < nR; ++i) { + insert_reload(ctx, ctx->block, bi_before_instr(I), + R[i]); + } + + ctx->ip += instr_cycles(I); + } + + assert(next_use_cursor == 0 && "exactly sized"); + + int i; + BITSET_FOREACH_SET(i, ctx->W, ctx->n_alloc) + sblock->W_exit[sblock->nW_exit++] = i; + + unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n_alloc)); + sblock->S_exit = ralloc_array(ctx->memctx, unsigned, nS); + + BITSET_FOREACH_SET(i, ctx->S, ctx->n_alloc) + sblock->S_exit[sblock->nS_exit++] = i; + + assert(nS == sblock->nS_exit); + util_dynarray_fini(&local_next_ip); +} + +/* + * spill to keep the number of registers in use + * below `k` + * returns number of registers spilled + */ + +unsigned +bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) +{ + void *memctx = ralloc_context(NULL); + dist_t *next_uses = rzalloc_array(memctx, dist_t, ctx->ssa_alloc + MAX_TEMPS_FOR_SPILL); + bi_instr **remat = rzalloc_array(memctx, bi_instr *, ctx->ssa_alloc + MAX_TEMPS_FOR_SPILL); + unsigned spill_count = spill_base; + unsigned max_temps = MAX_TEMPS_FOR_SPILL; + + /* check for instructions that can be easily re-materialized */ + bi_foreach_instr_global(ctx, I) { + if (can_remat(I)) + remat[I->dest[0].value] = I; + } + + struct spill_block *blocks = + rzalloc_array(memctx, struct spill_block, ctx->num_blocks); + + /* Step 1. Compute global next-use distances */ + global_next_use_distances(ctx, memctx, blocks); + validate_next_use_info(ctx, blocks); + + /* we may need to allocate some temporaries for spilling PHIs, hence the max_temps */ + unsigned n = ctx->ssa_alloc + max_temps; + BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n)); + BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n)); + uint32_t *spill_map = ralloc_array(memctx, uint32_t, n); + uint32_t *mem_map = ralloc_array(memctx, uint32_t, n); + + /* initialize to FFFFFFFF */ + memset(spill_map, 0xff, sizeof(uint32_t) * n); + memset(mem_map, 0xff, sizeof(uint32_t) * n); + + bi_foreach_block(ctx, block) { + memset(W, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD)); + memset(S, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD)); + + struct spill_ctx sctx = { + .memctx = memctx, + .shader = ctx, + .n_alloc = ctx->ssa_alloc, + .remat = remat, + .next_uses = next_uses, + .block = block, + .blocks = blocks, + .k = k, + .W = W, + .S = S, + .spill_max = n, + .spill_base = spill_base, + .spill_map = spill_map, + .spill_bytes = spill_count, + .mem_map = mem_map, + .arch = ctx->arch, + }; + + compute_w_entry(&sctx); + compute_s_entry(&sctx); + min_algorithm(&sctx); + spill_count = MAX2(spill_count, sctx.spill_bytes); + } + + /* Now that all blocks are processed separately, stitch it together */ + bi_foreach_block(ctx, block) { + struct spill_ctx sctx = { + .memctx = memctx, + .shader = ctx, + .n_alloc = ctx->ssa_alloc, + .remat = remat, + .block = block, + .blocks = blocks, + .k = k, + .W = W, + .S = S, + .spill_max = n, + .spill_base = spill_base, + .spill_map = spill_map, + .spill_bytes = spill_count, + .mem_map = mem_map, + .arch = ctx->arch, + }; + + bi_foreach_predecessor(block, pred) { + /* After spilling phi sources, insert coupling code */ + insert_coupling_code(&sctx, *pred, block); + } + spill_count = MAX2(spill_count, sctx.spill_bytes); + } + + ralloc_free(memctx); + return spill_count; +} diff --git a/src/panfrost/compiler/bifrost.h b/src/panfrost/compiler/bifrost.h index 9864f3da9ad..0880fc8093a 100644 --- a/src/panfrost/compiler/bifrost.h +++ b/src/panfrost/compiler/bifrost.h @@ -49,6 +49,7 @@ extern "C" { #define BIFROST_DBG_NOPRELOAD 0x0800 #define BIFROST_DBG_SPILL 0x1000 #define BIFROST_DBG_NOPSCHED 0x2000 +#define BIFROST_DBG_NOSSARA 0x4000 extern int bifrost_debug; diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index db246c24906..183828a32f4 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -57,6 +57,7 @@ static const struct debug_named_value bifrost_debug_options[] = { {"nosb", BIFROST_DBG_NOSB, "Disable scoreboarding"}, {"nopreload", BIFROST_DBG_NOPRELOAD, "Disable message preloading"}, {"spill", BIFROST_DBG_SPILL, "Test register spilling"}, + {"nossara", BIFROST_DBG_NOSSARA, "Disable SSA in register allocation"}, DEBUG_NAMED_VALUE_END }; /* clang-format on */ diff --git a/src/panfrost/compiler/compiler.h b/src/panfrost/compiler/compiler.h index ee9f50c0b5b..b183783be85 100644 --- a/src/panfrost/compiler/compiler.h +++ b/src/panfrost/compiler/compiler.h @@ -193,8 +193,16 @@ typedef struct { uint32_t offset : 3; enum bi_index_type type : 3; + /* Last use of an SSA value; similar to discard, but applies to the + * SSA analysis and does not have any HW restrictions (discard gets + * sent to the hardware eventually. */ + bool kill_ssa : 1; + + /* Register class */ + bool memory : 1; + /* Must be zeroed so we can hash the whole 64-bits at a time */ - unsigned padding : (32 - 14); + unsigned padding : (32 - 16); } bi_index; static inline bi_index @@ -207,6 +215,23 @@ bi_get_index(unsigned value) }; } +enum ra_class { + /* General purpose register */ + RA_GPR, + + /* Memory, used to assign stack slots */ + RA_MEM, + + /* Keep last */ + RA_CLASSES, +}; + +static inline enum ra_class +ra_class_for_index(bi_index idx) +{ + return idx.memory ? RA_MEM : RA_GPR; +} + static inline bi_index bi_register(unsigned reg) { @@ -911,6 +936,8 @@ enum bi_idvs_mode { BI_IDVS_ALL = 3, }; +#define BI_MAX_REGS 64 + typedef struct { const struct pan_compile_inputs *inputs; nir_shader *nir; @@ -950,7 +977,7 @@ typedef struct { /* During NIR->BIR, table of preloaded registers, or NULL if never * preloaded. */ - bi_index preloaded[64]; + bi_index preloaded[BI_MAX_REGS]; /* For creating temporaries */ unsigned ssa_alloc; @@ -964,6 +991,15 @@ typedef struct { */ struct hash_table_u64 *allocated_vec; + /* Beginning of our stack allocation used for spilling, below that is + * NIR-level scratch. + */ + unsigned spill_base_B; + + /* Beginning of stack allocation used for parallel copy lowering */ + bool has_spill_pcopy_reserved; + unsigned spill_pcopy_base; + /* Stats for shader-db */ unsigned loop_count; unsigned spills; @@ -1148,13 +1184,19 @@ bi_src_index(nir_src *src) util_dynarray_foreach(&(blk)->predecessors, bi_block *, v) #define bi_foreach_src(ins, v) for (unsigned v = 0; v < ins->nr_srcs; ++v) +#define bi_foreach_src_rev(ins, v) for (signed v = ins->nr_srcs-1; v >= 0; --v) #define bi_foreach_dest(ins, v) for (unsigned v = 0; v < ins->nr_dests; ++v) +#define bi_foreach_dest_rev(ins, v) for (signed v = ins->nr_dests-1; v >= 0; --v) #define bi_foreach_ssa_src(ins, v) \ bi_foreach_src(ins, v) \ if (ins->src[v].type == BI_INDEX_NORMAL) +#define bi_foreach_ssa_src_rev(ins, v) \ + bi_foreach_src_rev(ins, v) \ + if (ins->src[v].type == BI_INDEX_NORMAL) + #define bi_foreach_ssa_dest(ins, v) \ bi_foreach_dest(ins, v) \ if (ins->dest[v].type == BI_INDEX_NORMAL) @@ -1163,6 +1205,25 @@ bi_src_index(nir_src *src) bi_foreach_instr_in_tuple(tuple, ins) \ bi_foreach_src(ins, s) +#define bi_foreach_ssa_dest_rev(ins, v) \ + bi_foreach_dest_rev(ins, v) \ + if (ins->dest[v].type == BI_INDEX_NORMAL) + +/* Phis only come at the start (after else instructions) so we stop as soon as + * we hit a non-phi + */ +#define bi_foreach_phi_in_block(block, v) \ + bi_foreach_instr_in_block(block, v) \ + if (v->op != BI_OPCODE_PHI) \ + break; \ + else + +#define bi_foreach_phi_in_block_safe(block, v) \ + bi_foreach_instr_in_block_safe(block, v) \ + if (v->op != BI_OPCODE_PHI) \ + break; \ + else + /* * Find the index of a predecessor, used as the implicit order of phi sources. */ @@ -1285,9 +1346,14 @@ bool bi_opt_constant_fold(bi_context *ctx); void bi_compute_liveness_ssa(bi_context *ctx); void bi_liveness_ins_update_ssa(BITSET_WORD *live, const bi_instr *ins); +unsigned bi_calc_register_demand(bi_context *ctx); + void bi_postra_liveness(bi_context *ctx); uint64_t MUST_CHECK bi_postra_liveness_ins(uint64_t live, bi_instr *ins); +/* SSA spilling; returns number of spilled registers */ +unsigned bi_spill_ssa(bi_context *ctx, unsigned num_registers, unsigned tls_size); + /* Layout */ signed bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target); @@ -1477,6 +1543,15 @@ bi_after_clause(bi_clause *clause) return bi_after_instr(bi_last_instr_in_clause(clause)); } +/* Get a cursor at the start of a function, after any preloads */ +static inline bi_cursor +bi_before_function(bi_context *ctx) +{ + bi_block *block = bi_start_block(&ctx->blocks); + + return bi_before_block(block); +} + /* IR builder in terms of cursor infrastructure */ typedef struct { @@ -1490,6 +1565,10 @@ bi_init_builder(bi_context *ctx, bi_cursor cursor) return (bi_builder){.shader = ctx, .cursor = cursor}; } +/* insert load/store for spills */ +bi_instr *bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset); +void bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset); + /* Insert an instruction at the cursor and move the cursor */ static inline void diff --git a/src/panfrost/compiler/meson.build b/src/panfrost/compiler/meson.build index e7ce99bf343..fc56f908a9f 100644 --- a/src/panfrost/compiler/meson.build +++ b/src/panfrost/compiler/meson.build @@ -22,6 +22,8 @@ libpanfrost_bifrost_files = files( 'bi_opt_dual_tex.c', 'bi_pressure_schedule.c', 'bi_ra.c', + 'bi_ra_ssa.c', + 'bi_spill_ssa.c', 'bi_validate.c', 'bir.c', 'bifrost_compile.c',