diff --git a/src/.clang-format b/src/.clang-format index f55d7db76e9..3f2d13ac366 100644 --- a/src/.clang-format +++ b/src/.clang-format @@ -218,6 +218,7 @@ ForEachMacros: - agx_foreach_ssa_src - agx_foreach_ssa_src_rev - agx_foreach_successor + - foreach_next_use # radv - PHASE diff --git a/src/asahi/compiler/agx_compiler.h b/src/asahi/compiler/agx_compiler.h index 75200b8b629..c3c4732fece 100644 --- a/src/asahi/compiler/agx_compiler.h +++ b/src/asahi/compiler/agx_compiler.h @@ -938,6 +938,7 @@ void agx_lower_uniform_sources(agx_context *ctx); void agx_opt_cse(agx_context *ctx); void agx_dce(agx_context *ctx, bool partial); void agx_pressure_schedule(agx_context *ctx); +void agx_spill(agx_context *ctx, unsigned k); void agx_repair_ssa(agx_context *ctx); void agx_reindex_ssa(agx_context *ctx); void agx_ra(agx_context *ctx); diff --git a/src/asahi/compiler/agx_spill.c b/src/asahi/compiler/agx_spill.c new file mode 100644 index 00000000000..7e9425afc01 --- /dev/null +++ b/src/asahi/compiler/agx_spill.c @@ -0,0 +1,1192 @@ +/* + * Copyright 2023-2024 Alyssa Rosenzweig + * Copyright 2023-2024 Valve Corporation + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "util/bitset.h" +#include "util/hash_table.h" +#include "util/ralloc.h" +#include "util/u_dynarray.h" +#include "util/u_qsort.h" +#include "agx_builder.h" +#include "agx_compiler.h" +#include "agx_opcodes.h" + +/* + * An implementation of "Register Spilling and Live-Range Splitting for SSA-Form + * Programs" by Braun and Hack. + */ + +/* + * Next-use distances are logically in ℤ ∪ {∞}. Modeled as saturating uint32 and + * referred to as dist_t. + * + * next_uses represents a next-use map. This is a sparse data structure mapping + * variable names to next-use dist_t's. Variables with no later use (infinite + * next-use distance) are not stored explicitly, making the time/space + * requirements O(live variables). This is important for performance and memory + * usage on big shaders with many blocks. + * + * For now, next_uses is backed by a Mesa hash table, but it could be optimized + * to something more specialized in the future. + */ +#define DIST_INFINITY (UINT32_MAX) +typedef uint32_t dist_t; + +static dist_t +dist_sum(dist_t A, dist_t B) +{ + return (A + B < A) ? DIST_INFINITY : (A + B); +} + +struct next_uses { + struct hash_table_u64 *ht; +}; + +static void +init_next_uses(struct next_uses *nu, void *memctx) +{ + nu->ht = _mesa_hash_table_u64_create(memctx); +} + +static void +destroy_next_uses(struct next_uses *nu) +{ + _mesa_hash_table_u64_destroy(nu->ht); +} + +static void +clear_next_uses(struct next_uses *nu) +{ + _mesa_hash_table_u64_clear(nu->ht); +} + +static void +copy_next_uses(struct next_uses *nu, const struct next_uses *from) +{ + clear_next_uses(nu); + + hash_table_u64_foreach(from->ht, use) { + _mesa_hash_table_u64_insert(nu->ht, use.key, use.data); + } +} + +static void +set_next_use(struct next_uses *nu, unsigned node, dist_t dist) +{ + if (dist == DIST_INFINITY) { + _mesa_hash_table_u64_remove(nu->ht, node); + } else { + uintptr_t as_ptr = (uintptr_t)(dist + 1); + assert(as_ptr != 0 && "non-NULL"); + + _mesa_hash_table_u64_insert(nu->ht, node, (void *)as_ptr); + } +} + +static dist_t +search_next_uses(const struct next_uses *nu, unsigned node) +{ + void *ent = _mesa_hash_table_u64_search(nu->ht, node); + if (!ent) + return DIST_INFINITY; + + uintptr_t raw = (uintptr_t)ent; + return raw - 1; +} + +#define foreach_next_use(nu, node, dist) \ + hash_table_u64_foreach((nu)->ht, use_) \ + for (uint32_t _terminator = 1, node = use_.key, \ + UNUSED dist = ((uintptr_t)use_.data) - 1; \ + _terminator != 0; _terminator = 0) + +/* + * Calculate the minimum of two next-use sets. Values absent from one of the + * underlying sets are infinity so do not contribute to the minimum, instead + * acting like a set union. + */ +static bool +minimum_next_uses(struct next_uses *nu, const struct next_uses *from) +{ + bool progress = false; + + foreach_next_use(from, node, from_dist) { + dist_t nu_dist = search_next_uses(nu, node); + + if (from_dist < nu_dist) { + set_next_use(nu, node, from_dist); + progress = true; + } + } + + return progress; +} + +static uint32_t +instr_cycles(const agx_instr *I) +{ + return 1; +} + +struct spill_block { + /* Set of values available in the register file at the end */ + unsigned W_exit[AGX_NUM_REGS]; + unsigned nW_exit; + + unsigned W_entry[AGX_NUM_REGS]; + unsigned nW_entry; + + /* Set of live-out spilled values at the end of the block */ + unsigned *S_exit; + unsigned nS_exit; + + unsigned *S_entry; + unsigned nS_entry; + + /* Estimate */ + uint32_t cycles; + + /* Next-use maps at the start/end of the block */ + struct next_uses next_use_in; + struct next_uses next_use_out; +}; + +struct spill_ctx { + void *memctx; + agx_context *shader; + agx_block *block; + + /* Set of values currently available in the register file */ + BITSET_WORD *W; + + /* |W| = Current register pressure */ + unsigned nW; + + /* Local IPs of next-use */ + dist_t *next_uses; + + /* Current local IP relative to the start of the block */ + uint32_t ip; + + /* Set of live values that have been spilled. Contrary to the paper, this + * is not a subset of W: the definition in the paper is bogus. + */ + BITSET_WORD *S; + + /* Widths of vectors */ + uint8_t *channels; + enum agx_size *size; + + /* Mapping of rematerializable values to their definitions, or NULL for nodes + * that are not materializable. + */ + agx_instr **remat; + + /* Maximum register pressure allowed */ + unsigned k; + + /* Number of variables */ + unsigned n; + + /* Information on blocks indexed in source order */ + struct spill_block *blocks; + + /* Base memory index reserved for spilled indices */ + unsigned spill_base; +}; + +static inline struct spill_block * +spill_block(struct spill_ctx *ctx, agx_block *block) +{ + return &ctx->blocks[block->index]; +} + +/* Calculate the register demand of a node. This is rounded up to a power-of-two + * to match the equivalent calculations in RA. + */ +static unsigned +node_size(struct spill_ctx *ctx, unsigned node) +{ + return util_next_power_of_two(ctx->channels[node]) * + agx_size_align_16(ctx->size[node]); +} + +/* + * Map a control flow edge to a block. Assumes no critical edges. + */ +static agx_block * +agx_edge_to_block(agx_block *pred, agx_block *succ) +{ + /* End of predecessor is unique if there's a single successor */ + if (agx_num_successors(pred) == 1) + return pred; + + /* The predecessor has multiple successors, meaning this is not the only + * edge leaving the predecessor. Therefore, it is the only edge entering + * the successor (otherwise the edge would be critical), so the start of + * the successor is unique. + */ + assert(agx_num_predecessors(succ) == 1 && "critical edge detected"); + return succ; +} + +/* + * Get a cursor to insert along a control flow edge: either at the start of the + * successor or the end of the predecessor. This relies on the control flow + * graph having no critical edges. + */ +static agx_cursor +agx_along_edge(agx_block *pred, agx_block *succ) +{ + agx_block *to = agx_edge_to_block(pred, succ); + + if (to == pred) + return agx_after_block_logical(pred); + else + return agx_before_block(succ); +} + +static inline agx_index +agx_index_as_mem(agx_index idx, unsigned mem_base) +{ + assert(idx.type == AGX_INDEX_NORMAL); + assert(!idx.memory); + idx.memory = true; + idx.value = mem_base + idx.value; + return idx; +} + +static unsigned +chase_mem_index(agx_index ref, unsigned mem_base) +{ + assert(ref.type == AGX_INDEX_NORMAL); + return ref.memory ? (ref.value - mem_base) : ref.value; +} + +static agx_index +reconstruct_index(struct spill_ctx *ctx, unsigned node) +{ + return agx_get_vec_index(node, ctx->size[node], ctx->channels[node]); +} + +static void +insert_spill(agx_builder *b, struct spill_ctx *ctx, unsigned node) +{ + if (!ctx->remat[node]) { + agx_index idx = reconstruct_index(ctx, node); + agx_mov_to(b, agx_index_as_mem(idx, ctx->spill_base), idx); + } +} + +static void +insert_reload(struct spill_ctx *ctx, agx_block *block, agx_cursor cursor, + unsigned node) +{ + agx_builder b = agx_init_builder(ctx->shader, cursor); + agx_index idx = reconstruct_index(ctx, node); + + /* Reloading breaks SSA, but agx_repair_ssa will repair */ + if (ctx->remat[node]) { + assert(ctx->remat[node]->op == AGX_OPCODE_MOV_IMM); + agx_mov_imm_to(&b, idx, ctx->remat[node]->imm); + } else { + agx_mov_to(&b, idx, agx_index_as_mem(idx, ctx->spill_base)); + } +} + +/* Insert into the register file */ +static void +insert_W(struct spill_ctx *ctx, unsigned v) +{ + assert(v < ctx->n); + assert(!BITSET_TEST(ctx->W, v)); + + BITSET_SET(ctx->W, v); + ctx->nW += node_size(ctx, v); +} + +/* Remove from the register file */ +static void +remove_W(struct spill_ctx *ctx, unsigned v) +{ + assert(v < ctx->n); + assert(BITSET_TEST(ctx->W, v)); + + BITSET_CLEAR(ctx->W, v); + ctx->nW -= node_size(ctx, v); +} + +static void +remove_W_if_present(struct spill_ctx *ctx, unsigned v) +{ + assert(v < ctx->n); + + if (BITSET_TEST(ctx->W, v)) + remove_W(ctx, v); +} + +struct candidate { + unsigned node; + dist_t dist; +}; + +static int +cmp_dist(const void *left_, const void *right_, void *ctx_) +{ + struct spill_ctx *ctx = ctx_; + const struct candidate *left = left_; + const struct candidate *right = right_; + + /* We assume that rematerializing - even before every instruction - is + * cheaper than spilling. As long as one of the nodes is rematerializable + * (with distance > 0), we choose it over spilling. Within a class of nodes + * (rematerializable or not), compare by next-use-distance. + */ + bool remat_left = ctx->remat[left->node] != NULL && left->dist > 0; + bool remat_right = ctx->remat[right->node] != NULL && right->dist > 0; + + if (remat_left != remat_right) + return remat_left ? 1 : -1; + else + return (left->dist > right->dist) - (left->dist < right->dist); +} + +/* + * Limit the register file W to maximum size m by evicting registers. + */ +static ATTRIBUTE_NOINLINE void +limit(struct spill_ctx *ctx, agx_instr *I, unsigned m) +{ + /* Nothing to do if we're already below the limit */ + if (ctx->nW <= m) + return; + + /* Gather candidates for eviction. Note that next_uses gives IPs whereas + * cmp_dist expects relative distances. This requires us to subtract ctx->ip + * to ensure that cmp_dist works properly. Even though logically it shouldn't + * affect the sorted order, practically this matters for correctness with + * rematerialization. See the dist=0 test in cmp_dist. + */ + struct candidate *candidates = alloca(ctx->nW * sizeof(struct candidate)); + unsigned j = 0; + + int i; + BITSET_FOREACH_SET(i, ctx->W, ctx->n) { + assert(j < ctx->nW); + + candidates[j++] = (struct candidate){ + .node = i, + .dist = ctx->next_uses[i] - ctx->ip, + }; + } + + /* Sort by next-use distance */ + util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx); + + /* Evict what doesn't fit */ + unsigned new_weight = 0; + + for (i = 0; i < j; ++i) { + unsigned v = candidates[i].node; + unsigned comps = node_size(ctx, v); + + if ((new_weight + comps) <= m) { + new_weight += comps; + } else { + /* Insert a spill if we haven't spilled before and there is + * another use + */ + if (!BITSET_TEST(ctx->S, v) && candidates[i].dist < DIST_INFINITY) { + agx_builder b = agx_init_builder(ctx->shader, agx_before_instr(I)); + insert_spill(&b, ctx, v); + BITSET_SET(ctx->S, v); + } + + remove_W(ctx, v); + + /* We keep going in case we can pack in a scalar */ + } + } +} + +/* + * Insert coupling code on block boundaries. This must ensure: + * + * - anything live-in we expect to have spilled is spilled + * - anything live-in we expect to have filled is filled + * - phi sources are spilled if the destination is spilled + * - phi sources are filled if the destination is not spilled + * + * The latter two requirements ensure correct pressure calculations for phis. + */ +static ATTRIBUTE_NOINLINE void +insert_coupling_code(struct spill_ctx *ctx, agx_block *pred, agx_block *succ) +{ + struct spill_block *sp = spill_block(ctx, pred); + struct spill_block *ss = spill_block(ctx, succ); + + agx_foreach_phi_in_block(succ, I) { + if (!I->dest[0].memory) + continue; + + agx_builder b = + agx_init_builder(ctx->shader, agx_before_function(ctx->shader)); + + unsigned s = agx_predecessor_index(succ, pred); + + /* Copy immediate/uniform phi sources to memory variables at the start of + * the program, where pressure is zero and hence the copy is legal. + */ + if (I->src[s].type != AGX_INDEX_NORMAL) { + assert(I->src[s].type == AGX_INDEX_IMMEDIATE || + I->src[s].type == AGX_INDEX_UNIFORM); + + agx_index mem = agx_temp_like(ctx->shader, I->dest[0]); + assert(mem.memory); + + agx_index gpr = agx_temp_like(ctx->shader, I->dest[0]); + gpr.memory = false; + + if (I->src[s].type == AGX_INDEX_IMMEDIATE) + agx_mov_imm_to(&b, gpr, I->src[s].value); + else + agx_mov_to(&b, gpr, I->src[s]); + + agx_mov_to(&b, mem, gpr); + I->src[s] = mem; + continue; + } + + bool spilled = false; + for (unsigned i = 0; i < sp->nS_exit; ++i) { + if (sp->S_exit[i] == I->src[s].value) { + spilled = true; + break; + } + } + + if (!spilled) { + /* Spill the phi source. TODO: avoid redundant spills here */ + agx_builder b = + agx_init_builder(ctx->shader, agx_after_block_logical(pred)); + + insert_spill(&b, ctx, I->src[s].value); + } + + if (ctx->remat[I->src[s].value]) { + unsigned node = I->src[s].value; + agx_index idx = reconstruct_index(ctx, node); + + assert(ctx->remat[node]->op == AGX_OPCODE_MOV_IMM); + agx_mov_to(&b, agx_index_as_mem(idx, ctx->spill_base), + agx_mov_imm(&b, agx_size_align_16(idx.size) * 16, + ctx->remat[node]->imm)); + } + + /* Use the spilled version */ + I->src[s] = agx_index_as_mem(I->src[s], ctx->spill_base); + } + + /* Anything assumed to be spilled at the start of succ must be spilled along + * all edges. + */ + for (unsigned i = 0; i < ss->nS_entry; ++i) { + unsigned v = ss->S_entry[i]; + + bool spilled = false; + for (unsigned j = 0; j < sp->nS_exit; ++j) { + if (sp->S_exit[j] == v) { + spilled = true; + break; + } + } + + /* We handle spilling phi destinations separately */ + agx_foreach_phi_in_block(succ, phi) { + if (chase_mem_index(phi->dest[0], ctx->spill_base) == v) { + spilled = true; + break; + } + } + + if (spilled) + continue; + + agx_builder b = agx_init_builder(ctx->shader, agx_along_edge(pred, succ)); + insert_spill(&b, ctx, v); + } + + /* Variables in W at the start of succ must be defined along the edge. */ + for (unsigned i = 0; i < ss->nW_entry; ++i) { + unsigned node = ss->W_entry[i]; + bool defined = false; + + /* Variables live at the end of the predecessor are live along the edge */ + for (unsigned j = 0; j < sp->nW_exit; ++j) { + if (sp->W_exit[j] == node) { + defined = true; + break; + } + } + + /* Phis are defined along the edge */ + agx_foreach_phi_in_block(succ, phi) { + if (phi->dest[0].value == node) { + defined = true; + break; + } + } + + if (defined) + continue; + + /* Otherwise, inserting a reload defines the variable along the edge */ + agx_block *reload_block = agx_edge_to_block(pred, succ); + insert_reload(ctx, reload_block, agx_along_edge(pred, succ), node); + } + + agx_foreach_phi_in_block(succ, I) { + if (I->dest[0].memory) + continue; + + unsigned s = agx_predecessor_index(succ, pred); + + /* Treat immediate/uniform phi sources as registers for pressure + * accounting and phi lowering purposes. Parallel copy lowering can handle + * a copy from a immediate/uniform to a register, but not from an + * immediate/uniform directly to memory. + */ + if (I->src[s].type != AGX_INDEX_NORMAL) { + assert(I->src[s].type == AGX_INDEX_IMMEDIATE || + I->src[s].type == AGX_INDEX_UNIFORM); + + continue; + } + + bool live = false; + for (unsigned i = 0; i < sp->nW_exit; ++i) { + if (sp->W_exit[i] == I->src[s].value) { + live = true; + break; + } + } + + /* Fill the phi source in the predecessor */ + if (!live) { + agx_block *reload_block = agx_edge_to_block(pred, succ); + insert_reload(ctx, reload_block, agx_along_edge(pred, succ), + I->src[s].value); + } + + /* Leave as-is for the GPR version */ + assert(!I->src[s].memory); + } +} + +/* + * Produce an array of next-use IPs relative to the start of the block. This is + * an array of dist_t scalars, representing the next-use IP of each SSA dest + * (right-to-left) and SSA source (left-to-right) of each instruction in the + * block (bottom-to-top). Its size equals the # of SSA sources in the block. + */ +static ATTRIBUTE_NOINLINE void +calculate_local_next_use(struct spill_ctx *ctx, struct util_dynarray *out) +{ + struct spill_block *sb = spill_block(ctx, ctx->block); + unsigned ip = sb->cycles; + + util_dynarray_init(out, NULL); + + struct next_uses nu; + init_next_uses(&nu, NULL); + + foreach_next_use(&sb->next_use_out, i, dist) { + set_next_use(&nu, i, dist_sum(ip, dist)); + } + + agx_foreach_instr_in_block_rev(ctx->block, I) { + ip -= instr_cycles(I); + + if (I->op != AGX_OPCODE_PHI) { + agx_foreach_ssa_dest_rev(I, d) { + unsigned v = I->dest[d].value; + + util_dynarray_append(out, dist_t, search_next_uses(&nu, v)); + } + + agx_foreach_ssa_src(I, s) { + unsigned v = I->src[s].value; + + util_dynarray_append(out, dist_t, search_next_uses(&nu, v)); + set_next_use(&nu, v, ip); + } + } + } + + assert(ip == 0 && "cycle counting is consistent"); + destroy_next_uses(&nu); +} + +/* + * Insert spills/fills for a single basic block, following Belady's algorithm. + * Corresponds to minAlgorithm from the paper. + */ +static ATTRIBUTE_NOINLINE void +min_algorithm(struct spill_ctx *ctx) +{ + struct spill_block *sblock = spill_block(ctx, ctx->block); + struct util_dynarray local_next_ip; + calculate_local_next_use(ctx, &local_next_ip); + + /* next_uses gives the distance from the start of the block, so prepopulate + * with next_use_in. + */ + foreach_next_use(&sblock->next_use_in, key, dist) { + assert(key < ctx->n); + ctx->next_uses[key] = dist; + } + + dist_t *next_ips = util_dynarray_element(&local_next_ip, dist_t, 0); + unsigned next_use_cursor = + util_dynarray_num_elements(&local_next_ip, dist_t); + + /* Iterate each instruction in forward order */ + agx_foreach_instr_in_block(ctx->block, I) { + assert(ctx->nW <= ctx->k && "invariant"); + + /* Phis are special since they happen along the edge. When we initialized + * W and S, we implicitly chose which phis are spilled. So, here we just + * need to rewrite the phis to write into memory. + * + * Phi sources are handled later. + */ + if (I->op == AGX_OPCODE_PHI) { + if (!BITSET_TEST(ctx->W, I->dest[0].value)) { + I->dest[0] = agx_index_as_mem(I->dest[0], ctx->spill_base); + } + + ctx->ip += instr_cycles(I); + continue; + } + + /* Any source that is not in W needs to be reloaded. Gather the set R of + * such values. + */ + unsigned R[AGX_MAX_NORMAL_SOURCES]; + unsigned nR = 0; + + agx_foreach_ssa_src(I, s) { + unsigned node = I->src[s].value; + if (BITSET_TEST(ctx->W, node)) + continue; + + /* Mark this variable as needing a reload. */ + assert(node < ctx->n); + assert(BITSET_TEST(ctx->S, node) && "must have been spilled"); + assert(nR < ARRAY_SIZE(R) && "maximum source count"); + R[nR++] = node; + + /* The inserted reload will add the value to the register file. */ + insert_W(ctx, node); + } + + /* Limit W to make space for the sources we just added */ + limit(ctx, I, ctx->k); + + /* Update next-use distances for this instruction. Unlike the paper, we + * prune dead values from W as we go. This doesn't affect correctness, but + * it speeds up limit() on average. + */ + agx_foreach_ssa_src_rev(I, s) { + assert(next_use_cursor >= 1); + + unsigned next_ip = next_ips[--next_use_cursor]; + assert((next_ip == DIST_INFINITY) == I->src[s].kill); + + if (next_ip == DIST_INFINITY) + remove_W_if_present(ctx, I->src[s].value); + else + ctx->next_uses[I->src[s].value] = next_ip; + } + + agx_foreach_ssa_dest(I, d) { + assert(next_use_cursor >= 1); + unsigned next_ip = next_ips[--next_use_cursor]; + + if (next_ip == DIST_INFINITY) + remove_W_if_present(ctx, I->dest[d].value); + else + ctx->next_uses[I->dest[d].value] = next_ip; + } + + /* Count how many registers we need for destinations. Because of + * SSA form, destinations are unique. + */ + unsigned dest_size = 0; + agx_foreach_ssa_dest(I, d) { + dest_size += node_size(ctx, I->dest[d].value); + } + + /* Limit W to make space for the destinations. */ + limit(ctx, I, ctx->k - dest_size); + + /* Destinations are now in the register file */ + agx_foreach_ssa_dest(I, d) { + insert_W(ctx, I->dest[d].value); + } + + /* Add reloads for the sources in front of the instruction */ + for (unsigned i = 0; i < nR; ++i) { + insert_reload(ctx, ctx->block, agx_before_instr(I), R[i]); + } + + ctx->ip += instr_cycles(I); + } + + assert(next_use_cursor == 0 && "exactly sized"); + + int i; + BITSET_FOREACH_SET(i, ctx->W, ctx->n) + sblock->W_exit[sblock->nW_exit++] = i; + + unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n)); + sblock->S_exit = ralloc_array(ctx->memctx, unsigned, nS); + + BITSET_FOREACH_SET(i, ctx->S, ctx->n) + sblock->S_exit[sblock->nS_exit++] = i; + + assert(nS == sblock->nS_exit); + util_dynarray_fini(&local_next_ip); +} + +/* + * TODO: Implement section 4.2 of the paper. + * + * For now, we implement the simpler heuristic in Hack's thesis: sort + * the live-in set (+ destinations of phis) by next-use distance. + */ +static ATTRIBUTE_NOINLINE void +compute_w_entry_loop_header(struct spill_ctx *ctx) +{ + agx_block *block = ctx->block; + struct spill_block *sb = spill_block(ctx, block); + + unsigned nP = __bitset_count(block->live_in, BITSET_WORDS(ctx->n)); + struct candidate *candidates = calloc(nP, sizeof(struct candidate)); + unsigned j = 0; + + foreach_next_use(&sb->next_use_in, i, dist) { + assert(j < nP); + candidates[j++] = (struct candidate){.node = i, .dist = dist}; + } + + assert(j == nP); + + /* Sort by next-use distance */ + util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx); + + /* Take as much as we can */ + for (unsigned i = 0; i < j; ++i) { + unsigned node = candidates[i].node; + unsigned comps = node_size(ctx, node); + + if ((ctx->nW + comps) <= ctx->k) { + insert_W(ctx, node); + sb->W_entry[sb->nW_entry++] = node; + } + } + + assert(ctx->nW <= ctx->k); + free(candidates); +} + +/* + * Compute W_entry for a block. Section 4.2 in the paper. + */ +static ATTRIBUTE_NOINLINE void +compute_w_entry(struct spill_ctx *ctx) +{ + agx_block *block = ctx->block; + struct spill_block *sb = spill_block(ctx, block); + + /* Nothing to do for start blocks */ + if (agx_num_predecessors(block) == 0) + return; + + /* Loop headers have a different heuristic */ + if (block->loop_header) { + compute_w_entry_loop_header(ctx); + return; + } + + /* Usual blocks follow */ + unsigned *freq = calloc(ctx->n, sizeof(unsigned)); + + /* Record what's written at the end of each predecessor */ + agx_foreach_predecessor(ctx->block, P) { + struct spill_block *sp = spill_block(ctx, *P); + + for (unsigned i = 0; i < sp->nW_exit; ++i) { + unsigned v = sp->W_exit[i]; + freq[v]++; + } + } + + struct candidate *candidates = calloc(ctx->n, sizeof(struct candidate)); + unsigned j = 0; + + /* Variables that are in all predecessors are assumed in W_entry. Phis and + * variables in some predecessors are scored by next-use. + */ + foreach_next_use(&sb->next_use_in, i, dist) { + if (freq[i] == agx_num_predecessors(ctx->block)) { + insert_W(ctx, i); + } else if (freq[i]) { + candidates[j++] = (struct candidate){.node = i, .dist = dist}; + } + } + + agx_foreach_phi_in_block(ctx->block, I) { + bool all_found = true; + + agx_foreach_predecessor(ctx->block, pred) { + struct spill_block *sp = spill_block(ctx, *pred); + bool found = false; + + agx_index src = I->src[agx_predecessor_index(ctx->block, *pred)]; + if (src.type != AGX_INDEX_NORMAL) + continue; + + unsigned v = src.value; + for (unsigned i = 0; i < sp->nW_exit; ++i) { + if (sp->W_exit[i] == v) { + found = true; + break; + } + } + + all_found &= found; + } + + /* Heuristic: if any phi source is spilled, spill the whole phi. This is + * suboptimal, but it massively reduces pointless fill/spill chains with + * massive phi webs. + */ + if (!all_found) + continue; + + candidates[j++] = (struct candidate){ + .node = I->dest[0].value, + .dist = search_next_uses(&sb->next_use_in, I->dest[0].value), + }; + } + + /* Sort by next-use distance */ + util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx); + + /* Take as much as we can */ + for (unsigned i = 0; i < j; ++i) { + unsigned node = candidates[i].node; + unsigned comps = node_size(ctx, node); + + if ((ctx->nW + comps) <= ctx->k) { + insert_W(ctx, node); + sb->W_entry[sb->nW_entry++] = node; + } + } + + assert(ctx->nW <= ctx->k && "invariant"); + + free(freq); + free(candidates); +} + +/* + * We initialize S with the union of S at the exit of (forward edge) + * predecessors and the complement of W, intersected with the live-in set. The + * former propagates S forward. The latter ensures we spill along the edge when + * a live value is not selected for the entry W. + */ +static ATTRIBUTE_NOINLINE void +compute_s_entry(struct spill_ctx *ctx) +{ + unsigned v; + + agx_foreach_predecessor(ctx->block, pred) { + struct spill_block *sp = spill_block(ctx, *pred); + + for (unsigned i = 0; i < sp->nS_exit; ++i) { + v = sp->S_exit[i]; + + if (BITSET_TEST(ctx->block->live_in, v)) + BITSET_SET(ctx->S, v); + } + } + + BITSET_FOREACH_SET(v, ctx->block->live_in, ctx->n) { + if (!BITSET_TEST(ctx->W, v)) + BITSET_SET(ctx->S, v); + } + + /* Copy ctx->S to S_entry for later look-ups with coupling code */ + struct spill_block *sb = spill_block(ctx, ctx->block); + unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n)); + sb->S_entry = ralloc_array(ctx->memctx, unsigned, nS); + + int i; + BITSET_FOREACH_SET(i, ctx->S, ctx->n) + sb->S_entry[sb->nS_entry++] = i; + + assert(sb->nS_entry == nS); +} + +static ATTRIBUTE_NOINLINE void +global_next_use_distances(agx_context *ctx, void *memctx, + struct spill_block *blocks) +{ + u_worklist worklist; + u_worklist_init(&worklist, ctx->num_blocks, NULL); + + agx_foreach_block(ctx, block) { + struct spill_block *sb = &blocks[block->index]; + + init_next_uses(&sb->next_use_in, memctx); + init_next_uses(&sb->next_use_out, memctx); + + agx_foreach_instr_in_block(block, I) { + sb->cycles += instr_cycles(I); + } + + agx_worklist_push_head(&worklist, block); + } + + /* Definitions that have been seen */ + BITSET_WORD *defined = + malloc(BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD)); + + struct next_uses dists; + init_next_uses(&dists, NULL); + + /* Iterate the work list in reverse order since liveness is backwards */ + while (!u_worklist_is_empty(&worklist)) { + agx_block *blk = agx_worklist_pop_head(&worklist); + struct spill_block *sb = &blocks[blk->index]; + + /* Definitions that have been seen */ + memset(defined, 0, BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD)); + + /* Initialize all distances to infinity */ + clear_next_uses(&dists); + + uint32_t cycle = 0; + + /* Calculate dists. Phis are handled separately. */ + agx_foreach_instr_in_block(blk, I) { + if (I->op == AGX_OPCODE_PHI) { + cycle++; + continue; + } + + /* Record first use before def. Phi sources are handled + * above, because they logically happen in the + * predecessor. + */ + agx_foreach_ssa_src(I, s) { + if (BITSET_TEST(defined, I->src[s].value)) + continue; + if (search_next_uses(&dists, I->src[s].value) < DIST_INFINITY) + continue; + + assert(I->src[s].value < ctx->alloc); + set_next_use(&dists, I->src[s].value, cycle); + } + + /* Record defs */ + agx_foreach_ssa_dest(I, d) { + assert(I->dest[d].value < ctx->alloc); + BITSET_SET(defined, I->dest[d].value); + } + + cycle += instr_cycles(I); + } + + /* Apply transfer function to get our entry state. */ + foreach_next_use(&sb->next_use_out, node, dist) { + set_next_use(&sb->next_use_in, node, dist_sum(dist, sb->cycles)); + } + + foreach_next_use(&dists, node, dist) { + set_next_use(&sb->next_use_in, node, dist); + } + + int i; + BITSET_FOREACH_SET(i, defined, ctx->alloc) { + set_next_use(&sb->next_use_in, i, DIST_INFINITY); + } + + /* Propagate the live in of the successor (blk) to the live out of + * predecessors. + * + * Phi nodes are logically on the control flow edge and act in parallel. + * To handle when propagating, we kill writes from phis and make live the + * corresponding sources. + */ + agx_foreach_predecessor(blk, pred) { + struct spill_block *sp = &blocks[(*pred)->index]; + copy_next_uses(&dists, &sb->next_use_in); + + /* Kill write */ + agx_foreach_phi_in_block(blk, I) { + assert(I->dest[0].type == AGX_INDEX_NORMAL); + set_next_use(&dists, I->dest[0].value, DIST_INFINITY); + } + + /* Make live the corresponding source */ + agx_foreach_phi_in_block(blk, I) { + agx_index operand = I->src[agx_predecessor_index(blk, *pred)]; + if (operand.type == AGX_INDEX_NORMAL) + set_next_use(&dists, operand.value, 0); + } + + /* Join by taking minimum */ + if (minimum_next_uses(&sp->next_use_out, &dists)) + agx_worklist_push_tail(&worklist, *pred); + } + } + + free(defined); + u_worklist_fini(&worklist); + destroy_next_uses(&dists); +} + +static ATTRIBUTE_NOINLINE void +validate_next_use_info(UNUSED agx_context *ctx, + UNUSED struct spill_block *blocks) +{ +#ifndef NDEBUG + int i; + + agx_foreach_block(ctx, blk) { + struct spill_block *sb = &blocks[blk->index]; + + /* Invariant: next-use distance is finite iff the node is live */ + BITSET_FOREACH_SET(i, blk->live_in, ctx->alloc) + assert(search_next_uses(&sb->next_use_in, i) < DIST_INFINITY); + + BITSET_FOREACH_SET(i, blk->live_out, ctx->alloc) + assert(search_next_uses(&sb->next_use_out, i) < DIST_INFINITY); + + foreach_next_use(&sb->next_use_in, i, _) + assert(BITSET_TEST(blk->live_in, i)); + + foreach_next_use(&sb->next_use_out, i, _) + assert(BITSET_TEST(blk->live_out, i)); + } +#endif +} + +void +agx_spill(agx_context *ctx, unsigned k) +{ + void *memctx = ralloc_context(NULL); + + /* If control flow is used, we force the nesting counter (r0l) live + * throughout the shader. Just subtract that from our limit so we can forget + * about it while spilling. + */ + if (ctx->any_cf) + k--; + + uint8_t *channels = rzalloc_array(memctx, uint8_t, ctx->alloc); + dist_t *next_uses = rzalloc_array(memctx, dist_t, ctx->alloc); + enum agx_size *sizes = rzalloc_array(memctx, enum agx_size, ctx->alloc); + agx_instr **remat = rzalloc_array(memctx, agx_instr *, ctx->alloc); + + agx_foreach_instr_global(ctx, I) { + if (I->op == AGX_OPCODE_MOV_IMM) + remat[I->dest[0].value] = I; + + /* Measure vectors */ + agx_foreach_ssa_dest(I, d) { + assert(sizes[I->dest[d].value] == 0 && "broken SSA"); + assert(channels[I->dest[d].value] == 0 && "broken SSA"); + + sizes[I->dest[d].value] = I->dest[d].size; + channels[I->dest[d].value] = agx_channels(I->dest[d]); + } + } + + struct spill_block *blocks = + rzalloc_array(memctx, struct spill_block, ctx->num_blocks); + + /* Step 1. Compute global next-use distances */ + global_next_use_distances(ctx, memctx, blocks); + validate_next_use_info(ctx, blocks); + + /* Reserve a memory variable for every regular variable */ + unsigned n = ctx->alloc; + ctx->alloc *= 2; + + BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n)); + BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n)); + + agx_foreach_block(ctx, block) { + memset(W, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD)); + memset(S, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD)); + + struct spill_ctx sctx = { + .memctx = memctx, + .shader = ctx, + .n = n, + .channels = channels, + .size = sizes, + .remat = remat, + .next_uses = next_uses, + .block = block, + .blocks = blocks, + .k = k, + .W = W, + .S = S, + .spill_base = n, + }; + + compute_w_entry(&sctx); + compute_s_entry(&sctx); + min_algorithm(&sctx); + } + + /* Now that all blocks are processed separately, stitch it together */ + agx_foreach_block(ctx, block) { + struct spill_ctx sctx = { + .memctx = memctx, + .shader = ctx, + .n = n, + .channels = channels, + .size = sizes, + .remat = remat, + .block = block, + .blocks = blocks, + .k = k, + .W = W, + .S = S, + .spill_base = n, + }; + + agx_foreach_predecessor(block, pred) { + /* After spilling phi sources, insert coupling code */ + insert_coupling_code(&sctx, *pred, block); + } + } + + ralloc_free(memctx); + + /* Spilling breaks SSA, so we need to repair before validating */ + agx_repair_ssa(ctx); + agx_validate(ctx, "Spilling"); + + /* Remat can introduce dead code */ + agx_dce(ctx, false); +} diff --git a/src/asahi/compiler/meson.build b/src/asahi/compiler/meson.build index c2ef0ef24ac..46dc0eebb36 100644 --- a/src/asahi/compiler/meson.build +++ b/src/asahi/compiler/meson.build @@ -36,6 +36,7 @@ libasahi_agx_files = files( 'agx_optimizer.c', 'agx_repair_ssa.c', 'agx_reindex_ssa.c', + 'agx_spill.c', 'agx_register_allocate.c', 'agx_validate.c', )