From 79c786ea6efb23545aab7ed293bd6f424a2a256c Mon Sep 17 00:00:00 2001 From: Christoph Pillmayer Date: Sun, 14 Dec 2025 19:40:06 +0100 Subject: [PATCH] pan/bi: Abstract away spills/fills when spilling We want to lower spills and fills to actual LS instructions only after SSA register allocation which future commits will add. For now, we just lower the new MEMMOV instr immediately. Reviewed-by: Eric R. Smith Part-of: --- src/panfrost/compiler/bifrost/IR_pseudo.xml | 5 + .../compiler/bifrost/bi_lower_spill.c | 113 ++++++++++++++++++ src/panfrost/compiler/bifrost/bi_ra.c | 4 +- src/panfrost/compiler/bifrost/bi_spill_ssa.c | 70 ++++------- src/panfrost/compiler/bifrost/compiler.h | 7 +- src/panfrost/compiler/bifrost/meson.build | 1 + 6 files changed, 152 insertions(+), 48 deletions(-) create mode 100644 src/panfrost/compiler/bifrost/bi_lower_spill.c diff --git a/src/panfrost/compiler/bifrost/IR_pseudo.xml b/src/panfrost/compiler/bifrost/IR_pseudo.xml index 60264b39c09..b53d8f6d6c8 100644 --- a/src/panfrost/compiler/bifrost/IR_pseudo.xml +++ b/src/panfrost/compiler/bifrost/IR_pseudo.xml @@ -173,4 +173,9 @@ + + + + + diff --git a/src/panfrost/compiler/bifrost/bi_lower_spill.c b/src/panfrost/compiler/bifrost/bi_lower_spill.c new file mode 100644 index 00000000000..5d3a0198800 --- /dev/null +++ b/src/panfrost/compiler/bifrost/bi_lower_spill.c @@ -0,0 +1,113 @@ +/* + * Copyright 2025 Arm Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "compiler.h" + +struct lower_spill_ctx { + bi_context *shader; + /* If bi_index::memory, tls_loc[idx.value] is valid. */ + uint32_t *tls_loc; + /* Bytes of spill memory used so far. */ + uint32_t spill_count; +}; + +static void +lower_memmov(struct lower_spill_ctx* ctx, bi_instr *I, uint32_t tls_base) +{ + assert(I->op == BI_OPCODE_MEMMOV); + assert(I->nr_srcs == 1 && I->nr_dests == 1); + assert(I->src[0].memory || I->dest[0].memory); + + bi_builder b = bi_init_builder(ctx->shader, bi_before_instr(I)); + const unsigned bits = 32; + + bi_index src = I->src[0]; + bi_index dst = I->dest[0]; + + if (I->src[0].memory) { + unsigned offset = ctx->tls_loc[src.value]; + bi_load_tl(&b, bits, dst, tls_base + offset); + } else { + unsigned offset = ctx->tls_loc[dst.value]; + bi_store_tl(&b, bits, src, tls_base + offset); + } + + bi_remove_instruction(I); +} + +static void +lower_mem_phi(struct lower_spill_ctx* ctx, bi_instr *I, uint32_t tls_base) +{ + assert(I->op == BI_OPCODE_PHI); + assert(I->nr_dests == 1); + + /* PHIs get lowered in bi_out_of_ssa, which expects memory operands to + * provide the actual TLS offset as bi_index::value. + */ + + if (I->dest[0].memory) { + const bi_index dst = I->dest[0]; + I->dest[0].value = tls_base + ctx->tls_loc[dst.value]; + } + + bi_foreach_src(I, s) { + const bi_index src = I->src[s]; + if (!src.memory) + continue; + + assert(ctx->tls_loc[src.value] != UINT32_MAX && "Undefined source"); + I->src[s].value = tls_base + ctx->tls_loc[src.value]; + } +} + +static void +assign_tls_locations(struct lower_spill_ctx *ctx) { + bi_foreach_instr_global(ctx->shader, I) { + bi_foreach_ssa_dest(I, d) { + bi_index dst = I->dest[d]; + if (!dst.memory) + continue; + + assert(I->op == BI_OPCODE_MEMMOV || I->op == BI_OPCODE_PHI); + assert(ctx->tls_loc[dst.value] == UINT32_MAX && "Broken SSA"); + + ctx->tls_loc[dst.value] = ctx->spill_count; + ctx->spill_count += 4; + } + } +} + +unsigned +bi_lower_spill(bi_context* ctx, uint32_t tls_base) { + void* memctx = ralloc_context(NULL); + + uint32_t *tls_loc = ralloc_array(memctx, uint32_t, ctx->ssa_alloc); + memset(tls_loc, 0xff, sizeof(uint32_t) * ctx->ssa_alloc); + + struct lower_spill_ctx lctx = { + .shader = ctx, + .tls_loc = tls_loc, + .spill_count = 0, + }; + + assign_tls_locations(&lctx); + + bi_foreach_instr_global_safe(ctx, I) { + switch (I->op) { + case BI_OPCODE_MEMMOV: + lower_memmov(&lctx, I, tls_base); + break; + case BI_OPCODE_PHI: + lower_mem_phi(&lctx, I, tls_base); + break; + default: + break; + } + } + + ralloc_free(memctx); + + return lctx.spill_count; +} diff --git a/src/panfrost/compiler/bifrost/bi_ra.c b/src/panfrost/compiler/bifrost/bi_ra.c index ecf070906b6..dd8175e8157 100644 --- a/src/panfrost/compiler/bifrost/bi_ra.c +++ b/src/panfrost/compiler/bifrost/bi_ra.c @@ -1214,7 +1214,9 @@ bi_register_allocate(bi_context *ctx) if (ctx->inputs->is_blend) UNREACHABLE("Blend shaders may not spill"); - spill_count = bi_spill_ssa(ctx, regs_to_use, spill_count); + bi_spill_ssa(ctx, regs_to_use); + spill_count += bi_lower_spill(ctx, spill_count); + /* By default, we use packed TLS addressing on Valhall. * We cannot cross 16 byte boundaries with packed TLS * addressing. Align to ensure this doesn't happen. This diff --git a/src/panfrost/compiler/bifrost/bi_spill_ssa.c b/src/panfrost/compiler/bifrost/bi_spill_ssa.c index 6b1d05e6249..6859aa0adc9 100644 --- a/src/panfrost/compiler/bifrost/bi_spill_ssa.c +++ b/src/panfrost/compiler/bifrost/bi_spill_ssa.c @@ -277,8 +277,8 @@ bi_along_edge(bi_block *pred, bi_block *succ) return bi_before_block(succ); } -static bool bi_idx_is_memory(bi_index idx) { -// return (idx.type == BI_INDEX_FAU); +static inline bool +bi_idx_is_memory(bi_index idx) { return idx.memory; } @@ -286,32 +286,18 @@ static bi_index bi_index_as_mem(bi_index idx, struct spill_ctx *ctx) { assert(idx.type == BI_INDEX_NORMAL); - idx.type = BI_INDEX_FAU; - unsigned val = idx.value; + assert(!idx.memory); - assert(val < ctx->spill_max); - if (ctx->spill_map[val] == 0xFFFFFFFFU) { - uint32_t remap = ctx->spill_bytes; - ctx->spill_bytes += 4; - ctx->spill_map[val] = remap; - unsigned i = (remap - ctx->spill_base)/4; - assert(i < ctx->spill_max); - ctx->mem_map[i] = val; - } - idx.value = ctx->spill_map[val]; idx.memory = true; + idx.value = ctx->spill_base + idx.value; + return idx; } static unsigned chase_mem_index(bi_index ref, struct spill_ctx *ctx) { - unsigned val = ref.value; - if (bi_idx_is_memory(ref)) { - unsigned i = (val - ctx->spill_base)/4; - return ctx->mem_map[i]; - } - return val; + return bi_idx_is_memory(ref) ? (ref.value - ctx->spill_base) : ref.value; } static bi_index @@ -416,9 +402,8 @@ insert_spill(bi_builder *b, struct spill_ctx *ctx, unsigned node) if (!ctx->remat[node] && !BITSET_TEST(ctx->spill_map_store, node)) { bi_index idx = reconstruct_index(ctx, node); bi_index mem = bi_index_as_mem(idx, ctx); - unsigned bits = 32; - bi_store_tl(b, bits, idx, mem.value); + bi_memmov_to(b, mem, idx); b->shader->spills++; /* We only need the extra registers reserved if we actually spilled @@ -442,9 +427,7 @@ insert_reload(struct spill_ctx *ctx, bi_block *block, bi_cursor cursor, if (ctx->remat[node]) { remat_to(&b, idx, ctx, node); } else { - bi_index mem = bi_index_as_mem(idx, ctx); - unsigned bits = 32; - bi_load_tl(&b, bits, idx, mem.value); + bi_memmov_to(&b, idx, bi_index_as_mem(idx, ctx)); b.shader->fills++; } } @@ -551,7 +534,6 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ) I->src[s].type == BI_INDEX_REGISTER); bi_index gpr = bi_temp(ctx->shader); - unsigned bits = 32; assert(gpr.type == BI_INDEX_NORMAL); if (ctx->arch >= 9 && I->src[s].type == BI_INDEX_CONSTANT) { @@ -560,8 +542,9 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ) bi_iadd_imm_i32_to(&b, gpr, zero, I->src[s].value); } else bi_mov_i32_to(&b, gpr, I->src[s]); + bi_index mem = bi_index_as_mem(gpr, ctx); - bi_store_tl(&b, bits, gpr, mem.value); + bi_memmov_to(&b, mem, gpr); I->src[s] = mem; continue; } @@ -586,10 +569,9 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ) unsigned node = I->src[s].value; bi_index idx = reconstruct_index(ctx, node); bi_index tmp = bi_temp(ctx->shader); - unsigned bits = 32; remat_to(&b, tmp, ctx, node); - bi_store_tl(&b, bits, tmp, bi_index_as_mem(idx, ctx).value); + bi_memmov_to(&b, bi_index_as_mem(idx, ctx), tmp); } /* Use the spilled version */ @@ -1459,11 +1441,10 @@ record_ssa_defs(bi_context *ctx, bi_instr **defs, bi_block **blocks) * returns number of registers spilled */ -unsigned -bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) +void +bi_spill_ssa(bi_context *ctx, unsigned k) { void *memctx = ralloc_context(NULL); - unsigned spill_count = spill_base; unsigned max_temps = MIN_TEMPS_FOR_SPILL; /* calculate how many temporaries we may need */ @@ -1498,8 +1479,11 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) global_next_use_distances(ctx, memctx, blocks); validate_next_use_info(ctx, blocks); + /* Reserve a memory variable for every regular variable */ + const uint32_t n = ctx->ssa_alloc; + ctx->ssa_alloc *= 2; + /* we may need to allocate some temporaries for spilling PHIs, hence the max_temps */ - unsigned n = ctx->ssa_alloc + max_temps; BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n)); BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n)); uint32_t *spill_map = ralloc_array(memctx, uint32_t, n); @@ -1524,7 +1508,7 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) struct spill_ctx sctx = { .memctx = memctx, .shader = ctx, - .n_alloc = ctx->ssa_alloc, + .n_alloc = n, .remat = remat, .next_uses = next_uses, .block = block, @@ -1533,10 +1517,10 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) .W = W, .S = S, .size = sizes, - .spill_max = n, - .spill_base = spill_base, + .spill_max = 2*n, + .spill_base = n, .spill_map = spill_map, - .spill_bytes = spill_count, + .spill_bytes = 0, .spill_map_store = spill_map_store, .mem_map = mem_map, .ssa_defs = ssa_defs, @@ -1548,7 +1532,6 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) compute_w_entry(&sctx); compute_s_entry(&sctx); min_algorithm(&sctx); - spill_count = MAX2(spill_count, sctx.spill_bytes); } /* Now that all blocks are processed separately, stitch it together */ @@ -1556,7 +1539,7 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) struct spill_ctx sctx = { .memctx = memctx, .shader = ctx, - .n_alloc = ctx->ssa_alloc, + .n_alloc = n, .remat = remat, .block = block, .blocks = blocks, @@ -1564,10 +1547,10 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) .W = W, .S = S, .size = sizes, - .spill_max = n, - .spill_base = spill_base, + .spill_max = 2*n, + .spill_base = n, .spill_map = spill_map, - .spill_bytes = spill_count, + .spill_bytes = 0, .spill_map_store = spill_map_store, .mem_map = mem_map, .ssa_defs = ssa_defs, @@ -1580,12 +1563,9 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) /* After spilling phi sources, insert coupling code */ insert_coupling_code(&sctx, *pred, block); } - spill_count = MAX2(spill_count, sctx.spill_bytes); } ralloc_free(memctx); bi_repair_ssa(ctx); - - return spill_count; } diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index b3fb580d37c..0167f50af1c 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -1491,14 +1491,17 @@ uint64_t MUST_CHECK bi_postra_liveness_ins(uint64_t live, bi_instr *ins); /* Record sizes of SSA values into the provided array. */ void bi_record_sizes(bi_context *ctx, uint32_t *sizes); -/* SSA spilling; returns number of spilled registers */ -unsigned bi_spill_ssa(bi_context *ctx, unsigned num_registers, unsigned tls_size); +/* SSA spilling */ +void bi_spill_ssa(bi_context *ctx, unsigned num_registers); void bi_repair_ssa(bi_context *ctx); /* Reindex SSA to reduce memory usage */ void bi_reindex_ssa(bi_context *ctx); +/* Lower memory operands created during spilling. */ +unsigned bi_lower_spill(bi_context* ctx, uint32_t tls_base); + /* Layout */ signed bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target); diff --git a/src/panfrost/compiler/bifrost/meson.build b/src/panfrost/compiler/bifrost/meson.build index 3ae3308f319..6556b716ac5 100644 --- a/src/panfrost/compiler/bifrost/meson.build +++ b/src/panfrost/compiler/bifrost/meson.build @@ -11,6 +11,7 @@ libpanfrost_bifrost_files = files( 'bi_layout.c', 'bi_liveness.c', 'bi_lower_divergent_indirects.c', + 'bi_lower_spill.c', 'bi_lower_swizzle.c', 'bi_print.c', 'bi_opt_control_flow.c',