diff --git a/src/panfrost/compiler/bifrost/IR_pseudo.xml b/src/panfrost/compiler/bifrost/IR_pseudo.xml index 60264b39c09..b53d8f6d6c8 100644 --- a/src/panfrost/compiler/bifrost/IR_pseudo.xml +++ b/src/panfrost/compiler/bifrost/IR_pseudo.xml @@ -173,4 +173,9 @@ + + + + + diff --git a/src/panfrost/compiler/bifrost/bi_lower_spill.c b/src/panfrost/compiler/bifrost/bi_lower_spill.c new file mode 100644 index 00000000000..5d3a0198800 --- /dev/null +++ b/src/panfrost/compiler/bifrost/bi_lower_spill.c @@ -0,0 +1,113 @@ +/* + * Copyright 2025 Arm Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "compiler.h" + +struct lower_spill_ctx { + bi_context *shader; + /* If bi_index::memory, tls_loc[idx.value] is valid. */ + uint32_t *tls_loc; + /* Bytes of spill memory used so far. */ + uint32_t spill_count; +}; + +static void +lower_memmov(struct lower_spill_ctx* ctx, bi_instr *I, uint32_t tls_base) +{ + assert(I->op == BI_OPCODE_MEMMOV); + assert(I->nr_srcs == 1 && I->nr_dests == 1); + assert(I->src[0].memory || I->dest[0].memory); + + bi_builder b = bi_init_builder(ctx->shader, bi_before_instr(I)); + const unsigned bits = 32; + + bi_index src = I->src[0]; + bi_index dst = I->dest[0]; + + if (I->src[0].memory) { + unsigned offset = ctx->tls_loc[src.value]; + bi_load_tl(&b, bits, dst, tls_base + offset); + } else { + unsigned offset = ctx->tls_loc[dst.value]; + bi_store_tl(&b, bits, src, tls_base + offset); + } + + bi_remove_instruction(I); +} + +static void +lower_mem_phi(struct lower_spill_ctx* ctx, bi_instr *I, uint32_t tls_base) +{ + assert(I->op == BI_OPCODE_PHI); + assert(I->nr_dests == 1); + + /* PHIs get lowered in bi_out_of_ssa, which expects memory operands to + * provide the actual TLS offset as bi_index::value. + */ + + if (I->dest[0].memory) { + const bi_index dst = I->dest[0]; + I->dest[0].value = tls_base + ctx->tls_loc[dst.value]; + } + + bi_foreach_src(I, s) { + const bi_index src = I->src[s]; + if (!src.memory) + continue; + + assert(ctx->tls_loc[src.value] != UINT32_MAX && "Undefined source"); + I->src[s].value = tls_base + ctx->tls_loc[src.value]; + } +} + +static void +assign_tls_locations(struct lower_spill_ctx *ctx) { + bi_foreach_instr_global(ctx->shader, I) { + bi_foreach_ssa_dest(I, d) { + bi_index dst = I->dest[d]; + if (!dst.memory) + continue; + + assert(I->op == BI_OPCODE_MEMMOV || I->op == BI_OPCODE_PHI); + assert(ctx->tls_loc[dst.value] == UINT32_MAX && "Broken SSA"); + + ctx->tls_loc[dst.value] = ctx->spill_count; + ctx->spill_count += 4; + } + } +} + +unsigned +bi_lower_spill(bi_context* ctx, uint32_t tls_base) { + void* memctx = ralloc_context(NULL); + + uint32_t *tls_loc = ralloc_array(memctx, uint32_t, ctx->ssa_alloc); + memset(tls_loc, 0xff, sizeof(uint32_t) * ctx->ssa_alloc); + + struct lower_spill_ctx lctx = { + .shader = ctx, + .tls_loc = tls_loc, + .spill_count = 0, + }; + + assign_tls_locations(&lctx); + + bi_foreach_instr_global_safe(ctx, I) { + switch (I->op) { + case BI_OPCODE_MEMMOV: + lower_memmov(&lctx, I, tls_base); + break; + case BI_OPCODE_PHI: + lower_mem_phi(&lctx, I, tls_base); + break; + default: + break; + } + } + + ralloc_free(memctx); + + return lctx.spill_count; +} diff --git a/src/panfrost/compiler/bifrost/bi_ra.c b/src/panfrost/compiler/bifrost/bi_ra.c index ecf070906b6..dd8175e8157 100644 --- a/src/panfrost/compiler/bifrost/bi_ra.c +++ b/src/panfrost/compiler/bifrost/bi_ra.c @@ -1214,7 +1214,9 @@ bi_register_allocate(bi_context *ctx) if (ctx->inputs->is_blend) UNREACHABLE("Blend shaders may not spill"); - spill_count = bi_spill_ssa(ctx, regs_to_use, spill_count); + bi_spill_ssa(ctx, regs_to_use); + spill_count += bi_lower_spill(ctx, spill_count); + /* By default, we use packed TLS addressing on Valhall. * We cannot cross 16 byte boundaries with packed TLS * addressing. Align to ensure this doesn't happen. This diff --git a/src/panfrost/compiler/bifrost/bi_spill_ssa.c b/src/panfrost/compiler/bifrost/bi_spill_ssa.c index 6b1d05e6249..6859aa0adc9 100644 --- a/src/panfrost/compiler/bifrost/bi_spill_ssa.c +++ b/src/panfrost/compiler/bifrost/bi_spill_ssa.c @@ -277,8 +277,8 @@ bi_along_edge(bi_block *pred, bi_block *succ) return bi_before_block(succ); } -static bool bi_idx_is_memory(bi_index idx) { -// return (idx.type == BI_INDEX_FAU); +static inline bool +bi_idx_is_memory(bi_index idx) { return idx.memory; } @@ -286,32 +286,18 @@ static bi_index bi_index_as_mem(bi_index idx, struct spill_ctx *ctx) { assert(idx.type == BI_INDEX_NORMAL); - idx.type = BI_INDEX_FAU; - unsigned val = idx.value; + assert(!idx.memory); - assert(val < ctx->spill_max); - if (ctx->spill_map[val] == 0xFFFFFFFFU) { - uint32_t remap = ctx->spill_bytes; - ctx->spill_bytes += 4; - ctx->spill_map[val] = remap; - unsigned i = (remap - ctx->spill_base)/4; - assert(i < ctx->spill_max); - ctx->mem_map[i] = val; - } - idx.value = ctx->spill_map[val]; idx.memory = true; + idx.value = ctx->spill_base + idx.value; + return idx; } static unsigned chase_mem_index(bi_index ref, struct spill_ctx *ctx) { - unsigned val = ref.value; - if (bi_idx_is_memory(ref)) { - unsigned i = (val - ctx->spill_base)/4; - return ctx->mem_map[i]; - } - return val; + return bi_idx_is_memory(ref) ? (ref.value - ctx->spill_base) : ref.value; } static bi_index @@ -416,9 +402,8 @@ insert_spill(bi_builder *b, struct spill_ctx *ctx, unsigned node) if (!ctx->remat[node] && !BITSET_TEST(ctx->spill_map_store, node)) { bi_index idx = reconstruct_index(ctx, node); bi_index mem = bi_index_as_mem(idx, ctx); - unsigned bits = 32; - bi_store_tl(b, bits, idx, mem.value); + bi_memmov_to(b, mem, idx); b->shader->spills++; /* We only need the extra registers reserved if we actually spilled @@ -442,9 +427,7 @@ insert_reload(struct spill_ctx *ctx, bi_block *block, bi_cursor cursor, if (ctx->remat[node]) { remat_to(&b, idx, ctx, node); } else { - bi_index mem = bi_index_as_mem(idx, ctx); - unsigned bits = 32; - bi_load_tl(&b, bits, idx, mem.value); + bi_memmov_to(&b, idx, bi_index_as_mem(idx, ctx)); b.shader->fills++; } } @@ -551,7 +534,6 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ) I->src[s].type == BI_INDEX_REGISTER); bi_index gpr = bi_temp(ctx->shader); - unsigned bits = 32; assert(gpr.type == BI_INDEX_NORMAL); if (ctx->arch >= 9 && I->src[s].type == BI_INDEX_CONSTANT) { @@ -560,8 +542,9 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ) bi_iadd_imm_i32_to(&b, gpr, zero, I->src[s].value); } else bi_mov_i32_to(&b, gpr, I->src[s]); + bi_index mem = bi_index_as_mem(gpr, ctx); - bi_store_tl(&b, bits, gpr, mem.value); + bi_memmov_to(&b, mem, gpr); I->src[s] = mem; continue; } @@ -586,10 +569,9 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ) unsigned node = I->src[s].value; bi_index idx = reconstruct_index(ctx, node); bi_index tmp = bi_temp(ctx->shader); - unsigned bits = 32; remat_to(&b, tmp, ctx, node); - bi_store_tl(&b, bits, tmp, bi_index_as_mem(idx, ctx).value); + bi_memmov_to(&b, bi_index_as_mem(idx, ctx), tmp); } /* Use the spilled version */ @@ -1459,11 +1441,10 @@ record_ssa_defs(bi_context *ctx, bi_instr **defs, bi_block **blocks) * returns number of registers spilled */ -unsigned -bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) +void +bi_spill_ssa(bi_context *ctx, unsigned k) { void *memctx = ralloc_context(NULL); - unsigned spill_count = spill_base; unsigned max_temps = MIN_TEMPS_FOR_SPILL; /* calculate how many temporaries we may need */ @@ -1498,8 +1479,11 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) global_next_use_distances(ctx, memctx, blocks); validate_next_use_info(ctx, blocks); + /* Reserve a memory variable for every regular variable */ + const uint32_t n = ctx->ssa_alloc; + ctx->ssa_alloc *= 2; + /* we may need to allocate some temporaries for spilling PHIs, hence the max_temps */ - unsigned n = ctx->ssa_alloc + max_temps; BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n)); BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n)); uint32_t *spill_map = ralloc_array(memctx, uint32_t, n); @@ -1524,7 +1508,7 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) struct spill_ctx sctx = { .memctx = memctx, .shader = ctx, - .n_alloc = ctx->ssa_alloc, + .n_alloc = n, .remat = remat, .next_uses = next_uses, .block = block, @@ -1533,10 +1517,10 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) .W = W, .S = S, .size = sizes, - .spill_max = n, - .spill_base = spill_base, + .spill_max = 2*n, + .spill_base = n, .spill_map = spill_map, - .spill_bytes = spill_count, + .spill_bytes = 0, .spill_map_store = spill_map_store, .mem_map = mem_map, .ssa_defs = ssa_defs, @@ -1548,7 +1532,6 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) compute_w_entry(&sctx); compute_s_entry(&sctx); min_algorithm(&sctx); - spill_count = MAX2(spill_count, sctx.spill_bytes); } /* Now that all blocks are processed separately, stitch it together */ @@ -1556,7 +1539,7 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) struct spill_ctx sctx = { .memctx = memctx, .shader = ctx, - .n_alloc = ctx->ssa_alloc, + .n_alloc = n, .remat = remat, .block = block, .blocks = blocks, @@ -1564,10 +1547,10 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) .W = W, .S = S, .size = sizes, - .spill_max = n, - .spill_base = spill_base, + .spill_max = 2*n, + .spill_base = n, .spill_map = spill_map, - .spill_bytes = spill_count, + .spill_bytes = 0, .spill_map_store = spill_map_store, .mem_map = mem_map, .ssa_defs = ssa_defs, @@ -1580,12 +1563,9 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base) /* After spilling phi sources, insert coupling code */ insert_coupling_code(&sctx, *pred, block); } - spill_count = MAX2(spill_count, sctx.spill_bytes); } ralloc_free(memctx); bi_repair_ssa(ctx); - - return spill_count; } diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index b3fb580d37c..0167f50af1c 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -1491,14 +1491,17 @@ uint64_t MUST_CHECK bi_postra_liveness_ins(uint64_t live, bi_instr *ins); /* Record sizes of SSA values into the provided array. */ void bi_record_sizes(bi_context *ctx, uint32_t *sizes); -/* SSA spilling; returns number of spilled registers */ -unsigned bi_spill_ssa(bi_context *ctx, unsigned num_registers, unsigned tls_size); +/* SSA spilling */ +void bi_spill_ssa(bi_context *ctx, unsigned num_registers); void bi_repair_ssa(bi_context *ctx); /* Reindex SSA to reduce memory usage */ void bi_reindex_ssa(bi_context *ctx); +/* Lower memory operands created during spilling. */ +unsigned bi_lower_spill(bi_context* ctx, uint32_t tls_base); + /* Layout */ signed bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target); diff --git a/src/panfrost/compiler/bifrost/meson.build b/src/panfrost/compiler/bifrost/meson.build index 3ae3308f319..6556b716ac5 100644 --- a/src/panfrost/compiler/bifrost/meson.build +++ b/src/panfrost/compiler/bifrost/meson.build @@ -11,6 +11,7 @@ libpanfrost_bifrost_files = files( 'bi_layout.c', 'bi_liveness.c', 'bi_lower_divergent_indirects.c', + 'bi_lower_spill.c', 'bi_lower_swizzle.c', 'bi_print.c', 'bi_opt_control_flow.c',