diff --git a/src/panfrost/compiler/bifrost/IR_pseudo.xml b/src/panfrost/compiler/bifrost/IR_pseudo.xml
index 60264b39c09..b53d8f6d6c8 100644
--- a/src/panfrost/compiler/bifrost/IR_pseudo.xml
+++ b/src/panfrost/compiler/bifrost/IR_pseudo.xml
@@ -173,4 +173,9 @@
+
+
+
+
+
diff --git a/src/panfrost/compiler/bifrost/bi_lower_spill.c b/src/panfrost/compiler/bifrost/bi_lower_spill.c
new file mode 100644
index 00000000000..5d3a0198800
--- /dev/null
+++ b/src/panfrost/compiler/bifrost/bi_lower_spill.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2025 Arm Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler.h"
+
+struct lower_spill_ctx {
+ bi_context *shader;
+ /* If bi_index::memory, tls_loc[idx.value] is valid. */
+ uint32_t *tls_loc;
+ /* Bytes of spill memory used so far. */
+ uint32_t spill_count;
+};
+
+static void
+lower_memmov(struct lower_spill_ctx* ctx, bi_instr *I, uint32_t tls_base)
+{
+ assert(I->op == BI_OPCODE_MEMMOV);
+ assert(I->nr_srcs == 1 && I->nr_dests == 1);
+ assert(I->src[0].memory || I->dest[0].memory);
+
+ bi_builder b = bi_init_builder(ctx->shader, bi_before_instr(I));
+ const unsigned bits = 32;
+
+ bi_index src = I->src[0];
+ bi_index dst = I->dest[0];
+
+ if (I->src[0].memory) {
+ unsigned offset = ctx->tls_loc[src.value];
+ bi_load_tl(&b, bits, dst, tls_base + offset);
+ } else {
+ unsigned offset = ctx->tls_loc[dst.value];
+ bi_store_tl(&b, bits, src, tls_base + offset);
+ }
+
+ bi_remove_instruction(I);
+}
+
+static void
+lower_mem_phi(struct lower_spill_ctx* ctx, bi_instr *I, uint32_t tls_base)
+{
+ assert(I->op == BI_OPCODE_PHI);
+ assert(I->nr_dests == 1);
+
+ /* PHIs get lowered in bi_out_of_ssa, which expects memory operands to
+ * provide the actual TLS offset as bi_index::value.
+ */
+
+ if (I->dest[0].memory) {
+ const bi_index dst = I->dest[0];
+ I->dest[0].value = tls_base + ctx->tls_loc[dst.value];
+ }
+
+ bi_foreach_src(I, s) {
+ const bi_index src = I->src[s];
+ if (!src.memory)
+ continue;
+
+ assert(ctx->tls_loc[src.value] != UINT32_MAX && "Undefined source");
+ I->src[s].value = tls_base + ctx->tls_loc[src.value];
+ }
+}
+
+static void
+assign_tls_locations(struct lower_spill_ctx *ctx) {
+ bi_foreach_instr_global(ctx->shader, I) {
+ bi_foreach_ssa_dest(I, d) {
+ bi_index dst = I->dest[d];
+ if (!dst.memory)
+ continue;
+
+ assert(I->op == BI_OPCODE_MEMMOV || I->op == BI_OPCODE_PHI);
+ assert(ctx->tls_loc[dst.value] == UINT32_MAX && "Broken SSA");
+
+ ctx->tls_loc[dst.value] = ctx->spill_count;
+ ctx->spill_count += 4;
+ }
+ }
+}
+
+unsigned
+bi_lower_spill(bi_context* ctx, uint32_t tls_base) {
+ void* memctx = ralloc_context(NULL);
+
+ uint32_t *tls_loc = ralloc_array(memctx, uint32_t, ctx->ssa_alloc);
+ memset(tls_loc, 0xff, sizeof(uint32_t) * ctx->ssa_alloc);
+
+ struct lower_spill_ctx lctx = {
+ .shader = ctx,
+ .tls_loc = tls_loc,
+ .spill_count = 0,
+ };
+
+ assign_tls_locations(&lctx);
+
+ bi_foreach_instr_global_safe(ctx, I) {
+ switch (I->op) {
+ case BI_OPCODE_MEMMOV:
+ lower_memmov(&lctx, I, tls_base);
+ break;
+ case BI_OPCODE_PHI:
+ lower_mem_phi(&lctx, I, tls_base);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ralloc_free(memctx);
+
+ return lctx.spill_count;
+}
diff --git a/src/panfrost/compiler/bifrost/bi_ra.c b/src/panfrost/compiler/bifrost/bi_ra.c
index ecf070906b6..dd8175e8157 100644
--- a/src/panfrost/compiler/bifrost/bi_ra.c
+++ b/src/panfrost/compiler/bifrost/bi_ra.c
@@ -1214,7 +1214,9 @@ bi_register_allocate(bi_context *ctx)
if (ctx->inputs->is_blend)
UNREACHABLE("Blend shaders may not spill");
- spill_count = bi_spill_ssa(ctx, regs_to_use, spill_count);
+ bi_spill_ssa(ctx, regs_to_use);
+ spill_count += bi_lower_spill(ctx, spill_count);
+
/* By default, we use packed TLS addressing on Valhall.
* We cannot cross 16 byte boundaries with packed TLS
* addressing. Align to ensure this doesn't happen. This
diff --git a/src/panfrost/compiler/bifrost/bi_spill_ssa.c b/src/panfrost/compiler/bifrost/bi_spill_ssa.c
index 6b1d05e6249..6859aa0adc9 100644
--- a/src/panfrost/compiler/bifrost/bi_spill_ssa.c
+++ b/src/panfrost/compiler/bifrost/bi_spill_ssa.c
@@ -277,8 +277,8 @@ bi_along_edge(bi_block *pred, bi_block *succ)
return bi_before_block(succ);
}
-static bool bi_idx_is_memory(bi_index idx) {
-// return (idx.type == BI_INDEX_FAU);
+static inline bool
+bi_idx_is_memory(bi_index idx) {
return idx.memory;
}
@@ -286,32 +286,18 @@ static bi_index
bi_index_as_mem(bi_index idx, struct spill_ctx *ctx)
{
assert(idx.type == BI_INDEX_NORMAL);
- idx.type = BI_INDEX_FAU;
- unsigned val = idx.value;
+ assert(!idx.memory);
- assert(val < ctx->spill_max);
- if (ctx->spill_map[val] == 0xFFFFFFFFU) {
- uint32_t remap = ctx->spill_bytes;
- ctx->spill_bytes += 4;
- ctx->spill_map[val] = remap;
- unsigned i = (remap - ctx->spill_base)/4;
- assert(i < ctx->spill_max);
- ctx->mem_map[i] = val;
- }
- idx.value = ctx->spill_map[val];
idx.memory = true;
+ idx.value = ctx->spill_base + idx.value;
+
return idx;
}
static unsigned
chase_mem_index(bi_index ref, struct spill_ctx *ctx)
{
- unsigned val = ref.value;
- if (bi_idx_is_memory(ref)) {
- unsigned i = (val - ctx->spill_base)/4;
- return ctx->mem_map[i];
- }
- return val;
+ return bi_idx_is_memory(ref) ? (ref.value - ctx->spill_base) : ref.value;
}
static bi_index
@@ -416,9 +402,8 @@ insert_spill(bi_builder *b, struct spill_ctx *ctx, unsigned node)
if (!ctx->remat[node] && !BITSET_TEST(ctx->spill_map_store, node)) {
bi_index idx = reconstruct_index(ctx, node);
bi_index mem = bi_index_as_mem(idx, ctx);
- unsigned bits = 32;
- bi_store_tl(b, bits, idx, mem.value);
+ bi_memmov_to(b, mem, idx);
b->shader->spills++;
/* We only need the extra registers reserved if we actually spilled
@@ -442,9 +427,7 @@ insert_reload(struct spill_ctx *ctx, bi_block *block, bi_cursor cursor,
if (ctx->remat[node]) {
remat_to(&b, idx, ctx, node);
} else {
- bi_index mem = bi_index_as_mem(idx, ctx);
- unsigned bits = 32;
- bi_load_tl(&b, bits, idx, mem.value);
+ bi_memmov_to(&b, idx, bi_index_as_mem(idx, ctx));
b.shader->fills++;
}
}
@@ -551,7 +534,6 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ)
I->src[s].type == BI_INDEX_REGISTER);
bi_index gpr = bi_temp(ctx->shader);
- unsigned bits = 32;
assert(gpr.type == BI_INDEX_NORMAL);
if (ctx->arch >= 9 && I->src[s].type == BI_INDEX_CONSTANT) {
@@ -560,8 +542,9 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ)
bi_iadd_imm_i32_to(&b, gpr, zero, I->src[s].value);
} else
bi_mov_i32_to(&b, gpr, I->src[s]);
+
bi_index mem = bi_index_as_mem(gpr, ctx);
- bi_store_tl(&b, bits, gpr, mem.value);
+ bi_memmov_to(&b, mem, gpr);
I->src[s] = mem;
continue;
}
@@ -586,10 +569,9 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ)
unsigned node = I->src[s].value;
bi_index idx = reconstruct_index(ctx, node);
bi_index tmp = bi_temp(ctx->shader);
- unsigned bits = 32;
remat_to(&b, tmp, ctx, node);
- bi_store_tl(&b, bits, tmp, bi_index_as_mem(idx, ctx).value);
+ bi_memmov_to(&b, bi_index_as_mem(idx, ctx), tmp);
}
/* Use the spilled version */
@@ -1459,11 +1441,10 @@ record_ssa_defs(bi_context *ctx, bi_instr **defs, bi_block **blocks)
* returns number of registers spilled
*/
-unsigned
-bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
+void
+bi_spill_ssa(bi_context *ctx, unsigned k)
{
void *memctx = ralloc_context(NULL);
- unsigned spill_count = spill_base;
unsigned max_temps = MIN_TEMPS_FOR_SPILL;
/* calculate how many temporaries we may need */
@@ -1498,8 +1479,11 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
global_next_use_distances(ctx, memctx, blocks);
validate_next_use_info(ctx, blocks);
+ /* Reserve a memory variable for every regular variable */
+ const uint32_t n = ctx->ssa_alloc;
+ ctx->ssa_alloc *= 2;
+
/* we may need to allocate some temporaries for spilling PHIs, hence the max_temps */
- unsigned n = ctx->ssa_alloc + max_temps;
BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
uint32_t *spill_map = ralloc_array(memctx, uint32_t, n);
@@ -1524,7 +1508,7 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
struct spill_ctx sctx = {
.memctx = memctx,
.shader = ctx,
- .n_alloc = ctx->ssa_alloc,
+ .n_alloc = n,
.remat = remat,
.next_uses = next_uses,
.block = block,
@@ -1533,10 +1517,10 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
.W = W,
.S = S,
.size = sizes,
- .spill_max = n,
- .spill_base = spill_base,
+ .spill_max = 2*n,
+ .spill_base = n,
.spill_map = spill_map,
- .spill_bytes = spill_count,
+ .spill_bytes = 0,
.spill_map_store = spill_map_store,
.mem_map = mem_map,
.ssa_defs = ssa_defs,
@@ -1548,7 +1532,6 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
compute_w_entry(&sctx);
compute_s_entry(&sctx);
min_algorithm(&sctx);
- spill_count = MAX2(spill_count, sctx.spill_bytes);
}
/* Now that all blocks are processed separately, stitch it together */
@@ -1556,7 +1539,7 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
struct spill_ctx sctx = {
.memctx = memctx,
.shader = ctx,
- .n_alloc = ctx->ssa_alloc,
+ .n_alloc = n,
.remat = remat,
.block = block,
.blocks = blocks,
@@ -1564,10 +1547,10 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
.W = W,
.S = S,
.size = sizes,
- .spill_max = n,
- .spill_base = spill_base,
+ .spill_max = 2*n,
+ .spill_base = n,
.spill_map = spill_map,
- .spill_bytes = spill_count,
+ .spill_bytes = 0,
.spill_map_store = spill_map_store,
.mem_map = mem_map,
.ssa_defs = ssa_defs,
@@ -1580,12 +1563,9 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
/* After spilling phi sources, insert coupling code */
insert_coupling_code(&sctx, *pred, block);
}
- spill_count = MAX2(spill_count, sctx.spill_bytes);
}
ralloc_free(memctx);
bi_repair_ssa(ctx);
-
- return spill_count;
}
diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h
index b3fb580d37c..0167f50af1c 100644
--- a/src/panfrost/compiler/bifrost/compiler.h
+++ b/src/panfrost/compiler/bifrost/compiler.h
@@ -1491,14 +1491,17 @@ uint64_t MUST_CHECK bi_postra_liveness_ins(uint64_t live, bi_instr *ins);
/* Record sizes of SSA values into the provided array. */
void bi_record_sizes(bi_context *ctx, uint32_t *sizes);
-/* SSA spilling; returns number of spilled registers */
-unsigned bi_spill_ssa(bi_context *ctx, unsigned num_registers, unsigned tls_size);
+/* SSA spilling */
+void bi_spill_ssa(bi_context *ctx, unsigned num_registers);
void bi_repair_ssa(bi_context *ctx);
/* Reindex SSA to reduce memory usage */
void bi_reindex_ssa(bi_context *ctx);
+/* Lower memory operands created during spilling. */
+unsigned bi_lower_spill(bi_context* ctx, uint32_t tls_base);
+
/* Layout */
signed bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target);
diff --git a/src/panfrost/compiler/bifrost/meson.build b/src/panfrost/compiler/bifrost/meson.build
index 3ae3308f319..6556b716ac5 100644
--- a/src/panfrost/compiler/bifrost/meson.build
+++ b/src/panfrost/compiler/bifrost/meson.build
@@ -11,6 +11,7 @@ libpanfrost_bifrost_files = files(
'bi_layout.c',
'bi_liveness.c',
'bi_lower_divergent_indirects.c',
+ 'bi_lower_spill.c',
'bi_lower_swizzle.c',
'bi_print.c',
'bi_opt_control_flow.c',