pan/bi: Abstract away spills/fills when spilling

We want to lower spills and fills to actual LS instructions only
after SSA register allocation which future commits will add.
For now, we just lower the new MEMMOV instr immediately.

Reviewed-by: Eric R. Smith <eric.smith@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39305>
This commit is contained in:
Christoph Pillmayer 2025-12-14 19:40:06 +01:00 committed by Marge Bot
parent 0e55a87901
commit 79c786ea6e
6 changed files with 152 additions and 48 deletions

View file

@ -173,4 +173,9 @@
<src start="0"/>
</ins>
<!-- To model spills / fills -->
<ins name="MEMMOV" pseudo="true" unit="add">
<src start="0"/>
</ins>
</bifrost>

View file

@ -0,0 +1,113 @@
/*
* Copyright 2025 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
#include "compiler.h"
struct lower_spill_ctx {
bi_context *shader;
/* If bi_index::memory, tls_loc[idx.value] is valid. */
uint32_t *tls_loc;
/* Bytes of spill memory used so far. */
uint32_t spill_count;
};
static void
lower_memmov(struct lower_spill_ctx* ctx, bi_instr *I, uint32_t tls_base)
{
assert(I->op == BI_OPCODE_MEMMOV);
assert(I->nr_srcs == 1 && I->nr_dests == 1);
assert(I->src[0].memory || I->dest[0].memory);
bi_builder b = bi_init_builder(ctx->shader, bi_before_instr(I));
const unsigned bits = 32;
bi_index src = I->src[0];
bi_index dst = I->dest[0];
if (I->src[0].memory) {
unsigned offset = ctx->tls_loc[src.value];
bi_load_tl(&b, bits, dst, tls_base + offset);
} else {
unsigned offset = ctx->tls_loc[dst.value];
bi_store_tl(&b, bits, src, tls_base + offset);
}
bi_remove_instruction(I);
}
static void
lower_mem_phi(struct lower_spill_ctx* ctx, bi_instr *I, uint32_t tls_base)
{
assert(I->op == BI_OPCODE_PHI);
assert(I->nr_dests == 1);
/* PHIs get lowered in bi_out_of_ssa, which expects memory operands to
* provide the actual TLS offset as bi_index::value.
*/
if (I->dest[0].memory) {
const bi_index dst = I->dest[0];
I->dest[0].value = tls_base + ctx->tls_loc[dst.value];
}
bi_foreach_src(I, s) {
const bi_index src = I->src[s];
if (!src.memory)
continue;
assert(ctx->tls_loc[src.value] != UINT32_MAX && "Undefined source");
I->src[s].value = tls_base + ctx->tls_loc[src.value];
}
}
static void
assign_tls_locations(struct lower_spill_ctx *ctx) {
bi_foreach_instr_global(ctx->shader, I) {
bi_foreach_ssa_dest(I, d) {
bi_index dst = I->dest[d];
if (!dst.memory)
continue;
assert(I->op == BI_OPCODE_MEMMOV || I->op == BI_OPCODE_PHI);
assert(ctx->tls_loc[dst.value] == UINT32_MAX && "Broken SSA");
ctx->tls_loc[dst.value] = ctx->spill_count;
ctx->spill_count += 4;
}
}
}
unsigned
bi_lower_spill(bi_context* ctx, uint32_t tls_base) {
void* memctx = ralloc_context(NULL);
uint32_t *tls_loc = ralloc_array(memctx, uint32_t, ctx->ssa_alloc);
memset(tls_loc, 0xff, sizeof(uint32_t) * ctx->ssa_alloc);
struct lower_spill_ctx lctx = {
.shader = ctx,
.tls_loc = tls_loc,
.spill_count = 0,
};
assign_tls_locations(&lctx);
bi_foreach_instr_global_safe(ctx, I) {
switch (I->op) {
case BI_OPCODE_MEMMOV:
lower_memmov(&lctx, I, tls_base);
break;
case BI_OPCODE_PHI:
lower_mem_phi(&lctx, I, tls_base);
break;
default:
break;
}
}
ralloc_free(memctx);
return lctx.spill_count;
}

View file

@ -1214,7 +1214,9 @@ bi_register_allocate(bi_context *ctx)
if (ctx->inputs->is_blend)
UNREACHABLE("Blend shaders may not spill");
spill_count = bi_spill_ssa(ctx, regs_to_use, spill_count);
bi_spill_ssa(ctx, regs_to_use);
spill_count += bi_lower_spill(ctx, spill_count);
/* By default, we use packed TLS addressing on Valhall.
* We cannot cross 16 byte boundaries with packed TLS
* addressing. Align to ensure this doesn't happen. This

View file

@ -277,8 +277,8 @@ bi_along_edge(bi_block *pred, bi_block *succ)
return bi_before_block(succ);
}
static bool bi_idx_is_memory(bi_index idx) {
// return (idx.type == BI_INDEX_FAU);
static inline bool
bi_idx_is_memory(bi_index idx) {
return idx.memory;
}
@ -286,32 +286,18 @@ static bi_index
bi_index_as_mem(bi_index idx, struct spill_ctx *ctx)
{
assert(idx.type == BI_INDEX_NORMAL);
idx.type = BI_INDEX_FAU;
unsigned val = idx.value;
assert(!idx.memory);
assert(val < ctx->spill_max);
if (ctx->spill_map[val] == 0xFFFFFFFFU) {
uint32_t remap = ctx->spill_bytes;
ctx->spill_bytes += 4;
ctx->spill_map[val] = remap;
unsigned i = (remap - ctx->spill_base)/4;
assert(i < ctx->spill_max);
ctx->mem_map[i] = val;
}
idx.value = ctx->spill_map[val];
idx.memory = true;
idx.value = ctx->spill_base + idx.value;
return idx;
}
static unsigned
chase_mem_index(bi_index ref, struct spill_ctx *ctx)
{
unsigned val = ref.value;
if (bi_idx_is_memory(ref)) {
unsigned i = (val - ctx->spill_base)/4;
return ctx->mem_map[i];
}
return val;
return bi_idx_is_memory(ref) ? (ref.value - ctx->spill_base) : ref.value;
}
static bi_index
@ -416,9 +402,8 @@ insert_spill(bi_builder *b, struct spill_ctx *ctx, unsigned node)
if (!ctx->remat[node] && !BITSET_TEST(ctx->spill_map_store, node)) {
bi_index idx = reconstruct_index(ctx, node);
bi_index mem = bi_index_as_mem(idx, ctx);
unsigned bits = 32;
bi_store_tl(b, bits, idx, mem.value);
bi_memmov_to(b, mem, idx);
b->shader->spills++;
/* We only need the extra registers reserved if we actually spilled
@ -442,9 +427,7 @@ insert_reload(struct spill_ctx *ctx, bi_block *block, bi_cursor cursor,
if (ctx->remat[node]) {
remat_to(&b, idx, ctx, node);
} else {
bi_index mem = bi_index_as_mem(idx, ctx);
unsigned bits = 32;
bi_load_tl(&b, bits, idx, mem.value);
bi_memmov_to(&b, idx, bi_index_as_mem(idx, ctx));
b.shader->fills++;
}
}
@ -551,7 +534,6 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ)
I->src[s].type == BI_INDEX_REGISTER);
bi_index gpr = bi_temp(ctx->shader);
unsigned bits = 32;
assert(gpr.type == BI_INDEX_NORMAL);
if (ctx->arch >= 9 && I->src[s].type == BI_INDEX_CONSTANT) {
@ -560,8 +542,9 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ)
bi_iadd_imm_i32_to(&b, gpr, zero, I->src[s].value);
} else
bi_mov_i32_to(&b, gpr, I->src[s]);
bi_index mem = bi_index_as_mem(gpr, ctx);
bi_store_tl(&b, bits, gpr, mem.value);
bi_memmov_to(&b, mem, gpr);
I->src[s] = mem;
continue;
}
@ -586,10 +569,9 @@ insert_coupling_code(struct spill_ctx *ctx, bi_block *pred, bi_block *succ)
unsigned node = I->src[s].value;
bi_index idx = reconstruct_index(ctx, node);
bi_index tmp = bi_temp(ctx->shader);
unsigned bits = 32;
remat_to(&b, tmp, ctx, node);
bi_store_tl(&b, bits, tmp, bi_index_as_mem(idx, ctx).value);
bi_memmov_to(&b, bi_index_as_mem(idx, ctx), tmp);
}
/* Use the spilled version */
@ -1459,11 +1441,10 @@ record_ssa_defs(bi_context *ctx, bi_instr **defs, bi_block **blocks)
* returns number of registers spilled
*/
unsigned
bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
void
bi_spill_ssa(bi_context *ctx, unsigned k)
{
void *memctx = ralloc_context(NULL);
unsigned spill_count = spill_base;
unsigned max_temps = MIN_TEMPS_FOR_SPILL;
/* calculate how many temporaries we may need */
@ -1498,8 +1479,11 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
global_next_use_distances(ctx, memctx, blocks);
validate_next_use_info(ctx, blocks);
/* Reserve a memory variable for every regular variable */
const uint32_t n = ctx->ssa_alloc;
ctx->ssa_alloc *= 2;
/* we may need to allocate some temporaries for spilling PHIs, hence the max_temps */
unsigned n = ctx->ssa_alloc + max_temps;
BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
uint32_t *spill_map = ralloc_array(memctx, uint32_t, n);
@ -1524,7 +1508,7 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
struct spill_ctx sctx = {
.memctx = memctx,
.shader = ctx,
.n_alloc = ctx->ssa_alloc,
.n_alloc = n,
.remat = remat,
.next_uses = next_uses,
.block = block,
@ -1533,10 +1517,10 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
.W = W,
.S = S,
.size = sizes,
.spill_max = n,
.spill_base = spill_base,
.spill_max = 2*n,
.spill_base = n,
.spill_map = spill_map,
.spill_bytes = spill_count,
.spill_bytes = 0,
.spill_map_store = spill_map_store,
.mem_map = mem_map,
.ssa_defs = ssa_defs,
@ -1548,7 +1532,6 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
compute_w_entry(&sctx);
compute_s_entry(&sctx);
min_algorithm(&sctx);
spill_count = MAX2(spill_count, sctx.spill_bytes);
}
/* Now that all blocks are processed separately, stitch it together */
@ -1556,7 +1539,7 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
struct spill_ctx sctx = {
.memctx = memctx,
.shader = ctx,
.n_alloc = ctx->ssa_alloc,
.n_alloc = n,
.remat = remat,
.block = block,
.blocks = blocks,
@ -1564,10 +1547,10 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
.W = W,
.S = S,
.size = sizes,
.spill_max = n,
.spill_base = spill_base,
.spill_max = 2*n,
.spill_base = n,
.spill_map = spill_map,
.spill_bytes = spill_count,
.spill_bytes = 0,
.spill_map_store = spill_map_store,
.mem_map = mem_map,
.ssa_defs = ssa_defs,
@ -1580,12 +1563,9 @@ bi_spill_ssa(bi_context *ctx, unsigned k, unsigned spill_base)
/* After spilling phi sources, insert coupling code */
insert_coupling_code(&sctx, *pred, block);
}
spill_count = MAX2(spill_count, sctx.spill_bytes);
}
ralloc_free(memctx);
bi_repair_ssa(ctx);
return spill_count;
}

View file

@ -1491,14 +1491,17 @@ uint64_t MUST_CHECK bi_postra_liveness_ins(uint64_t live, bi_instr *ins);
/* Record sizes of SSA values into the provided array. */
void bi_record_sizes(bi_context *ctx, uint32_t *sizes);
/* SSA spilling; returns number of spilled registers */
unsigned bi_spill_ssa(bi_context *ctx, unsigned num_registers, unsigned tls_size);
/* SSA spilling */
void bi_spill_ssa(bi_context *ctx, unsigned num_registers);
void bi_repair_ssa(bi_context *ctx);
/* Reindex SSA to reduce memory usage */
void bi_reindex_ssa(bi_context *ctx);
/* Lower memory operands created during spilling. */
unsigned bi_lower_spill(bi_context* ctx, uint32_t tls_base);
/* Layout */
signed bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target);

View file

@ -11,6 +11,7 @@ libpanfrost_bifrost_files = files(
'bi_layout.c',
'bi_liveness.c',
'bi_lower_divergent_indirects.c',
'bi_lower_spill.c',
'bi_lower_swizzle.c',
'bi_print.c',
'bi_opt_control_flow.c',