panfrost: align spills to reduce TLS memory usage

When spilling registers on Valhall we are careful to leave the TLS
pointer aligned on 16 byte boundaries (so as to avoid accesses
crossing those boundaries). However, within the spill code we don't
need to have 16 byte alignment for spills of 32 or 64 bit values.
In the common case where most spills are 32 bits, we can save nearly
75% of the memory used by just aligning to 32 bit boundaries.

Reviewed-by: Aksel Hjerpbakk <aksel.hjerpbakk@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36676>
This commit is contained in:
Eric R. Smith 2025-08-07 14:57:06 +00:00 committed by Marge Bot
parent acd7cae0fa
commit b03cd7bdce
2 changed files with 50 additions and 12 deletions

View file

@ -693,6 +693,7 @@ bi_instr *
bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
{
if (b->shader->arch >= 9) {
assert(offset < 0x8000); /* valhall has 16 bit signed offset */
return bi_load_to(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true),
BI_SEG_TL, offset);
} else {
@ -705,6 +706,7 @@ void
bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
{
if (b->shader->arch >= 9) {
assert(offset < 0x8000); /* valhall has 16 bit signed offset */
bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL,
offset);
} else {
@ -712,14 +714,50 @@ bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
}
}
/* Once we've chosen a spill node, spill it and returns bytes spilled */
static void
bi_compute_reg_alignment(bi_context *ctx)
{
unsigned idx;
unsigned count;
ctx->reg_alignment = rzalloc_array(ctx, uint8_t, ctx->ssa_alloc);
bi_foreach_instr_global(ctx, I) {
bi_foreach_ssa_dest(I, d) {
idx = I->dest[d].value;
count = bi_count_write_registers(I, d);
if (count == 3) count = 4;
assert(idx < ctx->ssa_alloc);
ctx->reg_alignment[idx] = MAX2(count*4, ctx->reg_alignment[idx]);
}
bi_foreach_ssa_src(I, s) {
idx = I->src[s].value;
count = bi_count_read_index(I, I->src[s]);
if (count == 3) count = 4;
assert(idx < ctx->ssa_alloc);
ctx->reg_alignment[idx] = MAX2(count*4, ctx->reg_alignment[idx]);
}
}
}
/* Once we've chosen a spill node, spill it and return new (aligned) offset */
static unsigned
bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
{
bi_builder b = {.shader = ctx};
unsigned alignment = 4;
unsigned channels = 0;
/* first figure out the alignment we will need, based on the
* maximum count we see
*/
if (ctx->arch >= 9) {
if (!ctx->reg_alignment)
bi_compute_reg_alignment(ctx);
assert(index.value < ctx->ssa_alloc);
alignment = ctx->reg_alignment[index.value];
}
offset = ALIGN_POT(offset, alignment);
/* Spill after every store, fill before every load */
bi_foreach_instr_global_safe(ctx, I) {
bi_foreach_dest(I, d) {
@ -737,7 +775,6 @@ bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
b.cursor = bi_after_instr(I);
bi_store_tl(&b, bits, tmp, offset + 4 * extra);
ctx->spills++;
channels = MAX2(channels, extra + count);
}
@ -755,7 +792,7 @@ bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
}
}
return (channels * 4);
return offset + (channels * 4);
}
/*
@ -868,6 +905,10 @@ bi_is_tied(const bi_instr *I)
static void
bi_coalesce_tied(bi_context *ctx)
{
if (ctx->reg_alignment) {
ralloc_free(ctx->reg_alignment);
ctx->reg_alignment = NULL;
}
bi_foreach_instr_global(ctx, I) {
if (!bi_is_tied(I))
continue;
@ -1158,15 +1199,7 @@ bi_register_allocate(bi_context *ctx)
if (ctx->inputs->is_blend)
UNREACHABLE("Blend shaders may not spill");
/* By default, we use packed TLS addressing on Valhall.
* We cannot cross 16 byte boundaries with packed TLS
* addressing. Align to ensure this doesn't happen. This
* could be optimized a bit.
*/
if (ctx->arch >= 9)
spill_count = ALIGN_POT(spill_count, 16);
spill_count +=
spill_count =
bi_spill_register(ctx, bi_get_index(spill_node), spill_count);
/* In case the spill affected an instruction with tied
@ -1179,6 +1212,8 @@ bi_register_allocate(bi_context *ctx)
assert(success);
assert(l != NULL);
if (ctx->arch >= 9)
spill_count = ALIGN_POT(spill_count, 16);
ctx->info.tls_size = spill_count;
bi_install_registers(ctx, l);

View file

@ -1070,6 +1070,9 @@ typedef struct {
unsigned loop_count;
unsigned spills;
unsigned fills;
/* alignment needed for registers during register allocation */
uint8_t *reg_alignment;
} bi_context;
static inline enum bi_round