panfrost: align spills to reduce TLS memory usage

When spilling registers on Valhall we are careful to leave the TLS pointer aligned on 16 byte boundaries (so as to avoid accesses crossing those boundaries). However, within the spill code we don't need to have 16 byte alignment for spills of 32 or 64 bit values. In the common case where most spills are 32 bits, we can save nearly 75% of the memory used by just aligning to 32 bit boundaries. Reviewed-by: Aksel Hjerpbakk <aksel.hjerpbakk@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36676>
2026-01-04 00:30:11 +01:00 · 2025-08-07 14:57:06 +00:00 · 2025-08-07 14:57:06 +00:00 · b03cd7bdce
commit b03cd7bdce
parent acd7cae0fa
2 changed files with 50 additions and 12 deletions
--- a/src/panfrost/compiler/bi_ra.c
+++ b/src/panfrost/compiler/bi_ra.c
@ -693,6 +693,7 @@ bi_instr *
 bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
 {
   if (b->shader->arch >= 9) {
+      assert(offset < 0x8000);  /* valhall has 16 bit signed offset */
      return bi_load_to(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true),
                        BI_SEG_TL, offset);
   } else {
@ -705,6 +706,7 @@ void
 bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
 {
   if (b->shader->arch >= 9) {
+      assert(offset < 0x8000);  /* valhall has 16 bit signed offset */
      bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL,
               offset);
   } else {
@ -712,14 +714,50 @@ bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
   }
 }

-/* Once we've chosen a spill node, spill it and returns bytes spilled */
+static void
+bi_compute_reg_alignment(bi_context *ctx)
+{
+   unsigned idx;
+   unsigned count;
+   ctx->reg_alignment = rzalloc_array(ctx, uint8_t, ctx->ssa_alloc);
+   bi_foreach_instr_global(ctx, I) {
+      bi_foreach_ssa_dest(I, d) {
+         idx = I->dest[d].value;
+         count = bi_count_write_registers(I, d);
+         if (count == 3) count = 4;
+         assert(idx < ctx->ssa_alloc);
+         ctx->reg_alignment[idx] = MAX2(count*4, ctx->reg_alignment[idx]);
+      }
+      bi_foreach_ssa_src(I, s) {
+         idx = I->src[s].value;
+         count = bi_count_read_index(I, I->src[s]);
+         if (count == 3) count = 4;
+         assert(idx < ctx->ssa_alloc);
+         ctx->reg_alignment[idx] = MAX2(count*4, ctx->reg_alignment[idx]);
+      }
+   }
+}
+
+/* Once we've chosen a spill node, spill it and return new (aligned) offset */

 static unsigned
 bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
 {
   bi_builder b = {.shader = ctx};
+   unsigned alignment = 4;
   unsigned channels = 0;

+   /* first figure out the alignment we will need, based on the
+    * maximum count we see
+    */
+   if (ctx->arch >= 9) {
+      if (!ctx->reg_alignment)
+         bi_compute_reg_alignment(ctx);
+      assert(index.value < ctx->ssa_alloc);
+      alignment = ctx->reg_alignment[index.value];
+   }
+   offset = ALIGN_POT(offset, alignment);
+
   /* Spill after every store, fill before every load */
   bi_foreach_instr_global_safe(ctx, I) {
      bi_foreach_dest(I, d) {
@ -737,7 +775,6 @@ bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)

         b.cursor = bi_after_instr(I);
         bi_store_tl(&b, bits, tmp, offset + 4 * extra);
-
         ctx->spills++;
         channels = MAX2(channels, extra + count);
      }
@ -755,7 +792,7 @@ bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
      }
   }

-   return (channels * 4);
+   return offset + (channels * 4);
 }

 /*
@ -868,6 +905,10 @@ bi_is_tied(const bi_instr *I)
 static void
 bi_coalesce_tied(bi_context *ctx)
 {
+   if (ctx->reg_alignment) {
+      ralloc_free(ctx->reg_alignment);
+      ctx->reg_alignment = NULL;
+   }
   bi_foreach_instr_global(ctx, I) {
      if (!bi_is_tied(I))
         continue;
@ -1158,15 +1199,7 @@ bi_register_allocate(bi_context *ctx)
         if (ctx->inputs->is_blend)
            UNREACHABLE("Blend shaders may not spill");

-         /* By default, we use packed TLS addressing on Valhall.
-          * We cannot cross 16 byte boundaries with packed TLS
-          * addressing. Align to ensure this doesn't happen. This
-          * could be optimized a bit.
-          */
-         if (ctx->arch >= 9)
-            spill_count = ALIGN_POT(spill_count, 16);
-
-         spill_count +=
+         spill_count =
            bi_spill_register(ctx, bi_get_index(spill_node), spill_count);

         /* In case the spill affected an instruction with tied
@ -1179,6 +1212,8 @@ bi_register_allocate(bi_context *ctx)
   assert(success);
   assert(l != NULL);

+   if (ctx->arch >= 9)
+      spill_count = ALIGN_POT(spill_count, 16);
   ctx->info.tls_size = spill_count;
   bi_install_registers(ctx, l);

--- a/src/panfrost/compiler/compiler.h
+++ b/src/panfrost/compiler/compiler.h
@ -1070,6 +1070,9 @@ typedef struct {
   unsigned loop_count;
   unsigned spills;
   unsigned fills;
+
+   /* alignment needed for registers during register allocation */
+   uint8_t *reg_alignment;
 } bi_context;

 static inline enum bi_round