pan: Add spill cost metric

Our SSA spilling logic should avoid inserting spill code inside loops. Add a metric that reflects this goal. Reviewed-by: Eric R. Smith <eric.smith@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38181>
2026-05-08 04:48:08 +02:00 · 2025-10-30 21:55:46 +00:00 · 2025-10-30 21:55:46 +00:00 · bb7b0b6b1b
commit bb7b0b6b1b
parent 47f4b00cb2
4 changed files with 114 additions and 0 deletions
--- a/src/panfrost/compiler/bi_ra.c
+++ b/src/panfrost/compiler/bi_ra.c
@ -1119,6 +1119,67 @@ bi_out_of_ssa(bi_context *ctx)
   return first_reg;
 }

+static bool
+op_is_load_store(enum bi_opcode op)
+{
+   switch (op) {
+   case BI_OPCODE_STORE_I8:
+   case BI_OPCODE_STORE_I16:
+   case BI_OPCODE_STORE_I24:
+   case BI_OPCODE_STORE_I32:
+   case BI_OPCODE_STORE_I48:
+   case BI_OPCODE_STORE_I64:
+   case BI_OPCODE_STORE_I96:
+   case BI_OPCODE_STORE_I128:
+      return true;
+   case BI_OPCODE_LOAD_I8:
+   case BI_OPCODE_LOAD_I16:
+   case BI_OPCODE_LOAD_I24:
+   case BI_OPCODE_LOAD_I32:
+   case BI_OPCODE_LOAD_I48:
+   case BI_OPCODE_LOAD_I64:
+   case BI_OPCODE_LOAD_I96:
+   case BI_OPCODE_LOAD_I128:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static uint64_t
+compute_spill_cost(bi_context *ctx)
+{
+   /* The cost of a spill/fill is just 10*block_depth for now. */
+
+   uint32_t *block_depth = calloc(ctx->num_blocks, sizeof(uint32_t));
+
+   bi_foreach_block(ctx, block) {
+      if (!block->loop_header)
+         continue;
+
+      bool *loop_block = bi_find_loop_blocks(ctx, block);
+
+      for (uint32_t b = 0; b < ctx->num_blocks; ++b) {
+         if (loop_block[b])
+            block_depth[b] += 1;
+      }
+
+      free(loop_block);
+   }
+
+   uint64_t cost = 0;
+   bi_foreach_block(ctx, block) {
+      bi_foreach_instr_in_block(block, I) {
+         if (op_is_load_store(I->op) && I->seg == BI_SEG_TL)
+            cost += 10 * (block_depth[block->index] + 1);
+      }
+   }
+
+   free(block_depth);
+
+   return cost;
+}
+
 void
 bi_register_allocate(bi_context *ctx)
 {
@ -1214,6 +1275,8 @@ bi_register_allocate(bi_context *ctx)
      }
   }

+   ctx->spill_cost = compute_spill_cost(ctx);
+
   assert(success);
   assert(l != NULL);

--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@ -5095,6 +5095,7 @@ bi_gather_stats(bi_context *ctx, unsigned size, struct bifrost_stats *out)
      .loops = ctx->loop_count,
      .spills = ctx->spills,
      .fills = ctx->fills,
+      .spill_cost = ctx->spill_cost,
   };

   out->cycles = MAX2(out->arith, MAX3(out->t, out->v, out->ldst));
@ -5134,6 +5135,7 @@ va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out)
      .loops = ctx->loop_count,
      .spills = ctx->spills,
      .fills = ctx->fills,
+      .spill_cost = ctx->spill_cost,
   };
   struct valhall_stats stats = stats_abs;
   stats.fma /= model->rates.fma;
@ -6615,3 +6617,45 @@ bifrost_compile_shader_nir(nir_shader *nir,

   info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
 }
+
+bool *
+bi_find_loop_blocks(const bi_context *ctx, bi_block *header)
+{
+   /* A block is in the loop if it has the header both as the predecessor and
+    * the successor. */
+
+   bool *h_as_suc = (bool *)calloc(ctx->num_blocks, sizeof(bool));
+   bool *h_as_pred = (bool *)calloc(ctx->num_blocks, sizeof(bool));
+   h_as_suc[header->index] = true;
+   h_as_pred[header->index] = true;
+
+   /* If the CFG was one long chain, we would require |blocks|-1 iters to
+    * propagate the in_loop info all the way through.
+    */
+   for (uint32_t iter = 0; iter < ctx->num_blocks - 1; ++iter) {
+      bi_foreach_block(ctx, block) {
+
+         bi_foreach_successor(block, succ) {
+            if (h_as_suc[succ->index]) {
+               h_as_suc[block->index] = true;
+               break;
+            }
+         }
+
+         bi_foreach_predecessor(block, pred) {
+            if (h_as_pred[(*pred)->index]) {
+               h_as_pred[block->index] = true;
+               break;
+            }
+         }
+      }
+   }
+
+   for (uint32_t bidx = 0; bidx < ctx->num_blocks - 1; ++bidx) {
+      h_as_suc[bidx] &= h_as_pred[bidx];
+   }
+
+   free(h_as_pred);
+
+   return h_as_suc;
+}
--- a/src/panfrost/compiler/compiler.h
+++ b/src/panfrost/compiler/compiler.h
@ -1102,6 +1102,9 @@ typedef struct {

   /* alignment needed for registers during register allocation */
   uint8_t *reg_alignment;
+
+   /* Computed after RA */
+   uint64_t spill_cost;
 } bi_context;

 static inline enum bi_round
@ -1735,6 +1738,8 @@ bi_record_use(bi_instr **uses, BITSET_WORD *multiple, bi_instr *I, unsigned s)

 bool bi_lower_divergent_indirects(nir_shader *shader, unsigned lanes);

+bool *bi_find_loop_blocks(const bi_context *ctx, bi_block *header);
+
 #ifdef __cplusplus
 } /* extern C */
 #endif
--- a/src/util/shader_stats.xml
+++ b/src/util/shader_stats.xml
@ -65,6 +65,7 @@
        <stat name="Loops">Number of hardware loops</stat>
        <stat name="Spills">Number of spill instructions</stat>
        <stat name="Fills">Number of fill instructions</stat>
+        <stat name="Spill cost">Cost of spill and fill instructions</stat>
     </isa>

     <isa name="Valhall">
@ -81,6 +82,7 @@
        <stat name="Loops">Number of hardware loops</stat>
        <stat name="Spills">Number of spill instructions</stat>
        <stat name="Fills">Number of fill instructions</stat>
+        <stat name="Spill cost">Cost of spill and fill instructions</stat>
     </isa>
   </family>