agx: model sources as late-kill in demand calcs

This hurts us in two ways: * slightly more spilling (not actually a big problem) * slightly worse occupancy (the shaders that are "helped" here are from trying less hard to fit at higher occupancy levels) However, in exchange we get a LOT more flexibility in the RA. total instrs in shared programs: 2847015 -> 2846065 (-0.03%) instrs in affected programs: 84134 -> 83184 (-1.13%) total alu in shared programs: 2330406 -> 2329477 (-0.04%) alu in affected programs: 62305 -> 61376 (-1.49%) total code size in shared programs: 20497326 -> 20491690 (-0.03%) code size in affected programs: 586664 -> 581028 (-0.96%) total gprs in shared programs: 894202 -> 894762 (0.06%) gprs in affected programs: 8900 -> 9460 (6.29%) total scratch in shared programs: 13292 -> 13304 (0.09%) scratch in affected programs: 2924 -> 2936 (0.41%) total threads in shared programs: 27819712 -> 27814272 (-0.02%) threads in affected programs: 55296 -> 49856 (-9.84%) total spills in shared programs: 907 -> 914 (0.77%) spills in affected programs: 419 -> 426 (1.67%) total fills in shared programs: 857 -> 862 (0.58%) fills in affected programs: 389 -> 394 (1.29%) Backport-to: 25.1 Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34595>
2025-12-27 10:30:08 +01:00 · 2025-04-17 14:09:42 -04:00 · 2025-04-17 14:09:42 -04:00 · 7fad96d194
commit 7fad96d194
parent cc7aa31b30
2 changed files with 33 additions and 27 deletions
--- a/src/asahi/compiler/agx_register_allocate.c
+++ b/src/asahi/compiler/agx_register_allocate.c
@ -237,9 +237,8 @@ agx_calc_register_demand(agx_context *ctx)

      max_demand = MAX2(demand, max_demand);

-      /* To handle non-power-of-two vectors, sometimes live range splitting
-       * needs extra registers for 1 instruction. This counter tracks the number
-       * of registers to be freed after 1 extra instruction.
+      /* To handle late-kill sources, this counter tracks the number of
+       * registers to be freed after 1 extra instruction.
       */
      unsigned late_kill_count = 0;

@ -271,7 +270,9 @@ agx_calc_register_demand(agx_context *ctx)
         demand -= late_kill_count;
         late_kill_count = 0;

-         /* Kill sources the first time we see them */
+         /* Late-kill sources the first time we see them. This simplifies RA. We
+          * could optimize to early-kill in some situations if we wanted.
+          */
         agx_foreach_src(I, s) {
            if (!I->src[s].kill)
               continue;
@ -289,7 +290,7 @@ agx_calc_register_demand(agx_context *ctx)
            }

            if (!skip)
-               demand -= widths[I->src[s].value];
+               late_kill_count += widths[I->src[s].value];
         }

         /* Make destinations live */
--- a/src/asahi/compiler/agx_spill.c
+++ b/src/asahi/compiler/agx_spill.c
@ -642,18 +642,18 @@ calculate_local_next_use(struct spill_ctx *ctx, struct util_dynarray *out)
      ip -= instr_cycles(I);

      if (I->op != AGX_OPCODE_PHI) {
-         agx_foreach_ssa_dest_rev(I, d) {
-            unsigned v = I->dest[d].value;
-
-            util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
-         }
-
         agx_foreach_ssa_src(I, s) {
            unsigned v = I->src[s].value;

            util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
            set_next_use(&nu, v, ip);
         }
+
+         agx_foreach_ssa_dest_rev(I, d) {
+            unsigned v = I->dest[d].value;
+
+            util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
+         }
      }
   }

@ -753,22 +753,6 @@ min_algorithm(struct spill_ctx *ctx)
      /* Limit W to make space for the sources we just added */
      limit(ctx, I, ctx->k);

-      /* Update next-use distances for this instruction. Unlike the paper, we
-       * prune dead values from W as we go. This doesn't affect correctness, but
-       * it speeds up limit() on average.
-       */
-      agx_foreach_ssa_src_rev(I, s) {
-         assert(next_use_cursor >= 1);
-
-         unsigned next_ip = next_ips[--next_use_cursor];
-         assert((next_ip == DIST_INFINITY) == I->src[s].kill);
-
-         if (next_ip == DIST_INFINITY)
-            remove_W_if_present(ctx, I->src[s].value);
-         else
-            ctx->next_uses[I->src[s].value] = next_ip;
-      }
-
      agx_foreach_ssa_dest(I, d) {
         assert(next_use_cursor >= 1);
         unsigned next_ip = next_ips[--next_use_cursor];
@ -795,6 +779,27 @@ min_algorithm(struct spill_ctx *ctx)
         insert_W(ctx, I->dest[d].value);
      }

+      /* Update next-use distances for this instruction. Unlike the paper, we
+       * prune dead values from W as we go. This doesn't affect correctness, but
+       * it speeds up limit() on average.
+       *
+       * This happens after the above limit() calls to model sources as
+       * late-kill. This is conservative and could be improved, but it matches
+       * how we currently estimate register demand.
+       */
+      agx_foreach_ssa_src_rev(I, s) {
+         assert(next_use_cursor >= 1);
+
+         unsigned next_ip = next_ips[--next_use_cursor];
+         assert((next_ip == DIST_INFINITY) == I->src[s].kill);
+
+         if (I->src[s].kill) {
+            remove_W_if_present(ctx, I->src[s].value);
+         } else {
+            ctx->next_uses[I->src[s].value] = next_ip;
+         }
+      }
+
      /* Add reloads for the sources in front of the instruction. We need to be
       * careful around exports, hoisting the reloads to before all exports.
       *