agx: model sources as late-kill in demand calcs

This hurts us in two ways:
* slightly more spilling (not actually a big problem)
* slightly worse occupancy (the shaders that are "helped" here are from trying
  less hard to fit at higher occupancy levels)

However, in exchange we get a LOT more flexibility in the RA.

total instrs in shared programs: 2847015 -> 2846065 (-0.03%)
instrs in affected programs: 84134 -> 83184 (-1.13%)

total alu in shared programs: 2330406 -> 2329477 (-0.04%)
alu in affected programs: 62305 -> 61376 (-1.49%)

total code size in shared programs: 20497326 -> 20491690 (-0.03%)
code size in affected programs: 586664 -> 581028 (-0.96%)

total gprs in shared programs: 894202 -> 894762 (0.06%)
gprs in affected programs: 8900 -> 9460 (6.29%)

total scratch in shared programs: 13292 -> 13304 (0.09%)
scratch in affected programs: 2924 -> 2936 (0.41%)

total threads in shared programs: 27819712 -> 27814272 (-0.02%)
threads in affected programs: 55296 -> 49856 (-9.84%)

total spills in shared programs: 907 -> 914 (0.77%)
spills in affected programs: 419 -> 426 (1.67%)

total fills in shared programs: 857 -> 862 (0.58%)
fills in affected programs: 389 -> 394 (1.29%)

Backport-to: 25.1
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34595>
This commit is contained in:
Alyssa Rosenzweig 2025-04-17 14:09:42 -04:00 committed by Marge Bot
parent cc7aa31b30
commit 7fad96d194
2 changed files with 33 additions and 27 deletions

View file

@ -237,9 +237,8 @@ agx_calc_register_demand(agx_context *ctx)
max_demand = MAX2(demand, max_demand);
/* To handle non-power-of-two vectors, sometimes live range splitting
* needs extra registers for 1 instruction. This counter tracks the number
* of registers to be freed after 1 extra instruction.
/* To handle late-kill sources, this counter tracks the number of
* registers to be freed after 1 extra instruction.
*/
unsigned late_kill_count = 0;
@ -271,7 +270,9 @@ agx_calc_register_demand(agx_context *ctx)
demand -= late_kill_count;
late_kill_count = 0;
/* Kill sources the first time we see them */
/* Late-kill sources the first time we see them. This simplifies RA. We
* could optimize to early-kill in some situations if we wanted.
*/
agx_foreach_src(I, s) {
if (!I->src[s].kill)
continue;
@ -289,7 +290,7 @@ agx_calc_register_demand(agx_context *ctx)
}
if (!skip)
demand -= widths[I->src[s].value];
late_kill_count += widths[I->src[s].value];
}
/* Make destinations live */

View file

@ -642,18 +642,18 @@ calculate_local_next_use(struct spill_ctx *ctx, struct util_dynarray *out)
ip -= instr_cycles(I);
if (I->op != AGX_OPCODE_PHI) {
agx_foreach_ssa_dest_rev(I, d) {
unsigned v = I->dest[d].value;
util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
}
agx_foreach_ssa_src(I, s) {
unsigned v = I->src[s].value;
util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
set_next_use(&nu, v, ip);
}
agx_foreach_ssa_dest_rev(I, d) {
unsigned v = I->dest[d].value;
util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
}
}
}
@ -753,22 +753,6 @@ min_algorithm(struct spill_ctx *ctx)
/* Limit W to make space for the sources we just added */
limit(ctx, I, ctx->k);
/* Update next-use distances for this instruction. Unlike the paper, we
* prune dead values from W as we go. This doesn't affect correctness, but
* it speeds up limit() on average.
*/
agx_foreach_ssa_src_rev(I, s) {
assert(next_use_cursor >= 1);
unsigned next_ip = next_ips[--next_use_cursor];
assert((next_ip == DIST_INFINITY) == I->src[s].kill);
if (next_ip == DIST_INFINITY)
remove_W_if_present(ctx, I->src[s].value);
else
ctx->next_uses[I->src[s].value] = next_ip;
}
agx_foreach_ssa_dest(I, d) {
assert(next_use_cursor >= 1);
unsigned next_ip = next_ips[--next_use_cursor];
@ -795,6 +779,27 @@ min_algorithm(struct spill_ctx *ctx)
insert_W(ctx, I->dest[d].value);
}
/* Update next-use distances for this instruction. Unlike the paper, we
* prune dead values from W as we go. This doesn't affect correctness, but
* it speeds up limit() on average.
*
* This happens after the above limit() calls to model sources as
* late-kill. This is conservative and could be improved, but it matches
* how we currently estimate register demand.
*/
agx_foreach_ssa_src_rev(I, s) {
assert(next_use_cursor >= 1);
unsigned next_ip = next_ips[--next_use_cursor];
assert((next_ip == DIST_INFINITY) == I->src[s].kill);
if (I->src[s].kill) {
remove_W_if_present(ctx, I->src[s].value);
} else {
ctx->next_uses[I->src[s].value] = next_ip;
}
}
/* Add reloads for the sources in front of the instruction. We need to be
* careful around exports, hoisting the reloads to before all exports.
*