mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-27 10:30:08 +01:00
agx: model sources as late-kill in demand calcs
This hurts us in two ways: * slightly more spilling (not actually a big problem) * slightly worse occupancy (the shaders that are "helped" here are from trying less hard to fit at higher occupancy levels) However, in exchange we get a LOT more flexibility in the RA. total instrs in shared programs: 2847015 -> 2846065 (-0.03%) instrs in affected programs: 84134 -> 83184 (-1.13%) total alu in shared programs: 2330406 -> 2329477 (-0.04%) alu in affected programs: 62305 -> 61376 (-1.49%) total code size in shared programs: 20497326 -> 20491690 (-0.03%) code size in affected programs: 586664 -> 581028 (-0.96%) total gprs in shared programs: 894202 -> 894762 (0.06%) gprs in affected programs: 8900 -> 9460 (6.29%) total scratch in shared programs: 13292 -> 13304 (0.09%) scratch in affected programs: 2924 -> 2936 (0.41%) total threads in shared programs: 27819712 -> 27814272 (-0.02%) threads in affected programs: 55296 -> 49856 (-9.84%) total spills in shared programs: 907 -> 914 (0.77%) spills in affected programs: 419 -> 426 (1.67%) total fills in shared programs: 857 -> 862 (0.58%) fills in affected programs: 389 -> 394 (1.29%) Backport-to: 25.1 Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34595>
This commit is contained in:
parent
cc7aa31b30
commit
7fad96d194
2 changed files with 33 additions and 27 deletions
|
|
@ -237,9 +237,8 @@ agx_calc_register_demand(agx_context *ctx)
|
|||
|
||||
max_demand = MAX2(demand, max_demand);
|
||||
|
||||
/* To handle non-power-of-two vectors, sometimes live range splitting
|
||||
* needs extra registers for 1 instruction. This counter tracks the number
|
||||
* of registers to be freed after 1 extra instruction.
|
||||
/* To handle late-kill sources, this counter tracks the number of
|
||||
* registers to be freed after 1 extra instruction.
|
||||
*/
|
||||
unsigned late_kill_count = 0;
|
||||
|
||||
|
|
@ -271,7 +270,9 @@ agx_calc_register_demand(agx_context *ctx)
|
|||
demand -= late_kill_count;
|
||||
late_kill_count = 0;
|
||||
|
||||
/* Kill sources the first time we see them */
|
||||
/* Late-kill sources the first time we see them. This simplifies RA. We
|
||||
* could optimize to early-kill in some situations if we wanted.
|
||||
*/
|
||||
agx_foreach_src(I, s) {
|
||||
if (!I->src[s].kill)
|
||||
continue;
|
||||
|
|
@ -289,7 +290,7 @@ agx_calc_register_demand(agx_context *ctx)
|
|||
}
|
||||
|
||||
if (!skip)
|
||||
demand -= widths[I->src[s].value];
|
||||
late_kill_count += widths[I->src[s].value];
|
||||
}
|
||||
|
||||
/* Make destinations live */
|
||||
|
|
|
|||
|
|
@ -642,18 +642,18 @@ calculate_local_next_use(struct spill_ctx *ctx, struct util_dynarray *out)
|
|||
ip -= instr_cycles(I);
|
||||
|
||||
if (I->op != AGX_OPCODE_PHI) {
|
||||
agx_foreach_ssa_dest_rev(I, d) {
|
||||
unsigned v = I->dest[d].value;
|
||||
|
||||
util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
|
||||
}
|
||||
|
||||
agx_foreach_ssa_src(I, s) {
|
||||
unsigned v = I->src[s].value;
|
||||
|
||||
util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
|
||||
set_next_use(&nu, v, ip);
|
||||
}
|
||||
|
||||
agx_foreach_ssa_dest_rev(I, d) {
|
||||
unsigned v = I->dest[d].value;
|
||||
|
||||
util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -753,22 +753,6 @@ min_algorithm(struct spill_ctx *ctx)
|
|||
/* Limit W to make space for the sources we just added */
|
||||
limit(ctx, I, ctx->k);
|
||||
|
||||
/* Update next-use distances for this instruction. Unlike the paper, we
|
||||
* prune dead values from W as we go. This doesn't affect correctness, but
|
||||
* it speeds up limit() on average.
|
||||
*/
|
||||
agx_foreach_ssa_src_rev(I, s) {
|
||||
assert(next_use_cursor >= 1);
|
||||
|
||||
unsigned next_ip = next_ips[--next_use_cursor];
|
||||
assert((next_ip == DIST_INFINITY) == I->src[s].kill);
|
||||
|
||||
if (next_ip == DIST_INFINITY)
|
||||
remove_W_if_present(ctx, I->src[s].value);
|
||||
else
|
||||
ctx->next_uses[I->src[s].value] = next_ip;
|
||||
}
|
||||
|
||||
agx_foreach_ssa_dest(I, d) {
|
||||
assert(next_use_cursor >= 1);
|
||||
unsigned next_ip = next_ips[--next_use_cursor];
|
||||
|
|
@ -795,6 +779,27 @@ min_algorithm(struct spill_ctx *ctx)
|
|||
insert_W(ctx, I->dest[d].value);
|
||||
}
|
||||
|
||||
/* Update next-use distances for this instruction. Unlike the paper, we
|
||||
* prune dead values from W as we go. This doesn't affect correctness, but
|
||||
* it speeds up limit() on average.
|
||||
*
|
||||
* This happens after the above limit() calls to model sources as
|
||||
* late-kill. This is conservative and could be improved, but it matches
|
||||
* how we currently estimate register demand.
|
||||
*/
|
||||
agx_foreach_ssa_src_rev(I, s) {
|
||||
assert(next_use_cursor >= 1);
|
||||
|
||||
unsigned next_ip = next_ips[--next_use_cursor];
|
||||
assert((next_ip == DIST_INFINITY) == I->src[s].kill);
|
||||
|
||||
if (I->src[s].kill) {
|
||||
remove_W_if_present(ctx, I->src[s].value);
|
||||
} else {
|
||||
ctx->next_uses[I->src[s].value] = next_ip;
|
||||
}
|
||||
}
|
||||
|
||||
/* Add reloads for the sources in front of the instruction. We need to be
|
||||
* careful around exports, hoisting the reloads to before all exports.
|
||||
*
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue