ir3: add helper to calculate src read delay

cat3 instructions read their 3rd src later than their first two srcs.
This was implemented in two different places: once for scheduling and
once for legalization. Extract this logic in a new helper and also add
similar logic for gat/swz there (which the scheduling logic failed to
account for).

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33183>
This commit is contained in:
Job Noorman 2025-01-23 12:09:35 +01:00 committed by Marge Bot
parent e7ac1094f6
commit 2c7c62dfd9
3 changed files with 29 additions and 21 deletions

View file

@ -2118,6 +2118,8 @@ struct log_stream;
void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
/* delay calculation: */
unsigned ir3_src_read_delay(struct ir3_compiler *compiler,
struct ir3_instruction *instr, unsigned src_n);
int ir3_delayslots(struct ir3_compiler *compiler,
struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned n, bool soft);

View file

@ -23,6 +23,28 @@
* src iterators work.
*/
/* Return the number of cycles from the start of the instruction until src_n is
* read.
*/
unsigned
ir3_src_read_delay(struct ir3_compiler *compiler, struct ir3_instruction *instr,
unsigned src_n)
{
/* gat and swz have scalar sources and each source is read in a subsequent
* cycle.
*/
if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ) {
return src_n;
}
/* cat3 instructions consume their last source one or two cycles later. */
if ((is_mad(instr->opc) || is_madsh(instr->opc)) && src_n == 2) {
return 2;
}
return 0;
}
/* calculate required # of delay slots between the instruction that
* assigns a value and the one that consumes
*/
@ -85,12 +107,7 @@ ir3_delayslots(struct ir3_compiler *compiler,
bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
(consumer->srcs[n]->flags & IR3_REG_HALF);
unsigned penalty = mismatched_half ? 3 : 0;
if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
/* special case, 3rd src to cat3 not required on first cycle */
return 1 + penalty;
} else {
return 3 + penalty;
}
return 3 + penalty - ir3_src_read_delay(compiler, consumer, n);
}
}

View file

@ -178,7 +178,8 @@ get_ready_slot(struct ir3_legalize_state *state,
}
static unsigned
delay_calc(struct ir3_legalize_state *state,
delay_calc(struct ir3_legalize_ctx *ctx,
struct ir3_legalize_state *state,
struct ir3_instruction *instr,
unsigned cycle)
{
@ -193,19 +194,7 @@ delay_calc(struct ir3_legalize_state *state,
unsigned elems = post_ra_reg_elems(src);
unsigned num = post_ra_reg_num(src);
unsigned src_cycle = cycle;
/* gat and swz have scalar sources and each source is read in a
* subsequent cycle.
*/
if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ)
src_cycle += n;
/* cat3 instructions consume their last source two cycles later, so they
* only need a delay of 1.
*/
if ((is_mad(instr->opc) || is_madsh(instr->opc)) && n == 2)
src_cycle += 2;
unsigned src_cycle = cycle + ir3_src_read_delay(ctx->compiler, instr, n);
for (unsigned elem = 0; elem < elems; elem++, num++) {
unsigned ready_cycle =
@ -560,7 +549,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
cycle++;
}
unsigned delay = delay_calc(state, n, cycle);
unsigned delay = delay_calc(ctx, state, n, cycle);
/* NOTE: I think the nopN encoding works for a5xx and
* probably a4xx, but not a3xx. So far only tested on