ir3: make delay slots a compiler property

They changed on a7xx so we want to make it configurable.

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33183>
This commit is contained in:
Job Noorman 2025-01-23 12:09:35 +01:00 committed by Marge Bot
parent 2c7c62dfd9
commit 5460be5d33
4 changed files with 35 additions and 8 deletions

View file

@ -160,6 +160,9 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->has_branch_and_or = false;
compiler->has_rpt_bary_f = false;
compiler->has_alias_tex = false;
compiler->delay_slots.alu_to_alu = 3;
compiler->delay_slots.non_alu = 6;
compiler->delay_slots.cat3_src2_read = 2;
if (compiler->gen >= 6) {
compiler->samgq_workaround = true;

View file

@ -289,6 +289,26 @@ struct ir3_compiler {
bool has_alias_rt;
bool reading_shading_rate_requires_smask_quirk;
struct {
/* The number of cycles needed for the result of one ALU operation to be
* available to another ALU operation. Only valid when the halfness of the
* source and destination match.
*/
unsigned alu_to_alu;
/* The number of cycles needed for the result of one instruction to be
* available to another. Valid for a0.x, a1.x, and p0.c destinations, ALU
* to non-ALU dependencies, and ALU to ALU dependencies witch mismatched
* halfness.
*/
unsigned non_alu;
/* The number of cycles from the start of the instruction until a cat3
* instruction reads its 3rd src.
*/
unsigned cat3_src2_read;
} delay_slots;
};
void ir3_compiler_destroy(struct ir3_compiler *compiler);

View file

@ -39,7 +39,7 @@ ir3_src_read_delay(struct ir3_compiler *compiler, struct ir3_instruction *instr,
/* cat3 instructions consume their last source one or two cycles later. */
if ((is_mad(instr->opc) || is_madsh(instr->opc)) && src_n == 2) {
return 2;
return compiler->delay_slots.cat3_src2_read;
}
return 0;
@ -68,7 +68,7 @@ ir3_delayslots(struct ir3_compiler *compiler,
return 0;
if (writes_addr0(assigner) || writes_addr1(assigner))
return 6;
return compiler->delay_slots.non_alu;
if (soft && needs_ss(compiler, assigner, consumer))
return soft_ss_delay(assigner);
@ -98,7 +98,7 @@ ir3_delayslots(struct ir3_compiler *compiler,
/* assigner must be alu: */
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
is_mem(consumer)) {
return 6;
return compiler->delay_slots.non_alu;
} else {
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
* a full-reg is read as a half-reg or when a half-reg is read as a
@ -107,7 +107,8 @@ ir3_delayslots(struct ir3_compiler *compiler,
bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
(consumer->srcs[n]->flags & IR3_REG_HALF);
unsigned penalty = mismatched_half ? 3 : 0;
return 3 + penalty - ir3_src_read_delay(compiler, consumer, n);
return compiler->delay_slots.alu_to_alu + penalty -
ir3_src_read_delay(compiler, consumer, n);
}
}

View file

@ -213,7 +213,8 @@ delay_calc(struct ir3_legalize_ctx *ctx,
}
static void
delay_update(struct ir3_legalize_state *state,
delay_update(struct ir3_legalize_ctx *ctx,
struct ir3_legalize_state *state,
struct ir3_instruction *instr,
unsigned cycle,
bool mergedregs)
@ -265,11 +266,13 @@ delay_update(struct ir3_legalize_state *state,
reset_ready_slot = true;
} else if ((dst->flags & IR3_REG_PREDICATE) ||
reg_num(dst) == REG_A0) {
delay = 6;
delay = ctx->compiler->delay_slots.non_alu;
if (!matching_size)
continue;
} else {
delay = (consumer_alu && matching_size) ? 3 : 6;
delay = (consumer_alu && matching_size)
? ctx->compiler->delay_slots.alu_to_alu
: ctx->compiler->delay_slots.non_alu;
}
if (!matching_size) {
@ -697,7 +700,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
if (count)
cycle += 1;
delay_update(state, n, cycle, mergedregs);
delay_update(ctx, state, n, cycle, mergedregs);
if (count)
cycle += n->repeat + n->nop;