From 4808037f6ebd285b36a9b8b55c649a5eef8ff5dc Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Tue, 14 Apr 2026 17:44:05 +0200 Subject: [PATCH] ir3/legalize: track need_ss/sy_for_const per const reg Instead of tracking if *any* const reg has been written since the last sync, use a bitset to track exactly which const regs have been written. This often helps us prevent stalls. Preamble stats: Totals from 32893 (18.66% of 176258) affected shaders: Instrs: 3540796 -> 3540370 (-0.01%); split: -0.08%, +0.07% CodeSize: 30635588 -> 30627370 (-0.03%); split: -0.09%, +0.07% NOPs: 491600 -> 491174 (-0.09%); split: -0.58%, +0.49% (ss): 465746 -> 450057 (-3.37%); split: -3.54%, +0.17% (sy): 89251 -> 85497 (-4.21%); split: -4.30%, +0.09% (ss)-stall: 1210233 -> 1164381 (-3.79%); split: -4.44%, +0.66% (sy)-stall: 1286176 -> 1283034 (-0.24%); split: -0.94%, +0.70% Cat0: 594508 -> 594082 (-0.07%); split: -0.48%, +0.41% Signed-off-by: Job Noorman Part-of: --- src/freedreno/ir3/ir3.h | 7 +++-- src/freedreno/ir3/ir3_legalize.c | 46 +++++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index bddd2c352ce..fafa51f1842 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1544,6 +1544,7 @@ writes_pred(struct ir3_instruction *instr) #define SHARED_REG_SIZE (4 * 8) #define NONGPR_REG_START (SHARED_REG_START + SHARED_REG_SIZE) #define NONGPR_REG_SIZE (4 * 8) +#define CONST_REG_SIZE (4 * 512) enum ir3_reg_file { IR3_FILE_FULL, @@ -3347,6 +3348,8 @@ struct ir3_nop_state { unsigned half_ready[GPR_REG_SIZE]; }; +typedef BITSET_DECLARE(conststate_t, CONST_REG_SIZE); + struct ir3_legalize_state { regmask_t needs_ss; regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */ @@ -3357,8 +3360,8 @@ struct ir3_legalize_state { regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */ regmask_t needs_ss_or_sy_scalar_war; regmask_t needs_sy; - bool needs_ss_for_const; - bool needs_sy_for_const; + conststate_t needs_ss_for_const; + conststate_t needs_sy_for_const; /* Next instruction needs (ss)/(sy), no matter its dsts/srcs. */ bool force_ss; diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 9748367c572..635af257b1a 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -149,11 +149,23 @@ ir3_required_sync_flags(struct ir3_legalize_state *state, flags |= IR3_INSTR_SY; } } else if ((reg->flags & IR3_REG_CONST)) { - if (state->needs_ss_for_const) { - flags |= IR3_INSTR_SS; - } - if (state->needs_sy_for_const) { - flags |= IR3_INSTR_SY; + if (reg->flags & IR3_REG_RELATIV) { + /* Since we don't know which const reg is accessed, add sync flags + * if any const reg need them. + */ + if (!BITSET_IS_EMPTY(state->needs_ss_for_const)) { + flags |= IR3_INSTR_SS; + } + if (!BITSET_IS_EMPTY(state->needs_sy_for_const)) { + flags |= IR3_INSTR_SY; + } + } else { + if (BITSET_TEST(state->needs_ss_for_const, reg->num)) { + flags |= IR3_INSTR_SS; + } + if (BITSET_TEST(state->needs_sy_for_const, reg->num)) { + flags |= IR3_INSTR_SY; + } } } else if (!(reg->flags & (IR3_REG_IMMED | IR3_REG_RT))) { if (regmask_get(&state->needs_ss, reg)) { @@ -186,7 +198,7 @@ apply_ss(struct ir3_legalize_state *state, bool mergedregs) regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs); regmask_init(&state->needs_ss_scalar_full, mergedregs); regmask_init(&state->needs_ss_scalar_half, mergedregs); - state->needs_ss_for_const = false; + BITSET_ZERO(state->needs_ss_for_const); state->force_ss = false; } @@ -197,7 +209,7 @@ apply_sy(struct ir3_legalize_state *state, bool mergedregs) regmask_init(&state->needs_sy_war, mergedregs); regmask_init(&state->needs_ss_or_sy_war, mergedregs); regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs); - state->needs_sy_for_const = false; + BITSET_ZERO(state->needs_sy_for_const); state->force_sy = false; } @@ -258,10 +270,18 @@ sync_update(struct ir3_legalize_state *state, struct ir3_compiler *compiler, } else { regmask_set(&state->needs_ss, n->dsts[0]); } - } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO || n->opc == OPC_STC) { - state->needs_ss_for_const = true; + } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) { + unsigned const_dst = n->push_consts.dst_base; + unsigned const_size = n->push_consts.src_size * 2; + BITSET_SET_COUNT(state->needs_ss_for_const, const_dst, const_size); + } else if (n->opc == OPC_STC) { + unsigned const_dst = n->cat6.dst_offset; + unsigned const_size = n->cat6.iim_val; + BITSET_SET_COUNT(state->needs_ss_for_const, const_dst, const_size); } else if (n->opc == OPC_LDC_K) { - state->needs_sy_for_const = true; + unsigned const_dst = n->cat6.dst_offset; + unsigned const_size = n->cat6.iim_val * 4; + BITSET_SET_COUNT(state->needs_sy_for_const, const_dst, const_size); } /* both tex/sfu appear to not always immediately consume @@ -370,8 +390,10 @@ ir3_merge_pred_legalize_states(struct ir3_legalize_state *state, regmask_or(&state->needs_ss_or_sy_war, &state->needs_ss_or_sy_war, &pstate->needs_ss_or_sy_war); regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy); - state->needs_ss_for_const |= pstate->needs_ss_for_const; - state->needs_sy_for_const |= pstate->needs_sy_for_const; + BITSET_OR(state->needs_ss_for_const, state->needs_ss_for_const, + pstate->needs_ss_for_const); + BITSET_OR(state->needs_sy_for_const, state->needs_sy_for_const, + pstate->needs_sy_for_const); state->force_ss |= pstate->force_ss; state->force_sy |= pstate->force_sy;