ir3/legalize: track need_ss/sy_for_const per const reg

Instead of tracking if *any* const reg has been written since the last
sync, use a bitset to track exactly which const regs have been written.
This often helps us prevent stalls.

Preamble stats:

Totals from 32893 (18.66% of 176258) affected shaders:
Instrs: 3540796 -> 3540370 (-0.01%); split: -0.08%, +0.07%
CodeSize: 30635588 -> 30627370 (-0.03%); split: -0.09%, +0.07%
NOPs: 491600 -> 491174 (-0.09%); split: -0.58%, +0.49%
(ss): 465746 -> 450057 (-3.37%); split: -3.54%, +0.17%
(sy): 89251 -> 85497 (-4.21%); split: -4.30%, +0.09%
(ss)-stall: 1210233 -> 1164381 (-3.79%); split: -4.44%, +0.66%
(sy)-stall: 1286176 -> 1283034 (-0.24%); split: -0.94%, +0.70%
Cat0: 594508 -> 594082 (-0.07%); split: -0.48%, +0.41%

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40954>
This commit is contained in:
Job Noorman 2026-04-14 17:44:05 +02:00 committed by Marge Bot
parent c1bf9d6fd5
commit 4808037f6e
2 changed files with 39 additions and 14 deletions

View file

@ -1544,6 +1544,7 @@ writes_pred(struct ir3_instruction *instr)
#define SHARED_REG_SIZE (4 * 8)
#define NONGPR_REG_START (SHARED_REG_START + SHARED_REG_SIZE)
#define NONGPR_REG_SIZE (4 * 8)
#define CONST_REG_SIZE (4 * 512)
enum ir3_reg_file {
IR3_FILE_FULL,
@ -3347,6 +3348,8 @@ struct ir3_nop_state {
unsigned half_ready[GPR_REG_SIZE];
};
typedef BITSET_DECLARE(conststate_t, CONST_REG_SIZE);
struct ir3_legalize_state {
regmask_t needs_ss;
regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */
@ -3357,8 +3360,8 @@ struct ir3_legalize_state {
regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */
regmask_t needs_ss_or_sy_scalar_war;
regmask_t needs_sy;
bool needs_ss_for_const;
bool needs_sy_for_const;
conststate_t needs_ss_for_const;
conststate_t needs_sy_for_const;
/* Next instruction needs (ss)/(sy), no matter its dsts/srcs. */
bool force_ss;

View file

@ -149,11 +149,23 @@ ir3_required_sync_flags(struct ir3_legalize_state *state,
flags |= IR3_INSTR_SY;
}
} else if ((reg->flags & IR3_REG_CONST)) {
if (state->needs_ss_for_const) {
flags |= IR3_INSTR_SS;
}
if (state->needs_sy_for_const) {
flags |= IR3_INSTR_SY;
if (reg->flags & IR3_REG_RELATIV) {
/* Since we don't know which const reg is accessed, add sync flags
* if any const reg need them.
*/
if (!BITSET_IS_EMPTY(state->needs_ss_for_const)) {
flags |= IR3_INSTR_SS;
}
if (!BITSET_IS_EMPTY(state->needs_sy_for_const)) {
flags |= IR3_INSTR_SY;
}
} else {
if (BITSET_TEST(state->needs_ss_for_const, reg->num)) {
flags |= IR3_INSTR_SS;
}
if (BITSET_TEST(state->needs_sy_for_const, reg->num)) {
flags |= IR3_INSTR_SY;
}
}
} else if (!(reg->flags & (IR3_REG_IMMED | IR3_REG_RT))) {
if (regmask_get(&state->needs_ss, reg)) {
@ -186,7 +198,7 @@ apply_ss(struct ir3_legalize_state *state, bool mergedregs)
regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
regmask_init(&state->needs_ss_scalar_full, mergedregs);
regmask_init(&state->needs_ss_scalar_half, mergedregs);
state->needs_ss_for_const = false;
BITSET_ZERO(state->needs_ss_for_const);
state->force_ss = false;
}
@ -197,7 +209,7 @@ apply_sy(struct ir3_legalize_state *state, bool mergedregs)
regmask_init(&state->needs_sy_war, mergedregs);
regmask_init(&state->needs_ss_or_sy_war, mergedregs);
regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
state->needs_sy_for_const = false;
BITSET_ZERO(state->needs_sy_for_const);
state->force_sy = false;
}
@ -258,10 +270,18 @@ sync_update(struct ir3_legalize_state *state, struct ir3_compiler *compiler,
} else {
regmask_set(&state->needs_ss, n->dsts[0]);
}
} else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO || n->opc == OPC_STC) {
state->needs_ss_for_const = true;
} else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
unsigned const_dst = n->push_consts.dst_base;
unsigned const_size = n->push_consts.src_size * 2;
BITSET_SET_COUNT(state->needs_ss_for_const, const_dst, const_size);
} else if (n->opc == OPC_STC) {
unsigned const_dst = n->cat6.dst_offset;
unsigned const_size = n->cat6.iim_val;
BITSET_SET_COUNT(state->needs_ss_for_const, const_dst, const_size);
} else if (n->opc == OPC_LDC_K) {
state->needs_sy_for_const = true;
unsigned const_dst = n->cat6.dst_offset;
unsigned const_size = n->cat6.iim_val * 4;
BITSET_SET_COUNT(state->needs_sy_for_const, const_dst, const_size);
}
/* both tex/sfu appear to not always immediately consume
@ -370,8 +390,10 @@ ir3_merge_pred_legalize_states(struct ir3_legalize_state *state,
regmask_or(&state->needs_ss_or_sy_war, &state->needs_ss_or_sy_war,
&pstate->needs_ss_or_sy_war);
regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
state->needs_ss_for_const |= pstate->needs_ss_for_const;
state->needs_sy_for_const |= pstate->needs_sy_for_const;
BITSET_OR(state->needs_ss_for_const, state->needs_ss_for_const,
pstate->needs_ss_for_const);
BITSET_OR(state->needs_sy_for_const, state->needs_sy_for_const,
pstate->needs_sy_for_const);
state->force_ss |= pstate->force_ss;
state->force_sy |= pstate->force_sy;