diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 90c3f4a51ae..3ceaa5a3f75 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -168,6 +168,9 @@ typedef enum ir3_register_flags { * Note: This effectively has the same semantics as IR3_REG_KILL. */ IR3_REG_LAST_USE = BIT(18), + + /* Predicate register (p0.c). Cannot be combined with half or shared. */ + IR3_REG_PREDICATE = BIT(19), } ir3_register_flags; struct ir3_register { @@ -563,9 +566,6 @@ struct ir3 { /* same for a1.x: */ DECLARE_ARRAY(struct ir3_instruction *, a1_users); - /* and same for instructions that consume predicate register: */ - DECLARE_ARRAY(struct ir3_instruction *, predicates); - /* Track texture sample instructions which need texture state * patched in (for astc-srgb workaround): */ @@ -952,7 +952,7 @@ is_same_type_mov(struct ir3_instruction *instr) dst = instr->dsts[0]; /* mov's that write to a0 or p0.x are special: */ - if (dst->num == regid(REG_P0, 0)) + if (dst->flags & IR3_REG_PREDICATE) return false; if (reg_num(dst) == REG_A0) return false; @@ -1295,7 +1295,9 @@ is_dest_gpr(struct ir3_register *dst) { if (dst->wrmask == 0) return false; - if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0))) + if (reg_num(dst) == REG_A0) + return false; + if (dst->flags & IR3_REG_PREDICATE) return false; return true; } @@ -1333,10 +1335,10 @@ writes_addr1(struct ir3_instruction *instr) static inline bool writes_pred(struct ir3_instruction *instr) { - /* Note: only the first dest can write to p0.x */ + /* Note: only the first dest can write to p0 */ if (instr->dsts_count > 0) { struct ir3_register *dst = instr->dsts[0]; - return reg_num(dst) == REG_P0; + return !!(dst->flags & IR3_REG_PREDICATE); } return false; } @@ -1349,8 +1351,8 @@ writes_pred(struct ir3_instruction *instr) static inline bool is_reg_special(const struct ir3_register *reg) { - return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) || - (reg_num(reg) == REG_P0); + return (reg->flags & (IR3_REG_SHARED | IR3_REG_PREDICATE) || + (reg_num(reg) == REG_A0)); } /* Same as above but in cases where we don't have a register. r48.x and above @@ -1381,9 +1383,9 @@ conflicts(struct ir3_register *a, struct ir3_register *b) static inline bool reg_gpr(struct ir3_register *r) { - if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED)) + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_PREDICATE)) return false; - if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) + if (reg_num(r) == REG_A0) return false; return true; } @@ -1999,6 +2001,7 @@ bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v); /* register assignment: */ int ir3_ra(struct ir3_shader_variant *v); +void ir3_ra_predicates(struct ir3_shader_variant *v); /* lower subgroup ops: */ bool ir3_lower_subgroups(struct ir3 *ir); diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 5a254be1fe1..5f651a7b622 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -156,6 +156,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->local_mem_size = dev_info->cs_shared_mem_size; + compiler->num_predicates = 1; + if (compiler->gen >= 6) { compiler->samgq_workaround = true; /* a6xx split the pipeline state into geometry and fragment state, in @@ -212,6 +214,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk; compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble; compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk; + compiler->num_predicates = 4; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 23eb3eeb161..9703052fd5d 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -214,6 +214,9 @@ struct ir3_compiler { */ bool has_getfiberid; + /* Number of available predicate registers (p0.c) */ + uint32_t num_predicates; + /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */ uint32_t max_variable_workgroup_size; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 1c5d253e433..6b3b46df509 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -2614,8 +2614,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) cond->cat2.condition = IR3_COND_NE; /* condition always goes in predicate register: */ - cond->dsts[0]->num = regid(REG_P0, 0); - cond->dsts[0]->flags &= ~IR3_REG_SSA; + cond->dsts[0]->flags |= IR3_REG_PREDICATE; if (intr->intrinsic == nir_intrinsic_demote || intr->intrinsic == nir_intrinsic_demote_if) { @@ -2631,8 +2630,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) IR3_BARRIER_ACTIVE_FIBERS_W; kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W | IR3_BARRIER_ACTIVE_FIBERS_R; - kill->srcs[0]->num = regid(REG_P0, 0); - array_insert(ctx->ir, ctx->ir->predicates, kill); + kill->srcs[0]->flags |= IR3_REG_PREDICATE; array_insert(b, b->keeps, kill); ctx->so->has_kill = true; @@ -2653,14 +2651,13 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) cond->cat2.condition = IR3_COND_NE; /* condition always goes in predicate register: */ - cond->dsts[0]->num = regid(REG_P0, 0); + cond->dsts[0]->flags |= IR3_REG_PREDICATE; - kill = ir3_PREDT(b, cond, 0); + kill = ir3_PREDT(b, cond, IR3_REG_PREDICATE); kill->barrier_class = IR3_BARRIER_EVERYTHING; kill->barrier_conflict = IR3_BARRIER_EVERYTHING; - array_insert(ctx->ir, ctx->ir->predicates, kill); array_insert(b, b->keeps, kill); break; } @@ -2673,8 +2670,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0); else dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0); - dst[0]->srcs[0]->num = regid(REG_P0, 0); - array_insert(ctx->ir, ctx->ir->predicates, dst[0]); + dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE; break; } case nir_intrinsic_elect: @@ -2690,8 +2686,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0, src, 0); dst[0]->dsts[0]->flags |= IR3_REG_SHARED; - dst[0]->srcs[0]->num = regid(REG_P0, 0); - array_insert(ctx->ir, ctx->ir->predicates, dst[0]); + dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE; break; } @@ -2712,8 +2707,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; struct ir3_instruction *pred = ir3_get_predicate(ctx, src); ballot = ir3_BALLOT_MACRO(ctx->block, pred, components); - ballot->srcs[0]->num = regid(REG_P0, 0); - array_insert(ctx->ir, ctx->ir->predicates, ballot); + ballot->srcs[0]->flags |= IR3_REG_PREDICATE; } ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R; @@ -3820,11 +3814,11 @@ emit_if(struct ir3_context *ctx, nir_if *nif) if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) { struct ir3_instruction *pred = ssa(condition->srcs[0]); - ir3_BANY(ctx->block, pred, 0); + ir3_BANY(ctx->block, pred, IR3_REG_PREDICATE); } else if (condition->opc == OPC_ALL_MACRO && condition->block == ctx->block) { struct ir3_instruction *pred = ssa(condition->srcs[0]); - ir3_BALL(ctx->block, pred, 0); + ir3_BALL(ctx->block, pred, IR3_REG_PREDICATE); } else if (condition->opc == OPC_ELECT_MACRO && condition->block == ctx->block) { ir3_GETONE(ctx->block); @@ -3837,7 +3831,7 @@ emit_if(struct ir3_context *ctx, nir_if *nif) ir3_SHPS(ctx->block); } else { struct ir3_instruction *pred = ir3_get_predicate(ctx, condition); - ir3_BR(ctx->block, pred, 0); + ir3_BR(ctx->block, pred, IR3_REG_PREDICATE); } emit_cf_list(ctx, &nif->then_list); @@ -3959,15 +3953,14 @@ emit_stream_out(struct ir3_context *ctx) /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); - cond->dsts[0]->num = regid(REG_P0, 0); - cond->dsts[0]->flags &= ~IR3_REG_SSA; + cond->dsts[0]->flags |= IR3_REG_PREDICATE; cond->cat2.condition = IR3_COND_LT; /* condition goes on previous block to the conditional, * since it is used to pick which of the two successor * paths to take: */ - ir3_BR(orig_end_block, cond, 0); + ir3_BR(orig_end_block, cond, IR3_REG_PREDICATE); /* switch to stream_out_block to generate the stream-out * instructions: diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index f57f8118020..5618188f03a 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -463,15 +463,14 @@ ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src) struct ir3_block *b = ctx->block; struct ir3_instruction *cond; - /* NOTE: only cmps.*.* can write p0.x: */ + /* NOTE: we use cpms.s.ne x, 0 to move x into a predicate register */ struct ir3_instruction *zero = create_immed_typed(b, 0, is_half(src) ? TYPE_U16 : TYPE_U32); cond = ir3_CMPS_S(b, src, 0, zero, 0); cond->cat2.condition = IR3_COND_NE; /* condition always goes in predicate register: */ - cond->dsts[0]->num = regid(REG_P0, 0); - cond->dsts[0]->flags &= ~IR3_REG_SSA; + cond->dsts[0]->flags |= IR3_REG_PREDICATE; return cond; } diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c index 6a872dc904b..bdf8628cee6 100644 --- a/src/freedreno/ir3/ir3_cp.c +++ b/src/freedreno/ir3/ir3_cp.c @@ -100,7 +100,7 @@ static bool is_foldable_double_cmp(struct ir3_instruction *cmp) { struct ir3_instruction *cond = ssa(cmp->srcs[0]); - return (cmp->dsts[0]->num == regid(REG_P0, 0)) && cond && + return (cmp->dsts[0]->flags & IR3_REG_PREDICATE) && cond && (cmp->srcs[1]->flags & IR3_REG_IMMED) && (cmp->srcs[1]->iim_val == 0) && (cmp->cat2.condition == IR3_COND_NE) && diff --git a/src/freedreno/ir3/ir3_dce.c b/src/freedreno/ir3/ir3_dce.c index c640f6e5c35..456a1c805c4 100644 --- a/src/freedreno/ir3/ir3_dce.c +++ b/src/freedreno/ir3/ir3_dce.c @@ -133,12 +133,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so) struct ir3_instruction *terminator = ir3_block_get_terminator(block); if (terminator) { instr_dce(terminator, false); - - /* Temporary workaround for predicates not being SSA. Won't be - * necessary anymore once we have RA for predicates. - */ - foreach_src (src, terminator) - instr_dce(src->def->instr, false); } } @@ -181,12 +175,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so) ir->a1_users[i] = NULL; } - for (i = 0; i < ir->predicates_count; i++) { - struct ir3_instruction *instr = ir->predicates[i]; - if (instr && (instr->flags & IR3_INSTR_UNUSED)) - ir->predicates[i] = NULL; - } - /* cleanup unused inputs: */ foreach_input_n (in, n, ir) if (in->flags & IR3_INSTR_UNUSED) diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index e9ad732eadb..69a59b67fe6 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -727,9 +727,6 @@ invert_branch(struct ir3_instruction *branch) * Initially this is done naively, without considering if the successor * block immediately follows the current block (ie. so no jump required), * but that is cleaned up in opt_jump(). - * - * TODO what ensures that the last write to p0.x in a block is the - * branch condition? Have we been getting lucky all this time? */ static void block_sched(struct ir3 *ir) diff --git a/src/freedreno/ir3/ir3_parser.y b/src/freedreno/ir3/ir3_parser.y index eb3732be9fa..6549e4bbe56 100644 --- a/src/freedreno/ir3/ir3_parser.y +++ b/src/freedreno/ir3/ir3_parser.y @@ -845,11 +845,11 @@ instr: iflags cat0_instr label: T_IDENTIFIER ':' { new_label($1); } -cat0_src1: '!' T_P0 { instr->cat0.inv1 = true; $$ = new_src((62 << 3) + $2, 0); } -| T_P0 { $$ = new_src((62 << 3) + $1, 0); } +cat0_src1: '!' T_P0 { instr->cat0.inv1 = true; $$ = new_src((62 << 3) + $2, IR3_REG_PREDICATE); } +| T_P0 { $$ = new_src((62 << 3) + $1, IR3_REG_PREDICATE); } -cat0_src2: '!' T_P0 { instr->cat0.inv2 = true; $$ = new_src((62 << 3) + $2, 0); } -| T_P0 { $$ = new_src((62 << 3) + $1, 0); } +cat0_src2: '!' T_P0 { instr->cat0.inv2 = true; $$ = new_src((62 << 3) + $2, IR3_REG_PREDICATE); } +| T_P0 { $$ = new_src((62 << 3) + $1, IR3_REG_PREDICATE); } cat0_immed: '#' integer { instr->cat0.immed = $2; } | '#' T_IDENTIFIER { ralloc_steal(instr, (void *)$2); instr->cat0.target_label = $2; } @@ -1434,12 +1434,12 @@ meta_print: meta_print_start meta_print_regs { src: T_REGISTER { $$ = new_src($1, 0); } | T_A0 { $$ = new_src((61 << 3), IR3_REG_HALF); } | T_A1 { $$ = new_src((61 << 3) + 1, IR3_REG_HALF); } -| T_P0 { $$ = new_src((62 << 3) + $1, 0); } +| T_P0 { $$ = new_src((62 << 3) + $1, IR3_REG_PREDICATE); } dst: T_REGISTER { $$ = new_dst($1, 0); } | T_A0 { $$ = new_dst((61 << 3), IR3_REG_HALF); } | T_A1 { $$ = new_dst((61 << 3) + 1, IR3_REG_HALF); } -| T_P0 { $$ = new_dst((62 << 3) + $1, 0); } +| T_P0 { $$ = new_dst((62 << 3) + $1, IR3_REG_PREDICATE); } const: T_CONSTANT { $$ = new_src($1, IR3_REG_CONST); } diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c index bd5d0cf0452..92e55bf9a4b 100644 --- a/src/freedreno/ir3/ir3_print.c +++ b/src/freedreno/ir3/ir3_print.c @@ -257,9 +257,18 @@ print_ssa_name(struct log_stream *stream, struct ir3_register *reg, bool dst) print_ssa_def_name(stream, reg); } - if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY)) - mesa_log_stream_printf(stream, "(" SYN_REG("r%u.%c") ")", reg_num(reg), + if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY)) { + const char *prefix = "r"; + unsigned num = reg_num(reg); + + if (reg->flags & IR3_REG_PREDICATE) { + prefix = "p"; + num = 0; + } + + mesa_log_stream_printf(stream, "(" SYN_REG("%s%u.%c") ")", prefix, num, "xyzw"[reg_comp(reg)]); + } } static void @@ -296,6 +305,8 @@ print_reg_name(struct log_stream *stream, struct ir3_instruction *instr, mesa_log_stream_printf(stream, "s"); if (reg->flags & IR3_REG_HALF) mesa_log_stream_printf(stream, "h"); + if (reg->flags & IR3_REG_PREDICATE) + mesa_log_stream_printf(stream, "p"); if (reg->flags & IR3_REG_IMMED) { mesa_log_stream_printf(stream, SYN_IMMED("imm[%f,%d,0x%x]"), reg->fim_val, @@ -325,6 +336,9 @@ print_reg_name(struct log_stream *stream, struct ir3_instruction *instr, if (reg->flags & IR3_REG_CONST) mesa_log_stream_printf(stream, SYN_CONST("c%u.%c"), reg_num(reg), "xyzw"[reg_comp(reg)]); + else if (reg->flags & IR3_REG_PREDICATE) + mesa_log_stream_printf(stream, SYN_REG("p0.%c"), + "xyzw"[reg_comp(reg)]); else mesa_log_stream_printf(stream, SYN_REG("r%u.%c"), reg_num(reg), "xyzw"[reg_comp(reg)]); diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index ddc3da4b76b..2debdeab167 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -2544,6 +2544,9 @@ ir3_ra(struct ir3_shader_variant *v) { ir3_calc_dominance(v->ir); + /* Predicate RA needs dominance. */ + ir3_ra_predicates(v); + ir3_create_parallel_copies(v->ir); struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx); diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h index bc194627834..6dea545c3ce 100644 --- a/src/freedreno/ir3/ir3_ra.h +++ b/src/freedreno/ir3/ir3_ra.h @@ -87,7 +87,7 @@ ra_reg_get_physreg(const struct ir3_register *reg) static inline bool def_is_gpr(const struct ir3_register *reg) { - return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0; + return reg_num(reg) != REG_A0 && !(reg->flags & IR3_REG_PREDICATE); } /* Note: don't count undef as a source. @@ -105,6 +105,12 @@ ra_reg_is_dst(const struct ir3_register *reg) ((reg->flags & IR3_REG_ARRAY) || reg->wrmask); } +static inline bool +ra_reg_is_predicate(const struct ir3_register *reg) +{ + return (reg->flags & IR3_REG_SSA) && (reg->flags & IR3_REG_PREDICATE); +} + /* Iterators for sources and destinations which: * - Don't include fake sources (irrelevant for RA) * - Don't include non-SSA sources (immediates and constants, also irrelevant) diff --git a/src/freedreno/ir3/ir3_ra_predicates.c b/src/freedreno/ir3/ir3_ra_predicates.c new file mode 100644 index 00000000000..9c3e553275b --- /dev/null +++ b/src/freedreno/ir3/ir3_ra_predicates.c @@ -0,0 +1,467 @@ +/* + * Copyright 2024 Igalia S.L. + * SPDX-License-Identifier: MIT + */ + +#include "ir3.h" +#include "ir3_ra.h" +#include "ir3_shader.h" + +/* Represents a def that is currently live. We keep track of both the pre-RA def + * a register refers to and, in case of spilling and reloading, the def of the + * reloaded instruction. This allows us to assign reloaded defs to sources and + * prevents additional reloads. + */ +struct live_def { + /* The pre-RA def. */ + struct ir3_register *def; + + /* The reloaded def. NULL if def was not reloaded. */ + struct ir3_register *reloaded_def; + + /* Set when used for a src marked first-kill. We cannot immediately free the + * register because then it might get reused for another src in the same + * instruction. Instead, we free it after an instruction's sources have been + * processed. + */ + bool killed; +}; + +/* Per-block liveness information. Stores live defs per supported register, + * indexed by register component. + */ +struct block_liveness { + /* Live-in defs taken from the intersections the block's predecessors + * live-out defs. + */ + struct live_def *live_in_defs; + + /* Currently live defs. Starts from live-in and is updated while processing + * the instructions in a block. Contains the live-out defs after the whole + * block has been processed. + */ + struct live_def *live_defs; +}; + +struct ra_predicates_ctx { + struct ir3 *ir; + unsigned num_regs; + struct ir3_liveness *liveness; + struct block_liveness *blocks_liveness; + + /* True once we spilled a register. This allows us to postpone the + * calculation of SSA uses and instruction counting until the first time we + * need to spill. This is useful since spilling is rare in general. + */ + bool spilled; +}; + +static bool +has_free_regs(struct ra_predicates_ctx *ctx, struct block_liveness *live) +{ + for (unsigned i = 0; i < ctx->num_regs; ++i) { + if (live->live_defs[i].def == NULL) + return true; + } + + return false; +} + +static unsigned +alloc_reg_comp(struct ra_predicates_ctx *ctx, struct block_liveness *live) +{ + for (unsigned i = 0; i < ctx->num_regs; ++i) { + if (live->live_defs[i].def == NULL) + return i; + } + + unreachable("Reg availability should have been checked before"); +} + +static struct live_def * +assign_reg(struct ra_predicates_ctx *ctx, struct block_liveness *live, + struct ir3_register *def, struct ir3_register *reloaded_def, + unsigned comp) +{ + assert(comp < ctx->num_regs); + + struct ir3_register *current_def = + (reloaded_def == NULL) ? def : reloaded_def; + + current_def->num = regid(REG_P0, comp); + + struct live_def *live_def = &live->live_defs[comp]; + assert((live_def->def == NULL) && (live_def->reloaded_def == NULL)); + + live_def->def = def; + live_def->reloaded_def = reloaded_def; + return live_def; +} + +static struct live_def * +alloc_reg(struct ra_predicates_ctx *ctx, struct block_liveness *live, + struct ir3_register *def, struct ir3_register *reloaded_def) +{ + unsigned comp = alloc_reg_comp(ctx, live); + return assign_reg(ctx, live, def, reloaded_def, comp); +} + +static void +free_reg(struct ra_predicates_ctx *ctx, struct block_liveness *live, + struct ir3_register *reg) +{ + assert((reg->flags & IR3_REG_PREDICATE) && (reg->num != INVALID_REG)); + + unsigned comp = reg_comp(reg); + assert(comp < ctx->num_regs); + + struct live_def *reg_live_def = &live->live_defs[comp]; + assert((reg_live_def->def == reg) || (reg_live_def->reloaded_def == reg)); + + reg_live_def->def = NULL; + reg_live_def->reloaded_def = NULL; + reg_live_def->killed = false; +} + +static struct ir3_instruction * +first_non_allocated_use_after(struct ir3_register *def, + struct ir3_instruction *after) +{ + uint32_t first_ip = UINT32_MAX; + struct ir3_instruction *first = NULL; + + foreach_ssa_use (use, def->instr) { + if (!ir3_block_dominates(after->block, use->block)) + continue; + + /* Do not filter-out after itself. This ensures that if after is a use of + * def, def will not get selected to get spilled because there must be + * another register with a further first use. We have to ensure that def + * doesn't get spilled in this case because otherwise, we might spill a + * register used by an earlier source of after. + */ + if (use->ip < after->ip) + continue; + + foreach_ssa_src_n (src, n, use) { + if (__is_false_dep(use, n)) + continue; + + struct ir3_register *src_reg = use->srcs[n]; + if (!ra_reg_is_predicate(src_reg) || src_reg->def != def) + continue; + if (use->ip >= first_ip) + continue; + + first_ip = use->ip; + first = use; + } + } + + return first; +} + +static bool +is_predicate_use(struct ir3_instruction *instr, unsigned src_n) +{ + if (__is_false_dep(instr, src_n)) + return false; + return ra_reg_is_predicate(instr->srcs[src_n]); +} + +/* Spill a register by simply removing one from the live defs. We don't need to + * store its value anywhere since it can be rematerialized (see reload). We + * chose the register whose def's first use is the furthest. + */ +static void +spill(struct ra_predicates_ctx *ctx, struct block_liveness *live, + struct ir3_instruction *spill_location) +{ + if (!ctx->spilled) { + ir3_count_instructions_ra(ctx->ir); + ir3_find_ssa_uses_for(ctx->ir, ctx, is_predicate_use); + ctx->spilled = true; + } + + unsigned furthest_first_use = 0; + unsigned spill_reg = ~0; + + for (unsigned i = 0; i < ctx->num_regs; ++i) { + struct ir3_register *candidate = live->live_defs[i].def; + assert(candidate != NULL); + + struct ir3_instruction *first_use = + first_non_allocated_use_after(candidate, spill_location); + + if (first_use == NULL) { + spill_reg = i; + break; + } + + if (first_use->ip > furthest_first_use) { + furthest_first_use = first_use->ip; + spill_reg = i; + } + } + + assert(spill_reg != ~0); + + live->live_defs[spill_reg].def = NULL; + live->live_defs[spill_reg].reloaded_def = NULL; +} + +static struct live_def * +find_live_def(struct ra_predicates_ctx *ctx, struct block_liveness *live, + struct ir3_register *def) +{ + for (unsigned i = 0; i < ctx->num_regs; ++i) { + struct live_def *live_def = &live->live_defs[i]; + if (live_def->def == def) + return live_def; + } + + return NULL; +} + +static struct ir3_register * +get_def(struct live_def *live_def) +{ + return live_def->reloaded_def == NULL ? live_def->def + : live_def->reloaded_def; +} + +/* Reload a def into s specific register, which must be free. Reloading is + * implemented by cloning the instruction that produced the def and moving it in + * front of the use. + */ +static struct live_def * +reload_into(struct ra_predicates_ctx *ctx, struct block_liveness *live, + struct ir3_register *def, struct ir3_instruction *use, + unsigned comp) +{ + struct ir3_instruction *reloaded_instr = NULL; + bool def_is_allocated = !(def->flags & IR3_REG_UNUSED); + + if (!def_is_allocated && use->block == def->instr->block) { + /* If def has not been allocated a register yet, no source is currently + * using it. If it's in the same block as the current use, just move it in + * front of it. + */ + reloaded_instr = def->instr; + } else { + /* If the def is either 1) already allocated or 2) in a different block + * than the current use, we have to clone it. For 1) because its allocated + * register isn't currently live (we wouldn't be reloading it otherwise). + * For 2) because it might have other uses in blocks that aren't + * successors of the use. + */ + reloaded_instr = ir3_instr_clone(def->instr); + } + + reloaded_instr->block = use->block; + ir3_instr_move_before(reloaded_instr, use); + struct ir3_register *reloaded_def = reloaded_instr->dsts[0]; + return assign_reg(ctx, live, def, reloaded_def, comp); +} + +/* Reload a def into a register, spilling one if necessary. */ +static struct live_def * +reload(struct ra_predicates_ctx *ctx, struct block_liveness *live, + struct ir3_register *def, struct ir3_instruction *use) +{ + if (!has_free_regs(ctx, live)) + spill(ctx, live, use); + + unsigned comp = alloc_reg_comp(ctx, live); + return reload_into(ctx, live, def, use, comp); +} + +static int +ra_block(struct ra_predicates_ctx *ctx, struct ir3_block *block) +{ + struct block_liveness *live = &ctx->blocks_liveness[block->index]; + + foreach_instr (instr, &block->instr_list) { + /* Assign registers to sources based on their defs. */ + foreach_src (src, instr) { + if (!ra_reg_is_predicate(src)) + continue; + + struct live_def *live_def = find_live_def(ctx, live, src->def); + if (live_def == NULL) + live_def = reload(ctx, live, src->def, instr); + + assert(live_def != NULL); + + struct ir3_register *def = get_def(live_def); + src->num = def->num; + src->def = def; + + /* Mark the def as used to make sure we won't move it anymore. */ + def->flags &= ~IR3_REG_UNUSED; + + /* If this source kills the def, don't free the register right away to + * prevent it being reused for another source of this instruction. We + * can free it after all sources of this instruction have been + * processed. + */ + if (src->flags & IR3_REG_FIRST_KILL) + live_def->killed = true; + } + + /* After all sources of an instruction have been processed, we can free + * the registers that were killed by a source. + */ + for (unsigned reg = 0; reg < ctx->num_regs; ++reg) { + struct live_def *live_def = &live->live_defs[reg]; + if (live_def->def == NULL) + continue; + + if (live_def->killed) + free_reg(ctx, live, get_def(live_def)); + } + + /* Allocate registers for new defs. */ + foreach_dst (dst, instr) { + if (!ra_reg_is_predicate(dst)) + continue; + + /* Mark it as unused until we encounter the first use. This allows us + * to know when it is legal to move the instruction. + */ + dst->flags |= IR3_REG_UNUSED; + + /* If we don't have any free registers, ignore the def for now. If we + * start spilling right away, we might end-up with a cascade of spills + * when there are a lot of defs before their first uses. + */ + if (!has_free_regs(ctx, live)) + continue; + + alloc_reg(ctx, live, dst, NULL); + } + } + + /* Process loop back edges. Since we ignore them while calculating a block's + * live-in defs in init_block_liveness, we now make sure that we satisfy our + * successor's live-in requirements by producing the correct defs in the + * required registers. + */ + for (unsigned i = 0; i < 2; ++i) { + struct ir3_block *succ = block->successors[i]; + if (!succ) + continue; + + struct live_def *succ_live_in = + ctx->blocks_liveness[succ->index].live_in_defs; + + /* If live_in_defs has not been set yet, it's not a back edge. */ + if (!succ_live_in) + continue; + + for (unsigned reg = 0; reg < ctx->num_regs; ++reg) { + struct live_def *succ_def = &succ_live_in[reg]; + if (!succ_def->def) + continue; + + struct live_def *cur_def = &live->live_defs[reg]; + + /* Same def in the same register, nothing to be done. */ + if (cur_def->def == succ_def->def) + continue; + + /* Different def in the same register, free it first. */ + if (cur_def->def) + free_reg(ctx, live, cur_def->def); + + /* Reload the def in the required register right before the block's + * terminator. + */ + struct ir3_instruction *use = ir3_block_get_terminator(block); + reload_into(ctx, live, succ_def->def, use, reg); + } + } + + return 0; +} + +/* Propagate live-out defs of a block's predecessors to the block's live-in + * defs. This takes the intersection of all predecessors live-out defs. That is, + * a def will be live-in if it's live-out in the same register in all + * predecessors. + */ +static void +init_block_liveness(struct ra_predicates_ctx *ctx, struct ir3_block *block) +{ + struct block_liveness *live = &ctx->blocks_liveness[block->index]; + live->live_defs = rzalloc_array(ctx, struct live_def, ctx->num_regs); + BITSET_WORD *live_in = ctx->liveness->live_in[block->index]; + + for (unsigned i = 0; i < block->predecessors_count; ++i) { + struct ir3_block *pred = block->predecessors[i]; + assert(pred != NULL); + + struct block_liveness *pred_live = &ctx->blocks_liveness[pred->index]; + + /* If the predecessor has not been processed yet it means it's the back + * edge of a loop. We ignore it now, take the live-out defs of the block's + * other predecessors, and make sure the live-out defs of the back edge + * match this block's live-in defs after processing the back edge. + */ + if (pred_live->live_defs == NULL) + continue; + + for (unsigned reg = 0; reg < ctx->num_regs; ++reg) { + struct live_def *cur_def = &live->live_defs[reg]; + struct live_def *pred_def = &pred_live->live_defs[reg]; + + if (i == 0 && pred_def->def != NULL) { + /* If the first predecessor has a def in reg, use it if it's live-in + * in this block. + */ + if (BITSET_TEST(live_in, pred_def->def->name)) + *cur_def = *pred_def; + } else if (cur_def->def != pred_def->def) { + /* Different predecessors have different live-out defs in reg so we + * cannot use it as live-in. + */ + cur_def->def = NULL; + cur_def->reloaded_def = NULL; + } + } + } + + live->live_in_defs = rzalloc_array(ctx, struct live_def, ctx->num_regs); + memcpy(live->live_in_defs, live->live_defs, + sizeof(struct live_def) * ctx->num_regs); +} + +void +ir3_ra_predicates(struct ir3_shader_variant *v) +{ + struct ra_predicates_ctx *ctx = rzalloc(NULL, struct ra_predicates_ctx); + ctx->ir = v->ir; + ctx->num_regs = v->compiler->num_predicates; + ctx->liveness = ir3_calc_liveness_for(ctx, v->ir, ra_reg_is_predicate, + ra_reg_is_predicate); + ctx->blocks_liveness = + rzalloc_array(ctx, struct block_liveness, ctx->liveness->block_count); + + foreach_block (block, &v->ir->block_list) { + init_block_liveness(ctx, block); + ra_block(ctx, block); + } + + /* Remove instructions that became unused. This happens when a def was never + * used directly but only through its reloaded clones. + * Note that index 0 in the liveness definitions is always NULL. + */ + for (unsigned i = 1; i < ctx->liveness->definitions_count; ++i) { + struct ir3_register *def = ctx->liveness->definitions[i]; + + if (def->flags & IR3_REG_UNUSED) + list_delinit(&def->instr->node); + } + + ralloc_free(ctx); +} diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index 3991c0c6861..aad52b194a3 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -97,9 +97,8 @@ struct ir3_sched_ctx { struct ir3_instruction *scheduled; /* last scheduled instr */ struct ir3_instruction *addr0; /* current a0.x user, if any */ struct ir3_instruction *addr1; /* current a1.x user, if any */ - struct ir3_instruction *pred; /* current p0.x user, if any */ - struct ir3_instruction *split; /* most-recently-split a0/a1/p0 producer */ + struct ir3_instruction *split; /* most-recently-split a0/a1 producer */ int remaining_kills; int remaining_tex; @@ -277,11 +276,6 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) ctx->addr1 = instr; } - if (writes_pred(instr)) { - assert(ctx->pred == NULL); - ctx->pred = instr; - } - instr->flags |= IR3_INSTR_MARK; di(instr, "schedule"); @@ -366,9 +360,9 @@ struct ir3_sched_notes { */ bool blocked_kill; /* there is at least one instruction that could be scheduled, - * except for conflicting address/predicate register usage: + * except for conflicting address register usage: */ - bool addr0_conflict, addr1_conflict, pred_conflict; + bool addr0_conflict, addr1_conflict; }; static bool @@ -485,12 +479,6 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, return false; } - if (writes_pred(instr) && ctx->pred) { - assert(ctx->pred != instr); - notes->pred_conflict = true; - return false; - } - /* if the instruction is a kill, we need to ensure *every* * bary.f is scheduled. The hw seems unhappy if the thread * gets killed before the end-input (ei) flag is hit. @@ -955,58 +943,6 @@ split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr, return new_addr; } -/* "spill" the predicate register by remapping any unscheduled - * instructions which depend on the current predicate register - * to a clone of the instruction which wrote the address reg. - */ -static struct ir3_instruction * -split_pred(struct ir3_sched_ctx *ctx) -{ - struct ir3 *ir; - struct ir3_instruction *new_pred = NULL; - unsigned i; - - assert(ctx->pred); - - ir = ctx->pred->block->shader; - - for (i = 0; i < ir->predicates_count; i++) { - struct ir3_instruction *predicated = ir->predicates[i]; - - if (!predicated) - continue; - - /* skip instructions already scheduled: */ - if (is_scheduled(predicated)) - continue; - - /* remap remaining instructions using current pred - * to new pred: - * - * TODO is there ever a case when pred isn't first - * (and only) src? - */ - if (ssa(predicated->srcs[0]) == ctx->pred) { - if (!new_pred) { - new_pred = split_instr(ctx, ctx->pred); - /* original pred is scheduled, but new one isn't: */ - new_pred->flags &= ~IR3_INSTR_MARK; - } - predicated->srcs[0]->def->instr = new_pred; - /* don't need to remove old dag edge since old pred is - * already scheduled: - */ - sched_node_add_dep(predicated, new_pred, 0); - di(predicated, "new predicate"); - } - } - - /* all remaining predicated remapped to new pred: */ - ctx->pred = NULL; - - return new_pred; -} - static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { @@ -1182,7 +1118,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) /* addr/pred writes are per-block: */ ctx->addr0 = NULL; ctx->addr1 = NULL; - ctx->pred = NULL; ctx->sy_delay = 0; ctx->ss_delay = 0; ctx->sy_index = ctx->first_outstanding_sy_index = 0; @@ -1267,8 +1202,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) } else if (notes.addr1_conflict) { new_instr = split_addr(ctx, &ctx->addr1, ir->a1_users, ir->a1_users_count); - } else if (notes.pred_conflict) { - new_instr = split_pred(ctx); } else { d("unscheduled_list:"); foreach_instr (instr, &ctx->unscheduled_list) diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c index 73a65c1916e..c8113f79197 100644 --- a/src/freedreno/ir3/ir3_validate.c +++ b/src/freedreno/ir3/ir3_validate.c @@ -65,7 +65,7 @@ validate_error(struct ir3_validate_ctx *ctx, const char *condstr) static unsigned reg_class_flags(struct ir3_register *reg) { - return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED); + return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED | IR3_REG_PREDICATE); } static void @@ -78,6 +78,9 @@ validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr, if (!(reg->flags & IR3_REG_SSA) || !reg->def) return; + if (reg->flags & IR3_REG_PREDICATE) + validate_assert(ctx, !(reg->flags & (IR3_REG_SHARED | IR3_REG_HALF))); + struct ir3_register *src = reg->def; validate_assert(ctx, _mesa_set_search(ctx->defs, src->instr)); diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index 3578bbaf9b8..b67d4b9a2de 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -108,6 +108,7 @@ libfreedreno_ir3_files = files( 'ir3_print.c', 'ir3_ra.c', 'ir3_ra.h', + 'ir3_ra_predicates.c', 'ir3_ra_validate.c', 'ir3_reconvergence.c', 'ir3_remove_unreachable.c',