diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h index e17ace592d9..86fd5574d2b 100644 --- a/src/freedreno/ir3/instr-a3xx.h +++ b/src/freedreno/ir3/instr-a3xx.h @@ -523,6 +523,7 @@ regid(int num, int comp) /* special registers: */ #define REG_A0 61 /* address register */ #define REG_P0 62 /* predicate register */ +#define REG_P0_X regid(REG_P0, 0) /* p0.x */ /* With is_bindless_s2en = 1, this determines whether bindless is enabled and * if so, how to get the (base, index) pair for both sampler and texture. diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index dd1263d3a60..3bdf6cc2ef4 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -891,6 +891,8 @@ is_terminator(struct ir3_instruction *instr) case OPC_SHPS: case OPC_GETONE: case OPC_GETLAST: + case OPC_PREDT: + case OPC_PREDF: return true; default: return false; @@ -2388,7 +2390,7 @@ INSTR0(END) INSTR0(CHSH) INSTR0(CHMASK) INSTR1NODST(PREDT) -INSTR0(PREDF) +INSTR1NODST(PREDF) INSTR0(PREDE) INSTR0(GETONE) INSTR0(GETLAST) diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 4bd1ffd6372..769e6c562d9 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -222,6 +222,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->num_predicates = 4; compiler->bitops_can_write_predicates = true; compiler->has_branch_and_or = true; + compiler->has_predication = true; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 23f3aedc2c0..cd86462e291 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -226,6 +226,9 @@ struct ir3_compiler { /* True if braa/brao are available. */ bool has_branch_and_or; + /* True if predt/predf/prede are supported. */ + bool has_predication; + /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */ uint32_t max_variable_workgroup_size; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index fe9fea27e16..f5e89b4d0ec 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3765,8 +3765,10 @@ emit_block(struct ir3_context *ctx, nir_block *nblock) /* Emit unconditional branch if we only have one successor. Conditional * branches are emitted in emit_if. */ - if (ctx->block->successors[0] && !ctx->block->successors[1]) - ir3_JUMP(ctx->block); + if (ctx->block->successors[0] && !ctx->block->successors[1]) { + if (!ir3_block_get_terminator(ctx->block)) + ir3_JUMP(ctx->block); + } _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL); } @@ -3843,13 +3845,153 @@ fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond) return branch; } -static struct ir3_instruction * -emit_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond) +static bool +instr_can_be_predicated(nir_instr *instr) { + /* Anything that doesn't expand to control-flow can be predicated. */ + switch (instr->type) { + case nir_instr_type_alu: + case nir_instr_type_deref: + case nir_instr_type_tex: + case nir_instr_type_load_const: + case nir_instr_type_undef: + case nir_instr_type_phi: + case nir_instr_type_parallel_copy: + return true; + case nir_instr_type_call: + case nir_instr_type_jump: + return false; + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_reduce: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + case nir_intrinsic_reduce_clusters_ir3: + case nir_intrinsic_inclusive_scan_clusters_ir3: + case nir_intrinsic_exclusive_scan_clusters_ir3: + case nir_intrinsic_brcst_active_ir3: + case nir_intrinsic_ballot: + case nir_intrinsic_elect: + case nir_intrinsic_read_invocation_cond_ir3: + case nir_intrinsic_discard_if: + case nir_intrinsic_discard: + case nir_intrinsic_demote: + case nir_intrinsic_demote_if: + case nir_intrinsic_terminate: + case nir_intrinsic_terminate_if: + return false; + default: + return true; + } + } + } + + unreachable("Checked all cases"); +} + +static bool +nif_can_be_predicated(nir_if *nif) +{ + /* For non-divergent branches, predication is more expensive than a branch + * because the latter can potentially skip all instructions. + */ + if (!nir_src_is_divergent(nif->condition)) + return false; + + /* Although it could potentially be possible to allow a limited form of + * nested predication (e.g., by resetting the predication mask after a nested + * branch), let's avoid this for now and only use predication for leaf + * branches. That is, for ifs that contain exactly one block in both branches + * (note that they always contain at least one block). + */ + if (!exec_list_is_singular(&nif->then_list) || + !exec_list_is_singular(&nif->else_list)) { + return false; + } + + nir_foreach_instr (instr, nir_if_first_then_block(nif)) { + if (!instr_can_be_predicated(instr)) + return false; + } + + nir_foreach_instr (instr, nir_if_first_else_block(nif)) { + if (!instr_can_be_predicated(instr)) + return false; + } + + return true; +} + +/* A typical if-else block like this: + * if (cond) { + * tblock; + * } else { + * fblock; + * } + * Will be emitted as: + * |-- i --| + * | ... | + * | predt | + * |-------| + * succ0 / \ succ1 + * |-- i+1 --| |-- i+2 --| + * | tblock | | fblock | + * | predf | | jump | + * |---------| |---------| + * succ0 \ / succ0 + * |-- j --| + * | ... | + * |-------| + * Where the numbers at the top of blocks are their indices. That is, the true + * block and false block are laid-out contiguously after the current block. This + * layout is verified during legalization in prede_sched which also inserts the + * final prede instruction. Note that we don't insert prede right away to allow + * opt_jump to optimize the jump in the false block. + */ +static struct ir3_instruction * +emit_predicated_branch(struct ir3_context *ctx, nir_if *nif) +{ + if (!ctx->compiler->has_predication) + return NULL; + if (!nif_can_be_predicated(nif)) + return NULL; + + struct ir3_block *then_block = get_block(ctx, nir_if_first_then_block(nif)); + struct ir3_block *else_block = get_block(ctx, nir_if_first_else_block(nif)); + assert(list_is_empty(&then_block->instr_list) && + list_is_empty(&else_block->instr_list)); + + bool inv; + struct ir3_instruction *condition = + get_branch_condition(ctx, &nif->condition, &inv); + struct ir3_instruction *pred, *pred_inv; + + if (!inv) { + pred = ir3_PREDT(ctx->block, condition, IR3_REG_PREDICATE); + pred_inv = ir3_PREDF(then_block, condition, IR3_REG_PREDICATE); + } else { + pred = ir3_PREDF(ctx->block, condition, IR3_REG_PREDICATE); + pred_inv = ir3_PREDT(then_block, condition, IR3_REG_PREDICATE); + } + + pred->srcs[0]->num = REG_P0_X; + pred_inv->srcs[0]->num = REG_P0_X; + return pred; +} + +static struct ir3_instruction * +emit_conditional_branch(struct ir3_context *ctx, nir_if *nif) +{ + nir_src *nir_cond = &nif->condition; struct ir3_instruction *folded = fold_conditional_branch(ctx, nir_cond); if (folded) return folded; + struct ir3_instruction *predicated = emit_predicated_branch(ctx, nif); + if (predicated) + return predicated; + bool inv1; struct ir3_instruction *cond1 = get_branch_condition(ctx, nir_cond, &inv1); struct ir3_instruction *branch = @@ -3881,7 +4023,7 @@ emit_if(struct ir3_context *ctx, nir_if *nif) */ ir3_SHPS(ctx->block); } else { - emit_conditional_branch(ctx, &nif->condition); + emit_conditional_branch(ctx, nif); } ctx->block->divergent_condition = nif->condition.ssa->divergent; diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index b8148c236d9..b3c34ba5f3b 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -1038,7 +1038,8 @@ block_sched(struct ir3 *ir) br2 = ir3_JUMP(block); br2->cat0.target = block->successors[0]; - } else { + } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO || + opc == OPC_BALL || opc == OPC_BANY) { /* create "else" branch first (since "then" block should * frequently/always end up being a fall-thru): */ @@ -1046,6 +1047,12 @@ block_sched(struct ir3 *ir) br2 = ir3_instr_clone(br1); invert_branch(br1); br2->cat0.target = block->successors[0]; + } else { + assert(opc == OPC_PREDT || opc == OPC_PREDF); + + /* Handled by prede_sched. */ + terminator->cat0.target = block->successors[0]; + continue; } /* Creating br2 caused it to be moved before the terminator b1, move it @@ -1053,16 +1060,113 @@ block_sched(struct ir3 *ir) */ ir3_instr_move_after(br2, br1); } else if (block->successors[0]) { - /* otherwise unconditional jump to next block which should already have - * been inserted. + /* otherwise unconditional jump or predt/predf to next block which + * should already have been inserted. */ assert(terminator); - assert(terminator->opc == OPC_JUMP); + assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT || + terminator->opc == OPC_PREDF); terminator->cat0.target = block->successors[0]; } } } +static void +prede_sched(struct ir3 *ir) +{ + unsigned index = 0; + foreach_block (block, &ir->block_list) + block->index = index++; + + foreach_block (block, &ir->block_list) { + /* Look for the following pattern generated by NIR lowering. The numbers + * at the top of blocks are their index. + * |--- i ----| + * | ... | + * | pred[tf] | + * |----------| + * succ0 / \ succ1 + * |-- i+1 ---| |-- i+2 ---| + * | ... | | ... | + * | pred[ft] | | ... | + * |----------| |----------| + * succ0 \ / succ0 + * |--- j ----| + * | ... | + * |----------| + */ + struct ir3_block *succ0 = block->successors[0]; + struct ir3_block *succ1 = block->successors[1]; + + if (!succ1) + continue; + + struct ir3_instruction *terminator = ir3_block_get_terminator(block); + if (!terminator) + continue; + if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF) + continue; + + assert(!succ0->successors[1] && !succ1->successors[1]); + assert(succ0->successors[0] == succ1->successors[0]); + assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1); + assert(succ0->index == (block->index + 1)); + assert(succ1->index == (block->index + 2)); + + struct ir3_instruction *succ0_terminator = + ir3_block_get_terminator(succ0); + assert(succ0_terminator); + assert(succ0_terminator->opc == + (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT)); + + ASSERTED struct ir3_instruction *succ1_terminator = + ir3_block_get_terminator(succ1); + assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP)); + + /* Simple case: both successors contain instructions. Keep both blocks and + * insert prede before the second successor's terminator: + * |--- i ----| + * | ... | + * | pred[tf] | + * |----------| + * succ0 / \ succ1 + * |-- i+1 ---| |-- i+2 ---| + * | ... | | ... | + * | pred[ft] | | prede | + * |----------| |----------| + * succ0 \ / succ0 + * |--- j ----| + * | ... | + * |----------| + */ + if (!list_is_empty(&succ1->instr_list)) { + ir3_PREDE(succ1); + continue; + } + + /* Second successor is empty so we can remove it: + * |--- i ----| + * | ... | + * | pred[tf] | + * |----------| + * succ0 / \ succ1 + * |-- i+1 ---| | + * | ... | | + * | prede | | + * |----------| | + * succ0 \ / + * |--- j ----| + * | ... | + * |----------| + */ + list_delinit(&succ0_terminator->node); + ir3_PREDE(succ0); + remove_unused_block(succ1); + block->successors[1] = succ0->successors[0]; + ir3_block_add_predecessor(succ0->successors[0], block); + } +} + /* Here we workaround the fact that kill doesn't actually kill the thread as * GL expects. The last instruction always needs to be an end instruction, * which means that if we're stuck in a loop where kill is the only way out, @@ -1352,6 +1456,8 @@ helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir, */ if (is_alu(instr) || is_sfu(instr)) continue; + if (instr->opc == OPC_PREDE) + continue; expensive_instruction_in_block = true; break; @@ -1457,6 +1563,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) while (opt_jump(ir)) ; + prede_sched(ir); + /* TODO: does (eq) exist before a6xx? */ if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod && so->compiler->gen >= 6) diff --git a/src/freedreno/ir3/ir3_reconvergence.c b/src/freedreno/ir3/ir3_reconvergence.c index 5870193b4b8..bddd733efca 100644 --- a/src/freedreno/ir3/ir3_reconvergence.c +++ b/src/freedreno/ir3/ir3_reconvergence.c @@ -181,6 +181,11 @@ ir3_calc_reconvergence(struct ir3_shader_variant *so) * reconvergence point. */ foreach_block (block, &so->ir->block_list) { + struct ir3_instruction *terminator = ir3_block_get_terminator(block); + if (!terminator) + continue; + if (terminator->opc == OPC_PREDT || terminator->opc == OPC_PREDF) + continue; if (block->successors[0] && block->successors[1] && block->divergent_condition) { unsigned idx = block->successors[0]->index >