From 9de628b65ca36b920dc6181251b33c436cad1b68 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Thu, 1 Feb 2024 14:51:25 +0100 Subject: [PATCH] ir3: fold and/or and negations into branches Fold and/or into braa/brao when profitable. Only do this when the and/or is not used for any non-branch instructions as this would increase total instruction count. Add an algebraic nir pass that performs the inverse DeMorgan's laws to try to bring and/or in front of branches. Again, only do this when the original inot in only used for branches. This should always decrease instruction count since the extra inots can be folded into the branch. Fold inot into branches by using the inv1/inv2 cat0 fields. Signed-off-by: Job Noorman Part-of: --- src/freedreno/ir3/ir3_compiler.c | 2 + src/freedreno/ir3/ir3_compiler.h | 3 + src/freedreno/ir3/ir3_compiler_nir.c | 88 ++++++++++++++++++- src/freedreno/ir3/ir3_context.c | 4 + src/freedreno/ir3/ir3_delay.c | 3 +- src/freedreno/ir3/ir3_legalize.c | 8 +- src/freedreno/ir3/ir3_nir.h | 1 + .../ir3/ir3_nir_branch_and_or_not.py | 37 ++++++++ src/freedreno/ir3/meson.build | 13 ++- 9 files changed, 153 insertions(+), 6 deletions(-) create mode 100644 src/freedreno/ir3/ir3_nir_branch_and_or_not.py diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index d62752f5901..7e03a0a64cd 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -158,6 +158,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->num_predicates = 1; compiler->bitops_can_write_predicates = false; + compiler->has_branch_and_or = false; if (compiler->gen >= 6) { compiler->samgq_workaround = true; @@ -217,6 +218,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk; compiler->num_predicates = 4; compiler->bitops_can_write_predicates = true; + compiler->has_branch_and_or = true; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 271b01fa12b..c48ceda1a8c 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -220,6 +220,9 @@ struct ir3_compiler { /* True if bitops (and.b, or.b, xor.b, not.b) can write to p0.c */ bool bitops_can_write_predicates; + /* True if braa/brao are available. */ + bool has_branch_and_or; + /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */ uint32_t max_variable_workgroup_size; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index b1ed40355a7..9cb73cbc00c 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3797,6 +3797,91 @@ emit_block(struct ir3_context *ctx, nir_block *nblock) static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list); +/* Get the ir3 branch condition for a given nir source. This will strip any inot + * instructions and set *inv when the condition should be inverted. This + * inversion can be directly folded into branches (in the inv1/inv2 fields) + * instead of adding an explicit not.b/sub.u instruction. + */ +static struct ir3_instruction * +get_branch_condition(struct ir3_context *ctx, nir_src *src, bool *inv) +{ + struct ir3_instruction *condition = ir3_get_src(ctx, src)[0]; + + if (src->ssa->parent_instr->type == nir_instr_type_alu) { + nir_alu_instr *nir_cond = nir_instr_as_alu(src->ssa->parent_instr); + + if (nir_cond->op == nir_op_inot) { + struct ir3_instruction *inv_cond = + get_branch_condition(ctx, &nir_cond->src[0].src, inv); + *inv = !*inv; + return inv_cond; + } + } + + *inv = false; + return ir3_get_predicate(ctx, condition); +} + +/* Try to fold br (and/or cond1, cond2) into braa/brao cond1, cond2. + */ +static struct ir3_instruction * +fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond) +{ + if (!ctx->compiler->has_branch_and_or) + return false; + + if (nir_cond->ssa->parent_instr->type != nir_instr_type_alu) + return NULL; + + nir_alu_instr *alu_cond = nir_instr_as_alu(nir_cond->ssa->parent_instr); + + if ((alu_cond->op != nir_op_iand) && (alu_cond->op != nir_op_ior)) + return NULL; + + /* If the result of the and/or is also used for something else than an if + * condition, the and/or cannot be removed. In that case, we will end-up with + * extra predicate conversions for the conditions without actually removing + * any instructions, resulting in an increase of instructions. Let's not fold + * the conditions in the branch in that case. + */ + if (!nir_def_only_used_by_if(&alu_cond->def)) + return NULL; + + bool inv1, inv2; + struct ir3_instruction *cond1 = + get_branch_condition(ctx, &alu_cond->src[0].src, &inv1); + struct ir3_instruction *cond2 = + get_branch_condition(ctx, &alu_cond->src[1].src, &inv2); + + struct ir3_instruction *branch; + if (alu_cond->op == nir_op_iand) { + branch = ir3_BRAA(ctx->block, cond1, IR3_REG_PREDICATE, cond2, + IR3_REG_PREDICATE); + } else { + branch = ir3_BRAO(ctx->block, cond1, IR3_REG_PREDICATE, cond2, + IR3_REG_PREDICATE); + } + + branch->cat0.inv1 = inv1; + branch->cat0.inv2 = inv2; + return branch; +} + +static struct ir3_instruction * +emit_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond) +{ + struct ir3_instruction *folded = fold_conditional_branch(ctx, nir_cond); + if (folded) + return folded; + + bool inv1; + struct ir3_instruction *cond1 = get_branch_condition(ctx, nir_cond, &inv1); + struct ir3_instruction *branch = + ir3_BR(ctx->block, cond1, IR3_REG_PREDICATE); + branch->cat0.inv1 = inv1; + return branch; +} + static void emit_if(struct ir3_context *ctx, nir_if *nif) { @@ -3820,8 +3905,7 @@ emit_if(struct ir3_context *ctx, nir_if *nif) */ ir3_SHPS(ctx->block); } else { - struct ir3_instruction *pred = ir3_get_predicate(ctx, condition); - ir3_BR(ctx->block, pred, IR3_REG_PREDICATE); + emit_conditional_branch(ctx, &nif->condition); } emit_cf_list(ctx, &nif->then_list); diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index fb9bd1d03d5..b4845b6da2e 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -110,6 +110,10 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader, NIR_PASS(progress, ctx->s, nir_opt_dce); } + /* This must run after the last nir_opt_algebraic or it gets undone. */ + if (compiler->has_branch_and_or) + NIR_PASS_V(ctx->s, ir3_nir_opt_branch_and_or_not); + /* Enable the texture pre-fetch feature only a4xx onwards. But * only enable it on generations that have been tested: */ diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c index a6a8ff70b69..db5b5871c48 100644 --- a/src/freedreno/ir3/ir3_delay.c +++ b/src/freedreno/ir3/ir3_delay.c @@ -103,7 +103,8 @@ count_instruction(struct ir3_instruction *n) * earlier so we don't have this constraint. */ return is_alu(n) || - (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)); + (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) && + (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO)); } /* Post-RA, we don't have arrays any more, so we have to be a bit careful here diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 69a59b67fe6..f843d78baef 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -714,6 +714,12 @@ invert_branch(struct ir3_instruction *branch) case OPC_BANY: branch->opc = OPC_BALL; break; + case OPC_BRAA: + branch->opc = OPC_BRAO; + break; + case OPC_BRAO: + branch->opc = OPC_BRAA; + break; default: unreachable("can't get here"); } @@ -753,8 +759,6 @@ block_sched(struct ir3 *ir) br2 = ir3_JUMP(block); br2->cat0.target = block->successors[0]; } else { - assert(terminator->srcs_count == 1); - /* create "else" branch first (since "then" block should * frequently/always end up being a fall-thru): */ diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index 5320340ea2c..a17e2883c96 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -69,6 +69,7 @@ bool ir3_nir_lower_64b_undef(nir_shader *shader); bool ir3_nir_lower_64b_global(nir_shader *shader); bool ir3_nir_lower_64b_regs(nir_shader *shader); +bool ir3_nir_opt_branch_and_or_not(nir_shader *nir); void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s); void ir3_nir_lower_io_to_temporaries(nir_shader *s); void ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s); diff --git a/src/freedreno/ir3/ir3_nir_branch_and_or_not.py b/src/freedreno/ir3/ir3_nir_branch_and_or_not.py new file mode 100644 index 00000000000..02e39548c8a --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_branch_and_or_not.py @@ -0,0 +1,37 @@ +# Copyright (C) 2024 Igalia S.L. +# SPDX-License-Identifier: MIT + + +import argparse +import sys + + +# Inverse DeMorgan's laws to facilitate folding iand/ior into braa/brao. Only +# apply if the inot is only used by branch conditions. Otherwise, it would just +# end-up generating more instructions. +cond_lowering = [ + (('inot(is_only_used_by_if)', ('iand', 'a', 'b')), + ('ior', ('inot', 'a'), ('inot', 'b'))), + (('inot(is_only_used_by_if)', ('ior', 'a', 'b')), + ('iand', ('inot', 'a'), ('inot', 'b'))), +] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + args = parser.parse_args() + sys.path.insert(0, args.import_path) + run() + + +def run(): + import nir_algebraic # pylint: disable=import-error + + print('#include "ir3_nir.h"') + print(nir_algebraic.AlgebraicPass("ir3_nir_opt_branch_and_or_not", + cond_lowering).render()) + + +if __name__ == '__main__': + main() diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index e400bd179f3..aacfd1e2fdf 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -40,6 +40,17 @@ ir3_nir_imul_c = custom_target( depend_files : nir_algebraic_depends, ) +ir3_nir_branch_and_or_not_c = custom_target( + 'ir3_nir_branch_and_or_not.c', + input : 'ir3_nir_branch_and_or_not.py', + output : 'ir3_nir_branch_and_or_not.c', + command : [ + prog_python, '@INPUT@', '-p', dir_compiler_nir, + ], + capture : true, + depend_files : nir_algebraic_depends, +) + ir3_parser = custom_target( 'ir3_parser.[ch]', input: 'ir3_parser.y', @@ -123,7 +134,7 @@ libfreedreno_ir3_files = files( libfreedreno_ir3 = static_library( 'freedreno_ir3', - [libfreedreno_ir3_files, ir3_nir_trig_c, ir3_nir_imul_c, ir3_parser[0], ir3_parser[1], ir3_lexer], + [libfreedreno_ir3_files, ir3_nir_trig_c, ir3_nir_imul_c, ir3_nir_branch_and_or_not_c, ir3_parser[0], ir3_parser[1], ir3_lexer], include_directories : [inc_freedreno, inc_include, inc_src], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden',