ir3: fold and/or and negations into branches

Fold and/or into braa/brao when profitable. Only do this when the and/or
is not used for any non-branch instructions as this would increase total
instruction count.

Add an algebraic nir pass that performs the inverse DeMorgan's laws to
try to bring and/or in front of branches. Again, only do this when the
original inot in only used for branches. This should always decrease
instruction count since the extra inots can be folded into the branch.

Fold inot into branches by using the inv1/inv2 cat0 fields.

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27411>
This commit is contained in:
Job Noorman 2024-02-01 14:51:25 +01:00 committed by Marge Bot
parent 96c2fe3e1a
commit 9de628b65c
9 changed files with 153 additions and 6 deletions

View file

@ -158,6 +158,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->num_predicates = 1;
compiler->bitops_can_write_predicates = false;
compiler->has_branch_and_or = false;
if (compiler->gen >= 6) {
compiler->samgq_workaround = true;
@ -217,6 +218,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
compiler->num_predicates = 4;
compiler->bitops_can_write_predicates = true;
compiler->has_branch_and_or = true;
} else {
compiler->max_const_pipeline = 512;
compiler->max_const_geom = 512;

View file

@ -220,6 +220,9 @@ struct ir3_compiler {
/* True if bitops (and.b, or.b, xor.b, not.b) can write to p0.c */
bool bitops_can_write_predicates;
/* True if braa/brao are available. */
bool has_branch_and_or;
/* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
uint32_t max_variable_workgroup_size;

View file

@ -3797,6 +3797,91 @@ emit_block(struct ir3_context *ctx, nir_block *nblock)
static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
/* Get the ir3 branch condition for a given nir source. This will strip any inot
* instructions and set *inv when the condition should be inverted. This
* inversion can be directly folded into branches (in the inv1/inv2 fields)
* instead of adding an explicit not.b/sub.u instruction.
*/
static struct ir3_instruction *
get_branch_condition(struct ir3_context *ctx, nir_src *src, bool *inv)
{
struct ir3_instruction *condition = ir3_get_src(ctx, src)[0];
if (src->ssa->parent_instr->type == nir_instr_type_alu) {
nir_alu_instr *nir_cond = nir_instr_as_alu(src->ssa->parent_instr);
if (nir_cond->op == nir_op_inot) {
struct ir3_instruction *inv_cond =
get_branch_condition(ctx, &nir_cond->src[0].src, inv);
*inv = !*inv;
return inv_cond;
}
}
*inv = false;
return ir3_get_predicate(ctx, condition);
}
/* Try to fold br (and/or cond1, cond2) into braa/brao cond1, cond2.
*/
static struct ir3_instruction *
fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
{
if (!ctx->compiler->has_branch_and_or)
return false;
if (nir_cond->ssa->parent_instr->type != nir_instr_type_alu)
return NULL;
nir_alu_instr *alu_cond = nir_instr_as_alu(nir_cond->ssa->parent_instr);
if ((alu_cond->op != nir_op_iand) && (alu_cond->op != nir_op_ior))
return NULL;
/* If the result of the and/or is also used for something else than an if
* condition, the and/or cannot be removed. In that case, we will end-up with
* extra predicate conversions for the conditions without actually removing
* any instructions, resulting in an increase of instructions. Let's not fold
* the conditions in the branch in that case.
*/
if (!nir_def_only_used_by_if(&alu_cond->def))
return NULL;
bool inv1, inv2;
struct ir3_instruction *cond1 =
get_branch_condition(ctx, &alu_cond->src[0].src, &inv1);
struct ir3_instruction *cond2 =
get_branch_condition(ctx, &alu_cond->src[1].src, &inv2);
struct ir3_instruction *branch;
if (alu_cond->op == nir_op_iand) {
branch = ir3_BRAA(ctx->block, cond1, IR3_REG_PREDICATE, cond2,
IR3_REG_PREDICATE);
} else {
branch = ir3_BRAO(ctx->block, cond1, IR3_REG_PREDICATE, cond2,
IR3_REG_PREDICATE);
}
branch->cat0.inv1 = inv1;
branch->cat0.inv2 = inv2;
return branch;
}
static struct ir3_instruction *
emit_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
{
struct ir3_instruction *folded = fold_conditional_branch(ctx, nir_cond);
if (folded)
return folded;
bool inv1;
struct ir3_instruction *cond1 = get_branch_condition(ctx, nir_cond, &inv1);
struct ir3_instruction *branch =
ir3_BR(ctx->block, cond1, IR3_REG_PREDICATE);
branch->cat0.inv1 = inv1;
return branch;
}
static void
emit_if(struct ir3_context *ctx, nir_if *nif)
{
@ -3820,8 +3905,7 @@ emit_if(struct ir3_context *ctx, nir_if *nif)
*/
ir3_SHPS(ctx->block);
} else {
struct ir3_instruction *pred = ir3_get_predicate(ctx, condition);
ir3_BR(ctx->block, pred, IR3_REG_PREDICATE);
emit_conditional_branch(ctx, &nif->condition);
}
emit_cf_list(ctx, &nif->then_list);

View file

@ -110,6 +110,10 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
NIR_PASS(progress, ctx->s, nir_opt_dce);
}
/* This must run after the last nir_opt_algebraic or it gets undone. */
if (compiler->has_branch_and_or)
NIR_PASS_V(ctx->s, ir3_nir_opt_branch_and_or_not);
/* Enable the texture pre-fetch feature only a4xx onwards. But
* only enable it on generations that have been tested:
*/

View file

@ -103,7 +103,8 @@ count_instruction(struct ir3_instruction *n)
* earlier so we don't have this constraint.
*/
return is_alu(n) ||
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
(n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
}
/* Post-RA, we don't have arrays any more, so we have to be a bit careful here

View file

@ -714,6 +714,12 @@ invert_branch(struct ir3_instruction *branch)
case OPC_BANY:
branch->opc = OPC_BALL;
break;
case OPC_BRAA:
branch->opc = OPC_BRAO;
break;
case OPC_BRAO:
branch->opc = OPC_BRAA;
break;
default:
unreachable("can't get here");
}
@ -753,8 +759,6 @@ block_sched(struct ir3 *ir)
br2 = ir3_JUMP(block);
br2->cat0.target = block->successors[0];
} else {
assert(terminator->srcs_count == 1);
/* create "else" branch first (since "then" block should
* frequently/always end up being a fall-thru):
*/

View file

@ -69,6 +69,7 @@ bool ir3_nir_lower_64b_undef(nir_shader *shader);
bool ir3_nir_lower_64b_global(nir_shader *shader);
bool ir3_nir_lower_64b_regs(nir_shader *shader);
bool ir3_nir_opt_branch_and_or_not(nir_shader *nir);
void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s);
void ir3_nir_lower_io_to_temporaries(nir_shader *s);
void ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s);

View file

@ -0,0 +1,37 @@
# Copyright (C) 2024 Igalia S.L.
# SPDX-License-Identifier: MIT
import argparse
import sys
# Inverse DeMorgan's laws to facilitate folding iand/ior into braa/brao. Only
# apply if the inot is only used by branch conditions. Otherwise, it would just
# end-up generating more instructions.
cond_lowering = [
(('inot(is_only_used_by_if)', ('iand', 'a', 'b')),
('ior', ('inot', 'a'), ('inot', 'b'))),
(('inot(is_only_used_by_if)', ('ior', 'a', 'b')),
('iand', ('inot', 'a'), ('inot', 'b'))),
]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--import-path', required=True)
args = parser.parse_args()
sys.path.insert(0, args.import_path)
run()
def run():
import nir_algebraic # pylint: disable=import-error
print('#include "ir3_nir.h"')
print(nir_algebraic.AlgebraicPass("ir3_nir_opt_branch_and_or_not",
cond_lowering).render())
if __name__ == '__main__':
main()

View file

@ -40,6 +40,17 @@ ir3_nir_imul_c = custom_target(
depend_files : nir_algebraic_depends,
)
ir3_nir_branch_and_or_not_c = custom_target(
'ir3_nir_branch_and_or_not.c',
input : 'ir3_nir_branch_and_or_not.py',
output : 'ir3_nir_branch_and_or_not.c',
command : [
prog_python, '@INPUT@', '-p', dir_compiler_nir,
],
capture : true,
depend_files : nir_algebraic_depends,
)
ir3_parser = custom_target(
'ir3_parser.[ch]',
input: 'ir3_parser.y',
@ -123,7 +134,7 @@ libfreedreno_ir3_files = files(
libfreedreno_ir3 = static_library(
'freedreno_ir3',
[libfreedreno_ir3_files, ir3_nir_trig_c, ir3_nir_imul_c, ir3_parser[0], ir3_parser[1], ir3_lexer],
[libfreedreno_ir3_files, ir3_nir_trig_c, ir3_nir_imul_c, ir3_nir_branch_and_or_not_c, ir3_parser[0], ir3_parser[1], ir3_lexer],
include_directories : [inc_freedreno, inc_include, inc_src],
c_args : [no_override_init_args],
gnu_symbol_visibility : 'hidden',