From b79994e92d26aa8daec34175e63992d317a9e29b Mon Sep 17 00:00:00 2001 From: Karmjit Mahil Date: Tue, 19 Nov 2024 13:10:13 +0100 Subject: [PATCH] nir,ir3: Add icsel_eqz In IR3 `sel.b32` works based on the 0 so add `icsel_eqz` to fuse the cmp and sel that we'd otherwise need. total Instruction Count in shared programs: 1112814 -> 1110473 (-0.21%) Instruction Count in affected programs: 162701 -> 160360 (-1.44%) helped: 81 HURT: 29 Instruction count are helped. total MOV Count in shared programs: 86777 -> 88671 (2.18%) MOV Count in affected programs: 28119 -> 30013 (6.74%) helped: 1 HURT: 292 Mov count are HURT. total COV Count in shared programs: 15070 -> 14962 (-0.72%) COV Count in affected programs: 5770 -> 5662 (-1.87%) helped: 76 HURT: 2 Cov count are helped. total Last helper instruction in shared programs: 592729 -> 590638 (-0.35%) Last helper instruction in affected programs: 91331 -> 89240 (-2.29%) helped: 30 HURT: 1 Last helper instruction are helped. total Instructions with SS sync bit in shared programs: 29336 -> 29546 (0.72%) Instructions with SS sync bit in affected programs: 4702 -> 4912 (4.47%) helped: 8 HURT: 43 Instructions with ss sync bit are HURT. total Estimated cycles stalled on SS in shared programs: 111590 -> 112401 (0.73%) Estimated cycles stalled on SS in affected programs: 27708 -> 28519 (2.93%) helped: 21 HURT: 61 Estimated cycles stalled on ss are HURT. total cat1 instructions in shared programs: 101933 -> 103695 (1.73%) cat1 instructions in affected programs: 35804 -> 37566 (4.92%) helped: 18 HURT: 290 Cat1 instructions are HURT. total cat2 instructions in shared programs: 380299 -> 377499 (-0.74%) cat2 instructions in affected programs: 128609 -> 125809 (-2.18%) helped: 322 HURT: 0 Cat2 instructions are helped. Signed-off-by: Karmjit Mahil Reviewed-by: Connor Abbott Part-of: --- src/compiler/nir/nir.h | 4 ++++ src/compiler/nir/nir_opcodes.py | 2 ++ src/compiler/nir/nir_opt_algebraic.py | 6 ++++++ src/freedreno/ir3/ir3_compiler.c | 2 ++ src/freedreno/ir3/ir3_compiler_nir.c | 8 ++++++++ 5 files changed, 22 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index f7c0e25406f..a07437ffa6d 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -4180,6 +4180,10 @@ typedef struct nir_shader_compiler_options { /* Backend supports fused comapre against zero and csel */ bool has_fused_comp_and_csel; + /* Backend supports fused int eq/ne against zero and csel. */ + bool has_icsel_eqz64; + bool has_icsel_eqz32; + bool has_icsel_eqz16; /* Backend supports fneo, fequ, fltu, fgeu. */ bool has_fneo_fcmpu; diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 05786b7397b..e315022c49b 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -1064,6 +1064,8 @@ opcode("b32csel", 0, tuint, [0, 0, 0], [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2", description = csel_description.format("a 32-bit", "0 vs ~0")) +triop("icsel_eqz", tint, selection, "(src0 == 0) ? src1 : src2") + triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2") triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2") diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index e132bc5b210..e8172690d41 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -3677,6 +3677,12 @@ late_optimizations += [ (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), ] +for s in [16, 32, 64]: + late_optimizations.extend([ + (('bcsel@{}'.format(s), ('ieq', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, b, c), "options->has_icsel_eqz{} && !options->no_integers".format(s)), + (('bcsel@{}'.format(s), ('ine', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, c, b), "options->has_icsel_eqz{} && !options->no_integers".format(s)), + ]) + distribute_src_mods = [ # Try to remove some spurious negations rather than pushing them down. (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index b45b3083e58..7566b9a057b 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -98,6 +98,8 @@ static const nir_shader_compiler_options ir3_base_options = { .lower_pack_split = true, .lower_to_scalar = true, .has_imul24 = true, + .has_icsel_eqz32 = true, + .has_icsel_eqz16 = true, .has_fsub = true, .has_isub = true, .force_indirect_unrolling_sampler = true, diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 4e5578c6edc..6498ae48af0 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -922,6 +922,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE); break; + case nir_op_icsel_eqz: case nir_op_bcsel: { struct ir3_instruction_rpt conds; @@ -965,12 +966,19 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) conds.rpts[rpt] = cond; } + if (alu->op == nir_op_icsel_eqz) { + struct ir3_instruction_rpt tmp = src[1]; + src[1] = src[2]; + src[2] = tmp; + } + if (is_half(src[1].rpts[0])) dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0); else dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0); break; } + case nir_op_bit_count: { if (ctx->compiler->gen < 5 || (src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) {