nir,ir3: Add icsel_eqz

In IR3 `sel.b32` works based on the 0 so add `icsel_eqz` to fuse the
cmp and sel that we'd otherwise need.

total Instruction Count in shared programs: 1112814 -> 1110473 (-0.21%)
Instruction Count in affected programs: 162701 -> 160360 (-1.44%)
helped: 81
HURT: 29
Instruction count are helped.

total MOV Count in shared programs: 86777 -> 88671 (2.18%)
MOV Count in affected programs: 28119 -> 30013 (6.74%)
helped: 1
HURT: 292
Mov count are HURT.

total COV Count in shared programs: 15070 -> 14962 (-0.72%)
COV Count in affected programs: 5770 -> 5662 (-1.87%)
helped: 76
HURT: 2
Cov count are helped.

total Last helper instruction in shared programs: 592729 -> 590638 (-0.35%)
Last helper instruction in affected programs: 91331 -> 89240 (-2.29%)
helped: 30
HURT: 1
Last helper instruction are helped.

total Instructions with SS sync bit in shared programs: 29336 -> 29546 (0.72%)
Instructions with SS sync bit in affected programs: 4702 -> 4912 (4.47%)
helped: 8
HURT: 43
Instructions with ss sync bit are HURT.

total Estimated cycles stalled on SS in shared programs: 111590 -> 112401 (0.73%)
Estimated cycles stalled on SS in affected programs: 27708 -> 28519 (2.93%)
helped: 21
HURT: 61
Estimated cycles stalled on ss are HURT.

total cat1 instructions in shared programs: 101933 -> 103695 (1.73%)
cat1 instructions in affected programs: 35804 -> 37566 (4.92%)
helped: 18
HURT: 290
Cat1 instructions are HURT.

total cat2 instructions in shared programs: 380299 -> 377499 (-0.74%)
cat2 instructions in affected programs: 128609 -> 125809 (-2.18%)
helped: 322
HURT: 0
Cat2 instructions are helped.

Signed-off-by: Karmjit Mahil <karmjit.mahil@igalia.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32189>
This commit is contained in:
Karmjit Mahil 2024-11-19 13:10:13 +01:00 committed by Marge Bot
parent aad0aa0a9c
commit b79994e92d
5 changed files with 22 additions and 0 deletions

View file

@ -4180,6 +4180,10 @@ typedef struct nir_shader_compiler_options {
/* Backend supports fused comapre against zero and csel */ /* Backend supports fused comapre against zero and csel */
bool has_fused_comp_and_csel; bool has_fused_comp_and_csel;
/* Backend supports fused int eq/ne against zero and csel. */
bool has_icsel_eqz64;
bool has_icsel_eqz32;
bool has_icsel_eqz16;
/* Backend supports fneo, fequ, fltu, fgeu. */ /* Backend supports fneo, fequ, fltu, fgeu. */
bool has_fneo_fcmpu; bool has_fneo_fcmpu;

View file

@ -1064,6 +1064,8 @@ opcode("b32csel", 0, tuint, [0, 0, 0],
[tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2", [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2",
description = csel_description.format("a 32-bit", "0 vs ~0")) description = csel_description.format("a 32-bit", "0 vs ~0"))
triop("icsel_eqz", tint, selection, "(src0 == 0) ? src1 : src2")
triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2") triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2")
triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2") triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2")

View file

@ -3677,6 +3677,12 @@ late_optimizations += [
(('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
] ]
for s in [16, 32, 64]:
late_optimizations.extend([
(('bcsel@{}'.format(s), ('ieq', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, b, c), "options->has_icsel_eqz{} && !options->no_integers".format(s)),
(('bcsel@{}'.format(s), ('ine', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, c, b), "options->has_icsel_eqz{} && !options->no_integers".format(s)),
])
distribute_src_mods = [ distribute_src_mods = [
# Try to remove some spurious negations rather than pushing them down. # Try to remove some spurious negations rather than pushing them down.
(('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),

View file

@ -98,6 +98,8 @@ static const nir_shader_compiler_options ir3_base_options = {
.lower_pack_split = true, .lower_pack_split = true,
.lower_to_scalar = true, .lower_to_scalar = true,
.has_imul24 = true, .has_imul24 = true,
.has_icsel_eqz32 = true,
.has_icsel_eqz16 = true,
.has_fsub = true, .has_fsub = true,
.has_isub = true, .has_isub = true,
.force_indirect_unrolling_sampler = true, .force_indirect_unrolling_sampler = true,

View file

@ -922,6 +922,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE); set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
break; break;
case nir_op_icsel_eqz:
case nir_op_bcsel: { case nir_op_bcsel: {
struct ir3_instruction_rpt conds; struct ir3_instruction_rpt conds;
@ -965,12 +966,19 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
conds.rpts[rpt] = cond; conds.rpts[rpt] = cond;
} }
if (alu->op == nir_op_icsel_eqz) {
struct ir3_instruction_rpt tmp = src[1];
src[1] = src[2];
src[2] = tmp;
}
if (is_half(src[1].rpts[0])) if (is_half(src[1].rpts[0]))
dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0); dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
else else
dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0); dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
break; break;
} }
case nir_op_bit_count: { case nir_op_bit_count: {
if (ctx->compiler->gen < 5 || if (ctx->compiler->gen < 5 ||
(src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) { (src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) {