From b5e023777cee73d1cca597588185c70eadfb19d3 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 22 Nov 2022 15:55:30 -0800 Subject: [PATCH] brw: Change the flags written by some CMP One frustrating thing about the CMP and CMPN instructions is that they always write the flags. Sometimes, however, it is desirable to generate the comparison result without modifying the flags. This would, theoretically, reduce false dependencies that restrict the scheduler's ability to rearrange code, create more opportunities for cmod propagation, save a kitten from a tree, and make a rainbow. Consider this sequence: cmp.ge.f0.0(8) g103<1>F g101<8,8,1>F g39<8,8,1>F cmp.nz.f0.0(8) null<1>D g81<8,8,1>D 0D (+f0.0) if(8) JIP: LABEL19 UIP: LABEL19 It would be advantageous to put the first CMP between the second CMP and the IF, but this cannot be done since the IF depends on the flags generated by the second CMP. This pass enables this rescheduling by changing the first CMP to write to a different flags register. cmp.ge.f1.0(8) g103<1>F g101<8,8,1>F g39<8,8,1>F cmp.nz.f0.0(8) null<1>D g81<8,8,1>D 0D (+f0.0) if(8) JIP: LABEL19 UIP: LABEL19 Sometimes this is also possible by using a different instruction. For example, consider cmp.l.f0.0(8) g103<1>D g101<8,8,1>D 0D This produces 0xffffffff when g101 negative and zero otherwise. This instruction, which does not modifiy the flag, also produces these results: asr(8) g103<1>D g101<8,8,1>D 31D Gfx9 platforms take a hit on instructions due to the instruction added at the end of short shaders by brw_workaround_source_arf_before_eot. shader-db: Lunar Lake, Meteor Lake, DG2, Tiger Lake, and Ice Lake had similar results. (Lunar Lake shown) total instructions in shared programs: 17089451 -> 17088766 (<.01%) instructions in affected programs: 766613 -> 765928 (-0.09%) helped: 653 / HURT: 0 total cycles in shared programs: 888832986 -> 887873068 (-0.11%) cycles in affected programs: 549441852 -> 548481934 (-0.17%) helped: 10474 / HURT: 130 LOST: 9 GAINED: 0 Skylake total instructions in shared programs: 19037976 -> 19049719 (0.06%) instructions in affected programs: 3979914 -> 3991657 (0.30%) helped: 503 / HURT: 12303 total cycles in shared programs: 867918242 -> 866930801 (-0.11%) cycles in affected programs: 512773919 -> 511786478 (-0.19%) helped: 13858 / HURT: 66 LOST: 32 GAINED: 0 fossil-db: Lunar Lake Totals: Instrs: 925023504 -> 924950382 (-0.01%); split: -0.01%, +0.00% Cycle count: 106348432916 -> 106116809009 (-0.22%); split: -0.22%, +0.00% Spill count: 3423988 -> 3423930 (-0.00%); split: -0.00%, +0.00% Fill count: 4877087 -> 4876960 (-0.00%); split: -0.01%, +0.00% Max dispatch width: 49087552 -> 49078448 (-0.02%); split: +0.00%, -0.02% Totals from 1099332 (54.44% of 2019443) affected shaders: Instrs: 742670473 -> 742597351 (-0.01%); split: -0.01%, +0.00% Cycle count: 100455549635 -> 100223925728 (-0.23%); split: -0.23%, +0.00% Spill count: 3384366 -> 3384308 (-0.00%); split: -0.00%, +0.00% Fill count: 4837434 -> 4837307 (-0.00%); split: -0.01%, +0.00% Max dispatch width: 26725152 -> 26716048 (-0.03%); split: +0.00%, -0.03% Meteor Lake and DG2 had similar results. (Meteor Lake shown) Totals: Instrs: 997603774 -> 997529238 (-0.01%); split: -0.01%, +0.00% Cycle count: 93904012762 -> 93646730006 (-0.27%); split: -0.28%, +0.00% Spill count: 3710155 -> 3710125 (-0.00%); split: -0.00%, +0.00% Fill count: 5032908 -> 5032819 (-0.00%); split: -0.01%, +0.00% Max dispatch width: 37929640 -> 37811560 (-0.31%) Totals from 1334920 (58.52% of 2281134) affected shaders: Instrs: 817377787 -> 817303251 (-0.01%); split: -0.01%, +0.00% Cycle count: 88468851658 -> 88211568902 (-0.29%); split: -0.29%, +0.00% Spill count: 3663353 -> 3663323 (-0.00%); split: -0.00%, +0.00% Fill count: 4991629 -> 4991540 (-0.00%); split: -0.01%, +0.00% Max dispatch width: 20245832 -> 20127752 (-0.58%) Tiger Lake and Ice Lake had similar results. (Tiger Lake shown) Totals: Instrs: 1013433769 -> 1013363273 (-0.01%); split: -0.01%, +0.00% Cycle count: 85766921182 -> 85509316620 (-0.30%); split: -0.31%, +0.00% Spill count: 3903923 -> 3903944 (+0.00%); split: -0.00%, +0.00% Fill count: 6801983 -> 6801948 (-0.00%); split: -0.00%, +0.00% Max dispatch width: 37896320 -> 37805320 (-0.24%); split: +0.00%, -0.24% Totals from 1333814 (58.54% of 2278396) affected shaders: Instrs: 830200531 -> 830130035 (-0.01%); split: -0.01%, +0.00% Cycle count: 80746184101 -> 80488579539 (-0.32%); split: -0.32%, +0.01% Spill count: 3855771 -> 3855792 (+0.00%); split: -0.00%, +0.00% Fill count: 6755513 -> 6755478 (-0.00%); split: -0.00%, +0.00% Max dispatch width: 20301456 -> 20210456 (-0.45%); split: +0.00%, -0.45% Skylake Totals: Instrs: 519389758 -> 519874108 (+0.09%); split: -0.00%, +0.10% Cycle count: 57932316132 -> 57789433956 (-0.25%); split: -0.25%, +0.00% Spill count: 636741 -> 636715 (-0.00%); split: -0.01%, +0.00% Fill count: 860470 -> 860357 (-0.01%); split: -0.02%, +0.00% Max dispatch width: 32527800 -> 32481792 (-0.14%); split: +0.00%, -0.14% Totals from 1080380 (62.25% of 1735462) affected shaders: Instrs: 411976399 -> 412460749 (+0.12%); split: -0.00%, +0.12% Cycle count: 54291447615 -> 54148565439 (-0.26%); split: -0.27%, +0.00% Spill count: 602993 -> 602967 (-0.00%); split: -0.01%, +0.00% Fill count: 734459 -> 734346 (-0.02%); split: -0.02%, +0.00% Max dispatch width: 18626096 -> 18580088 (-0.25%); split: +0.00%, -0.25% Reviewed-by: Kenneth Graunke Part-of: --- .../brw/brw_opt_cmp_flag_destination.cpp | 169 ++++++++++++++++++ src/intel/compiler/brw/brw_shader.cpp | 20 +++ src/intel/compiler/brw/brw_shader.h | 1 + src/intel/compiler/brw/meson.build | 1 + 4 files changed, 191 insertions(+) create mode 100644 src/intel/compiler/brw/brw_opt_cmp_flag_destination.cpp diff --git a/src/intel/compiler/brw/brw_opt_cmp_flag_destination.cpp b/src/intel/compiler/brw/brw_opt_cmp_flag_destination.cpp new file mode 100644 index 00000000000..31bf3e2a68d --- /dev/null +++ b/src/intel/compiler/brw/brw_opt_cmp_flag_destination.cpp @@ -0,0 +1,169 @@ +/* + * Copyright © 2022 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +/** \file + * Change the flags written by some CMP and CMPN instructions. + * + * One frustrating thing about the CMP and CMPN instructions is that they + * always write the flags. Sometimes, however, it is desirable to generate the + * comparison result without modifying the flags. This would, theoretically, + * reduce false dependencies that restrict the scheduler's ability to + * rearrange code, create more opportunities for cmod propagation, save a + * kitten from a tree, and make a rainbow. + * + * Consider this sequence: + * + * cmp.ge.f0.0(8) g103<1>F g101<8,8,1>F g39<8,8,1>F + * cmp.nz.f0.0(8) null<1>D g81<8,8,1>D 0D + * (+f0.0) if(8) JIP: LABEL19 UIP: LABEL19 + * + * It would be advantageous to put the first CMP between the second CMP and + * the IF, but this cannot be done since the IF depends on the flags generated + * by the second CMP. + * + * This pass enables this rescheduling by changing the first CMP to write to a + * different flags register. + * + * cmp.ge.f1.0(8) g103<1>F g101<8,8,1>F g39<8,8,1>F + * cmp.nz.f0.0(8) null<1>D g81<8,8,1>D 0D + * (+f0.0) if(8) JIP: LABEL19 UIP: LABEL19 + * + * Sometimes this is also possible by using a different instruction. For + * example, consider + * + * cmp.l.f0.0(8) g103<1>D g101<8,8,1>D 0D + * + * This produces 0xffffffff when g101 negative and zero otherwise. This + * instruction, which does not modifiy the flag, also produces these results: + * + * asr(8) g103<1>D g101<8,8,1>D 31D + */ +#include "brw_shader.h" +#include "brw_cfg.h" + +static bool +opt_cmp_flag_destination_local(brw_shader &s, bblock_t *block, + bool uses_kill) +{ + bool progress = false; + + foreach_inst_in_block(brw_inst, inst, block) { + if (inst->opcode != BRW_OPCODE_CMP && inst->opcode != BRW_OPCODE_CMPN) + continue; + + if (inst->dst.is_null() || inst->predicate != BRW_PREDICATE_NONE) + continue; + + if ((s.dispatch_width <= 16 && inst->flag_subreg == 1) || + inst->flag_subreg > 1) { + continue; + } + + unsigned flags_written = inst->flags_written(s.devinfo); + + foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) { + if ((scan_inst->flags_read(s.devinfo) & flags_written) != 0) + break; + + /* If scan_inst might not modify the flags, keep scanning. */ + if (scan_inst->predicate != BRW_PREDICATE_NONE) + continue; + + flags_written &= ~scan_inst->flags_written(s.devinfo); + if (flags_written == 0) { + /* A common special case: + * + * cmp.l.f0.0(8) g103<1>D g101<8,8,1>D 0D + * + * becomes + * + * asr(8) g103<1>D g101<8,8,1>D 31D + */ + if (inst->opcode == BRW_OPCODE_CMP && + brw_type_is_sint(inst->src[0].type) && + inst->src[1].is_zero() && + inst->conditional_mod == BRW_CONDITIONAL_L) { + inst->opcode = BRW_OPCODE_ASR; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + inst->src[1] = + brw_imm_w(brw_type_size_bits(inst->src[0].type) - 1); + + progress = true; + } + + /* A common special case: + * + * cmp.nz.f0.0(8) g103<1>D g101<8,8,1>D 0D + * + * or + * + * cmp.nz.f0.0(8) g103<1>UD g101<8,8,1>UD 0UD + * + * becomes + * + * asr(8) g103<1>D -(abs)g101<8,8,1>D 31D + */ + if (inst->opcode == BRW_OPCODE_CMP && + brw_type_is_int(inst->src[0].type) && + inst->src[1].is_zero() && + inst->conditional_mod == BRW_CONDITIONAL_NZ) { + inst->opcode = BRW_OPCODE_ASR; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + inst->dst.type = + brw_type_with_size(BRW_TYPE_D, + brw_type_size_bits(inst->dst.type)); + + unsigned bits = brw_type_size_bits(inst->src[0].type); + + inst->src[0].type = brw_type_with_size(BRW_TYPE_D, bits); + inst->src[0].negate = true; + inst->src[0].abs = true; + + inst->src[1] = brw_imm_w(bits - 1); + + progress = true; + } + + if (inst->conditional_mod != BRW_CONDITIONAL_NONE) { + if (s.dispatch_width <= 16) { + inst->flag_subreg += 1; + progress = true; + } else if (s.devinfo->ver >= 20) { + /* Xe2 (and a couple exotic Xe platforms) have 4 32-bit + * flags registers. Use f2.x. + */ + inst->flag_subreg += 4; + progress = true; + } else if (!uses_kill) { + inst->flag_subreg += 2; + progress = true; + } + + assert(s.stage != MESA_SHADER_FRAGMENT || + inst->flag_subreg != sample_mask_flag_subreg(s)); + } + + break; + } + } + } + + return progress; +} + +bool +brw_opt_cmp_flag_destination(brw_shader &s, bool uses_kill) +{ + bool progress = false; + + foreach_block (block, s.cfg) { + if (opt_cmp_flag_destination_local(s, block, uses_kill)) + progress = true; + } + + return progress; +} diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp index 76dc57865b6..5579e022b08 100644 --- a/src/intel/compiler/brw/brw_shader.cpp +++ b/src/intel/compiler/brw/brw_shader.cpp @@ -845,6 +845,17 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) if (OPT(brw_opt_cmod_propagation)) OPT(brw_opt_dead_code_eliminate); + if (OPT(brw_opt_cmp_flag_destination, + /* We want something like + * brw_wm_prog_data(s.prog_data)->uses_kill, but there are + * other things that can cause the sample_mask_flag_subreg to + * be used. + */ + s.stage == MESA_SHADER_FRAGMENT)) { + OPT(brw_opt_cmod_propagation); + OPT(brw_opt_dead_code_eliminate); + } + allocated = brw_assign_regs(s, allow_spilling, spill_all); } @@ -905,6 +916,15 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) */ OPT(brw_opt_cmod_propagation); + if (OPT(brw_opt_cmp_flag_destination, + /* We want something like brw_wm_prog_data(s.prog_data)->uses_kill, + * but there are other things that can cause the + * sample_mask_flag_subreg to be used. + */ + s.stage == MESA_SHADER_FRAGMENT)) { + OPT(brw_opt_cmod_propagation); + } + if (s.devinfo->ver >= 30) OPT(brw_lower_send_gather); diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h index 0129748ae7e..f241b7e56dd 100644 --- a/src/intel/compiler/brw/brw_shader.h +++ b/src/intel/compiler/brw/brw_shader.h @@ -356,6 +356,7 @@ bool brw_opt_address_reg_load(brw_shader &s); bool brw_opt_algebraic(brw_shader &s); bool brw_opt_bank_conflicts(brw_shader &s); bool brw_opt_cmod_propagation(brw_shader &s); +bool brw_opt_cmp_flag_destination(brw_shader &s, bool uses_kill); bool brw_opt_combine_constants(brw_shader &s); bool brw_opt_combine_convergent_txf(brw_shader &s); bool brw_opt_compact_virtual_grfs(brw_shader &s); diff --git a/src/intel/compiler/brw/meson.build b/src/intel/compiler/brw/meson.build index f96ea1b2650..cfdbe0281da 100644 --- a/src/intel/compiler/brw/meson.build +++ b/src/intel/compiler/brw/meson.build @@ -77,6 +77,7 @@ libintel_compiler_brw_files = files( 'brw_opt_algebraic.cpp', 'brw_opt_bank_conflicts.cpp', 'brw_opt_cmod_propagation.cpp', + 'brw_opt_cmp_flag_destination.cpp', 'brw_opt_combine_constants.cpp', 'brw_opt_copy_propagation.cpp', 'brw_opt_cse.cpp',