From 3f151c03af3d159c8725f5f6dc7050b6a418de51 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 1 May 2024 21:02:13 -0700 Subject: [PATCH] intel/brw: Handle fsign optimization in a NIR algebraic pass This is a lot less code, and it makes it easier to experiment with other pattern-based optimizations in the future. The results here are nearly identical to the results I got from Ken's "intel/brw: Make fsign (for 16/32-bit) in SSA form"... which are not particularly good. In this commit and in Ken's, all of the shader-db shaders hurt for spills and fills are from Deus Ex Mankind Divided. Each shader has a bunch of texture instructions with a single fsign between the blocks. With the dependency on the flag removed, the scheduler puts all of the texture instructions at the start... and there are a LOT of them. shader-db: All Intel platforms had similar results. (Meteor Lake shown) total instructions in shared programs: 19647060 -> 19650207 (0.02%) instructions in affected programs: 734718 -> 737865 (0.43%) helped: 382 / HURT: 1984 total cycles in shared programs: 823238442 -> 822785913 (-0.05%) cycles in affected programs: 426901157 -> 426448628 (-0.11%) helped: 3408 / HURT: 3671 total spills in shared programs: 3887 -> 3891 (0.10%) spills in affected programs: 256 -> 260 (1.56%) helped: 0 / HURT: 4 total fills in shared programs: 3236 -> 3306 (2.16%) fills in affected programs: 882 -> 952 (7.94%) helped: 0 / HURT: 12 LOST: 37 GAINED: 34 fossil-db: DG2 and Meteor Lake had similar results. (Meteor Lake shown) Totals: Instrs: 154005469 -> 154008294 (+0.00%); split: -0.00%, +0.00% Cycle count: 17551859277 -> 17554293955 (+0.01%); split: -0.02%, +0.04% Spill count: 142078 -> 142090 (+0.01%) Fill count: 266761 -> 266729 (-0.01%); split: -0.02%, +0.01% Max live registers: 32593578 -> 32593858 (+0.00%) Max dispatch width: 5535944 -> 5536816 (+0.02%); split: +0.02%, -0.01% Totals from 5867 (0.93% of 631350) affected shaders: Instrs: 5475544 -> 5478369 (+0.05%); split: -0.04%, +0.09% Cycle count: 1649032029 -> 1651466707 (+0.15%); split: -0.24%, +0.39% Spill count: 26411 -> 26423 (+0.05%) Fill count: 57364 -> 57332 (-0.06%); split: -0.10%, +0.04% Max live registers: 431561 -> 431841 (+0.06%) Max dispatch width: 49784 -> 50656 (+1.75%); split: +2.38%, -0.63% Tiger Lake Totals: Instrs: 149530671 -> 149533588 (+0.00%); split: -0.00%, +0.00% Cycle count: 15261418953 -> 15264764921 (+0.02%); split: -0.00%, +0.03% Spill count: 60317 -> 60316 (-0.00%); split: -0.02%, +0.01% Max live registers: 32249201 -> 32249464 (+0.00%) Max dispatch width: 5540608 -> 5540584 (-0.00%) Totals from 5862 (0.93% of 630309) affected shaders: Instrs: 4740800 -> 4743717 (+0.06%); split: -0.04%, +0.10% Cycle count: 566531248 -> 569877216 (+0.59%); split: -0.13%, +0.72% Spill count: 11709 -> 11708 (-0.01%); split: -0.09%, +0.08% Max live registers: 424560 -> 424823 (+0.06%) Max dispatch width: 50304 -> 50280 (-0.05%) Ice Lake Totals: Instrs: 150499705 -> 150502608 (+0.00%); split: -0.00%, +0.00% Cycle count: 15105629116 -> 15105425880 (-0.00%); split: -0.00%, +0.00% Spill count: 60087 -> 60090 (+0.00%) Fill count: 100542 -> 100541 (-0.00%); split: -0.00%, +0.00% Max live registers: 32605215 -> 32605495 (+0.00%) Max dispatch width: 5617752 -> 5617792 (+0.00%); split: +0.00%, -0.00% Totals from 5882 (0.93% of 634934) affected shaders: Instrs: 4737206 -> 4740109 (+0.06%); split: -0.04%, +0.10% Cycle count: 598882104 -> 598678868 (-0.03%); split: -0.08%, +0.05% Spill count: 10278 -> 10281 (+0.03%) Fill count: 22504 -> 22503 (-0.00%); split: -0.01%, +0.01% Max live registers: 424184 -> 424464 (+0.07%) Max dispatch width: 50216 -> 50256 (+0.08%); split: +0.25%, -0.18% Skylake Totals: Instrs: 139092612 -> 139095257 (+0.00%); split: -0.00%, +0.00% Cycle count: 14533550285 -> 14533544716 (-0.00%); split: -0.00%, +0.00% Spill count: 58176 -> 58172 (-0.01%) Fill count: 95877 -> 95796 (-0.08%) Max live registers: 31924594 -> 31924874 (+0.00%) Max dispatch width: 5484568 -> 5484552 (-0.00%); split: +0.00%, -0.00% Totals from 5789 (0.93% of 625512) affected shaders: Instrs: 4481987 -> 4484632 (+0.06%); split: -0.04%, +0.10% Cycle count: 578310124 -> 578304555 (-0.00%); split: -0.05%, +0.05% Spill count: 9248 -> 9244 (-0.04%) Fill count: 19677 -> 19596 (-0.41%) Max live registers: 415340 -> 415620 (+0.07%) Max dispatch width: 49720 -> 49704 (-0.03%); split: +0.10%, -0.13% Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 143 +--------------------- src/intel/compiler/brw_nir.c | 6 +- src/intel/compiler/brw_nir.h | 2 + src/intel/compiler/brw_nir_lower_fsign.py | 53 ++++++++ src/intel/compiler/meson.build | 13 +- 5 files changed, 73 insertions(+), 144 deletions(-) create mode 100644 src/intel/compiler/brw_nir_lower_fsign.py diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 742d4314beb..dad87ccb80f 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -849,135 +849,6 @@ try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const fs_builder &bld, return true; } -/** - * Emit code for nir_op_fsign possibly fused with a nir_op_fmul - * - * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of - * the source of \c instr that is a \c nir_op_fsign. - */ -static void -emit_fsign(nir_to_brw_state &ntb, const fs_builder &bld, const nir_alu_instr *instr, - fs_reg result, fs_reg *op, unsigned fsign_src) -{ - const intel_device_info *devinfo = ntb.devinfo; - - fs_inst *inst; - - assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul); - assert(fsign_src < nir_op_infos[instr->op].num_inputs); - - if (instr->op != nir_op_fsign) { - const nir_alu_instr *const fsign_instr = - nir_src_as_alu_instr(instr->src[fsign_src].src); - - /* op[fsign_src] has the nominal result of the fsign, and op[1 - - * fsign_src] has the other multiply source. This must be rearranged so - * that op[0] is the source of the fsign op[1] is the other multiply - * source. - */ - if (fsign_src != 0) - op[1] = op[0]; - - op[0] = get_nir_src(ntb, fsign_instr->src[0].src); - - const nir_alu_type t = - (nir_alu_type)(nir_op_infos[instr->op].input_types[0] | - nir_src_bit_size(fsign_instr->src[0].src)); - - op[0].type = brw_type_for_nir_type(devinfo, t); - - unsigned channel = 0; - if (nir_op_infos[instr->op].output_size == 0) { - /* Since NIR is doing the scalarizing for us, we should only ever see - * vectorized operations with a single channel. - */ - nir_component_mask_t write_mask = get_nir_write_mask(instr->def); - assert(util_bitcount(write_mask) == 1); - channel = ffs(write_mask) - 1; - } - - op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); - } - - if (brw_type_size_bytes(op[0].type) == 2) { - /* AND(val, 0x8000) gives the sign bit. - * - * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. - */ - fs_reg zero = retype(brw_imm_uw(0), BRW_TYPE_HF); - bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); - - op[0].type = BRW_TYPE_UW; - result.type = BRW_TYPE_UW; - bld.AND(result, op[0], brw_imm_uw(0x8000u)); - - if (instr->op == nir_op_fsign) - inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); - else { - /* Use XOR here to get the result sign correct. */ - inst = bld.XOR(result, result, retype(op[1], BRW_TYPE_UW)); - } - - inst->predicate = BRW_PREDICATE_NORMAL; - } else if (brw_type_size_bytes(op[0].type) == 4) { - /* AND(val, 0x80000000) gives the sign bit. - * - * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not - * zero. - */ - bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); - - op[0].type = BRW_TYPE_UD; - result.type = BRW_TYPE_UD; - bld.AND(result, op[0], brw_imm_ud(0x80000000u)); - - if (instr->op == nir_op_fsign) - inst = bld.OR(result, result, brw_imm_ud(0x3f800000u)); - else { - /* Use XOR here to get the result sign correct. */ - inst = bld.XOR(result, result, retype(op[1], BRW_TYPE_UD)); - } - - inst->predicate = BRW_PREDICATE_NORMAL; - } else { - unreachable("Should have been lowered by nir_opt_algebraic."); - } -} - -/** - * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign - * - * Checks the operands of a \c nir_op_fmul to determine whether or not - * \c emit_fsign could fuse the multiplication with the \c sign() calculation. - * - * \param instr The multiplication instruction - * - * \param fsign_src The source of \c instr that may or may not be a - * \c nir_op_fsign - */ -static bool -can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) -{ - assert(instr->op == nir_op_fmul); - - nir_alu_instr *const fsign_instr = - nir_src_as_alu_instr(instr->src[fsign_src].src); - - /* Rules: - * - * 1. instr->src[fsign_src] must be a nir_op_fsign. - * 2. The nir_op_fsign can only be used by this multiplication. - * 3. The source that is the nir_op_fsign does not have source modifiers. - * \c emit_fsign only examines the source modifiers of the source of the - * \c nir_op_fsign. - * - * The nir_op_fsign must also not have the saturate modifier, but steps - * have already been taken (in nir_opt_algebraic) to ensure that. - */ - return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && - is_used_once(fsign_instr); -} - static bool is_const_zero(const nir_src &src) { @@ -1233,8 +1104,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, break; case nir_op_fsign: - emit_fsign(ntb, bld, instr, result, op, 0); - break; + unreachable("Should have been lowered by brw_nir_lower_fsign."); case nir_op_frcp: bld.RCP(result, op[0]); @@ -1322,17 +1192,6 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, } case nir_op_fmul: - for (unsigned i = 0; i < 2; i++) { - if (can_fuse_fmul_fsign(instr, i)) { - emit_fsign(ntb, bld, instr, result, op, i); - return; - } - } - - /* We emit the rounding mode after the previous fsign optimization since - * it won't result in a MUL, but will try to negate the value by other - * means. - */ if (nir_has_any_rounding_mode_enabled(execution_mode)) { brw_rnd_mode rnd = brw_rnd_mode_from_execution_mode(execution_mode); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 87b06d4c154..3f025097c26 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1711,7 +1711,11 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, do { progress = false; - if (OPT(nir_opt_algebraic_late)) { + + OPT(nir_opt_algebraic_late); + OPT(brw_nir_lower_fsign); + + if (progress) { OPT(nir_opt_constant_folding); OPT(nir_copy_prop); OPT(nir_opt_dce); diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 744ba46a360..561afb8bee6 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -195,6 +195,8 @@ bool brw_nir_apply_trig_workarounds(nir_shader *nir); bool brw_nir_limit_trig_input_range_workaround(nir_shader *nir); +bool brw_nir_lower_fsign(nir_shader *nir); + void brw_nir_apply_key(nir_shader *nir, const struct brw_compiler *compiler, const struct brw_base_prog_key *key, diff --git a/src/intel/compiler/brw_nir_lower_fsign.py b/src/intel/compiler/brw_nir_lower_fsign.py new file mode 100644 index 00000000000..3df2441fe77 --- /dev/null +++ b/src/intel/compiler/brw_nir_lower_fsign.py @@ -0,0 +1,53 @@ +# Copyright © 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +import argparse +import sys +from math import pi + +a = 'a' +b = 'b' + +lower_fsign = [ + # This matches the behavior of the old optimization in brw_fs_nir.cpp, but + # it has some problems. + # + # The fmul version passes Vulkan float_controls2 CTS a little bit by + # luck. The use of fne means that the false path (i.e., fsign(X) == 0) is + # only taken when X is zero. For OpenCL, this path should also be taken + # when when X is NaN. This can be handled by using 'fabs(X) > 0', but this + # fails float_controls2 CTS when the other multiplication operand is NaN. + # + # This optimization is additionally problematic when fsign(X) is zero and + # the other multiplication operand is Inf. This will result in 0, but it + # should result in NaN. This does not seem to be tested by the CTS. + # + # NOTE: fcsel opcodes are currently limited to float32 in NIR. + (('fmul@32', ('fsign(is_used_once)', a), b), ('fcsel', a , ('ixor', ('iand', a, 0x80000000), b), ('iand', a, 0x80000000))), + (('fmul@16', ('fsign(is_used_once)', a), b), ('bcsel', ('fneu', a, 0), ('ixor', ('iand', a, 0x8000 ), b), ('iand', a, 0x8000 ))), + + # This is 99.99% strictly correct for OpenCL. It will provide correctly + # signed zero for ±0 inputs, and it will provide zero for NaN inputs. The + # only slight deviation is that it can provide -0 for some NaN inputs. + (('fsign@32', a), ('fcsel_gt', ('fabs', a) , ('ior', ('iand', a, 0x80000000), 0x3f800000), ('iand', a, 0x80000000))), + (('fsign@16', a), ('bcsel', ('!flt', 0, ('fabs', a)), ('ior', ('iand', a, 0x8000 ), 0x3c00 ), ('iand', a, 0x8000 ))), +] + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + args = parser.parse_args() + sys.path.insert(0, args.import_path) + run() + + +def run(): + import nir_algebraic # pylint: disable=import-error + + print('#include "brw_nir.h"') + + print(nir_algebraic.AlgebraicPass("brw_nir_lower_fsign", lower_fsign).render()) + + +if __name__ == '__main__': + main() diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 1ad4f95e811..74237fbdef1 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -130,6 +130,17 @@ brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c', command : [prog_python, '@INPUT0@', '--outdir', meson.current_build_dir()]) +brw_nir_lower_fsign = custom_target( + 'brw_nir_lower_fsign.c', + input : 'brw_nir_lower_fsign.py', + output : 'brw_nir_lower_fsign.c', + command : [ + prog_python, '@INPUT@', '-p', dir_compiler_nir, + ], + depend_files : nir_algebraic_depends, + capture : true, +) + brw_nir_trig = custom_target( 'brw_nir_trig_workarounds.c', input : 'brw_nir_trig_workarounds.py', @@ -143,7 +154,7 @@ brw_nir_trig = custom_target( libintel_compiler_brw = static_library( 'intel_compiler', - [libintel_compiler_brw_files, intel_nir_files, brw_nir_trig, ir_expression_operation_h, [brw_device_sha1_gen_src]], + [libintel_compiler_brw_files, intel_nir_files, brw_nir_lower_fsign, brw_nir_trig, ir_expression_operation_h, [brw_device_sha1_gen_src]], include_directories : [inc_include, inc_src, inc_intel], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden',