mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 02:28:10 +02:00
intel/brw: Handle fsign optimization in a NIR algebraic pass
This is a lot less code, and it makes it easier to experiment with other pattern-based optimizations in the future. The results here are nearly identical to the results I got from Ken's "intel/brw: Make fsign (for 16/32-bit) in SSA form"... which are not particularly good. In this commit and in Ken's, all of the shader-db shaders hurt for spills and fills are from Deus Ex Mankind Divided. Each shader has a bunch of texture instructions with a single fsign between the blocks. With the dependency on the flag removed, the scheduler puts all of the texture instructions at the start... and there are a LOT of them. shader-db: All Intel platforms had similar results. (Meteor Lake shown) total instructions in shared programs: 19647060 -> 19650207 (0.02%) instructions in affected programs: 734718 -> 737865 (0.43%) helped: 382 / HURT: 1984 total cycles in shared programs: 823238442 -> 822785913 (-0.05%) cycles in affected programs: 426901157 -> 426448628 (-0.11%) helped: 3408 / HURT: 3671 total spills in shared programs: 3887 -> 3891 (0.10%) spills in affected programs: 256 -> 260 (1.56%) helped: 0 / HURT: 4 total fills in shared programs: 3236 -> 3306 (2.16%) fills in affected programs: 882 -> 952 (7.94%) helped: 0 / HURT: 12 LOST: 37 GAINED: 34 fossil-db: DG2 and Meteor Lake had similar results. (Meteor Lake shown) Totals: Instrs: 154005469 -> 154008294 (+0.00%); split: -0.00%, +0.00% Cycle count: 17551859277 -> 17554293955 (+0.01%); split: -0.02%, +0.04% Spill count: 142078 -> 142090 (+0.01%) Fill count: 266761 -> 266729 (-0.01%); split: -0.02%, +0.01% Max live registers: 32593578 -> 32593858 (+0.00%) Max dispatch width: 5535944 -> 5536816 (+0.02%); split: +0.02%, -0.01% Totals from 5867 (0.93% of 631350) affected shaders: Instrs: 5475544 -> 5478369 (+0.05%); split: -0.04%, +0.09% Cycle count: 1649032029 -> 1651466707 (+0.15%); split: -0.24%, +0.39% Spill count: 26411 -> 26423 (+0.05%) Fill count: 57364 -> 57332 (-0.06%); split: -0.10%, +0.04% Max live registers: 431561 -> 431841 (+0.06%) Max dispatch width: 49784 -> 50656 (+1.75%); split: +2.38%, -0.63% Tiger Lake Totals: Instrs: 149530671 -> 149533588 (+0.00%); split: -0.00%, +0.00% Cycle count: 15261418953 -> 15264764921 (+0.02%); split: -0.00%, +0.03% Spill count: 60317 -> 60316 (-0.00%); split: -0.02%, +0.01% Max live registers: 32249201 -> 32249464 (+0.00%) Max dispatch width: 5540608 -> 5540584 (-0.00%) Totals from 5862 (0.93% of 630309) affected shaders: Instrs: 4740800 -> 4743717 (+0.06%); split: -0.04%, +0.10% Cycle count: 566531248 -> 569877216 (+0.59%); split: -0.13%, +0.72% Spill count: 11709 -> 11708 (-0.01%); split: -0.09%, +0.08% Max live registers: 424560 -> 424823 (+0.06%) Max dispatch width: 50304 -> 50280 (-0.05%) Ice Lake Totals: Instrs: 150499705 -> 150502608 (+0.00%); split: -0.00%, +0.00% Cycle count: 15105629116 -> 15105425880 (-0.00%); split: -0.00%, +0.00% Spill count: 60087 -> 60090 (+0.00%) Fill count: 100542 -> 100541 (-0.00%); split: -0.00%, +0.00% Max live registers: 32605215 -> 32605495 (+0.00%) Max dispatch width: 5617752 -> 5617792 (+0.00%); split: +0.00%, -0.00% Totals from 5882 (0.93% of 634934) affected shaders: Instrs: 4737206 -> 4740109 (+0.06%); split: -0.04%, +0.10% Cycle count: 598882104 -> 598678868 (-0.03%); split: -0.08%, +0.05% Spill count: 10278 -> 10281 (+0.03%) Fill count: 22504 -> 22503 (-0.00%); split: -0.01%, +0.01% Max live registers: 424184 -> 424464 (+0.07%) Max dispatch width: 50216 -> 50256 (+0.08%); split: +0.25%, -0.18% Skylake Totals: Instrs: 139092612 -> 139095257 (+0.00%); split: -0.00%, +0.00% Cycle count: 14533550285 -> 14533544716 (-0.00%); split: -0.00%, +0.00% Spill count: 58176 -> 58172 (-0.01%) Fill count: 95877 -> 95796 (-0.08%) Max live registers: 31924594 -> 31924874 (+0.00%) Max dispatch width: 5484568 -> 5484552 (-0.00%); split: +0.00%, -0.00% Totals from 5789 (0.93% of 625512) affected shaders: Instrs: 4481987 -> 4484632 (+0.06%); split: -0.04%, +0.10% Cycle count: 578310124 -> 578304555 (-0.00%); split: -0.05%, +0.05% Spill count: 9248 -> 9244 (-0.04%) Fill count: 19677 -> 19596 (-0.41%) Max live registers: 415340 -> 415620 (+0.07%) Max dispatch width: 49720 -> 49704 (-0.03%); split: +0.10%, -0.13% Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29095>
This commit is contained in:
parent
cd343fb9ac
commit
3f151c03af
5 changed files with 73 additions and 144 deletions
|
|
@ -849,135 +849,6 @@ try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const fs_builder &bld,
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit code for nir_op_fsign possibly fused with a nir_op_fmul
|
||||
*
|
||||
* If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
|
||||
* the source of \c instr that is a \c nir_op_fsign.
|
||||
*/
|
||||
static void
|
||||
emit_fsign(nir_to_brw_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
|
||||
fs_reg result, fs_reg *op, unsigned fsign_src)
|
||||
{
|
||||
const intel_device_info *devinfo = ntb.devinfo;
|
||||
|
||||
fs_inst *inst;
|
||||
|
||||
assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
|
||||
assert(fsign_src < nir_op_infos[instr->op].num_inputs);
|
||||
|
||||
if (instr->op != nir_op_fsign) {
|
||||
const nir_alu_instr *const fsign_instr =
|
||||
nir_src_as_alu_instr(instr->src[fsign_src].src);
|
||||
|
||||
/* op[fsign_src] has the nominal result of the fsign, and op[1 -
|
||||
* fsign_src] has the other multiply source. This must be rearranged so
|
||||
* that op[0] is the source of the fsign op[1] is the other multiply
|
||||
* source.
|
||||
*/
|
||||
if (fsign_src != 0)
|
||||
op[1] = op[0];
|
||||
|
||||
op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
|
||||
|
||||
const nir_alu_type t =
|
||||
(nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
|
||||
nir_src_bit_size(fsign_instr->src[0].src));
|
||||
|
||||
op[0].type = brw_type_for_nir_type(devinfo, t);
|
||||
|
||||
unsigned channel = 0;
|
||||
if (nir_op_infos[instr->op].output_size == 0) {
|
||||
/* Since NIR is doing the scalarizing for us, we should only ever see
|
||||
* vectorized operations with a single channel.
|
||||
*/
|
||||
nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
|
||||
assert(util_bitcount(write_mask) == 1);
|
||||
channel = ffs(write_mask) - 1;
|
||||
}
|
||||
|
||||
op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
|
||||
}
|
||||
|
||||
if (brw_type_size_bytes(op[0].type) == 2) {
|
||||
/* AND(val, 0x8000) gives the sign bit.
|
||||
*
|
||||
* Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
|
||||
*/
|
||||
fs_reg zero = retype(brw_imm_uw(0), BRW_TYPE_HF);
|
||||
bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
|
||||
|
||||
op[0].type = BRW_TYPE_UW;
|
||||
result.type = BRW_TYPE_UW;
|
||||
bld.AND(result, op[0], brw_imm_uw(0x8000u));
|
||||
|
||||
if (instr->op == nir_op_fsign)
|
||||
inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
|
||||
else {
|
||||
/* Use XOR here to get the result sign correct. */
|
||||
inst = bld.XOR(result, result, retype(op[1], BRW_TYPE_UW));
|
||||
}
|
||||
|
||||
inst->predicate = BRW_PREDICATE_NORMAL;
|
||||
} else if (brw_type_size_bytes(op[0].type) == 4) {
|
||||
/* AND(val, 0x80000000) gives the sign bit.
|
||||
*
|
||||
* Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
|
||||
* zero.
|
||||
*/
|
||||
bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
|
||||
|
||||
op[0].type = BRW_TYPE_UD;
|
||||
result.type = BRW_TYPE_UD;
|
||||
bld.AND(result, op[0], brw_imm_ud(0x80000000u));
|
||||
|
||||
if (instr->op == nir_op_fsign)
|
||||
inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
|
||||
else {
|
||||
/* Use XOR here to get the result sign correct. */
|
||||
inst = bld.XOR(result, result, retype(op[1], BRW_TYPE_UD));
|
||||
}
|
||||
|
||||
inst->predicate = BRW_PREDICATE_NORMAL;
|
||||
} else {
|
||||
unreachable("Should have been lowered by nir_opt_algebraic.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
|
||||
*
|
||||
* Checks the operands of a \c nir_op_fmul to determine whether or not
|
||||
* \c emit_fsign could fuse the multiplication with the \c sign() calculation.
|
||||
*
|
||||
* \param instr The multiplication instruction
|
||||
*
|
||||
* \param fsign_src The source of \c instr that may or may not be a
|
||||
* \c nir_op_fsign
|
||||
*/
|
||||
static bool
|
||||
can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
|
||||
{
|
||||
assert(instr->op == nir_op_fmul);
|
||||
|
||||
nir_alu_instr *const fsign_instr =
|
||||
nir_src_as_alu_instr(instr->src[fsign_src].src);
|
||||
|
||||
/* Rules:
|
||||
*
|
||||
* 1. instr->src[fsign_src] must be a nir_op_fsign.
|
||||
* 2. The nir_op_fsign can only be used by this multiplication.
|
||||
* 3. The source that is the nir_op_fsign does not have source modifiers.
|
||||
* \c emit_fsign only examines the source modifiers of the source of the
|
||||
* \c nir_op_fsign.
|
||||
*
|
||||
* The nir_op_fsign must also not have the saturate modifier, but steps
|
||||
* have already been taken (in nir_opt_algebraic) to ensure that.
|
||||
*/
|
||||
return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
|
||||
is_used_once(fsign_instr);
|
||||
}
|
||||
|
||||
static bool
|
||||
is_const_zero(const nir_src &src)
|
||||
{
|
||||
|
|
@ -1233,8 +1104,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
break;
|
||||
|
||||
case nir_op_fsign:
|
||||
emit_fsign(ntb, bld, instr, result, op, 0);
|
||||
break;
|
||||
unreachable("Should have been lowered by brw_nir_lower_fsign.");
|
||||
|
||||
case nir_op_frcp:
|
||||
bld.RCP(result, op[0]);
|
||||
|
|
@ -1322,17 +1192,6 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
}
|
||||
|
||||
case nir_op_fmul:
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
if (can_fuse_fmul_fsign(instr, i)) {
|
||||
emit_fsign(ntb, bld, instr, result, op, i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* We emit the rounding mode after the previous fsign optimization since
|
||||
* it won't result in a MUL, but will try to negate the value by other
|
||||
* means.
|
||||
*/
|
||||
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
|
||||
brw_rnd_mode rnd =
|
||||
brw_rnd_mode_from_execution_mode(execution_mode);
|
||||
|
|
|
|||
|
|
@ -1711,7 +1711,11 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
|
||||
do {
|
||||
progress = false;
|
||||
if (OPT(nir_opt_algebraic_late)) {
|
||||
|
||||
OPT(nir_opt_algebraic_late);
|
||||
OPT(brw_nir_lower_fsign);
|
||||
|
||||
if (progress) {
|
||||
OPT(nir_opt_constant_folding);
|
||||
OPT(nir_copy_prop);
|
||||
OPT(nir_opt_dce);
|
||||
|
|
|
|||
|
|
@ -195,6 +195,8 @@ bool brw_nir_apply_trig_workarounds(nir_shader *nir);
|
|||
|
||||
bool brw_nir_limit_trig_input_range_workaround(nir_shader *nir);
|
||||
|
||||
bool brw_nir_lower_fsign(nir_shader *nir);
|
||||
|
||||
void brw_nir_apply_key(nir_shader *nir,
|
||||
const struct brw_compiler *compiler,
|
||||
const struct brw_base_prog_key *key,
|
||||
|
|
|
|||
53
src/intel/compiler/brw_nir_lower_fsign.py
Normal file
53
src/intel/compiler/brw_nir_lower_fsign.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
# Copyright © 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from math import pi
|
||||
|
||||
a = 'a'
|
||||
b = 'b'
|
||||
|
||||
lower_fsign = [
|
||||
# This matches the behavior of the old optimization in brw_fs_nir.cpp, but
|
||||
# it has some problems.
|
||||
#
|
||||
# The fmul version passes Vulkan float_controls2 CTS a little bit by
|
||||
# luck. The use of fne means that the false path (i.e., fsign(X) == 0) is
|
||||
# only taken when X is zero. For OpenCL, this path should also be taken
|
||||
# when when X is NaN. This can be handled by using 'fabs(X) > 0', but this
|
||||
# fails float_controls2 CTS when the other multiplication operand is NaN.
|
||||
#
|
||||
# This optimization is additionally problematic when fsign(X) is zero and
|
||||
# the other multiplication operand is Inf. This will result in 0, but it
|
||||
# should result in NaN. This does not seem to be tested by the CTS.
|
||||
#
|
||||
# NOTE: fcsel opcodes are currently limited to float32 in NIR.
|
||||
(('fmul@32', ('fsign(is_used_once)', a), b), ('fcsel', a , ('ixor', ('iand', a, 0x80000000), b), ('iand', a, 0x80000000))),
|
||||
(('fmul@16', ('fsign(is_used_once)', a), b), ('bcsel', ('fneu', a, 0), ('ixor', ('iand', a, 0x8000 ), b), ('iand', a, 0x8000 ))),
|
||||
|
||||
# This is 99.99% strictly correct for OpenCL. It will provide correctly
|
||||
# signed zero for ±0 inputs, and it will provide zero for NaN inputs. The
|
||||
# only slight deviation is that it can provide -0 for some NaN inputs.
|
||||
(('fsign@32', a), ('fcsel_gt', ('fabs', a) , ('ior', ('iand', a, 0x80000000), 0x3f800000), ('iand', a, 0x80000000))),
|
||||
(('fsign@16', a), ('bcsel', ('!flt', 0, ('fabs', a)), ('ior', ('iand', a, 0x8000 ), 0x3c00 ), ('iand', a, 0x8000 ))),
|
||||
]
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-p', '--import-path', required=True)
|
||||
args = parser.parse_args()
|
||||
sys.path.insert(0, args.import_path)
|
||||
run()
|
||||
|
||||
|
||||
def run():
|
||||
import nir_algebraic # pylint: disable=import-error
|
||||
|
||||
print('#include "brw_nir.h"')
|
||||
|
||||
print(nir_algebraic.AlgebraicPass("brw_nir_lower_fsign", lower_fsign).render())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -130,6 +130,17 @@ brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c',
|
|||
command : [prog_python, '@INPUT0@', '--outdir', meson.current_build_dir()])
|
||||
|
||||
|
||||
brw_nir_lower_fsign = custom_target(
|
||||
'brw_nir_lower_fsign.c',
|
||||
input : 'brw_nir_lower_fsign.py',
|
||||
output : 'brw_nir_lower_fsign.c',
|
||||
command : [
|
||||
prog_python, '@INPUT@', '-p', dir_compiler_nir,
|
||||
],
|
||||
depend_files : nir_algebraic_depends,
|
||||
capture : true,
|
||||
)
|
||||
|
||||
brw_nir_trig = custom_target(
|
||||
'brw_nir_trig_workarounds.c',
|
||||
input : 'brw_nir_trig_workarounds.py',
|
||||
|
|
@ -143,7 +154,7 @@ brw_nir_trig = custom_target(
|
|||
|
||||
libintel_compiler_brw = static_library(
|
||||
'intel_compiler',
|
||||
[libintel_compiler_brw_files, intel_nir_files, brw_nir_trig, ir_expression_operation_h, [brw_device_sha1_gen_src]],
|
||||
[libintel_compiler_brw_files, intel_nir_files, brw_nir_lower_fsign, brw_nir_trig, ir_expression_operation_h, [brw_device_sha1_gen_src]],
|
||||
include_directories : [inc_include, inc_src, inc_intel],
|
||||
c_args : [no_override_init_args],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue