From 97e3c6a12a2c981bf70e64fbd4ab4e7bbf8601eb Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Fri, 3 May 2024 11:07:39 -0700 Subject: [PATCH] intel/brw: Use range analysis to optimize fsign shader-db: Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown) total instructions in shared programs: 19674784 -> 19665960 (-0.04%) instructions in affected programs: 933425 -> 924601 (-0.95%) helped: 3656 / HURT: 0 total cycles in shared programs: 810343919 -> 810241030 (-0.01%) cycles in affected programs: 56752034 -> 56649145 (-0.18%) helped: 3032 / HURT: 434 LOST: 11 GAINED: 0 Ice Lake and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 20315795 -> 20305856 (-0.05%) instructions in affected programs: 979698 -> 969759 (-1.01%) helped: 3845 / HURT: 0 total cycles in shared programs: 830600281 -> 830534694 (<.01%) cycles in affected programs: 45675615 -> 45610028 (-0.14%) helped: 3250 / HURT: 325 total spills in shared programs: 4583 -> 4565 (-0.39%) spills in affected programs: 180 -> 162 (-10.00%) helped: 3 / HURT: 0 total fills in shared programs: 5245 -> 5219 (-0.50%) fills in affected programs: 379 -> 353 (-6.86%) helped: 3 / HURT: 0 LOST: 14 GAINED: 8 fossil-db: All Intel platforms except Tiger Lake had similar results. (Meteor Lake shown) Totals: Instrs: 154024263 -> 154023814 (-0.00%) Cycle count: 17463341602 -> 17461726239 (-0.01%); split: -0.01%, +0.00% Totals from 322 (0.05% of 631440) affected shaders: Instrs: 199933 -> 199484 (-0.22%) Cycle count: 168492537 -> 166877174 (-0.96%); split: -0.96%, +0.00% Tiger Lake Instrs: 149984723 -> 149984287 (-0.00%) Cycle count: 15238596937 -> 15239260415 (+0.00%); split: -0.00%, +0.01% Max dispatch width: 5553408 -> 5553424 (+0.00%) Totals from 318 (0.05% of 631414) affected shaders: Instrs: 179624 -> 179188 (-0.24%) Cycle count: 160724533 -> 161388011 (+0.41%); split: -0.06%, +0.48% Max dispatch width: 3296 -> 3312 (+0.49%) Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_nir_lower_fsign.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/intel/compiler/brw_nir_lower_fsign.py b/src/intel/compiler/brw_nir_lower_fsign.py index 6821b3d98f5..04c6a2bde68 100644 --- a/src/intel/compiler/brw_nir_lower_fsign.py +++ b/src/intel/compiler/brw_nir_lower_fsign.py @@ -16,6 +16,16 @@ lower_fsign = [ # is_finite. # # NOTE: fcsel opcodes are currently limited to float32 in NIR. + (('fmul@32(is_only_used_as_float)', ('fsign(is_used_once)', 'a(is_not_negative)'), b), ('fcsel_gt', a , b , ('fmul', b, 0.0 ))), + (('~fmul@32', ('fsign(is_used_once)', 'a(is_not_negative)'), b), ('fcsel_gt', a , b , 0.0 )), + (('fmul@32(is_only_used_as_float)', ('fsign(is_used_once)', 'a(is_not_positive)'), b), ('fcsel_gt', ('fneg', a), ('fneg', b), ('fmul', b, 0x80000000))), + (('~fmul@32', ('fsign(is_used_once)', 'a(is_not_positive)'), b), ('fcsel_gt', ('fneg', a), ('fneg', b), 0x80000000 )), + + (('fmul@16(is_only_used_as_float)', ('fsign(is_used_once)', 'a(is_not_negative)'), b), ('bcsel', ('!flt', 0, a ), b , ('fmul', b, 0.0 ))), + (('~fmul@16', ('fsign(is_used_once)', 'a(is_not_negative)'), b), ('bcsel', ('!flt', 0, a ), b , 0.0 )), + (('fmul@16(is_only_used_as_float)', ('fsign(is_used_once)', 'a(is_not_positive)'), b), ('bcsel', ('!flt', 0, ('fneg', a)), ('fneg', b), ('fmul', b, 0x8000))), + (('~fmul@16', ('fsign(is_used_once)', 'a(is_not_positive)'), b), ('bcsel', ('!flt', 0, ('fneg', a)), ('fneg', b), 0x8000 )), + (('fmul@32(is_only_used_as_float,nsz)', ('fsign(is_used_once)', a), 'b(is_finite)'), ('fcsel_gt', a, b, ('fcsel_gt', ('fneg', a), ('fneg', b), 0.0))), (('fmul@32(is_only_used_as_float,nsz,nnan)', ('fsign(is_used_once)', a), b ), ('fcsel_gt', a, b, ('fcsel_gt', ('fneg', a), ('fneg', b), 0.0))), (('~fmul@32', ('fsign(is_used_once)', a), b ), ('fcsel_gt', a, b, ('fcsel_gt', ('fneg', a), ('fneg', b), 0.0))), @@ -25,6 +35,12 @@ lower_fsign = [ # only slight deviation is that it can provide -0 for some NaN inputs. (('fsign@32', a), ('fcsel_gt', ('fabs', a) , ('ior', ('iand', a, 0x80000000), 0x3f800000), ('iand', a, 0x80000000))), (('fsign@16', a), ('bcsel', ('!flt', 0, ('fabs', a)), ('ior', ('iand', a, 0x8000 ), 0x3c00 ), ('iand', a, 0x8000 ))), + + # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN + (('fmul(nsz,nnan)', 'a', 0.0), 0.0), + (('fmul(nsz)', 'a(is_finite)', 0.0), 0.0), + (('fmul(nsz,nnan)', 'a@32', 0x80000000), 0.0), + (('fmul(nsz,nnan)', 'a@16', 0x8000 ), 0.0), ] def main():