From bcde7c14c688b0364ecfcb4753e4f57b230e4acb Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 2 Feb 2021 16:04:57 -0800 Subject: [PATCH] nir/algebraic: Fix a >> #b << #b for sizes other than 32-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The base mask previously used was 0xffffffff. This is not correct (but should still work) for 16-bit and 8-bit values, but it means the high 32-bits of 64-bit values will get chopped off. Instead of just restricting the pattern to 32-bits (as was done before 00b28a50b2c), this extends the optimization in two ways: 1. Make it correct for other bit sizes. 2. Make it work for arbitrary shift counts. This has the added benefit of reducing the number of patterns actually added (7 previously, 4 now). The "Reassociate for improved CSE" part is just reverted to its pre-00b28a50b2c behavior. I doubt that pattern is likely to have much impact outside 32-bits. This change fixes the piglit tests tests/spec/arb_gpu_shader_int64/fs-shl-of-shr-int64.shader_test and tests/spec/arb_gpu_shader_int64/fs-iand-of-iadd-int64.shader_test. All of the shaders helped in shader-db are vertex shaders on platforms with vector-oriented vertex processing. The shaders contain ((x >> 16) << 16). These platforms set lower_extract_word, so the optimization that transforms (x >> 16) to extract_u16 doesn't trigger. With only ~60 shaders involved, I didn't bother trying to add extract_XYZ versions of these patterns to try to get those cases. Fixes: 00b28a50b2c ("nir/algebraic: trivially enable existing 32-bit patterns for all bit sizes") Reviewed-by: Rhys Perry Haswell and earlier Intel GPUs had simlar results. (Haswell shown) total instructions in shared programs: 16397554 -> 16397496 (<.01%) instructions in affected programs: 7961 -> 7903 (-0.73%) helped: 58 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.36% max: 1.89% x̄: 0.99% x̃: 0.78% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -1.13% -0.85% Instructions are helped. total cycles in shared programs: 1035483770 -> 1035483504 (<.01%) cycles in affected programs: 75922 -> 75656 (-0.35%) helped: 44 HURT: 2 helped stats (abs) min: 2 max: 12 x̄: 6.14 x̃: 2 helped stats (rel) min: 0.05% max: 1.67% x̄: 0.87% x̃: 0.72% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.06% max: 0.06% x̄: 0.06% x̃: 0.06% 95% mean confidence interval for cycles value: -7.28 -4.29 95% mean confidence interval for cycles %-change: -1.03% -0.63% Cycles are helped. Part-of: (cherry picked from commit 6b0443a9008ac4c004b1f3fb846b5c1e8c961df2) --- .pick_status.json | 2 +- src/compiler/nir/nir_opt_algebraic.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 0379e151253..3b30ea91f91 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -1903,7 +1903,7 @@ "description": "nir/algebraic: Fix a >> #b << #b for sizes other than 32-bit", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "master_sha": null, "because_sha": "00b28a50b2c492eee25ef3f75538aabe1e569ff1" }, diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 35a51aa4806..56de4ab72f4 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -273,17 +273,23 @@ for s in [8, 16, 32, 64]: # Optimize a pattern of address calculation created by DXVK where the offset is # divided by 4 and then multipled by 4. This can be turned into an iand and the # additions before can be reassociated to CSE the iand instruction. + +for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)): + a_sz = 'a@{}'.format(size) + + optimizations.extend([ + # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' + (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), + ]) + for log2 in range(1, 7): # powers of two from 2 to 64 v = 1 << log2 mask = 0xffffffff & ~(v - 1) b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) optimizations.extend([ - # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' - (('ishl', ('ushr', a, log2), log2), ('iand', a, mask)), - # Reassociate for improved CSE - (('iand', ('iadd', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), + (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), ]) # To save space in the state tables, reduce to the set that is known to help.