From 1ef20f1f3516162bb8f6200f11b53acce2f64315 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 19 Aug 2022 15:41:29 -0400 Subject: [PATCH] pan/bi: Optimize bitwise arithmetic of booleans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is easier to schedule on Bifrost. In theory it's also better on Valhall, but in practice the CVT unit is too overloaded on Valhall for this to help at the moment. We can revisit these rules for Valhall in the future where the Valhall optimizer is more mature and/or Valhall grows a scheduler to balance the execution units. total instructions in shared programs: 2415350 -> 2414877 (-0.02%) instructions in affected programs: 120948 -> 120475 (-0.39%) helped: 192 HURT: 49 helped stats (abs) min: 1.0 max: 5.0 x̄: 2.89 x̃: 4 helped stats (rel) min: 0.25% max: 4.35% x̄: 0.66% x̃: 0.52% HURT stats (abs) min: 1.0 max: 3.0 x̄: 1.67 x̃: 1 HURT stats (rel) min: 0.11% max: 7.14% x̄: 1.73% x̃: 0.77% 95% mean confidence interval for instructions value: -2.24 -1.68 95% mean confidence interval for instructions %-change: -0.37% 0.02% Inconclusive result (%-change mean confidence interval includes 0). total tuples in shared programs: 1928474 -> 1927478 (-0.05%) tuples in affected programs: 146482 -> 145486 (-0.68%) helped: 514 HURT: 73 helped stats (abs) min: 1.0 max: 8.0 x̄: 2.11 x̃: 1 helped stats (rel) min: 0.18% max: 9.52% x̄: 1.35% x̃: 0.76% HURT stats (abs) min: 1.0 max: 2.0 x̄: 1.23 x̃: 1 HURT stats (rel) min: 0.15% max: 7.14% x̄: 1.07% x̃: 0.76% 95% mean confidence interval for tuples value: -1.85 -1.55 95% mean confidence interval for tuples %-change: -1.19% -0.91% Tuples are helped. total clauses in shared programs: 354985 -> 354853 (-0.04%) clauses in affected programs: 8562 -> 8430 (-1.54%) helped: 124 HURT: 22 helped stats (abs) min: 1.0 max: 8.0 x̄: 1.24 x̃: 1 helped stats (rel) min: 0.83% max: 7.14% x̄: 2.47% x̃: 1.72% HURT stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 HURT stats (rel) min: 1.25% max: 20.00% x̄: 5.08% x̃: 4.35% 95% mean confidence interval for clauses value: -1.11 -0.70 95% mean confidence interval for clauses %-change: -1.92% -0.75% Clauses are helped. total cycles in shared programs: 166575.48 -> 166542.56 (-0.02%) cycles in affected programs: 4556.58 -> 4523.67 (-0.72%) helped: 395 HURT: 65 helped stats (abs) min: 0.041665999999999315 max: 0.33333199999999863 x̄: 0.09 x̃: 0 helped stats (rel) min: 0.19% max: 11.11% x̄: 1.42% x̃: 0.81% HURT stats (abs) min: 0.041665999999999315 max: 0.08333400000000069 x̄: 0.05 x̃: 0 HURT stats (rel) min: 0.15% max: 8.33% x̄: 1.21% x̃: 0.83% 95% mean confidence interval for cycles value: -0.08 -0.06 95% mean confidence interval for cycles %-change: -1.22% -0.87% Cycles are helped. total arith in shared programs: 73687.88 -> 73643 (-0.06%) arith in affected programs: 6339 -> 6294.13 (-0.71%) helped: 570 HURT: 72 helped stats (abs) min: 0.041665999999999315 max: 0.3333340000000007 x̄: 0.08 x̃: 0 helped stats (rel) min: 0.19% max: 12.50% x̄: 1.41% x̃: 0.77% HURT stats (abs) min: 0.041665999999999315 max: 0.08333400000000069 x̄: 0.05 x̃: 0 HURT stats (rel) min: 0.15% max: 8.33% x̄: 1.13% x̃: 0.75% 95% mean confidence interval for arith value: -0.08 -0.06 95% mean confidence interval for arith %-change: -1.27% -0.98% Arith are helped. total quadwords in shared programs: 1674486 -> 1673974 (-0.03%) quadwords in affected programs: 117696 -> 117184 (-0.44%) helped: 424 HURT: 127 helped stats (abs) min: 1.0 max: 6.0 x̄: 1.64 x̃: 1 helped stats (rel) min: 0.19% max: 4.88% x̄: 1.00% x̃: 0.82% HURT stats (abs) min: 1.0 max: 5.0 x̄: 1.46 x̃: 1 HURT stats (rel) min: 0.15% max: 6.25% x̄: 1.31% x̃: 0.88% 95% mean confidence interval for quadwords value: -1.07 -0.79 95% mean confidence interval for quadwords %-change: -0.58% -0.36% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/bifrost_compile.c | 4 ++++ src/panfrost/bifrost/bifrost_nir.h | 1 + src/panfrost/bifrost/bifrost_nir_algebraic.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index c0f466b7e4b..d316e805dd7 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -4571,6 +4571,10 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) NIR_PASS(progress, nir, nir_opt_cse); } + /* This opt currently helps on Bifrost but not Valhall */ + if (gpu_id < 0x9000) + NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise); + NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL); diff --git a/src/panfrost/bifrost/bifrost_nir.h b/src/panfrost/bifrost/bifrost_nir.h index b94ba6eaf38..ba0208012b8 100644 --- a/src/panfrost/bifrost/bifrost_nir.h +++ b/src/panfrost/bifrost/bifrost_nir.h @@ -27,3 +27,4 @@ bool bifrost_nir_lower_algebraic_late(nir_shader *shader); bool bifrost_nir_lower_xfb(nir_shader *shader); +bool bifrost_nir_opt_boolean_bitwise(nir_shader *shader); diff --git a/src/panfrost/bifrost/bifrost_nir_algebraic.py b/src/panfrost/bifrost/bifrost_nir_algebraic.py index 9a6085815fd..77fad35ff31 100644 --- a/src/panfrost/bifrost/bifrost_nir_algebraic.py +++ b/src/panfrost/bifrost/bifrost_nir_algebraic.py @@ -28,6 +28,20 @@ a = 'a' b = 'b' c = 'c' +# In general, bcsel is cheaper than bitwise arithmetic on Mali. On +# Bifrost, we can implement bcsel as either CSEL or MUX to schedule to either +# execution unit. On Valhall, bitwise arithmetic may be on the SFU whereas MUX +# is on the higher throughput CVT unit. We get a zero argument for free relative +# to the bitwise op, which would be LSHIFT_* internally taking a zero anyway. +# +# As such, it's beneficial to reexpress bitwise arithmetic of booleans as bcsel. +opt_bool_bitwise = [ + (('iand', 'a@1', 'b@1'), ('bcsel', a, b, False)), + (('ior', 'a@1', 'b@1'), ('bcsel', a, a, b)), + (('iand', 'a@1', ('inot', 'b@1')), ('bcsel', b, 0, a)), + (('ior', 'a@1', ('inot', 'b@1')), ('bcsel', b, a, True)), +] + algebraic_late = [ # Canonical form. The scheduler will convert back if it makes sense. (('fmul', a, 2.0), ('fadd', a, a)), @@ -69,6 +83,8 @@ def run(): print('#include "bifrost_nir.h"') + print(nir_algebraic.AlgebraicPass("bifrost_nir_opt_boolean_bitwise", + opt_bool_bitwise).render()) print(nir_algebraic.AlgebraicPass("bifrost_nir_lower_algebraic_late", algebraic_late).render())