From c950098abb5f257e83f986ccce7bab004520c069 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 3 May 2023 11:52:37 +0200
Subject: [PATCH] broadcom/compiler: move buffer loads to lower register
 pressure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If we are trying to lower register pressure this can make a big
difference in some cases. To avoid adding even more strategies,
merge this with disabling ubo load sorting, since they are basically
trying to do the same.

total instructions in shared programs: 12848024 -> 12844510 (-0.03%)
instructions in affected programs: 236537 -> 233023 (-1.49%)
helped: 195
HURT: 87
Instructions are helped.

total uniforms in shared programs: 3815601 -> 3814932 (-0.02%)
uniforms in affected programs: 31773 -> 31104 (-2.11%)
helped: 67
HURT: 115
Inconclusive result (value mean confidence interval includes 0).

total max-temps in shared programs: 2210803 -> 2210622 (<.01%)
max-temps in affected programs: 9362 -> 9181 (-1.93%)
helped: 114
HURT: 34
Max-temps are helped.

total spills in shared programs: 2556 -> 2330 (-8.84%)
spills in affected programs: 1391 -> 1165 (-16.25%)
helped: 39
HURT: 9

total fills in shared programs: 3840 -> 3317 (-13.62%)
fills in affected programs: 2379 -> 1856 (-21.98%)
helped: 39
HURT: 23

total sfu-stalls in shared programs: 21965 -> 21978 (0.06%)
sfu-stalls in affected programs: 2618 -> 2631 (0.50%)
helped: 45
HURT: 81
Inconclusive result (value mean confidence interval includes 0).

total inst-and-stalls in shared programs: 12869989 -> 12866488 (-0.03%)
inst-and-stalls in affected programs: 238771 -> 235270 (-1.47%)
helped: 193
HURT: 87
Inst-and-stalls are helped.

total nops in shared programs: 303501 -> 303274 (-0.07%)
nops in affected programs: 4159 -> 3932 (-5.46%)
helped: 87
HURT: 105
Inconclusive result (value mean confidence interval includes 0).

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22824>
---
 src/broadcom/compiler/v3d_compiler.h |  5 ++++
 src/broadcom/compiler/vir.c          | 42 ++++++++++++++++++----------
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 7a43c72cc36..61d0c5ccdc8 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -715,6 +715,11 @@ struct v3d_compile {
         bool disable_constant_ubo_load_sorting;
         bool sorted_any_ubo_loads;
 
+        /* Moves UBO/SSBO loads right before their first user (nir_opt_move).
+         * This can reduce register pressure.
+         */
+        bool move_buffer_loads;
+
         /* Emits ldunif for each new uniform, even if the uniform was already
          * emitted in the same block. Useful to compile shaders with high
          * register pressure or to disable the optimization during uniform
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 707c96911e8..5355645085d 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -549,6 +549,7 @@ struct v3d_compiler_strategy {
         bool disable_gcm;
         bool disable_loop_unrolling;
         bool disable_ubo_load_sorting;
+        bool move_buffer_loads;
         bool disable_tmu_pipelining;
         uint32_t max_tmu_spills;
 };
@@ -583,6 +584,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
         c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
         c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
+        c->move_buffer_loads = strategy->move_buffer_loads;
         c->disable_gcm = strategy->disable_gcm;
         c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
                 ? true : strategy->disable_loop_unrolling;
@@ -1649,8 +1651,11 @@ v3d_attempt_compile(struct v3d_compile *c)
         if (!c->disable_constant_ubo_load_sorting)
                 NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
 
+        const nir_move_options buffer_opts = c->move_buffer_loads ?
+                (nir_move_load_ubo | nir_move_load_ssbo) : 0;
         NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
-                                       nir_move_const_undef);
+                                        nir_move_const_undef |
+                                        buffer_opts);
 
         v3d_nir_to_vir(c);
 }
@@ -1710,19 +1715,19 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
  * because v3d_nir_to_vir will cap this to the actual minimum.
  */
 static const struct v3d_compiler_strategy strategies[] = {
-        /*0*/  { "default",                        4, 4, false, false, false, false, false,  0 },
-        /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false,  0 },
-        /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false,  0 },
-        /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false,  0 },
-        /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false,  0 },
-        /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  true,   0 },
-        /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, -1 },
-        /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, -1 },
-        /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, -1 },
-        /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, -1 },
-        /*10*/ { "disable UBO load sorting (2t)",  2, 1, true,  true,  true,  true,  false, -1 },
-        /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  -1 },
-        /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  -1 }
+        /*0*/  { "default",                        4, 4, false, false, false, false, false, false,  0 },
+        /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false, false,  0 },
+        /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false, false,  0 },
+        /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false, false,  0 },
+        /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false, false,  0 },
+        /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  false, true,   0 },
+        /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, false, -1 },
+        /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, false, -1 },
+        /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, false, -1 },
+        /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, false, -1 },
+        /*10*/ { "Move buffer loads (2t)",         2, 1, true,  true,  true,  true,  true,  false, -1 },
+        /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  false, -1 },
+        /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  false, -1 }
 };
 
 /**
@@ -1763,8 +1768,15 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
            return !c->unrolled_any_loops;
    /* UBO load sorting: skip if we didn't sort any loads */
    case 4:
-   case 10:
            return !c->sorted_any_ubo_loads;
+   /* Move buffer loads: we assume any shader with difficult RA
+    * most likely has UBO / SSBO loads so we never try to skip.
+    * For now, we only try this for 2-thread compiles since it
+    * is expected to impact instruction counts and latency.
+    */
+   case 10:
+          assert(c->threads < 4);
+          return false;
    /* TMU pipelining: skip if we didn't pipeline any TMU ops */
    case 5:
    case 11: