broadcom/compiler: add a strategy to disable scheduling of general TMU reads

This can add quite a bit of register pressure so it makes sense to disable it to prevent us from dropping to 2 threads or increase spills: total instructions in shared programs: 12672813 -> 12642413 (-0.24%) instructions in affected programs: 256721 -> 226321 (-11.84%) helped: 719 HURT: 77 total threads in shared programs: 415534 -> 416322 (0.19%) threads in affected programs: 788 -> 1576 (100.00%) helped: 394 HURT: 0 total uniforms in shared programs: 3711370 -> 3703861 (-0.20%) uniforms in affected programs: 28859 -> 21350 (-26.02%) helped: 204 HURT: 455 total max-temps in shared programs: 2159439 -> 2150686 (-0.41%) max-temps in affected programs: 32945 -> 24192 (-26.57%) helped: 585 HURT: 47 total spills in shared programs: 5966 -> 3255 (-45.44%) spills in affected programs: 2933 -> 222 (-92.43%) helped: 192 HURT: 4 total fills in shared programs: 9328 -> 4630 (-50.36%) fills in affected programs: 5184 -> 486 (-90.62%) helped: 196 HURT: 0 Compared to the stats before adding scheduling of non-filtered memory reads we see we that we have now gotten back all that was lost and then some: total instructions in shared programs: 12663186 -> 12642413 (-0.16%) instructions in affected programs: 2051803 -> 2031030 (-1.01%) helped: 4885 HURT: 3338 total threads in shared programs: 415870 -> 416322 (0.11%) threads in affected programs: 896 -> 1348 (50.45%) helped: 300 HURT: 74 total uniforms in shared programs: 3711629 -> 3703861 (-0.21%) uniforms in affected programs: 158766 -> 150998 (-4.89%) helped: 1973 HURT: 499 total max-temps in shared programs: 2138857 -> 2150686 (0.55%) max-temps in affected programs: 177920 -> 189749 (6.65%) helped: 2666 HURT: 2035 total spills in shared programs: 3860 -> 3255 (-15.67%) spills in affected programs: 2653 -> 2048 (-22.80%) helped: 77 HURT: 21 total fills in shared programs: 5573 -> 4630 (-16.92%) fills in affected programs: 3839 -> 2896 (-24.56%) helped: 81 HURT: 15 total sfu-stalls in shared programs: 39583 -> 38154 (-3.61%) sfu-stalls in affected programs: 8993 -> 7564 (-15.89%) helped: 1808 HURT: 1038 total nops in shared programs: 324894 -> 323685 (-0.37%) nops in affected programs: 30362 -> 29153 (-3.98%) helped: 2513 HURT: 2077 Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15276>
2026-05-05 13:58:04 +02:00 · 2022-03-07 14:42:39 +01:00 · 2022-03-07 14:42:39 +01:00 · a35b47a0b1
commit a35b47a0b1
parent f783bd0d2a
3 changed files with 67 additions and 32 deletions
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@ -3200,8 +3200,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)

        case nir_intrinsic_load_ubo:
        case nir_intrinsic_load_ssbo:
-                if (!ntq_emit_load_unifa(c, instr))
+                if (!ntq_emit_load_unifa(c, instr)) {
                        ntq_emit_tmu_general(c, instr, false);
+                        c->has_general_tmu_load = true;
+                }
                break;

        case nir_intrinsic_ssbo_atomic_add:
@ -3228,14 +3230,17 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
        case nir_intrinsic_shared_atomic_xor:
        case nir_intrinsic_shared_atomic_exchange:
        case nir_intrinsic_shared_atomic_comp_swap:
-        case nir_intrinsic_load_shared:
        case nir_intrinsic_store_shared:
-        case nir_intrinsic_load_scratch:
        case nir_intrinsic_store_scratch:
                ntq_emit_tmu_general(c, instr, true);
                break;

-        case nir_intrinsic_image_load:
+        case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_shared:
+                ntq_emit_tmu_general(c, instr, true);
+                c->has_general_tmu_load = true;
+                break;
+
        case nir_intrinsic_image_store:
        case nir_intrinsic_image_atomic_add:
        case nir_intrinsic_image_atomic_imin:
@ -3250,6 +3255,15 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                v3d40_vir_emit_image_load_store(c, instr);
                break;

+        case nir_intrinsic_image_load:
+                v3d40_vir_emit_image_load_store(c, instr);
+                /* Not really a general TMU load, but we only use this flag
+                 * for NIR scheduling and we do schedule these under the same
+                 * policy as general TMU.
+                 */
+                c->has_general_tmu_load = true;
+                break;
+
        case nir_intrinsic_get_ssbo_size:
                ntq_store_dest(c, &instr->dest, 0,
                               vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@ -710,6 +710,11 @@ struct v3d_compile {
        bool disable_loop_unrolling;
        bool unrolled_any_loops;

+        /* Disables scheduling of general TMU loads (and unfiltered image load).
+         */
+        bool disable_general_tmu_sched;
+        bool has_general_tmu_load;
+
        /* Minimum number of threads we are willing to use to register allocate
         * a shader with the current compilation strategy. This only prevents
         * us from lowering the thread count to register allocate successfully,
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@ -550,6 +550,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
                 uint32_t max_threads,
                 uint32_t min_threads_for_reg_alloc,
                 uint32_t max_tmu_spills,
+                 bool disable_general_tmu_sched,
                 bool disable_loop_unrolling,
                 bool disable_constant_ubo_load_sorting,
                 bool disable_tmu_pipelining,
@ -569,6 +570,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
        c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
        c->max_tmu_spills = max_tmu_spills;
        c->fallback_scheduler = fallback_scheduler;
+        c->disable_general_tmu_sched = disable_general_tmu_sched;
        c->disable_tmu_pipelining = disable_tmu_pipelining;
        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
        c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
@ -1122,6 +1124,8 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
 static unsigned
 v3d_instr_delay_cb(nir_instr *instr, void *data)
 {
+   struct v3d_compile *c = (struct v3d_compile *) data;
+
   switch (instr->type) {
   case nir_instr_type_ssa_undef:
   case nir_instr_type_load_const:
@ -1134,18 +1138,22 @@ v3d_instr_delay_cb(nir_instr *instr, void *data)
      return 1;

   case nir_instr_type_intrinsic: {
-      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-      switch (intr->intrinsic) {
-      case nir_intrinsic_load_ssbo:
-      case nir_intrinsic_load_scratch:
-      case nir_intrinsic_load_shared:
-      case nir_intrinsic_image_load:
-         return 30;
-      case nir_intrinsic_load_ubo:
-         if (nir_src_is_divergent(intr->src[1]))
+      if (!c->disable_general_tmu_sched) {
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         switch (intr->intrinsic) {
+         case nir_intrinsic_load_ssbo:
+         case nir_intrinsic_load_scratch:
+         case nir_intrinsic_load_shared:
+         case nir_intrinsic_image_load:
            return 30;
-         FALLTHROUGH;
-      default:
+         case nir_intrinsic_load_ubo:
+            if (nir_src_is_divergent(intr->src[1]))
+               return 30;
+            FALLTHROUGH;
+         default:
+            return 1;
+         }
+      } else {
         return 1;
      }
      break;
@ -1674,20 +1682,23 @@ struct v3d_compiler_strategy {
        const char *name;
        uint32_t max_threads;
        uint32_t min_threads;
+        bool disable_general_tmu_sched;
        bool disable_loop_unrolling;
        bool disable_ubo_load_sorting;
        bool disable_tmu_pipelining;
        uint32_t max_tmu_spills;
 } static const strategies[] = {
-  /*0*/ { "default",                        4, 4, false, false, false,  0 },
-  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false,  0 },
-  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false,  0 },
-  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,   0 },
-  /*4*/ { "lower thread count",             2, 1, false, false, false, -1 },
-  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, -1 },
-  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, -1 },
-  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  -1 },
-  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  -1 }
+  /*0*/  { "default",                        4, 4, false, false, false, false,  0 },
+  /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false,  0 },
+  /*2*/  { "disable loop unrolling",         4, 4, true,  true,  false, false,  0 },
+  /*3*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  false,  0 },
+  /*4*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,   0 },
+  /*5*/  { "lower thread count",             2, 1, false, false, false, false, -1 },
+  /*6*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, -1 },
+  /*7*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  false, false, -1 },
+  /*8*/  { "disable UBO load sorting (2t)",  2, 1, true,  true,  true,  false, -1 },
+  /*9*/  { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  -1 },
+  /*10*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  -1 }
 };

 /**
@ -1695,7 +1706,7 @@ struct v3d_compiler_strategy {
 * attempt disabling it alone won't allow us to compile the shader successfuly,
 * since we'll end up with the same code. Detect these scenarios so we can
 * avoid wasting time with useless compiles. We should also consider if the
- * strategy changes other aspects of the compilation process though, like
+ * gy changes other aspects of the compilation process though, like
 * spilling, and not skip it in that case.
 */
 static bool
@ -1714,20 +1725,24 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
   }

   switch (idx) {
-   /* Loop unrolling: skip if we didn't unroll any loops */
+   /* General TMU sched.: skip if we didn't emit any TMU loads */
   case 1:
-   case 5:
+   case 6:
+           return !c->has_general_tmu_load;
+   /* Loop unrolling: skip if we didn't unroll any loops */
+   case 2:
+   case 7:
           return !c->unrolled_any_loops;
   /* UBO load sorting: skip if we didn't sort any loads */
-   case 2:
-   case 6:
+   case 3:
+   case 8:
           return !c->sorted_any_ubo_loads;
   /* TMU pipelining: skip if we didn't pipeline any TMU ops */
-   case 3:
-   case 7:
+   case 4:
+   case 9:
           return !c->pipelined_any_tmu;
   /* Lower thread count: skip if we already tried less that 4 threads */
-   case 4:
+   case 5:
          return c->threads < 4;
   default:
           return false;
@ -1780,6 +1795,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                                     strategies[strat].max_threads,
                                     strategies[strat].min_threads,
                                     strategies[strat].max_tmu_spills,
+                                     strategies[strat].disable_general_tmu_sched,
                                     strategies[strat].disable_loop_unrolling,
                                     strategies[strat].disable_ubo_load_sorting,
                                     strategies[strat].disable_tmu_pipelining,