pan/compiler: Rework scratch memory strategy

Before this commit, all scartch memory was allocated in 16-byte chunks and indirect references where always lowered into if-else trees. This patch tries to clean this up a little bit, by using a more compact layout that is still TLS friendly, allowing indirect accesses and only lowering them for optimizations and using the newer nir_lower_explicit_io. The patches should improve performance on some shaders, but lifts a lot of dust off the compiler uncovering some new bugs. They have been kept at bay by disabling local memory vectorization. Signed-off-by: Lorenzo Rossi <lorenzo.rossi@collabora.com> Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com> Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40924>
2026-05-06 15:58:05 +02:00 · 2026-04-29 14:47:39 +02:00 · 2026-04-29 14:47:39 +02:00 · 01e6a0555c
commit 01e6a0555c
parent f0d2ad9840
1 changed files with 60 additions and 18 deletions
--- a/src/panfrost/compiler/bifrost/bifrost_nir.c
+++ b/src/panfrost/compiler/bifrost/bifrost_nir.c
@ -274,11 +274,20 @@ bi_optimize_nir(nir_shader *nir, uint64_t gpu_id,

   NIR_PASS(_, nir, nir_opt_shrink_vectors, false);

+   /* Why aren't we vectorizing nir_var_shader_temp?
+    * Basically, the current RA doesn't know rematerialization and is still
+    * learning spills, if we vectorize temp stores it might create long-lived
+    * COLLECTs that make the RA fall off the bicycle and create very scary spills.
+    * (spills that are just other temp STORE/LOADs).
+    *
+    * Really hope that a Metroid boss hears my prayer and saves the day soon!
+    * test case: dEQP-VK.subgroups.ballot_broadcast.compute.subgroupbroadcast_u8vec3
+    * TODO: Fix RA and re-enable temp vectorization.
+    */
   nir_load_store_vectorize_options vectorize_opts = {
      .modes = nir_var_mem_global |
               nir_var_mem_shared |
-               nir_var_mem_ubo |
-               nir_var_shader_temp,
+               nir_var_mem_ubo /* | nir_var_mem_temp */,
      .callback = mem_vectorize_cb,
      .robust_modes = robust_modes,
   };
@ -397,22 +406,6 @@ bifrost_preprocess_nir(nir_shader *nir, uint64_t gpu_id)
   /* Get rid of any global vars before we lower to scratch. */
   NIR_PASS(_, nir, nir_lower_global_vars_to_local);

-   /* Valhall introduces packed thread local storage, which improves cache
-    * locality of TLS access. However, access to packed TLS cannot
-    * straddle 16-byte boundaries. As such, when packed TLS is in use
-    * (currently unconditional for Valhall), we force vec4 alignment for
-    * scratch access.
-    */
-   glsl_type_size_align_func vars_to_scratch_size_align_func =
-      (pan_arch(gpu_id) >= 9) ? glsl_get_vec4_size_align_bytes
-                              : glsl_get_natural_size_align_bytes;
-   /* Lower large arrays to scratch and small arrays to bcsel */
-   NIR_PASS(_, nir, nir_lower_scratch_to_var);
-   NIR_PASS(_, nir, nir_lower_vars_to_scratch, 256,
-            vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
-   NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees,
-            nir_var_function_temp, ~0);
-
   bi_optimize_loop_nir(nir, gpu_id, true);

   NIR_PASS(_, nir, nir_lower_var_copies);
@ -842,6 +835,17 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
 static void bi_lower_texture_nir(nir_shader *nir, uint64_t gpu_id);
 static void bi_lower_texture_late_nir(nir_shader *nir, uint64_t gpu_id);

+static bool
+nir_shader_has_local_variables(const nir_shader *nir)
+{
+   nir_foreach_function(func, nir) {
+      if (func->impl && !exec_list_is_empty(&func->impl->locals))
+         return true;
+   }
+
+   return false;
+}
+
 void
 bifrost_postprocess_nir(nir_shader *nir, uint64_t gpu_id)
 {
@ -871,6 +875,44 @@ bifrost_postprocess_nir(nir_shader *nir, uint64_t gpu_id)
      NIR_PASS(_, nir, pan_nir_lower_noperspective_vs);
   }

+   /* Our OpenCL compiler (src/panfrost/clc/pan_compile.c) has a very weird and
+    * suboptimal optimization pipeline that results in a lot of unoptimized
+    * memcpys and sparse scratch space.  That code is still being used for
+    * panlib, so we try to re-optimize it here.
+    * TODO: If you want to remove this pass, first optimize clc libpan on v9
+    *       until it doesn't emit kilobytes of scratch access.
+    */
+   NIR_PASS(_, nir, nir_lower_scratch_to_var);
+
+   if (nir_shader_has_local_variables(nir)) {
+      /* Lower indirect access on small arrays to if/else trees.  After
+       * vars_to_ssa and copy propagation, these will often end up as just a
+       * handful of MUX instructions instead of memory access.  The threshold
+       * of 8 array elements is chosen fairly arbitrarily.
+       */
+      NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees,
+               nir_var_function_temp, 8);
+
+      /* Turn the deref loads/stores we just made direct into SSA values */
+      NIR_PASS(_, nir, nir_opt_constant_folding);
+      NIR_PASS(_, nir, nir_lower_vars_to_ssa);
+      NIR_PASS(_, nir, nir_opt_dce);
+
+      /* Get rid of any dead function_temp variables so they don't get
+       * assigned scratch space by vars_to_explicit_types().
+       */
+      NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
+
+      /* This can create illegal memory accesses for TLS (Ex: struct with
+       * four uint32_t + memcpy).  Let nir_lower_mem_access_bit_sizes split it.
+       * bandwith is more important than instruction count.
+       */
+      NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
+               glsl_get_natural_size_align_bytes);
+      NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_function_temp,
+               nir_address_format_32bit_offset);
+   }
+
   nir_lower_mem_access_bit_sizes_options mem_size_options = {
      .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
               nir_var_mem_constant | nir_var_mem_task_payload |