diff --git a/src/panfrost/compiler/bifrost/bifrost_nir.c b/src/panfrost/compiler/bifrost/bifrost_nir.c
index b43cad7469c..9faab870d95 100644
--- a/src/panfrost/compiler/bifrost/bifrost_nir.c
+++ b/src/panfrost/compiler/bifrost/bifrost_nir.c
@@ -274,11 +274,20 @@ bi_optimize_nir(nir_shader *nir, uint64_t gpu_id,
 
    NIR_PASS(_, nir, nir_opt_shrink_vectors, false);
 
+   /* Why aren't we vectorizing nir_var_shader_temp?
+    * Basically, the current RA doesn't know rematerialization and is still
+    * learning spills, if we vectorize temp stores it might create long-lived
+    * COLLECTs that make the RA fall off the bicycle and create very scary spills.
+    * (spills that are just other temp STORE/LOADs).
+    *
+    * Really hope that a Metroid boss hears my prayer and saves the day soon!
+    * test case: dEQP-VK.subgroups.ballot_broadcast.compute.subgroupbroadcast_u8vec3
+    * TODO: Fix RA and re-enable temp vectorization.
+    */
    nir_load_store_vectorize_options vectorize_opts = {
       .modes = nir_var_mem_global |
                nir_var_mem_shared |
-               nir_var_mem_ubo |
-               nir_var_shader_temp,
+               nir_var_mem_ubo /* | nir_var_mem_temp */,
       .callback = mem_vectorize_cb,
       .robust_modes = robust_modes,
    };
@@ -397,22 +406,6 @@ bifrost_preprocess_nir(nir_shader *nir, uint64_t gpu_id)
    /* Get rid of any global vars before we lower to scratch. */
    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
 
-   /* Valhall introduces packed thread local storage, which improves cache
-    * locality of TLS access. However, access to packed TLS cannot
-    * straddle 16-byte boundaries. As such, when packed TLS is in use
-    * (currently unconditional for Valhall), we force vec4 alignment for
-    * scratch access.
-    */
-   glsl_type_size_align_func vars_to_scratch_size_align_func =
-      (pan_arch(gpu_id) >= 9) ? glsl_get_vec4_size_align_bytes
-                              : glsl_get_natural_size_align_bytes;
-   /* Lower large arrays to scratch and small arrays to bcsel */
-   NIR_PASS(_, nir, nir_lower_scratch_to_var);
-   NIR_PASS(_, nir, nir_lower_vars_to_scratch, 256,
-            vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
-   NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees,
-            nir_var_function_temp, ~0);
-
    bi_optimize_loop_nir(nir, gpu_id, true);
 
    NIR_PASS(_, nir, nir_lower_var_copies);
@@ -842,6 +835,17 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
 static void bi_lower_texture_nir(nir_shader *nir, uint64_t gpu_id);
 static void bi_lower_texture_late_nir(nir_shader *nir, uint64_t gpu_id);
 
+static bool
+nir_shader_has_local_variables(const nir_shader *nir)
+{
+   nir_foreach_function(func, nir) {
+      if (func->impl && !exec_list_is_empty(&func->impl->locals))
+         return true;
+   }
+
+   return false;
+}
+
 void
 bifrost_postprocess_nir(nir_shader *nir, uint64_t gpu_id)
 {
@@ -871,6 +875,44 @@ bifrost_postprocess_nir(nir_shader *nir, uint64_t gpu_id)
       NIR_PASS(_, nir, pan_nir_lower_noperspective_vs);
    }
 
+   /* Our OpenCL compiler (src/panfrost/clc/pan_compile.c) has a very weird and
+    * suboptimal optimization pipeline that results in a lot of unoptimized
+    * memcpys and sparse scratch space.  That code is still being used for
+    * panlib, so we try to re-optimize it here.
+    * TODO: If you want to remove this pass, first optimize clc libpan on v9
+    *       until it doesn't emit kilobytes of scratch access.
+    */
+   NIR_PASS(_, nir, nir_lower_scratch_to_var);
+
+   if (nir_shader_has_local_variables(nir)) {
+      /* Lower indirect access on small arrays to if/else trees.  After
+       * vars_to_ssa and copy propagation, these will often end up as just a
+       * handful of MUX instructions instead of memory access.  The threshold
+       * of 8 array elements is chosen fairly arbitrarily.
+       */
+      NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees,
+               nir_var_function_temp, 8);
+
+      /* Turn the deref loads/stores we just made direct into SSA values */
+      NIR_PASS(_, nir, nir_opt_constant_folding);
+      NIR_PASS(_, nir, nir_lower_vars_to_ssa);
+      NIR_PASS(_, nir, nir_opt_dce);
+
+      /* Get rid of any dead function_temp variables so they don't get
+       * assigned scratch space by vars_to_explicit_types().
+       */
+      NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
+
+      /* This can create illegal memory accesses for TLS (Ex: struct with
+       * four uint32_t + memcpy).  Let nir_lower_mem_access_bit_sizes split it.
+       * bandwith is more important than instruction count.
+       */
+      NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
+               glsl_get_natural_size_align_bytes);
+      NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_function_temp,
+               nir_address_format_32bit_offset);
+   }
+
    nir_lower_mem_access_bit_sizes_options mem_size_options = {
       .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
                nir_var_mem_constant | nir_var_mem_task_payload |