From 276b65dbca56dd35795cbc6708e41b6a0afb20d3 Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary.guillemard@collabora.com>
Date: Thu, 19 Jun 2025 14:45:46 +0200
Subject: [PATCH] pan/clc: Lower IO as late as possible

We were assigning the scratch size of the whole lib... that was causing
very big TLS usage on draw indirect.

With this TLS usage is way lower now at the cost of running more pass
for every variants.

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35639>
---
 src/panfrost/clc/pan_compile.c | 95 +++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 47 deletions(-)

diff --git a/src/panfrost/clc/pan_compile.c b/src/panfrost/clc/pan_compile.c
index ace91124c3b..bd22035d2bb 100644
--- a/src/panfrost/clc/pan_compile.c
+++ b/src/panfrost/clc/pan_compile.c
@@ -75,7 +75,8 @@ optimize(nir_shader *nir)
          .limit = 64,
          .expensive_alu_ok = true,
       };
-      NIR_PASS(progress, nir, nir_opt_peephole_select, &peephole_select_options);
+      NIR_PASS(progress, nir, nir_opt_peephole_select,
+               &peephole_select_options);
       NIR_PASS(progress, nir, nir_opt_phi_precision);
       NIR_PASS(progress, nir, nir_opt_algebraic);
       NIR_PASS(progress, nir, nir_opt_constant_folding);
@@ -145,50 +146,12 @@ compile(void *memctx, const uint32_t *spirv, size_t spirv_size, unsigned arch)
             nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
                nir_var_mem_global | nir_var_mem_constant);
 
-   /* We assign explicit types early so that the optimizer can take advantage
-    * of that information and hopefully get rid of some of our memcpys.
-    */
-   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
-            nir_var_uniform | nir_var_shader_temp | nir_var_function_temp |
-               nir_var_mem_shared | nir_var_mem_global,
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_uniform,
             glsl_get_cl_type_size_align);
 
-   optimize(nir);
-
-   NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
-
-   /* Lower again, this time after dead-variables to get more compact variable
-    * layouts.
-    */
-   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
-            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
-               nir_var_mem_global | nir_var_mem_constant,
-            glsl_get_cl_type_size_align);
-   assert(nir->constant_data_size == 0);
-
-   NIR_PASS(_, nir, nir_lower_memcpy);
-
-   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_constant,
-            nir_address_format_64bit_global);
-
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform,
             nir_address_format_32bit_offset_as_64bit);
 
-   NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);
-   NIR_PASS(_, nir, nir_opt_if, 0);
-   NIR_PASS(_, nir, nir_opt_idiv_const, 16);
-
-   /* Lower explicit IO here to ensure that we will not clash with different
-    * address formats inside shaders */
-   NIR_PASS(_, nir, nir_opt_deref);
-   NIR_PASS(_, nir, nir_lower_vars_to_ssa);
-   NIR_PASS(_, nir, nir_lower_explicit_io,
-            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
-               nir_var_mem_global,
-            nir_address_format_62bit_generic);
-
-   optimize(nir);
-
    return nir;
 }
 
@@ -374,11 +337,6 @@ main(int argc, const char **argv)
             libfunc, v, get_compiler_options(target_arch), &opt,
             load_kernel_input);
 
-         /* Because we do nir_lower_explicit_io on temp variable early on, we
-          * lose the scratch_size when we build the shader variant so we need
-          * to readjust it here. */
-         s->scratch_size = MAX2(s->scratch_size, nir->scratch_size);
-
          struct pan_compile_inputs inputs = {
             .gpu_id = target_arch << 12,
          };
@@ -391,6 +349,50 @@ main(int argc, const char **argv)
          NIR_PASS(_, s, nir_remove_dead_derefs);
          NIR_PASS(_, s, nir_remove_dead_variables,
                   nir_var_function_temp | nir_var_shader_temp, NULL);
+
+         /* We assign explicit types early so that the optimizer can take
+          * advantage of that information and hopefully get rid of some of our
+          * memcpys.
+          */
+         NIR_PASS(_, s, nir_lower_vars_to_explicit_types,
+                  nir_var_shader_temp | nir_var_function_temp |
+                     nir_var_mem_shared | nir_var_mem_global,
+                  glsl_get_cl_type_size_align);
+
+         optimize(s);
+
+         NIR_PASS(_, s, nir_remove_dead_variables, nir_var_all, NULL);
+
+         /* Lower again, this time after dead-variables to get more compact
+          * variable layouts.
+          */
+         NIR_PASS(_, s, nir_lower_vars_to_explicit_types,
+                  nir_var_shader_temp | nir_var_function_temp |
+                     nir_var_mem_shared | nir_var_mem_global |
+                     nir_var_mem_constant,
+                  glsl_get_cl_type_size_align);
+         assert(nir->constant_data_size == 0);
+
+         NIR_PASS(_, s, nir_lower_memcpy);
+
+         NIR_PASS(_, s, nir_lower_explicit_io, nir_var_mem_constant,
+                  nir_address_format_64bit_global);
+
+         NIR_PASS(_, s, nir_lower_convert_alu_types, NULL);
+         NIR_PASS(_, s, nir_opt_if, 0);
+         NIR_PASS(_, s, nir_opt_idiv_const, 16);
+
+         /* Lower explicit IO here to ensure that we will not clash with
+          * different address formats inside shaders */
+         NIR_PASS(_, s, nir_opt_deref);
+         NIR_PASS(_, s, nir_lower_vars_to_ssa);
+         NIR_PASS(_, s, nir_lower_explicit_io,
+                  nir_var_shader_temp | nir_var_function_temp |
+                     nir_var_mem_shared | nir_var_mem_global,
+                  nir_address_format_62bit_generic);
+
+         optimize(s);
+
          NIR_PASS(_, s, nir_lower_vars_to_explicit_types,
                   nir_var_shader_temp | nir_var_function_temp,
                   glsl_get_cl_type_size_align);
@@ -422,8 +424,7 @@ main(int argc, const char **argv)
          struct util_dynarray shader_binary;
          struct pan_shader_info shader_info = {0};
          util_dynarray_init(&shader_binary, NULL);
-         pan_shader_compile(clone, &inputs, &shader_binary,
-                            &shader_info);
+         pan_shader_compile(clone, &inputs, &shader_binary, &shader_info);
 
          assert(shader_info.push.count * 4 <=
                    BIFROST_PRECOMPILED_KERNEL_ARGS_SIZE &&