From 7fd5f76393da2e6cd8d71eef8a382769e2fac3a2 Mon Sep 17 00:00:00 2001
From: Zan Dobersek <zdobersek@igalia.com>
Date: Sun, 14 Jul 2024 08:59:27 +0200
Subject: [PATCH] nir/lower_vars_to_scratch: calculate threshold-limited
 variable size separately

ir3's lowering of variables to scratch memory has to treat 8-bit values as
16-bit ones when comparing such value's size against the given threshold
since those values are handled through 16-bit half-registers. But those
values can still use natural 8-bit size and alignment for storing inside
scratch memory.

nir_lower_vars_to_scratch now accepts two size-and-alignment functions,
one used for calculating the variable size and the other for calculating
the size and alignment needed for storing inside scratch memory. Non-ir3
uses of this pass can just duplicate the currently-used function. ir3
provides a separate variable-size function that special-cases 8-bit types.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29875>
---
 src/amd/common/ac_nir.c                  |  2 +-
 src/asahi/compiler/agx_compile.c         |  2 +-
 src/broadcom/compiler/vir.c              |  1 +
 src/compiler/glsl_types.c                |  2 +-
 src/compiler/glsl_types.h                |  3 +++
 src/compiler/nir/nir.h                   |  3 ++-
 src/compiler/nir/nir_lower_scratch.c     |  9 ++++----
 src/freedreno/ir3/ir3_nir.c              | 28 +++++++++++++++++++++++-
 src/gallium/drivers/r600/sfn/sfn_nir.cpp |  1 +
 src/panfrost/compiler/bifrost_compile.c  |  8 +++----
 10 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c
index edfc9c16c58..b77e8b5a4a0 100644
--- a/src/amd/common/ac_nir.c
+++ b/src/amd/common/ac_nir.c
@@ -648,7 +648,7 @@ ac_nir_lower_indirect_derefs(nir_shader *shader,
     * scratch to alloca's, assuming LLVM won't generate VGPR indexing.
     */
    NIR_PASS(progress, shader, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
-            glsl_get_natural_size_align_bytes);
+            glsl_get_natural_size_align_bytes, glsl_get_natural_size_align_bytes);
 
    /* LLVM doesn't support VGPR indexing on GFX9. */
    bool llvm_has_working_vgpr_indexing = gfx_level != GFX9;
diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index 89c9c0a831a..5350aae7cb2 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -3293,7 +3293,7 @@ agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)
 
    /* Lower large arrays to scratch and small arrays to csel */
    NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
-            glsl_get_natural_size_align_bytes);
+            glsl_get_natural_size_align_bytes, glsl_get_natural_size_align_bytes);
    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
    NIR_PASS(_, nir, nir_split_var_copies);
    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index a087282cfc8..dd638c3e64c 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -1752,6 +1752,7 @@ v3d_attempt_compile(struct v3d_compile *c)
         NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
                  nir_var_function_temp,
                  0,
+                 glsl_get_natural_size_align_bytes,
                  glsl_get_natural_size_align_bytes);
 
         NIR_PASS(_, c->s, v3d_nir_lower_global_2x32);
diff --git a/src/compiler/glsl_types.c b/src/compiler/glsl_types.c
index 7a31de2a312..808a3cb40f2 100644
--- a/src/compiler/glsl_types.c
+++ b/src/compiler/glsl_types.c
@@ -3685,7 +3685,7 @@ glsl_channel_type(const glsl_type *t)
    }
 }
 
-static void
+void
 glsl_size_align_handle_array_and_structs(const glsl_type *type,
                                          glsl_type_size_align_func size_align,
                                          unsigned *size, unsigned *align)
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index fefba4115f3..ce64318eb58 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -1350,6 +1350,9 @@ glsl_get_explicit_interface_type(const glsl_type *t, bool supports_std430)
    }
 }
 
+void glsl_size_align_handle_array_and_structs(const glsl_type *type,
+                                              glsl_type_size_align_func size_align,
+                                              unsigned *size, unsigned *align);
 void glsl_get_natural_size_align_bytes(const glsl_type *t, unsigned *size, unsigned *align);
 void glsl_get_vec4_size_align_bytes(const glsl_type *type, unsigned *size, unsigned *align);
 
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index d3024d0343e..c423398c167 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5434,7 +5434,8 @@ bool nir_lower_io_to_temporaries(nir_shader *shader,
 bool nir_lower_vars_to_scratch(nir_shader *shader,
                                nir_variable_mode modes,
                                int size_threshold,
-                               glsl_type_size_align_func size_align);
+                               glsl_type_size_align_func variable_size_align,
+                               glsl_type_size_align_func scratch_layout_size_align);
 
 void nir_lower_clip_halfz(nir_shader *shader);
 
diff --git a/src/compiler/nir/nir_lower_scratch.c b/src/compiler/nir/nir_lower_scratch.c
index 50eecfff6e5..3536e05d9c4 100644
--- a/src/compiler/nir/nir_lower_scratch.c
+++ b/src/compiler/nir/nir_lower_scratch.c
@@ -95,7 +95,8 @@ bool
 nir_lower_vars_to_scratch(nir_shader *shader,
                           nir_variable_mode modes,
                           int size_threshold,
-                          glsl_type_size_align_func size_align)
+                          glsl_type_size_align_func variable_size_align,
+                          glsl_type_size_align_func scratch_layout_size_align)
 {
    struct set *set = _mesa_pointer_set_create(NULL);
 
@@ -131,7 +132,7 @@ nir_lower_vars_to_scratch(nir_shader *shader,
                continue;
 
             unsigned var_size, var_align;
-            size_align(var->type, &var_size, &var_align);
+            variable_size_align(var->type, &var_size, &var_align);
             if (var_size <= size_threshold)
                continue;
 
@@ -207,13 +208,13 @@ nir_lower_vars_to_scratch(nir_shader *shader,
 
             if (var->data.location == INT_MAX) {
                unsigned var_size, var_align;
-               size_align(var->type, &var_size, &var_align);
+               scratch_layout_size_align(var->type, &var_size, &var_align);
 
                var->data.location = ALIGN_POT(shader->scratch_size, var_align);
                shader->scratch_size = var->data.location + var_size;
             }
 
-            lower_load_store(&build, intrin, size_align);
+            lower_load_store(&build, intrin, scratch_layout_size_align);
             impl_progress = true;
          }
       }
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 3db75b6ab9a..bd9eec6e8d1 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -181,6 +181,31 @@ ir3_lower_bit_size(const nir_instr *instr, UNUSED void *data)
    return 0;
 }
 
+static void
+ir3_get_variable_size_align_bytes(const glsl_type *type, unsigned *size, unsigned *align)
+{
+   switch (type->base_type) {
+   case GLSL_TYPE_ARRAY:
+   case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_STRUCT:
+      glsl_size_align_handle_array_and_structs(type, ir3_get_variable_size_align_bytes,
+                                               size, align);
+      break;
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_INT8:
+      /* 8-bit values are handled through 16-bit half-registers, so the resulting size
+       * and alignment value has to be doubled to reflect the actual variable size
+       * requirement.
+       */
+      *size = 2 * glsl_get_components(type);
+      *align = 2;
+      break;
+   default:
+      glsl_get_natural_size_align_bytes(type, size, align);
+      break;
+   }
+}
+
 #define OPT(nir, pass, ...)                                                    \
    ({                                                                          \
       bool this_progress = false;                                              \
@@ -828,7 +853,8 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
     */
    if (so->compiler->has_pvtmem) {
       progress |= OPT(s, nir_lower_vars_to_scratch, nir_var_function_temp,
-                      16 * 16 /* bytes */, glsl_get_natural_size_align_bytes);
+                      16 * 16 /* bytes */,
+                      ir3_get_variable_size_align_bytes, glsl_get_natural_size_align_bytes);
    }
 
    /* Lower scratch writemasks */
diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
index eff49b287e4..3d8632d7fb3 100644
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@@ -847,6 +847,7 @@ r600_lower_and_optimize_nir(nir_shader *sh,
               nir_lower_vars_to_scratch,
               nir_var_function_temp,
               40,
+              r600_get_natural_size_align_bytes,
               r600_get_natural_size_align_bytes);
 
    while (optimize_once(sh))
diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c
index 10088656e6f..4ceff8d5be9 100644
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -4937,12 +4937,12 @@ bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
     * (currently unconditional for Valhall), we force vec4 alignment for
     * scratch access.
     */
-   bool packed_tls = (gpu_id >= 0x9000);
-
+   glsl_type_size_align_func vars_to_scratch_size_align_func =
+      (gpu_id >= 0x9000) ? glsl_get_vec4_size_align_bytes
+                         : glsl_get_natural_size_align_bytes;
    /* Lower large arrays to scratch and small arrays to bcsel */
    NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
-              packed_tls ? glsl_get_vec4_size_align_bytes
-                         : glsl_get_natural_size_align_bytes);
+              vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
    NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
 
    NIR_PASS_V(nir, nir_split_var_copies);