From 0b34a7aff05d7cc0efa3d05d4887e63d5b6ecd45 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 19 Sep 2024 15:47:03 -0700
Subject: [PATCH] nir: Don't generate single iteration loops to zero-initialize
 memory

If the stride we're adding to our loop counter is larger than the total
amount of shared local memory we're trying to initialize, we know the
loop will run at most one time.  So we can skip emitting a loop.

Loop unrolling appears to be unable to detect this currently.

Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31312>
---
 .../nir/nir_lower_variable_initializers.c     | 42 ++++++++++++-------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/compiler/nir/nir_lower_variable_initializers.c b/src/compiler/nir/nir_lower_variable_initializers.c
index 6556a8638ed..9ab4f53cdf1 100644
--- a/src/compiler/nir/nir_lower_variable_initializers.c
+++ b/src/compiler/nir/nir_lower_variable_initializers.c
@@ -153,6 +153,7 @@ nir_zero_initialize_shared_memory(nir_shader *shader,
    const unsigned local_count = shader->info.workgroup_size[0] *
                                 shader->info.workgroup_size[1] *
                                 shader->info.workgroup_size[2];
+   const unsigned stride = chunk_size * local_count;
 
    /* The initialization logic is simplified if we can always split the memory
     * in full chunk_size units.
@@ -161,30 +162,41 @@ nir_zero_initialize_shared_memory(nir_shader *shader,
 
    const unsigned chunk_comps = chunk_size / 4;
 
-   nir_variable *it = nir_local_variable_create(b.impl, glsl_uint_type(),
-                                                "zero_init_iterator");
    nir_def *local_index = nir_load_local_invocation_index(&b);
    nir_def *first_offset = nir_imul_imm(&b, local_index, chunk_size);
-   nir_store_var(&b, it, first_offset, 0x1);
 
-   nir_loop *loop = nir_push_loop(&b);
-   {
-      nir_def *offset = nir_load_var(&b, it);
-
-      nir_push_if(&b, nir_uge_imm(&b, offset, shared_size));
+   if (stride >= shared_size) {
+      nir_push_if(&b, nir_ult_imm(&b, first_offset, shared_size));
       {
-         nir_jump(&b, nir_jump_break);
+         nir_store_shared(&b, nir_imm_zero(&b, chunk_comps, 32), first_offset,
+                          .align_mul = chunk_size,
+                          .write_mask = ((1 << chunk_comps) - 1));
       }
       nir_pop_if(&b, NULL);
+   } else {
+      nir_variable *it = nir_local_variable_create(b.impl, glsl_uint_type(),
+                                                   "zero_init_iterator");
+      nir_store_var(&b, it, first_offset, 0x1);
 
-      nir_store_shared(&b, nir_imm_zero(&b, chunk_comps, 32), offset,
-                       .align_mul = chunk_size,
-                       .write_mask = ((1 << chunk_comps) - 1));
+      nir_loop *loop = nir_push_loop(&b);
+      {
+         nir_def *offset = nir_load_var(&b, it);
 
-      nir_def *new_offset = nir_iadd_imm(&b, offset, chunk_size * local_count);
-      nir_store_var(&b, it, new_offset, 0x1);
+         nir_push_if(&b, nir_uge_imm(&b, offset, shared_size));
+         {
+            nir_jump(&b, nir_jump_break);
+         }
+         nir_pop_if(&b, NULL);
+
+         nir_store_shared(&b, nir_imm_zero(&b, chunk_comps, 32), offset,
+                          .align_mul = chunk_size,
+                          .write_mask = ((1 << chunk_comps) - 1));
+
+         nir_def *new_offset = nir_iadd_imm(&b, offset, stride);
+         nir_store_var(&b, it, new_offset, 0x1);
+      }
+      nir_pop_loop(&b, loop);
    }
-   nir_pop_loop(&b, loop);
 
    nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL,
                nir_var_mem_shared);