From e41fec78126e467a99938c122976e8a6b1a8f666 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 16 Jul 2024 02:15:34 -0400
Subject: [PATCH] radeonsi: align waves to 256B clear/copy area for the
 clear/copy_buffer shader

This is about 10% faster in certain unaligned cases.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30208>
---
 .../drivers/radeonsi/si_compute_blit.c        | 14 +++++++-
 src/gallium/drivers/radeonsi/si_pipe.h        |  2 ++
 .../drivers/radeonsi/si_shaderlib_nir.c       | 32 ++++++++++++++++---
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 5120581a39e..e1dd7ea427e 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -406,6 +406,16 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
    unsigned num_threads = DIV_ROUND_UP(dst_align_offset + size, dwords_per_thread * 4);
    key.dst_single_thread_unaligned = num_threads == 1 && dst_align_offset && key.dst_last_thread_bytes;
 
+   /* start_thread offsets threads to make sure all non-zero waves start clearing/copying from
+    * the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
+    * partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
+    * Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
+    */
+   unsigned start_thread =
+      dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ?
+            DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0;
+   key.has_start_thread = start_thread != 0;
+
    void *shader = _mesa_hash_table_u64_search(sctx->cs_dma_shaders, key.key);
    if (!shader) {
       shader = si_create_dma_compute_shader(sctx, &key);
@@ -415,9 +425,11 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
    /* Set the value of the last thread ID, so that the shader knows which thread is the last one. */
    if (key.dst_last_thread_bytes)
       sctx->cs_user_data[num_user_data_terms++] = num_threads - 1;
+   if (key.has_start_thread)
+      sctx->cs_user_data[num_user_data_terms++] = start_thread;
 
    struct pipe_grid_info info = {};
-   set_work_size(&info, 64, 1, 1, num_threads, 1, 1);
+   set_work_size(&info, 64, 1, 1, start_thread + num_threads, 1, 1);
 
    si_launch_grid_internal_ssbos(sctx, &info, shader, flags, coher, is_copy ? 2 : 1, sb,
                                  is_copy ? 0x2 : 0x1);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 6a05c0146a7..1898fb3c6db 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1655,6 +1655,8 @@ union si_cs_clear_copy_buffer_key {
       unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */
       unsigned dst_last_thread_bytes:4; /* if non-zero, the last thread should write this many bytes */
       bool dst_single_thread_unaligned:1; /* only 1 thread executes, both previous fields apply */
+      bool has_start_thread:1; /* whether the first few threads should be skipped, making later
+                                  waves start on a 256B boundary */
    };
    uint64_t key;
 };
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
index ad3a3428920..3296927039a 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -284,14 +284,35 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
    b.shader->info.workgroup_size[1] = 1;
    b.shader->info.workgroup_size[2] = 1;
    b.shader->info.num_ssbos = key->is_clear ? 1 : 2;
-   b.shader->info.cs.user_data_components_amd =
-      (key->is_clear ? (key->clear_value_size_is_12 ? 3 : key->dwords_per_thread) : 0);
+   b.shader->info.cs.user_data_components_amd = 0;
+
+   if (key->is_clear) {
+      b.shader->info.cs.user_data_components_amd +=
+         key->clear_value_size_is_12 ? 3 : key->dwords_per_thread;
+   }
 
    /* Add the last thread ID value. */
+   unsigned last_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
    if (key->dst_last_thread_bytes)
-      b.shader->info.cs.user_data_components_amd = key->is_clear ? 5 : 1;
+      b.shader->info.cs.user_data_components_amd++;
+
+   unsigned start_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
+   if (key->has_start_thread)
+      b.shader->info.cs.user_data_components_amd++;
 
    nir_def *thread_id = ac_get_global_ids(&b, 1, 32);
+
+   /* If the clear/copy area is unaligned, we launched extra threads at the beginning to make it
+    * aligned. Skip those threads here.
+    */
+   nir_if *if_positive = NULL;
+   if (key->has_start_thread) {
+      nir_def *start_thread =
+         nir_channel(&b, nir_load_user_data_amd(&b), start_thread_user_data_index);
+      thread_id = nir_isub(&b, thread_id, start_thread);
+      if_positive = nir_push_if(&b, nir_ige_imm(&b, thread_id, 0));
+   }
+
    /* Convert the global thread ID into bytes. */
    nir_def *offset = nir_imul_imm(&b, thread_id, 4 * key->dwords_per_thread);
    nir_def *value;
@@ -446,7 +467,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
 
       if (key->dst_last_thread_bytes) {
          nir_def *last_thread_id =
-            nir_channel(&b, nir_load_user_data_amd(&b), key->is_clear ? 4 : 0);
+            nir_channel(&b, nir_load_user_data_amd(&b), last_thread_user_data_index);
 
          if_last_thread = nir_push_if(&b, nir_ieq(&b, thread_id, last_thread_id));
          {
@@ -517,6 +538,9 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
       }
    }
 
+   if (key->has_start_thread)
+      nir_pop_if(&b, if_positive);
+
    return si_create_shader_state(sctx, b.shader);
 }