radeonsi: align waves to 256B clear/copy area for the clear/copy_buffer shader

This is about 10% faster in certain unaligned cases. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30208>
2026-05-07 09:18:04 +02:00 · 2024-07-16 02:15:34 -04:00 · 2024-07-16 02:15:34 -04:00 · e41fec7812
commit e41fec7812
parent 2f9201e91b
3 changed files with 43 additions and 5 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -406,6 +406,16 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
   unsigned num_threads = DIV_ROUND_UP(dst_align_offset + size, dwords_per_thread * 4);
   key.dst_single_thread_unaligned = num_threads == 1 && dst_align_offset && key.dst_last_thread_bytes;

+   /* start_thread offsets threads to make sure all non-zero waves start clearing/copying from
+    * the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
+    * partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
+    * Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
+    */
+   unsigned start_thread =
+      dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ?
+            DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0;
+   key.has_start_thread = start_thread != 0;
+
   void *shader = _mesa_hash_table_u64_search(sctx->cs_dma_shaders, key.key);
   if (!shader) {
      shader = si_create_dma_compute_shader(sctx, &key);
@ -415,9 +425,11 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
   /* Set the value of the last thread ID, so that the shader knows which thread is the last one. */
   if (key.dst_last_thread_bytes)
      sctx->cs_user_data[num_user_data_terms++] = num_threads - 1;
+   if (key.has_start_thread)
+      sctx->cs_user_data[num_user_data_terms++] = start_thread;

   struct pipe_grid_info info = {};
-   set_work_size(&info, 64, 1, 1, num_threads, 1, 1);
+   set_work_size(&info, 64, 1, 1, start_thread + num_threads, 1, 1);

   si_launch_grid_internal_ssbos(sctx, &info, shader, flags, coher, is_copy ? 2 : 1, sb,
                                 is_copy ? 0x2 : 0x1);
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1655,6 +1655,8 @@ union si_cs_clear_copy_buffer_key {
      unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */
      unsigned dst_last_thread_bytes:4; /* if non-zero, the last thread should write this many bytes */
      bool dst_single_thread_unaligned:1; /* only 1 thread executes, both previous fields apply */
+      bool has_start_thread:1; /* whether the first few threads should be skipped, making later
+                                  waves start on a 256B boundary */
   };
   uint64_t key;
 };
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -284,14 +284,35 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
   b.shader->info.workgroup_size[1] = 1;
   b.shader->info.workgroup_size[2] = 1;
   b.shader->info.num_ssbos = key->is_clear ? 1 : 2;
-   b.shader->info.cs.user_data_components_amd =
-      (key->is_clear ? (key->clear_value_size_is_12 ? 3 : key->dwords_per_thread) : 0);
+   b.shader->info.cs.user_data_components_amd = 0;
+
+   if (key->is_clear) {
+      b.shader->info.cs.user_data_components_amd +=
+         key->clear_value_size_is_12 ? 3 : key->dwords_per_thread;
+   }

   /* Add the last thread ID value. */
+   unsigned last_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
   if (key->dst_last_thread_bytes)
-      b.shader->info.cs.user_data_components_amd = key->is_clear ? 5 : 1;
+      b.shader->info.cs.user_data_components_amd++;
+
+   unsigned start_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
+   if (key->has_start_thread)
+      b.shader->info.cs.user_data_components_amd++;

   nir_def *thread_id = ac_get_global_ids(&b, 1, 32);
+
+   /* If the clear/copy area is unaligned, we launched extra threads at the beginning to make it
+    * aligned. Skip those threads here.
+    */
+   nir_if *if_positive = NULL;
+   if (key->has_start_thread) {
+      nir_def *start_thread =
+         nir_channel(&b, nir_load_user_data_amd(&b), start_thread_user_data_index);
+      thread_id = nir_isub(&b, thread_id, start_thread);
+      if_positive = nir_push_if(&b, nir_ige_imm(&b, thread_id, 0));
+   }
+
   /* Convert the global thread ID into bytes. */
   nir_def *offset = nir_imul_imm(&b, thread_id, 4 * key->dwords_per_thread);
   nir_def *value;
@ -446,7 +467,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co

      if (key->dst_last_thread_bytes) {
         nir_def *last_thread_id =
-            nir_channel(&b, nir_load_user_data_amd(&b), key->is_clear ? 4 : 0);
+            nir_channel(&b, nir_load_user_data_amd(&b), last_thread_user_data_index);

         if_last_thread = nir_push_if(&b, nir_ieq(&b, thread_id, last_thread_id));
         {
@ -517,6 +538,9 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
      }
   }

+   if (key->has_start_thread)
+      nir_pop_if(&b, if_positive);
+
   return si_create_shader_state(sctx, b.shader);
 }