ac/nir/meta: don't scalarize sparse loads if the address is aligned to load size

This should make copying sparse faster if we get aligned buffer bounds. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39841>
2026-05-07 07:08:04 +02:00 · 2026-01-30 12:45:33 -05:00 · 2026-01-30 12:45:33 -05:00 · bbcfab9f4f
commit bbcfab9f4f
parent 38829bc373
2 changed files with 16 additions and 5 deletions
--- a/src/amd/common/nir/ac_nir_meta.h
+++ b/src/amd/common/nir/ac_nir_meta.h
@ -144,7 +144,7 @@ union ac_cs_clear_copy_buffer_key {
      bool is_clear:1;
      unsigned dwords_per_thread:3; /* 1..4 allowed */
      bool clear_value_size_is_12:1;
-      bool src_is_sparse:1;
+      bool src_scalarize_for_sparse:1;
      /* Unaligned clears and copies. */
      unsigned src_align_offset:2; /* how much is the source address unaligned */
      unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */
--- a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c
+++ b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c
@ -52,7 +52,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
      fprintf(stderr, "   key.is_clear = %u\n", key->is_clear);
      fprintf(stderr, "   key.dwords_per_thread = %u\n", key->dwords_per_thread);
      fprintf(stderr, "   key.clear_value_size_is_12 = %u\n", key->clear_value_size_is_12);
-      fprintf(stderr, "   key.src_is_sparse = %u\n", key->src_is_sparse);
+      fprintf(stderr, "   key.src_scalarize_for_sparse = %u\n", key->src_scalarize_for_sparse);
      fprintf(stderr, "   key.src_align_offset = %u\n", key->src_align_offset);
      fprintf(stderr, "   key.dst_align_offset = %u\n", key->dst_align_offset);
      fprintf(stderr, "   key.dst_last_thread_bytes = %u\n", key->dst_last_thread_bytes);
@ -167,7 +167,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
                                         .access = ACCESS_RESTRICT,
                                         .align_mul = 4,
                                         .align_offset = 0
-                                      }, key->src_is_sparse);
+                                      }, key->src_scalarize_for_sparse);

            /* Add the components that we didn't load as undef. */
            nir_def *comps[16];
@ -189,7 +189,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
                                  .access = ACCESS_RESTRICT,
                                  .align_mul = 4,
                                  .align_offset = (unsigned)realign_offset % 4
-                               }, key->src_is_sparse);
+                               }, key->src_scalarize_for_sparse);


      if (if_first_thread) {
@ -557,7 +557,18 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op
   assert(dwords_per_thread && dwords_per_thread <= 4);
   out->shader_key.dwords_per_thread = dwords_per_thread;
   out->shader_key.clear_value_size_is_12 = !is_copy && clear_value_size == 12;
-   out->shader_key.src_is_sparse = info->src_is_sparse;
+   /* If the src load size is aligned to 2^n and the src load address in every invocation is aligned
+    * to the load size, loads are guaranteed to never be partially non-resident, so we don't have to
+    * scalarize them. Every sparse buffer is aligned to a page, so we don't need to check whether the
+    * buffer base address is aligned.
+    */
+   out->shader_key.src_scalarize_for_sparse =
+      is_copy && info->src_is_sparse &&
+      /* Each invocation must increment the offset by 2^n. */
+      (!util_is_power_of_two_nonzero(dwords_per_thread) ||
+       /* The buffer range bounds must be divisible by the copy size per invocation. */
+       ((info->src_offset - src_align_offset) % (dwords_per_thread * 4) != 0 ||
+        (info->src_offset + info->size) % (dwords_per_thread * 4) != 0));
   out->shader_key.src_align_offset = src_align_offset;
   out->shader_key.dst_align_offset = dst_align_offset;