From bbcfab9f4f5eedb2c50c6a32cb1bd4ea7b55eed5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 30 Jan 2026 12:45:33 -0500
Subject: [PATCH] ac/nir/meta: don't scalarize sparse loads if the address is
 aligned to load size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This should make copying sparse faster if we get aligned buffer bounds.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39841>
---
 src/amd/common/nir/ac_nir_meta.h              |  2 +-
 .../nir/ac_nir_meta_cs_clear_copy_buffer.c    | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/amd/common/nir/ac_nir_meta.h b/src/amd/common/nir/ac_nir_meta.h
index dbb814c69c4..bddb8b4e96c 100644
--- a/src/amd/common/nir/ac_nir_meta.h
+++ b/src/amd/common/nir/ac_nir_meta.h
@@ -144,7 +144,7 @@ union ac_cs_clear_copy_buffer_key {
       bool is_clear:1;
       unsigned dwords_per_thread:3; /* 1..4 allowed */
       bool clear_value_size_is_12:1;
-      bool src_is_sparse:1;
+      bool src_scalarize_for_sparse:1;
       /* Unaligned clears and copies. */
       unsigned src_align_offset:2; /* how much is the source address unaligned */
       unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */
diff --git a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c
index 6bba87a7243..ce33ba95dc3 100644
--- a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c
+++ b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c
@@ -52,7 +52,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
       fprintf(stderr, "   key.is_clear = %u\n", key->is_clear);
       fprintf(stderr, "   key.dwords_per_thread = %u\n", key->dwords_per_thread);
       fprintf(stderr, "   key.clear_value_size_is_12 = %u\n", key->clear_value_size_is_12);
-      fprintf(stderr, "   key.src_is_sparse = %u\n", key->src_is_sparse);
+      fprintf(stderr, "   key.src_scalarize_for_sparse = %u\n", key->src_scalarize_for_sparse);
       fprintf(stderr, "   key.src_align_offset = %u\n", key->src_align_offset);
       fprintf(stderr, "   key.dst_align_offset = %u\n", key->dst_align_offset);
       fprintf(stderr, "   key.dst_last_thread_bytes = %u\n", key->dst_last_thread_bytes);
@@ -167,7 +167,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
                                          .access = ACCESS_RESTRICT,
                                          .align_mul = 4,
                                          .align_offset = 0
-                                      }, key->src_is_sparse);
+                                      }, key->src_scalarize_for_sparse);
 
             /* Add the components that we didn't load as undef. */
             nir_def *comps[16];
@@ -189,7 +189,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
                                   .access = ACCESS_RESTRICT,
                                   .align_mul = 4,
                                   .align_offset = (unsigned)realign_offset % 4
-                               }, key->src_is_sparse);
+                               }, key->src_scalarize_for_sparse);
 
 
       if (if_first_thread) {
@@ -557,7 +557,18 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op
    assert(dwords_per_thread && dwords_per_thread <= 4);
    out->shader_key.dwords_per_thread = dwords_per_thread;
    out->shader_key.clear_value_size_is_12 = !is_copy && clear_value_size == 12;
-   out->shader_key.src_is_sparse = info->src_is_sparse;
+   /* If the src load size is aligned to 2^n and the src load address in every invocation is aligned
+    * to the load size, loads are guaranteed to never be partially non-resident, so we don't have to
+    * scalarize them. Every sparse buffer is aligned to a page, so we don't need to check whether the
+    * buffer base address is aligned.
+    */
+   out->shader_key.src_scalarize_for_sparse =
+      is_copy && info->src_is_sparse &&
+      /* Each invocation must increment the offset by 2^n. */
+      (!util_is_power_of_two_nonzero(dwords_per_thread) ||
+       /* The buffer range bounds must be divisible by the copy size per invocation. */
+       ((info->src_offset - src_align_offset) % (dwords_per_thread * 4) != 0 ||
+        (info->src_offset + info->size) % (dwords_per_thread * 4) != 0));
    out->shader_key.src_align_offset = src_align_offset;
    out->shader_key.dst_align_offset = dst_align_offset;