From bbcfab9f4f5eedb2c50c6a32cb1bd4ea7b55eed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 30 Jan 2026 12:45:33 -0500 Subject: [PATCH] ac/nir/meta: don't scalarize sparse loads if the address is aligned to load size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should make copying sparse faster if we get aligned buffer bounds. Reviewed-by: Timur Kristóf Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/nir/ac_nir_meta.h | 2 +- .../nir/ac_nir_meta_cs_clear_copy_buffer.c | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/amd/common/nir/ac_nir_meta.h b/src/amd/common/nir/ac_nir_meta.h index dbb814c69c4..bddb8b4e96c 100644 --- a/src/amd/common/nir/ac_nir_meta.h +++ b/src/amd/common/nir/ac_nir_meta.h @@ -144,7 +144,7 @@ union ac_cs_clear_copy_buffer_key { bool is_clear:1; unsigned dwords_per_thread:3; /* 1..4 allowed */ bool clear_value_size_is_12:1; - bool src_is_sparse:1; + bool src_scalarize_for_sparse:1; /* Unaligned clears and copies. */ unsigned src_align_offset:2; /* how much is the source address unaligned */ unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */ diff --git a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c index 6bba87a7243..ce33ba95dc3 100644 --- a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c +++ b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c @@ -52,7 +52,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options, fprintf(stderr, " key.is_clear = %u\n", key->is_clear); fprintf(stderr, " key.dwords_per_thread = %u\n", key->dwords_per_thread); fprintf(stderr, " key.clear_value_size_is_12 = %u\n", key->clear_value_size_is_12); - fprintf(stderr, " key.src_is_sparse = %u\n", key->src_is_sparse); + fprintf(stderr, " key.src_scalarize_for_sparse = %u\n", key->src_scalarize_for_sparse); fprintf(stderr, " key.src_align_offset = %u\n", key->src_align_offset); fprintf(stderr, " key.dst_align_offset = %u\n", key->dst_align_offset); fprintf(stderr, " key.dst_last_thread_bytes = %u\n", key->dst_last_thread_bytes); @@ -167,7 +167,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options, .access = ACCESS_RESTRICT, .align_mul = 4, .align_offset = 0 - }, key->src_is_sparse); + }, key->src_scalarize_for_sparse); /* Add the components that we didn't load as undef. */ nir_def *comps[16]; @@ -189,7 +189,7 @@ ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options, .access = ACCESS_RESTRICT, .align_mul = 4, .align_offset = (unsigned)realign_offset % 4 - }, key->src_is_sparse); + }, key->src_scalarize_for_sparse); if (if_first_thread) { @@ -557,7 +557,18 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op assert(dwords_per_thread && dwords_per_thread <= 4); out->shader_key.dwords_per_thread = dwords_per_thread; out->shader_key.clear_value_size_is_12 = !is_copy && clear_value_size == 12; - out->shader_key.src_is_sparse = info->src_is_sparse; + /* If the src load size is aligned to 2^n and the src load address in every invocation is aligned + * to the load size, loads are guaranteed to never be partially non-resident, so we don't have to + * scalarize them. Every sparse buffer is aligned to a page, so we don't need to check whether the + * buffer base address is aligned. + */ + out->shader_key.src_scalarize_for_sparse = + is_copy && info->src_is_sparse && + /* Each invocation must increment the offset by 2^n. */ + (!util_is_power_of_two_nonzero(dwords_per_thread) || + /* The buffer range bounds must be divisible by the copy size per invocation. */ + ((info->src_offset - src_align_offset) % (dwords_per_thread * 4) != 0 || + (info->src_offset + info->size) % (dwords_per_thread * 4) != 0)); out->shader_key.src_align_offset = src_align_offset; out->shader_key.dst_align_offset = dst_align_offset;