From 52cd5f7e69fde6b5eef4a88103e7571d8342577f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 14 Jul 2025 14:24:35 +0200 Subject: [PATCH] ac/nir_lower_mem_access_bit_sizes: Split unsupported shared memory instructions Totals from 1400 (1.75% of 79839) affected shaders: (Navi48) MaxWaves: 38313 -> 38317 (+0.01%); split: +0.06%, -0.05% Instrs: 1162521 -> 1199627 (+3.19%); split: -0.01%, +3.20% CodeSize: 5874288 -> 6146832 (+4.64%); split: -0.01%, +4.65% VGPRs: 79948 -> 79984 (+0.05%); split: -0.12%, +0.17% Latency: 3703961 -> 3741457 (+1.01%); split: -0.02%, +1.04% InvThroughput: 589594 -> 590597 (+0.17%); split: -0.06%, +0.23% VClause: 22561 -> 22564 (+0.01%) SClause: 19615 -> 19611 (-0.02%); split: -0.03%, +0.01% Copies: 70721 -> 71678 (+1.35%); split: -0.25%, +1.60% PreVGPRs: 61068 -> 61101 (+0.05%); split: -0.00%, +0.06% VALU: 651754 -> 651785 (+0.00%); split: -0.07%, +0.07% SALU: 141953 -> 141955 (+0.00%) VOPD: 489 -> 485 (-0.82%); split: +0.41%, -1.23% Part-of: --- .../nir/ac_nir_lower_mem_access_bit_sizes.c | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c b/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c index d6d24e85b2e..4be47d4350a 100644 --- a/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c +++ b/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c @@ -4,6 +4,7 @@ * SPDX-License-Identifier: MIT */ +#include "util/blake3/blake3_impl.h" #include "ac_nir.h" #include "ac_nir_helpers.h" @@ -62,6 +63,20 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui const bool is_load = nir_intrinsic_infos[intrin].has_dest; const bool is_smem = intrin == nir_intrinsic_load_push_constant || (access & ACCESS_SMEM_AMD); const uint32_t combined_align = nir_combined_align(align_mul, align_offset); + nir_mem_access_size_align res; + + if (intrin == nir_intrinsic_load_shared || intrin == nir_intrinsic_store_shared) { + /* Split unsupported shared access. */ + res.bit_size = MIN2(bit_size, combined_align * 8ull); + res.align = res.bit_size / 8; + /* Don't use >64-bit LDS loads for performance reasons. */ + unsigned max_bytes = intrin == nir_intrinsic_store_shared && cb_data->gfx_level >= GFX7 ? 16 : 8; + bytes = MIN3(bytes, combined_align, max_bytes); + bytes = bytes == 12 ? bytes : round_down_to_power_of_2(bytes); + res.num_components = bytes / res.align; + res.shift = nir_mem_access_shift_method_bytealign_amd; + return res; + } /* Make 8-bit accesses 16-bit if possible */ if (is_load && bit_size == 8 && combined_align >= 2 && bytes % 2 == 0) @@ -79,18 +94,11 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui else if (is_smem) max_components = MIN2(512 / bit_size, 16); - nir_mem_access_size_align res; res.num_components = MIN2(DIV_ROUND_UP(bytes, bit_size / 8), max_components); res.bit_size = bit_size; res.align = MIN2(bit_size / 8, 4); /* 64-bit access only requires 4 byte alignment. */ res.shift = nir_mem_access_shift_method_shift64; - if ((intrin == nir_intrinsic_load_shared || intrin == nir_intrinsic_store_shared)) { - /* Split unaligned shared access to create more read2/write2. */ - if (combined_align < 16 && bytes < 16) - res.num_components = MIN2(res.num_components, 64 / bit_size); - } - if (!is_load) return res;