From 957b271a9f6ddeb8ec79690db883ca7abd6f2cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 14 Jul 2025 16:03:00 +0200 Subject: [PATCH] nir/opt_load_store_vectorize: only attempt to vectorize shared2 after exhausting other possibilities Totals from 249 (0.31% of 79839) affected shaders: (Navi48) Instrs: 276401 -> 275918 (-0.17%); split: -0.29%, +0.11% CodeSize: 1477072 -> 1474440 (-0.18%); split: -0.26%, +0.08% VGPRs: 12748 -> 12760 (+0.09%); split: -0.28%, +0.38% Latency: 1397959 -> 1398846 (+0.06%); split: -0.10%, +0.16% InvThroughput: 424767 -> 424496 (-0.06%); split: -0.09%, +0.02% VClause: 5183 -> 5186 (+0.06%); split: -0.10%, +0.15% SClause: 6537 -> 6538 (+0.02%); split: -0.05%, +0.06% Copies: 21295 -> 21098 (-0.93%); split: -1.21%, +0.29% Branches: 4324 -> 4325 (+0.02%) PreSGPRs: 9719 -> 9717 (-0.02%) PreVGPRs: 8857 -> 8847 (-0.11%); split: -0.24%, +0.12% VALU: 144514 -> 144334 (-0.12%); split: -0.20%, +0.07% SALU: 38970 -> 38944 (-0.07%); split: -0.08%, +0.01% VOPD: 884 -> 898 (+1.58%); split: +1.92%, -0.34% Part-of: --- .../nir/nir_opt_load_store_vectorize.c | 54 +++++++++++-------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index 9266267cb1b..9245f761251 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -1446,32 +1446,42 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, * callback if needed. Driver callbacks will likely want to * restrict this to a smaller value, say 4 bytes (or none). */ - unsigned max_hole = - first->is_store || - (ctx->options->has_shared2_amd && - get_variable_mode(first) == nir_var_mem_shared) - ? 0 - : 28; + unsigned max_hole = first->is_store ? 0 : 28; unsigned low_size = get_bit_size(low) / 8u * low->num_components; bool separate = diff > max_hole + low_size; + if (separate) + continue; - if (separate) { - if (!ctx->options->has_shared2_amd || - get_variable_mode(first) != nir_var_mem_shared) - break; + if (try_vectorize(impl, ctx, low, high, first, second)) { + low = low->is_store ? second : first; + *util_dynarray_element(arr, struct entry *, second_idx) = NULL; + progress = true; + } + } + *util_dynarray_element(arr, struct entry *, first_idx) = low; + } - if (try_vectorize_shared2(ctx, low, high, first, second)) { - low = NULL; - *util_dynarray_element(arr, struct entry *, second_idx) = NULL; - progress = true; - break; - } - } else { - if (try_vectorize(impl, ctx, low, high, first, second)) { - low = low->is_store ? second : first; - *util_dynarray_element(arr, struct entry *, second_idx) = NULL; - progress = true; - } + if (!ctx->options->has_shared2_amd) + return progress; + + /* Do a second pass for backends which support load/store shared2. */ + for (unsigned first_idx = 0; first_idx < num_entries; first_idx++) { + struct entry *low = *util_dynarray_element(arr, struct entry *, first_idx); + if (!low || get_variable_mode(low) != nir_var_mem_shared) + continue; + + for (unsigned second_idx = first_idx + 1; second_idx < num_entries; second_idx++) { + struct entry *high = *util_dynarray_element(arr, struct entry *, second_idx); + if (!high || get_variable_mode(high) != nir_var_mem_shared) + continue; + + struct entry *first = low->index < high->index ? low : high; + struct entry *second = low->index < high->index ? high : low; + if (try_vectorize_shared2(ctx, low, high, first, second)) { + low = NULL; + *util_dynarray_element(arr, struct entry *, second_idx) = NULL; + progress = true; + break; } }