nir/opt_load_store_vectorize: only attempt to vectorize shared2 after exhausting other possibilities

Totals from 249 (0.31% of 79839) affected shaders: (Navi48)

Instrs: 276401 -> 275918 (-0.17%); split: -0.29%, +0.11%
CodeSize: 1477072 -> 1474440 (-0.18%); split: -0.26%, +0.08%
VGPRs: 12748 -> 12760 (+0.09%); split: -0.28%, +0.38%
Latency: 1397959 -> 1398846 (+0.06%); split: -0.10%, +0.16%
InvThroughput: 424767 -> 424496 (-0.06%); split: -0.09%, +0.02%
VClause: 5183 -> 5186 (+0.06%); split: -0.10%, +0.15%
SClause: 6537 -> 6538 (+0.02%); split: -0.05%, +0.06%
Copies: 21295 -> 21098 (-0.93%); split: -1.21%, +0.29%
Branches: 4324 -> 4325 (+0.02%)
PreSGPRs: 9719 -> 9717 (-0.02%)
PreVGPRs: 8857 -> 8847 (-0.11%); split: -0.24%, +0.12%
VALU: 144514 -> 144334 (-0.12%); split: -0.20%, +0.07%
SALU: 38970 -> 38944 (-0.07%); split: -0.08%, +0.01%
VOPD: 884 -> 898 (+1.58%); split: +1.92%, -0.34%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36133>
This commit is contained in:
Daniel Schürmann 2025-07-14 16:03:00 +02:00 committed by Marge Bot
parent 148063670d
commit 957b271a9f

View file

@ -1446,32 +1446,42 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
* callback if needed. Driver callbacks will likely want to
* restrict this to a smaller value, say 4 bytes (or none).
*/
unsigned max_hole =
first->is_store ||
(ctx->options->has_shared2_amd &&
get_variable_mode(first) == nir_var_mem_shared)
? 0
: 28;
unsigned max_hole = first->is_store ? 0 : 28;
unsigned low_size = get_bit_size(low) / 8u * low->num_components;
bool separate = diff > max_hole + low_size;
if (separate)
continue;
if (separate) {
if (!ctx->options->has_shared2_amd ||
get_variable_mode(first) != nir_var_mem_shared)
break;
if (try_vectorize(impl, ctx, low, high, first, second)) {
low = low->is_store ? second : first;
*util_dynarray_element(arr, struct entry *, second_idx) = NULL;
progress = true;
}
}
*util_dynarray_element(arr, struct entry *, first_idx) = low;
}
if (try_vectorize_shared2(ctx, low, high, first, second)) {
low = NULL;
*util_dynarray_element(arr, struct entry *, second_idx) = NULL;
progress = true;
break;
}
} else {
if (try_vectorize(impl, ctx, low, high, first, second)) {
low = low->is_store ? second : first;
*util_dynarray_element(arr, struct entry *, second_idx) = NULL;
progress = true;
}
if (!ctx->options->has_shared2_amd)
return progress;
/* Do a second pass for backends which support load/store shared2. */
for (unsigned first_idx = 0; first_idx < num_entries; first_idx++) {
struct entry *low = *util_dynarray_element(arr, struct entry *, first_idx);
if (!low || get_variable_mode(low) != nir_var_mem_shared)
continue;
for (unsigned second_idx = first_idx + 1; second_idx < num_entries; second_idx++) {
struct entry *high = *util_dynarray_element(arr, struct entry *, second_idx);
if (!high || get_variable_mode(high) != nir_var_mem_shared)
continue;
struct entry *first = low->index < high->index ? low : high;
struct entry *second = low->index < high->index ? high : low;
if (try_vectorize_shared2(ctx, low, high, first, second)) {
low = NULL;
*util_dynarray_element(arr, struct entry *, second_idx) = NULL;
progress = true;
break;
}
}