nir/load_store_vectorize: check for interfering shared2 before vectorizing

Only affected shaders in radv_fossils are f1_23.

fossil-db (navi21):
Totals from 3 (0.00% of 79825) affected shaders:
Instrs: 2700 -> 2730 (+1.11%)
CodeSize: 17096 -> 17228 (+0.77%)
Latency: 8424 -> 8726 (+3.58%)
InvThroughput: 3768 -> 3778 (+0.27%); split: -0.05%, +0.32%
Copies: 224 -> 234 (+4.46%)
PreVGPRs: 291 -> 287 (-1.37%)
VALU: 1989 -> 1996 (+0.35%); split: -0.05%, +0.40%

fossil-db (gfx1201):
Totals from 3 (0.00% of 79839) affected shaders:
Instrs: 2862 -> 2908 (+1.61%)
CodeSize: 17868 -> 18064 (+1.10%)
Latency: 7567 -> 7854 (+3.79%)
InvThroughput: 2810 -> 2802 (-0.28%)
Copies: 122 -> 120 (-1.64%)
PreVGPRs: 291 -> 287 (-1.37%)
VALU: 1890 -> 1885 (-0.26%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13616
Backport-to: 25.1
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36442>
(cherry picked from commit eeddf6b9e2)
This commit is contained in:
Rhys Perry 2025-07-29 12:09:25 +01:00 committed by Eric Engestrom
parent 7247c50d44
commit f9295bb5ec
2 changed files with 50 additions and 18 deletions

View file

@ -3494,7 +3494,7 @@
"description": "nir/load_store_vectorize: check for interfering shared2 before vectorizing",
"nominated": true,
"nomination_type": 4,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View file

@ -56,7 +56,7 @@
struct intrinsic_info {
nir_variable_mode mode; /* 0 if the mode is obtained from the deref. */
nir_intrinsic_op op;
bool is_atomic;
bool is_unvectorizable;
/* Indices into nir_intrinsic::src[] or -1 if not applicable. */
int resource_src; /* resource (e.g. from vulkan_resource_index) */
int base_src; /* offset which it loads/stores from */
@ -71,9 +71,9 @@ static const struct intrinsic_info *
get_info(nir_intrinsic_op op)
{
switch (op) {
#define INFO(mode, op, atomic, res, base, deref, val, scale) \
#define INFO(mode, op, unvectorizable, res, base, deref, val, scale) \
case nir_intrinsic_##op: { \
static const struct intrinsic_info op##_info = { mode, nir_intrinsic_##op, atomic, res, base, deref, val, scale }; \
static const struct intrinsic_info op##_info = { mode, nir_intrinsic_##op, unvectorizable, res, base, deref, val, scale }; \
return &op##_info; \
}
#define LOAD(mode, op, res, base, deref, scale) INFO(mode, load_##op, false, res, base, deref, -1, scale)
@ -90,6 +90,8 @@ get_info(nir_intrinsic_op op)
STORE(0, deref, -1, -1, 0, 1, 1)
LOAD(nir_var_mem_shared, shared, -1, 0, -1, 1)
STORE(nir_var_mem_shared, shared, -1, 1, -1, 0, 1)
INFO(nir_var_mem_shared, load_shared2_amd, true, -1, 0, -1, -1, 1);
INFO(nir_var_mem_shared, store_shared2_amd, true, -1, 1, -1, 0, 1)
LOAD(nir_var_mem_global, global, -1, 0, -1, 1)
STORE(nir_var_mem_global, global, -1, 1, -1, 0, 1)
LOAD(nir_var_mem_global, global_constant, -1, 0, -1, 1)
@ -594,6 +596,9 @@ create_entry(void *mem_ctx,
const struct intrinsic_info *info,
nir_intrinsic_instr *intrin)
{
bool is_shared2 = intrin->intrinsic == nir_intrinsic_load_shared2_amd ||
intrin->intrinsic == nir_intrinsic_store_shared2_amd;
struct entry *entry = rzalloc(mem_ctx, struct entry);
entry->intrin = intrin;
entry->instr = &intrin->instr;
@ -601,6 +606,8 @@ create_entry(void *mem_ctx,
entry->is_store = entry->info->value_src >= 0;
entry->num_components =
entry->is_store ? intrin->num_components : nir_def_last_component_read(&intrin->def) + 1;
if (is_shared2)
entry->num_components = 1;
if (entry->info->deref_src >= 0) {
entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]);
@ -1035,11 +1042,37 @@ bindings_different_restrict(nir_shader *shader, struct entry *a, struct entry *b
}
static int64_t
compare_entries(struct entry *a, struct entry *b)
may_alias_internal(struct entry *a, struct entry *b, uint32_t a_offset, uint32_t b_offset)
{
/* use adjacency information */
/* TODO: we can look closer at the entry keys */
if (!entry_key_equals(a->key, b->key))
return INT64_MAX;
return b->offset_signed - a->offset_signed;
return true;
int64_t diff = (b->offset_signed + b_offset) - (a->offset_signed + a_offset);
/* with atomics, nir_intrinsic_instr::num_components can be 0 */
if (diff < 0)
return llabs(diff) < MAX2(b->num_components, 1u) * (get_bit_size(b) / 8u);
else
return diff < MAX2(a->num_components, 1u) * (get_bit_size(a) / 8u);
}
static unsigned
parse_shared2_offsets(struct entry *entry, uint32_t offsets[2])
{
if (entry->intrin->intrinsic != nir_intrinsic_load_shared2_amd &&
entry->intrin->intrinsic != nir_intrinsic_store_shared2_amd) {
offsets[0] = 0;
return 1;
}
uint32_t stride = get_bit_size(entry) / 8u;
if (nir_intrinsic_st64(entry->intrin))
stride *= 64;
offsets[0] = nir_intrinsic_offset0(entry->intrin) * stride;
offsets[1] = nir_intrinsic_offset1(entry->intrin) * stride;
return 2;
}
static bool
@ -1078,20 +1111,19 @@ may_alias(nir_shader *shader, struct entry *a, struct entry *b)
return true;
}
/* use adjacency information */
/* TODO: we can look closer at the entry keys */
int64_t diff = compare_entries(a, b);
if (diff != INT64_MAX) {
/* with atomics, nir_intrinsic_instr::num_components can be 0 */
if (diff < 0)
return llabs(diff) < MAX2(b->num_components, 1u) * (get_bit_size(b) / 8u);
else
return diff < MAX2(a->num_components, 1u) * (get_bit_size(a) / 8u);
uint32_t a_offsets[2], b_offsets[2] = { 0, 0 };
unsigned a_count = parse_shared2_offsets(a, a_offsets);
unsigned b_count = parse_shared2_offsets(b, b_offsets);
for (unsigned i = 0; i < a_count; i++) {
for (unsigned j = 0; j < b_count; j++) {
if (may_alias_internal(a, b, a_offsets[i], b_offsets[j]))
return true;
}
}
/* TODO: we can use deref information */
return true;
return false;
}
static bool
@ -1223,7 +1255,7 @@ can_vectorize(struct vectorize_ctx *ctx, struct entry *first, struct entry *seco
/* we can only vectorize non-volatile loads/stores of the same type and with
* the same access */
if (first->info != second->info || first->access != second->access ||
(first->access & ACCESS_VOLATILE) || first->info->is_atomic)
(first->access & ACCESS_VOLATILE) || first->info->is_unvectorizable)
return false;
if (first->intrin->intrinsic == nir_intrinsic_load_buffer_amd ||