nir/opt_load_store_vectorize: create load_shared2_amd/store_shared2_amd

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13778>
2026-05-05 18:18:06 +02:00 · 2021-11-12 10:27:13 +00:00 · 2021-11-12 10:27:13 +00:00 · 778fc176b1
commit 778fc176b1
parent dc835626b3
2 changed files with 106 additions and 15 deletions
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -5426,6 +5426,7 @@ typedef struct {
   nir_variable_mode modes;
   nir_variable_mode robust_modes;
   void *cb_data;
+   bool has_shared2_amd;
 } nir_load_store_vectorize_options;

 bool nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options);
--- a/src/compiler/nir/nir_opt_load_store_vectorize.c
+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c
@ -1104,9 +1104,7 @@ is_strided_vector(const struct glsl_type *type)
 }

 static bool
-try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
-              struct entry *low, struct entry *high,
-              struct entry *first, struct entry *second)
+can_vectorize(struct vectorize_ctx *ctx, struct entry *first, struct entry *second)
 {
   if (!(get_variable_mode(first) & ctx->options->modes) ||
       !(get_variable_mode(second) & ctx->options->modes))
@ -1115,16 +1113,27 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
   if (check_for_aliasing(ctx, first, second))
      return false;

-   uint64_t diff = high->offset_signed - low->offset_signed;
-   if (check_for_robustness(ctx, low, diff))
-      return false;
-
   /* we can only vectorize non-volatile loads/stores of the same type and with
    * the same access */
   if (first->info != second->info || first->access != second->access ||
       (first->access & ACCESS_VOLATILE) || first->info->is_atomic)
      return false;

+   return true;
+}
+
+static bool
+try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
+              struct entry *low, struct entry *high,
+              struct entry *first, struct entry *second)
+{
+   if (!can_vectorize(ctx, first, second))
+      return false;
+
+   uint64_t diff = high->offset_signed - low->offset_signed;
+   if (check_for_robustness(ctx, low, diff))
+      return false;
+
   /* don't attempt to vectorize accesses of row-major matrix columns */
   if (first->deref) {
      const struct glsl_type *first_type = first->deref->type;
@ -1175,6 +1184,76 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
   return true;
 }

+static bool
+try_vectorize_shared2(nir_function_impl *impl, struct vectorize_ctx *ctx,
+                      struct entry *low, struct entry *high,
+                      struct entry *first, struct entry *second)
+{
+   if (!can_vectorize(ctx, first, second) || first->deref)
+      return false;
+
+   unsigned low_bit_size = get_bit_size(low);
+   unsigned high_bit_size = get_bit_size(high);
+   unsigned low_size = low->intrin->num_components * low_bit_size / 8;
+   unsigned high_size = high->intrin->num_components * high_bit_size / 8;
+   if ((low_size != 4 && low_size != 8) || (high_size != 4 && high_size != 8))
+      return false;
+   if (low_size != high_size)
+      return false;
+   if (low->align_mul % low_size || low->align_offset % low_size)
+      return false;
+   if (high->align_mul % low_size || high->align_offset % low_size)
+      return false;
+
+   uint64_t diff = high->offset_signed - low->offset_signed;
+   bool st64 = diff % (64 * low_size) == 0;
+   unsigned stride = st64 ? 64 * low_size : low_size;
+   if (diff % stride || diff > 255 * stride)
+      return false;
+
+   /* try to avoid creating accesses we can't combine additions/offsets into */
+   if (high->offset > 255 * stride || (st64 && high->offset % stride))
+      return false;
+
+   if (first->is_store) {
+      if (nir_intrinsic_write_mask(low->intrin) != BITFIELD_MASK(low->intrin->num_components))
+         return false;
+      if (nir_intrinsic_write_mask(high->intrin) != BITFIELD_MASK(high->intrin->num_components))
+         return false;
+   }
+
+   /* vectorize the accesses */
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   b.cursor = nir_after_instr(first->is_store ? second->instr : first->instr);
+
+   nir_ssa_def *offset = first->intrin->src[first->is_store].ssa;
+   offset = nir_iadd_imm(&b, offset, nir_intrinsic_base(first->intrin));
+   if (first != low)
+      offset = nir_iadd_imm(&b, offset, -(int)diff);
+
+   if (first->is_store) {
+      nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa;
+      nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa;
+      nir_ssa_def *val = nir_vec2(&b, nir_bitcast_vector(&b, low_val, low_size * 8u),
+                                      nir_bitcast_vector(&b, high_val, low_size * 8u));
+      nir_store_shared2_amd(&b, val, offset, .offset1=diff/stride, .st64=st64);
+   } else {
+      nir_ssa_def *new_def = nir_load_shared2_amd(&b, low_size * 8u, offset, .offset1=diff/stride,
+                                                  .st64=st64);
+      nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa,
+                               nir_bitcast_vector(&b, nir_channel(&b, new_def, 0), low_bit_size));
+      nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa,
+                               nir_bitcast_vector(&b, nir_channel(&b, new_def, 1), high_bit_size));
+   }
+
+   nir_instr_remove(first->instr);
+   nir_instr_remove(second->instr);
+
+   return true;
+}
+
 static bool
 update_align(struct entry *entry)
 {
@ -1204,17 +1283,28 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
         if (!high)
            continue;

-         uint64_t diff = high->offset_signed - low->offset_signed;
-         if (diff > get_bit_size(low) / 8u * low->intrin->num_components)
-            break;
-
         struct entry *first = low->index < high->index ? low : high;
         struct entry *second = low->index < high->index ? high : low;

-         if (try_vectorize(impl, ctx, low, high, first, second)) {
-            low = low->is_store ? second : first;
-            *util_dynarray_element(arr, struct entry *, second_idx) = NULL;
-            progress = true;
+         uint64_t diff = high->offset_signed - low->offset_signed;
+         bool separate = diff > get_bit_size(low) / 8u * low->intrin->num_components;
+         if (separate) {
+            if (!ctx->options->has_shared2_amd ||
+                get_variable_mode(first) != nir_var_mem_shared)
+               break;
+
+            if (try_vectorize_shared2(impl, ctx, low, high, first, second)) {
+               low = NULL;
+               *util_dynarray_element(arr, struct entry *, second_idx) = NULL;
+               progress = true;
+               break;
+            }
+         } else {
+            if (try_vectorize(impl, ctx, low, high, first, second)) {
+               low = low->is_store ? second : first;
+               *util_dynarray_element(arr, struct entry *, second_idx) = NULL;
+               progress = true;
+            }
         }
      }