From 8ce43b7765a43c23c7f7f64628534fad51a4ef62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 15 Jun 2024 00:16:08 -0400
Subject: [PATCH] nir/opt_load_store_vectorize: add entry::num_components

We will represent vec6..vec7, vec9..vec15 loads with 8 and 16
components respectively, so we need to track how many components
we really use.

This is a prerequisite for optimal merging up to vec16. Example:
    Step 1: vec4 + vec3 ==> vec7as8 (last component unused)
    Step 2: vec1 + vec7as8 ==> vec8 (last unused component dropped)

Without using the number of components read, the same example would end up
doing:
    Step 1: vec4 + vec3 ==> vec8
    Step 2: vec1 + vec8 ==> vec9 (fail)

Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29398>
---
 src/compiler/nir/nir.h                        |  6 ++++
 .../nir/nir_opt_load_store_vectorize.c        | 30 +++++++++++--------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 9d6801350eb..1b0304d5401 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5068,6 +5068,12 @@ nir_component_mask_t nir_src_components_read(const nir_src *src);
 nir_component_mask_t nir_def_components_read(const nir_def *def);
 bool nir_def_all_uses_are_fsat(const nir_def *def);
 
+static inline int
+nir_def_last_component_read(nir_def *def)
+{
+    return (int)util_last_bit(nir_def_components_read(def)) - 1;
+}
+
 static inline bool
 nir_def_is_unused(nir_def *ssa)
 {
diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c
index e095b3189c8..ede9fe8a243 100644
--- a/src/compiler/nir/nir_opt_load_store_vectorize.c
+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c
@@ -151,6 +151,7 @@ struct entry {
 
    nir_instr *instr;
    nir_intrinsic_instr *intrin;
+   unsigned num_components;
    const struct intrinsic_info *info;
    enum gl_access_qualifier access;
    bool is_store;
@@ -553,6 +554,7 @@ create_entry(void *mem_ctx,
    entry->instr = &intrin->instr;
    entry->info = info;
    entry->is_store = entry->info->value_src >= 0;
+   entry->num_components = intrin->num_components;
 
    if (entry->info->deref_src >= 0) {
       entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]);
@@ -646,8 +648,8 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
       return false;
 
    if (low->is_store) {
-      unsigned low_size = low->intrin->num_components * get_bit_size(low);
-      unsigned high_size = high->intrin->num_components * get_bit_size(high);
+      unsigned low_size = low->num_components * get_bit_size(low);
+      unsigned high_size = high->num_components * get_bit_size(high);
 
       if (low_size % new_bit_size != 0)
          return false;
@@ -737,6 +739,7 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
 
    /* update the intrinsic */
    first->intrin->num_components = new_num_components;
+   first->num_components = nir_def_last_component_read(data) + 1;
 
    const struct intrinsic_info *info = first->info;
 
@@ -795,7 +798,7 @@ vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx,
                  unsigned new_bit_size, unsigned new_num_components,
                  unsigned high_start)
 {
-   ASSERTED unsigned low_size = low->intrin->num_components * get_bit_size(low);
+   ASSERTED unsigned low_size = low->num_components * get_bit_size(low);
    assert(low_size % new_bit_size == 0);
 
    b->cursor = nir_before_instr(second->instr);
@@ -842,6 +845,7 @@ vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx,
    if (nir_intrinsic_has_write_mask(second->intrin))
       nir_intrinsic_set_write_mask(second->intrin, write_mask);
    second->intrin->num_components = data->num_components;
+   second->num_components = data->num_components;
 
    const struct intrinsic_info *info = second->info;
    assert(info->value_src >= 0);
@@ -961,11 +965,11 @@ may_alias(nir_shader *shader, struct entry *a, struct entry *b)
    /* TODO: we can look closer at the entry keys */
    int64_t diff = compare_entries(a, b);
    if (diff != INT64_MAX) {
-      /* with atomics, intrin->num_components can be 0 */
+      /* with atomics, nir_intrinsic_instr::num_components can be 0 */
       if (diff < 0)
-         return llabs(diff) < MAX2(b->intrin->num_components, 1u) * (get_bit_size(b) / 8u);
+         return llabs(diff) < MAX2(b->num_components, 1u) * (get_bit_size(b) / 8u);
       else
-         return diff < MAX2(a->intrin->num_components, 1u) * (get_bit_size(a) / 8u);
+         return diff < MAX2(a->num_components, 1u) * (get_bit_size(a) / 8u);
    }
 
    /* TODO: we can use deref information */
@@ -1131,8 +1135,8 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
    /* gather information */
    unsigned low_bit_size = get_bit_size(low);
    unsigned high_bit_size = get_bit_size(high);
-   unsigned low_size = low->intrin->num_components * low_bit_size;
-   unsigned high_size = high->intrin->num_components * high_bit_size;
+   unsigned low_size = low->num_components * low_bit_size;
+   unsigned high_size = high->num_components * high_bit_size;
    unsigned new_size = MAX2(diff * 8u + high_size, low_size);
 
    /* find a good bit size for the new load/store */
@@ -1179,8 +1183,8 @@ try_vectorize_shared2(struct vectorize_ctx *ctx,
 
    unsigned low_bit_size = get_bit_size(low);
    unsigned high_bit_size = get_bit_size(high);
-   unsigned low_size = low->intrin->num_components * low_bit_size / 8;
-   unsigned high_size = high->intrin->num_components * high_bit_size / 8;
+   unsigned low_size = low->num_components * low_bit_size / 8;
+   unsigned high_size = high->num_components * high_bit_size / 8;
    if ((low_size != 4 && low_size != 8) || (high_size != 4 && high_size != 8))
       return false;
    if (low_size != high_size)
@@ -1201,9 +1205,9 @@ try_vectorize_shared2(struct vectorize_ctx *ctx,
       return false;
 
    if (first->is_store) {
-      if (get_write_mask(low->intrin) != BITFIELD_MASK(low->intrin->num_components))
+      if (get_write_mask(low->intrin) != BITFIELD_MASK(low->num_components))
          return false;
-      if (get_write_mask(high->intrin) != BITFIELD_MASK(high->intrin->num_components))
+      if (get_write_mask(high->intrin) != BITFIELD_MASK(high->num_components))
          return false;
    }
 
@@ -1269,7 +1273,7 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
          struct entry *second = low->index < high->index ? high : low;
 
          uint64_t diff = high->offset_signed - low->offset_signed;
-         bool separate = diff > get_bit_size(low) / 8u * low->intrin->num_components;
+         bool separate = diff > get_bit_size(low) / 8u * low->num_components;
          if (separate) {
             if (!ctx->options->has_shared2_amd ||
                 get_variable_mode(first) != nir_var_mem_shared)