From 02923e237d3c1b52ce833eb68e2ff990906645e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 15 Jun 2024 00:36:12 -0400
Subject: [PATCH] nir: add hole_size parameter into the vectorize callback

It will be used to allow merging loads with a hole between them.

Reviewed-by: Qiang Yu <yuq825@gmail.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29398>
---
 src/amd/common/ac_shader_util.c                        | 4 ++--
 src/amd/common/ac_shader_util.h                        | 4 ++--
 src/asahi/compiler/agx_compile.c                       | 7 +++++--
 src/broadcom/compiler/nir_to_vir.c                     | 4 ++++
 src/compiler/nir/nir.h                                 | 3 +++
 src/compiler/nir/nir_lower_shader_calls.c              | 3 ++-
 src/compiler/nir/nir_opt_load_store_vectorize.c        | 2 +-
 src/compiler/nir/tests/load_store_vectorizer_tests.cpp | 7 +++++--
 src/freedreno/ir3/ir3_nir.c                            | 5 ++++-
 src/gallium/auxiliary/nir/nir_to_tgsi.c                | 4 +++-
 src/intel/compiler/brw_nir.c                           | 3 ++-
 src/intel/compiler/brw_nir.h                           | 1 +
 src/intel/compiler/elk/elk_nir.c                       | 5 +++--
 src/intel/compiler/elk/elk_nir.h                       | 7 -------
 src/microsoft/compiler/nir_to_dxil.c                   | 3 ++-
 src/nouveau/codegen/nv50_ir_from_nir.cpp               | 5 +++++
 src/nouveau/compiler/nak_nir.c                         | 7 +++++--
 src/panfrost/compiler/bifrost_compile.c                | 8 ++++++--
 18 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c
index ade410ee391..171b6738a53 100644
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -113,10 +113,10 @@ void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
 
 bool
 ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
-                              unsigned num_components, nir_intrinsic_instr *low,
+                              unsigned num_components, unsigned hole_size, nir_intrinsic_instr *low,
                               nir_intrinsic_instr *high, void *data)
 {
-   if (num_components > 4)
+   if (num_components > 4 || hole_size)
       return false;
 
    bool is_scratch = false;
diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h
index d8b6e9b78c0..d2b024bedef 100644
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@@ -243,8 +243,8 @@ void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
                         nir_shader_compiler_options *options);
 
 bool ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
-                                   unsigned num_components, nir_intrinsic_instr *low,
-                                   nir_intrinsic_instr *high, void *data);
+                                   unsigned num_components, unsigned hole_size,
+                                   nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data);
 
 unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,
                                     bool writes_mrt0_alpha);
diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index d7a9f0ea738..bcb76ab810e 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -2795,9 +2795,12 @@ agx_optimize_loop_nir(nir_shader *nir)
 
 static bool
 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
-                 unsigned num_components, nir_intrinsic_instr *low,
-                 nir_intrinsic_instr *high, void *data)
+                 unsigned num_components, unsigned hole_size,
+                 nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data)
 {
+   if (hole_size)
+      return false;
+
    /* Must be aligned to the size of the load */
    unsigned align = nir_combined_align(align_mul, align_offset);
    if ((bit_size / 8) > align)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 5e38522f5c5..bc225e6dba4 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -2084,10 +2084,14 @@ static bool
 mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
                        unsigned bit_size,
                        unsigned num_components,
+                       unsigned hole_size,
                        nir_intrinsic_instr *low,
                        nir_intrinsic_instr *high,
                        void *data)
 {
+        if (hole_size)
+                return false;
+
         /* TMU general access only supports 32-bit vectors */
         if (bit_size > 32)
                 return false;
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 1b0304d5401..cb4df615e4e 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5896,6 +5896,9 @@ typedef bool (*nir_should_vectorize_mem_func)(unsigned align_mul,
                                               unsigned align_offset,
                                               unsigned bit_size,
                                               unsigned num_components,
+                                              /* The hole between low and
+                                               * high if they are not adjacent. */
+                                              unsigned hole_size,
                                               nir_intrinsic_instr *low,
                                               nir_intrinsic_instr *high,
                                               void *data);
diff --git a/src/compiler/nir/nir_lower_shader_calls.c b/src/compiler/nir/nir_lower_shader_calls.c
index 1e0ecb10a81..245d6dd4c19 100644
--- a/src/compiler/nir/nir_lower_shader_calls.c
+++ b/src/compiler/nir/nir_lower_shader_calls.c
@@ -1920,6 +1920,7 @@ should_vectorize(unsigned align_mul,
                  unsigned align_offset,
                  unsigned bit_size,
                  unsigned num_components,
+                 unsigned hole_size,
                  nir_intrinsic_instr *low, nir_intrinsic_instr *high,
                  void *data)
 {
@@ -1933,7 +1934,7 @@ should_vectorize(unsigned align_mul,
    struct stack_op_vectorizer_state *state = data;
 
    return state->driver_callback(align_mul, align_offset,
-                                 bit_size, num_components,
+                                 bit_size, num_components, hole_size,
                                  low, high, state->driver_data);
 }
 
diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c
index ede9fe8a243..0b666aa716d 100644
--- a/src/compiler/nir/nir_opt_load_store_vectorize.c
+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c
@@ -642,7 +642,7 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
 
    if (!ctx->options->callback(low->align_mul,
                                low->align_offset,
-                               new_bit_size, new_num_components,
+                               new_bit_size, new_num_components, 0,
                                low->intrin, high->intrin,
                                ctx->options->cb_data))
       return false;
diff --git a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp
index ff707f7a6d2..72db7ad0142 100644
--- a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp
+++ b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp
@@ -71,7 +71,7 @@ protected:
 
    static bool mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
                                       unsigned bit_size,
-                                      unsigned num_components,
+                                      unsigned num_components, unsigned hole_size,
                                       nir_intrinsic_instr *low, nir_intrinsic_instr *high,
                                       void *data);
    static void shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align);
@@ -336,10 +336,13 @@ bool nir_load_store_vectorize_test::test_alu_def(
 
 bool nir_load_store_vectorize_test::mem_vectorize_callback(
    unsigned align_mul, unsigned align_offset, unsigned bit_size,
-   unsigned num_components,
+   unsigned num_components, unsigned hole_size,
    nir_intrinsic_instr *low, nir_intrinsic_instr *high,
    void *data)
 {
+   if (hole_size)
+      return false;
+
    /* Calculate a simple alignment, like how nir_intrinsic_align() does. */
    uint32_t align = align_mul;
    if (align_offset)
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 899b6e9a7ed..cb5e9fdab79 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -110,9 +110,12 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)
 static bool
 ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
                              unsigned bit_size, unsigned num_components,
-                             nir_intrinsic_instr *low,
+                             unsigned hole_size, nir_intrinsic_instr *low,
                              nir_intrinsic_instr *high, void *data)
 {
+   if (hole_size)
+      return false;
+
    struct ir3_compiler *compiler = data;
    unsigned byte_size = bit_size / 8;
 
diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c
index af3a9920349..0701c0d44c5 100644
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -3265,13 +3265,15 @@ ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
    return 4;
 }
 
+/* TODO: These parameters are wrong. */
 static bool
 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
                         unsigned num_components, unsigned high_offset,
+                        unsigned hole_size,
                         nir_intrinsic_instr *low, nir_intrinsic_instr *high,
                         void *data)
 {
-   if (bit_size != 32)
+   if (bit_size != 32 || hole_size)
       return false;
 
    /* Our offset alignment should aways be at least 4 bytes */
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 7035c244d98..5f11f53127a 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1411,6 +1411,7 @@ bool
 brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
                              unsigned bit_size,
                              unsigned num_components,
+                             unsigned hole_size,
                              nir_intrinsic_instr *low,
                              nir_intrinsic_instr *high,
                              void *data)
@@ -1419,7 +1420,7 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
     * those back into 32-bit ones anyway and UBO loads aren't split in NIR so
     * we don't want to make a mess for the back-end.
     */
-   if (bit_size > 32)
+   if (bit_size > 32 || hole_size)
       return false;
 
    if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index a1f72a28392..169062f1c04 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -233,6 +233,7 @@ enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo,
 bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
                                   unsigned bit_size,
                                   unsigned num_components,
+                                  unsigned hole_size,
                                   nir_intrinsic_instr *low,
                                   nir_intrinsic_instr *high,
                                   void *data);
diff --git a/src/intel/compiler/elk/elk_nir.c b/src/intel/compiler/elk/elk_nir.c
index d0ab5888900..3be383a85c0 100644
--- a/src/intel/compiler/elk/elk_nir.c
+++ b/src/intel/compiler/elk/elk_nir.c
@@ -1124,10 +1124,11 @@ elk_nir_link_shaders(const struct elk_compiler *compiler,
    }
 }
 
-bool
+static bool
 elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
                              unsigned bit_size,
                              unsigned num_components,
+                             unsigned hole_size,
                              nir_intrinsic_instr *low,
                              nir_intrinsic_instr *high,
                              void *data)
@@ -1136,7 +1137,7 @@ elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
     * those back into 32-bit ones anyway and UBO loads aren't split in NIR so
     * we don't want to make a mess for the back-end.
     */
-   if (bit_size > 32)
+   if (bit_size > 32 || hole_size)
       return false;
 
    if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
diff --git a/src/intel/compiler/elk/elk_nir.h b/src/intel/compiler/elk/elk_nir.h
index f621fdcda19..8575a07b8f8 100644
--- a/src/intel/compiler/elk/elk_nir.h
+++ b/src/intel/compiler/elk/elk_nir.h
@@ -248,13 +248,6 @@ void elk_nir_apply_key(nir_shader *nir,
 unsigned elk_nir_api_subgroup_size(const nir_shader *nir,
                                    unsigned hw_subgroup_size);
 
-bool elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
-                                  unsigned bit_size,
-                                  unsigned num_components,
-                                  nir_intrinsic_instr *low,
-                                  nir_intrinsic_instr *high,
-                                  void *data);
-
 void elk_nir_analyze_ubo_ranges(const struct elk_compiler *compiler,
                                 nir_shader *nir,
                                 struct elk_ubo_range out_ranges[4]);
diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c
index edbe0aa81d7..6df5a379805 100644
--- a/src/microsoft/compiler/nir_to_dxil.c
+++ b/src/microsoft/compiler/nir_to_dxil.c
@@ -6229,10 +6229,11 @@ vectorize_filter(
    unsigned align_offset,
    unsigned bit_size,
    unsigned num_components,
+   unsigned hole_size,
    nir_intrinsic_instr *low, nir_intrinsic_instr *high,
    void *data)
 {
-   return util_is_power_of_two_nonzero(num_components);
+   return !hole_size && util_is_power_of_two_nonzero(num_components);
 }
 
 struct lower_mem_bit_sizes_data {
diff --git a/src/nouveau/codegen/nv50_ir_from_nir.cpp b/src/nouveau/codegen/nv50_ir_from_nir.cpp
index 9d8f48fcedf..47d3d650020 100644
--- a/src/nouveau/codegen/nv50_ir_from_nir.cpp
+++ b/src/nouveau/codegen/nv50_ir_from_nir.cpp
@@ -138,6 +138,7 @@ private:
                               unsigned align_offset,
                               unsigned bit_size,
                               unsigned num_components,
+                              unsigned hole_size,
                               nir_intrinsic_instr *low,
                               nir_intrinsic_instr *high,
                               void *cb_data);
@@ -1369,10 +1370,14 @@ Converter::memVectorizeCb(unsigned align_mul,
                           unsigned align_offset,
                           unsigned bit_size,
                           unsigned num_components,
+                          unsigned hole_size,
                           nir_intrinsic_instr *low,
                           nir_intrinsic_instr *high,
                           void *cb_data)
 {
+   if (hole_size)
+      return false;
+
    /*
     * Since we legalize these later with nir_lower_mem_access_bit_sizes,
     * we can optimistically combine anything that might be profitable
diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c
index 098a60d4ece..7476c932e69 100644
--- a/src/nouveau/compiler/nak_nir.c
+++ b/src/nouveau/compiler/nak_nir.c
@@ -799,8 +799,8 @@ nak_nir_remove_barriers(nir_shader *nir)
 static bool
 nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
                      unsigned bit_size, unsigned num_components,
-                     nir_intrinsic_instr *low, nir_intrinsic_instr *high,
-                     void *cb_data)
+                     unsigned hole_size, nir_intrinsic_instr *low,
+                     nir_intrinsic_instr *high, void *cb_data)
 {
    /*
     * Since we legalize these later with nir_lower_mem_access_bit_sizes,
@@ -808,6 +808,9 @@ nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
     */
    assert(util_is_power_of_two_nonzero(align_mul));
 
+   if (hole_size)
+      return false;
+
    unsigned max_bytes = 128u / 8u;
    if (low->intrinsic == nir_intrinsic_ldc_nv ||
        low->intrinsic == nir_intrinsic_ldcx_nv)
diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c
index eaea02848a2..248d0c55b48 100644
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -4632,9 +4632,13 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
 
 static bool
 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
-                 unsigned num_components, nir_intrinsic_instr *low,
-                 nir_intrinsic_instr *high, void *data)
+                 unsigned num_components, unsigned hole_size,
+                 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
+                 void *data)
 {
+   if (hole_size)
+      return false;
+
    /* Must be aligned to the size of the load */
    unsigned align = nir_combined_align(align_mul, align_offset);
    if ((bit_size / 8) > align)