nir/opt_load_store_vectorize: allow a 4-byte hole between 2 loads

If there is a 4-byte hole between 2 loads, drivers can now optionally vectorize the loads by including the hole between them, e.g.: 4B load + 4B hole + 8B load --> 16B load All vectorize callbacks already reject all holes, but AMD will want to allow it. radeonsi+ACO with the new vectorization callback: TOTALS FROM AFFECTED SHADERS (25248/58918) VGPRs: 871116 -> 871872 (0.09 %) Spilled SGPRs: 397 -> 407 (2.52 %) Code Size: 43074536 -> 42496352 (-1.34 %) bytes Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29398>
2026-03-05 10:10:30 +01:00 · 2024-05-24 14:47:39 -04:00 · 2024-05-24 14:47:39 -04:00 · a44e5cfccf
commit a44e5cfccf
parent 80c156422d
2 changed files with 100 additions and 3 deletions
--- a/src/compiler/nir/nir_opt_load_store_vectorize.c
+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c
@ -652,9 +652,14 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
   if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS)
      return false;

+   unsigned low_size = low->intrin->num_components * get_bit_size(low) / 8;
+   /* The hole size can be less than 0 if low and high instructions overlap. */
+   unsigned hole_size =
+      MAX2(high->offset_signed - (low->offset_signed + low_size), 0);
+
   if (!ctx->options->callback(low->align_mul,
                               low->align_offset,
-                               new_bit_size, new_num_components, 0,
+                               new_bit_size, new_num_components, hole_size,
                               low->intrin, high->intrin,
                               ctx->options->cb_data))
      return false;
@ -1320,7 +1325,16 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
         struct entry *second = low->index < high->index ? high : low;

         uint64_t diff = high->offset_signed - low->offset_signed;
-         bool separate = diff > get_bit_size(low) / 8u * low->num_components;
+         /* Allow overfetching by 4 bytes, which can be rejected
+          * by the callback if needed.
+          */
+         unsigned max_hole =
+            first->is_store ||
+            (ctx->options->has_shared2_amd &&
+             get_variable_mode(first) == nir_var_mem_shared) ? 0 : 4;
+         unsigned low_size = get_bit_size(low) / 8u * low->num_components;
+         bool separate = diff > max_hole + low_size;
+
         if (separate) {
            if (!ctx->options->has_shared2_amd ||
                get_variable_mode(first) != nir_var_mem_shared)
--- a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp
+++ b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp
@ -83,6 +83,7 @@ protected:
   std::map<unsigned, nir_def*> res_map;
   unsigned max_components = 4;
   bool overfetch = false;
+   unsigned max_hole_size = 0;
 };

 std::string
@ -345,7 +346,9 @@ bool nir_load_store_vectorize_test::mem_vectorize_callback(
 {
   nir_load_store_vectorize_test *test = (nir_load_store_vectorize_test *)data;

-   if (hole_size ||
+   assert(hole_size <= 4);
+
+   if (hole_size > test->max_hole_size ||
       (!test->overfetch && !nir_num_components_valid(num_components)))
      return false;

@ -2177,3 +2180,83 @@ TEST_F(nir_load_store_vectorize_test, ubo_vec7as8_vec1)
   /* TODO: This is not merged by the pass, but we could implement it. */
   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
 }
+
+TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole1_vec1_disallowed)
+{
+   create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
+   create_load(nir_var_mem_ubo, 0, 12, 0x2, 32, 1);
+
+   nir_validate_shader(b->shader, NULL);
+   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
+
+   EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
+   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
+}
+
+TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole1_vec1)
+{
+   create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
+   create_load(nir_var_mem_ubo, 0, 12, 0x2, 32, 1);
+
+   nir_validate_shader(b->shader, NULL);
+   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
+
+   this->max_hole_size = 4;
+   EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
+   this->max_hole_size = 0;
+
+   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1);
+
+   nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0);
+   ASSERT_EQ(load->def.bit_size, 32);
+   ASSERT_EQ(load->def.num_components, 4);
+   ASSERT_EQ(nir_intrinsic_range_base(load), 0);
+   ASSERT_EQ(nir_intrinsic_range(load), 16);
+   ASSERT_EQ(nir_def_components_read(&load->def), 1 | 2 | 8);
+   ASSERT_EQ(nir_src_as_uint(load->src[1]), 0);
+   EXPECT_INSTR_SWIZZLES(movs[0x1], load, "xy");
+   EXPECT_INSTR_SWIZZLES(movs[0x2], load, "w");
+}
+
+TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole2_vec4_disallowed)
+{
+   create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
+   create_load(nir_var_mem_ubo, 0, 16, 0x2, 32, 1);
+
+   nir_validate_shader(b->shader, NULL);
+   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
+
+   /* The pass only allows 4-byte holes. */
+   this->max_hole_size = 8;
+   EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
+   this->max_hole_size = 0;
+
+   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
+}
+
+TEST_F(nir_load_store_vectorize_test, ubo_vec3_hole1_vec3)
+{
+   create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 3);
+   create_load(nir_var_mem_ubo, 0, 16, 0x2, 32, 3);
+
+   nir_validate_shader(b->shader, NULL);
+   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
+
+   this->overfetch = true;
+   this->max_hole_size = 4;
+   EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
+   this->max_hole_size = 0;
+   this->overfetch = false;
+
+   ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1);
+
+   nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0);
+   ASSERT_EQ(load->def.bit_size, 32);
+   ASSERT_EQ(load->def.num_components, 8);
+   ASSERT_EQ(nir_intrinsic_range_base(load), 0);
+   ASSERT_EQ(nir_intrinsic_range(load), 32);
+   ASSERT_EQ(nir_def_components_read(&load->def), 0x77);
+   ASSERT_EQ(nir_src_as_uint(load->src[1]), 0);
+   EXPECT_INSTR_SWIZZLES(movs[0x1], load, "xyz");
+   EXPECT_INSTR_SWIZZLES(movs[0x2], load, "efg");
+}