ir3: Move load/store vectorization to finalize

Some frontends such as rusticl and turnip call the optimization loop before choosing the shared memory layout, in order to be able to delete variables that turn out to be unused. This means that we can't vectorize them until after the first run of the optimization loop. Other drivers also seem to do something similar. This also has the benefit that by delaying vectorization of UBOs until after they are lowered from derefs, we don't insert casts which remove the ability of nir_lower_explicit_io to insert a range, which was blocking the pushing of vectorized indirect UBO loads. This has a significant positive impact on fossil-db: Only doing vectorization later exposes a bug where vectorization could change the bitsize after we used it to determine which descriptor to use. It happened to work before because vectorization was usually done early. To fix it, move adjusting the descriptor to a new pass that happens after finalizing. Totals: MaxWaves: 2249140 -> 2281068 (+1.42%); split: +1.43%, -0.01% Instrs: 49624230 -> 49143117 (-0.97%); split: -1.14%, +0.17% CodeSize: 103796862 -> 104143744 (+0.33%); split: -0.98%, +1.31% NOPs: 8489860 -> 8512218 (+0.26%); split: -1.55%, +1.81% MOVs: 1531650 -> 1574911 (+2.82%); split: -1.37%, +4.20% Full: 1814334 -> 1748906 (-3.61%); split: -3.64%, +0.03% (ss): 1155395 -> 1128249 (-2.35%); split: -3.48%, +1.13% (sy): 608650 -> 567972 (-6.68%); split: -7.32%, +0.64% (ss)-stall: 4352550 -> 4340473 (-0.28%); split: -2.08%, +1.80% (sy)-stall: 17852259 -> 16943647 (-5.09%); split: -6.25%, +1.16% STPs: 24568 -> 24215 (-1.44%) LDPs: 37799 -> 37468 (-0.88%) Early-preamble: 115698 -> 113694 (-1.73%); split: +0.17%, -1.90% Cat0: 9345228 -> 9367782 (+0.24%); split: -1.41%, +1.65% Cat1: 2445265 -> 2549122 (+4.25%); split: -0.81%, +5.06% Cat2: 18704736 -> 18377519 (-1.75%); split: -1.76%, +0.01% Cat3: 14210303 -> 14130558 (-0.56%); split: -0.56%, +0.00% Cat4: 1346895 -> 1346462 (-0.03%); split: -0.03%, +0.00% Cat5: 1420418 -> 1420417 (-0.00%); split: -0.07%, +0.07% Cat6: 745590 -> 549358 (-26.32%); split: -26.66%, +0.34% Cat7: 1405795 -> 1401899 (-0.28%); split: -0.96%, +0.68% Totals from 79089 (48.19% of 164134) affected shaders: MaxWaves: 947648 -> 979576 (+3.37%); split: +3.40%, -0.03% Instrs: 38664140 -> 38183027 (-1.24%); split: -1.47%, +0.22% CodeSize: 80179110 -> 80525992 (+0.43%); split: -1.27%, +1.70% NOPs: 6880907 -> 6903265 (+0.32%); split: -1.91%, +2.23% MOVs: 1183855 -> 1227116 (+3.65%); split: -1.78%, +5.43% Full: 1107056 -> 1041628 (-5.91%); split: -5.96%, +0.05% (ss): 939342 -> 912196 (-2.89%); split: -4.28%, +1.39% (sy): 457959 -> 417281 (-8.88%); split: -9.73%, +0.85% (ss)-stall: 3664495 -> 3652418 (-0.33%); split: -2.47%, +2.14% (sy)-stall: 12266805 -> 11358193 (-7.41%); split: -9.10%, +1.69% STPs: 7494 -> 7141 (-4.71%) LDPs: 7050 -> 6719 (-4.70%) Early-preamble: 46339 -> 44335 (-4.32%); split: +0.43%, -4.75% Cat0: 7548630 -> 7571184 (+0.30%); split: -1.75%, +2.05% Cat1: 1823872 -> 1927729 (+5.69%); split: -1.09%, +6.78% Cat2: 14767716 -> 14440499 (-2.22%); split: -2.22%, +0.01% Cat3: 10630582 -> 10550837 (-0.75%); split: -0.75%, +0.00% Cat4: 1150090 -> 1149657 (-0.04%); split: -0.04%, +0.00% Cat5: 1068913 -> 1068912 (-0.00%); split: -0.09%, +0.09% Cat6: 554910 -> 358678 (-35.36%); split: -35.82%, +0.45% Cat7: 1119427 -> 1115531 (-0.35%); split: -1.20%, +0.86% Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34441>
2026-05-05 20:28:04 +02:00 · 2025-04-09 13:32:37 -04:00 · 2025-04-09 13:32:37 -04:00 · 9977c4d682
commit 9977c4d682
parent 2f93137308
2 changed files with 75 additions and 26 deletions
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -380,14 +380,6 @@ ir3_optimize_loop(struct ir3_compiler *compiler,
      };
      progress |= OPT(s, nir_opt_offsets, &offset_options);

-      nir_load_store_vectorize_options vectorize_opts = {
-         .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform,
-         .callback = ir3_nir_should_vectorize_mem,
-         .robust_modes = options->robust_modes,
-         .cb_data = compiler,
-      };
-      progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);
-
      if (lower_flrp != 0) {
         if (OPT(s, nir_lower_flrp, lower_flrp, false /* always_precise */)) {
            OPT(s, nir_opt_constant_folding);
@ -652,7 +644,20 @@ ir3_finalize_nir(struct ir3_compiler *compiler,
   bool idiv_progress = OPT(s, nir_opt_idiv_const, 8);
   idiv_progress |= OPT(s, nir_lower_idiv, &idiv_options);

-   if (idiv_progress)
+   /* Do load/store vectorization after the first opt loop to give us a chance
+    * to optimize lowered SSBO pointers. Without the first opt loop every
+    * SSBO load/store with a different pointer looks like it has a different
+    * descriptor, even when it doesn't.
+    */
+   nir_load_store_vectorize_options vectorize_opts = {
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform,
+      .callback = ir3_nir_should_vectorize_mem,
+      .robust_modes = options->robust_modes,
+      .cb_data = compiler,
+   };
+   bool vectorize_progress = OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);
+
+   if (idiv_progress || vectorize_progress)
      ir3_optimize_loop(compiler, options, s);

   OPT(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@ -310,23 +310,6 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
      }
   }

-   /* Descriptor index has to be adjusted in the following cases:
-    *  - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
-    *    loads -- next-index descriptor will be able to do that;
-    *  - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
-    *    storage accesses of that size.
-    */
-   if ((dev->physical_device->info->a6xx.storage_16bit &&
-        !dev->physical_device->info->a6xx.has_isam_v &&
-        intrin->intrinsic == nir_intrinsic_load_ssbo &&
-        (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
-        intrin->def.bit_size > 16) ||
-       (dev->physical_device->info->a7xx.storage_8bit &&
-        ((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
-         (intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
-      descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
-   }
-
   nir_def *results[MAX_SETS] = { NULL };

   if (nir_scalar_is_const(scalar_idx)) {
@ -1099,6 +1082,62 @@ tu_nir_lower_fdm(nir_shader *shader, const struct lower_fdm_options *options)
                                        lower_fdm_instr, (void *)options);
 }

+static bool
+lower_ssbo_descriptor_instr(nir_builder *b, nir_intrinsic_instr *intrin,
+                            void *cb_data)
+{
+   struct tu_device *dev = (struct tu_device *)cb_data;
+
+   /* Descriptor index has to be adjusted in the following cases:
+    *  - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
+    *    loads -- next-index descriptor will be able to do that;
+    *  - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
+    *    storage accesses of that size.
+    */
+   if ((dev->physical_device->info->a6xx.storage_16bit &&
+        !dev->physical_device->info->a6xx.has_isam_v &&
+        intrin->intrinsic == nir_intrinsic_load_ssbo &&
+        (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
+        intrin->def.bit_size > 16) ||
+       (dev->physical_device->info->a7xx.storage_8bit &&
+        ((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
+         (intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
+      unsigned buffer_src;
+      if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
+         /* This has the value first */
+         buffer_src = 1;
+      } else {
+         buffer_src = 0;
+      }
+
+      b->cursor = nir_before_instr(&intrin->instr);
+      nir_def *buffer = intrin->src[buffer_src].ssa;
+      assert(buffer->parent_instr->type == nir_instr_type_intrinsic);
+      nir_intrinsic_instr *bindless =
+         nir_instr_as_intrinsic(buffer->parent_instr);
+      assert(bindless->intrinsic == nir_intrinsic_bindless_resource_ir3);
+      nir_def *descriptor_idx = bindless->src[0].ssa;
+      descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
+      nir_def *new_buffer =
+         nir_bindless_resource_ir3(b, 32, descriptor_idx,
+                                   .desc_set = nir_intrinsic_desc_set(bindless));
+      nir_src_rewrite(&intrin->src[buffer_src], new_buffer);
+
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+tu_nir_lower_ssbo_descriptor(nir_shader *shader,
+                             struct tu_device *dev)
+{
+   return nir_shader_intrinsics_pass(shader, lower_ssbo_descriptor_instr,
+                                     nir_metadata_control_flow,
+                                     (void *)dev);
+}
+
 static void
 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 {
@ -2620,6 +2659,11 @@ tu_shader_create(struct tu_device *dev,

   ir3_finalize_nir(dev->compiler, &nir_options, nir);

+   /* This has to happen after finalizing, so that we know the final bitsize
+    * after vectorizing.
+    */
+   NIR_PASS(_, nir, tu_nir_lower_ssbo_descriptor, dev);
+
   const struct ir3_shader_options options = {
      .api_wavesize = key->api_wavesize,
      .real_wavesize = key->real_wavesize,