nir/load_store_vectorize: hoist base addr instead of subtracting

Totals from 3130 (3.92% of 79839) affected shaders: (Navi48) Instrs: 2634316 -> 2633652 (-0.03%); split: -0.06%, +0.04% CodeSize: 13999784 -> 13996888 (-0.02%); split: -0.05%, +0.03% SpillSGPRs: 1771 -> 1778 (+0.40%) Latency: 27233464 -> 27230934 (-0.01%); split: -0.02%, +0.01% InvThroughput: 4234587 -> 4234550 (-0.00%); split: -0.00%, +0.00% VClause: 54684 -> 54689 (+0.01%) SClause: 62743 -> 62912 (+0.27%); split: -0.08%, +0.35% Copies: 162594 -> 163729 (+0.70%); split: -0.22%, +0.91% PreSGPRs: 146909 -> 146914 (+0.00%); split: -0.01%, +0.01% VALU: 1558771 -> 1558778 (+0.00%) SALU: 337715 -> 338168 (+0.13%); split: -0.30%, +0.44% Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37163>
2026-05-06 09:28:07 +02:00 · 2025-07-19 12:49:57 +02:00 · 2025-07-19 12:49:57 +02:00 · a53190a426
commit a53190a426
parent cfba417316
1 changed files with 36 additions and 14 deletions
--- a/src/compiler/nir/nir_opt_load_store_vectorize.c
+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c
@ -892,6 +892,31 @@ subtract_deref(nir_builder *b, nir_deref_instr *deref, int64_t offset)
      b, deref, nir_imm_intN_t(b, -offset, deref->def.bit_size));
 }

+static void
+hoist_base_addr(nir_instr *instr, nir_instr *to_hoist)
+{
+   /* Return if this instruction already dominates the first load. */
+   if (to_hoist->block != instr->block || to_hoist->index <= instr->index)
+      return;
+
+   /* Only the offset calculation (consisting of ALU and load_const)
+    * differs between the vectorized loads.
+    */
+   assert(to_hoist->type == nir_instr_type_load_const ||
+          to_hoist->type == nir_instr_type_alu);
+
+   if (to_hoist->type == nir_instr_type_alu) {
+      /* For ALU, recursively hoist the sources. */
+      nir_alu_instr *alu = nir_instr_as_alu(to_hoist);
+      for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
+         hoist_base_addr(instr, alu->src[i].src.ssa->parent_instr);
+   }
+
+   nir_instr_move(nir_before_instr(instr), to_hoist);
+   to_hoist->index = instr->index;
+   return;
+}
+
 static void
 vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
                struct entry *low, struct entry *high,
@ -958,21 +983,16 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,

   /* update the offset */
   if (first != low && info->base_src >= 0) {
-      /* let nir_opt_algebraic() remove this addition. this doesn't have much
-       * issues with subtracting 16 from expressions like "(i + 1) * 16" because
-       * nir_opt_algebraic() turns them into "i * 16 + 16" */
      b->cursor = nir_before_instr(first->instr);

+      /* Hoist low base addr before first intrinsic. */
+      nir_def *base = low->intrin->src[info->base_src].ssa;
+      hoist_base_addr(first->instr, base->parent_instr);
+      nir_src_rewrite(&first->intrin->src[info->base_src], base);
+
      if (nir_intrinsic_has_offset_shift(first->intrin)) {
-         nir_add_io_offset(b, first->intrin, -(int)(high_start / 8u));
-      } else {
-         /* TODO once all intrinsics that need a scaled offset use
-          * offset_shift, this old path can be removed.
-          */
-         nir_def *new_base = first->intrin->src[info->base_src].ssa;
-         new_base = nir_iadd_imm(
-            b, new_base, -(int)(high_start / 8u / get_offset_scale(first)));
-         nir_src_rewrite(&first->intrin->src[info->base_src], new_base);
+         nir_intrinsic_set_offset_shift(first->intrin,
+                                        nir_intrinsic_offset_shift(low->intrin));
      }
   }

@ -1013,7 +1033,7 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,

      nir_intrinsic_set_range_base(first->intrin, old_low_range_base);
      nir_intrinsic_set_range(first->intrin, range);
-   } else if (nir_intrinsic_has_base(first->intrin) && info->base_src == -1 && info->deref_src == -1) {
+   } else if (nir_intrinsic_has_base(first->intrin) && info->deref_src == -1) {
      nir_intrinsic_set_base(first->intrin, nir_intrinsic_base(low->intrin));
   }

@ -1737,6 +1757,8 @@ process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *blo
   unsigned next_index = 0;

   nir_foreach_instr_safe(instr, block) {
+      instr->index = next_index++;
+
      if (handle_barrier(ctx, &progress, impl, instr))
         continue;

@ -1760,7 +1782,7 @@ process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *blo

      /* create entry */
      struct entry *entry = create_entry(ctx, ctx, info, intrin);
-      entry->index = next_index++;
+      entry->index = next_index;

      list_addtail(&entry->head, &ctx->entries[mode_index]);