nak: optimize iadds with an uniform operand in iadds of address calculations

Instead of doing the iadd manually we can use the uniform slot of the ld/st/atom instruction getting rid of the iadd altogether. Additionally for global memory we can also consume a 32 bit offset instead of requiring it to be 64 bit. Totals from 158539 (13.07% of 1212873) affected shaders: CodeSize: 2308216336 -> 2242231136 (-2.86%); split: -2.86%, +0.00% Number of GPRs: 8682436 -> 8662675 (-0.23%); split: -0.26%, +0.04% SLM Size: 238816 -> 238604 (-0.09%) Static cycle count: 2169063422 -> 2147747544 (-0.98%); split: -0.99%, +0.01% Spills to memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02% Fills from memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02% Spills to reg: 45053 -> 45273 (+0.49%); split: -0.04%, +0.53% Fills from reg: 36385 -> 36757 (+1.02%); split: -0.04%, +1.06% Max warps/SM: 6027232 -> 6034616 (+0.12%); split: +0.12%, -0.00%
2026-05-08 06:58:05 +02:00 · 2026-03-15 21:16:30 +01:00 · 2026-03-15 21:16:30 +01:00 · 0b4705ec95
commit 0b4705ec95
parent eeadd23c09
1 changed files with 113 additions and 0 deletions
--- a/src/nouveau/compiler/nak_nir.c
+++ b/src/nouveau/compiler/nak_nir.c
@ -1130,6 +1130,113 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
   return progress;
 }

+static bool
+is_divergent_phi(nir_instr *instr)
+{
+   if (instr->type != nir_instr_type_phi)
+      return false;
+   nir_phi_instr *phi = nir_instr_as_phi(instr);
+   return nak_nir_phi_is_divergent(phi);
+}
+
+static bool
+nak_nir_opt_uniform_address_impl(struct nir_builder *b,
+                                 nir_intrinsic_instr *intr, void *cb_data)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_cmat_load_shared_nv:
+   case nir_intrinsic_global_atomic_nv:
+   case nir_intrinsic_load_global_nv:
+   case nir_intrinsic_load_scratch_nv:
+   case nir_intrinsic_load_shared_nv:
+   case nir_intrinsic_shared_atomic_nv:
+   case nir_intrinsic_store_global_nv:
+   case nir_intrinsic_store_scratch_nv:
+   case nir_intrinsic_store_shared_nv: {
+      nir_src *offset_src = nir_get_io_offset_src(intr);
+      nir_def *offset = offset_src->ssa;
+      nir_src *uniform_offset_src = nir_get_io_uniform_offset_src(intr);
+      nir_def *uniform_offset = uniform_offset_src->ssa;
+      nir_block *use_block = intr->instr.block;
+
+      assert(nir_src_as_uint(*uniform_offset_src) == 0);
+
+      /* Nak can't collect vectors in non uniform control flow, so don't
+       * even try */
+      if (offset->bit_size == 64 && nak_block_is_divergent(use_block))
+         return false;
+
+      /* We ignore any constant offset */
+      if (nir_src_is_const(*offset_src))
+         return false;
+
+      /* If the source is already uniform, just swap them as the uniform slot
+       * should be 0 */
+      if (!nir_def_is_divergent_at_use_block(offset, use_block)) {
+         if (is_divergent_phi(nir_def_instr(offset)))
+            return false;
+         nir_src_rewrite(uniform_offset_src, offset);
+         nir_src_rewrite(offset_src, uniform_offset);
+         return true;
+      }
+
+      nir_alu_instr *iadd = nir_def_as_alu_or_null(offset_src->ssa);
+      if (!iadd || iadd->op != nir_op_iadd)
+         return false;
+
+      unsigned src0_div = nir_def_is_divergent_at_use_block(iadd->src[0].src.ssa, use_block);
+      unsigned src1_div = nir_def_is_divergent_at_use_block(iadd->src[1].src.ssa, use_block);
+      if (src0_div && src1_div)
+         return false;
+
+      b->cursor = nir_before_instr(&intr->instr);
+
+      nir_def *addr, *uaddr;
+      if (src0_div) {
+         assert(!src1_div);
+         addr = nir_ssa_for_alu_src(b, iadd, 0);
+         uaddr = nir_ssa_for_alu_src(b, iadd, 1);
+      } else {
+         assert(src1_div);
+         addr = nir_ssa_for_alu_src(b, iadd, 1);
+         uaddr = nir_ssa_for_alu_src(b, iadd, 0);
+      }
+
+      if (is_divergent_phi(nir_def_instr(uaddr)))
+         return false;
+
+      /* We can remove a u2u64 on the non uniform src */
+      if (addr->bit_size == 64) {
+         nir_alu_instr *u2u64 = nir_def_as_alu_or_null(addr);
+         if (u2u64 && u2u64->op == nir_op_u2u64)
+            addr = nir_ssa_for_alu_src(b, u2u64, 0);
+      }
+
+      nir_src_rewrite(offset_src, addr);
+      nir_src_rewrite(uniform_offset_src, uaddr);
+      return true;
+   }
+   default:
+      return false;
+   }
+}
+
+/** This pass assumes it is ran after nir_opt_offset */
+static bool
+nak_nir_opt_uniform_address(nir_shader *nir)
+{
+   if (nak_debug_no_ugpr())
+      return false;
+   nir_divergence_analysis(nir);
+   return nir_shader_intrinsics_pass(
+      nir,
+      nak_nir_opt_uniform_address_impl,
+      nir_metadata_control_flow,
+      NULL
+   );
+}
+
+
 static bool
 nak_nir_opt_offset_shift_nv_impl(struct nir_builder *b,
                                 nir_intrinsic_instr *intrin, void *data)
@ -1333,6 +1440,12 @@ nak_postprocess_nir(nir_shader *nir,
      .cb_data = nak,
   };
   OPT(nir, nir_opt_offsets, &nak_offset_options);
+   if (nak->sm >= 73) {
+      OPT(nir, nak_nir_opt_uniform_address);
+      /* TODO: as we eliminate u2u64s we could fold more offsets in, however
+       * This would require us to verify it doesn't overflow, which we can't. */
+      /* OPT(nir, nir_opt_offsets, &nak_offset_options); */
+   }

   /* Should run after nir_opt_offsets, because nir_opt_algebraic will move
    * iadds down the chain */