From 0fc02a8b69a0a7e9265c795778cc0db8916fcdce Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sun, 15 Mar 2026 20:47:04 +0100
Subject: [PATCH 1/7] nir: add nir_intrinsic_cmat_load_shared_nv to
 nir_get_io_offset_src_number

---
 src/compiler/nir/nir_lower_io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index 6bb281e6cba..86f7a3591fd 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -1018,6 +1018,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr)
    case nir_intrinsic_load_push_data_intel:
    case nir_intrinsic_vild_nv:
    case nir_intrinsic_load_shader_indirect_data_intel:
+   case nir_intrinsic_cmat_load_shared_nv:
       return 0;
    case nir_intrinsic_load_ubo:
    case nir_intrinsic_load_ubo_vec4:

From 584ba918a1c809f9d5cd3c377111a65a15e32ecd Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Tue, 17 Mar 2026 00:37:22 +0100
Subject: [PATCH 2/7] nak: add nak_nir_phi_is_divergent helper

---
 src/nouveau/compiler/nak_nir_lower_cf.c | 61 ++++++++++++++-----------
 src/nouveau/compiler/nak_private.h      |  1 +
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/src/nouveau/compiler/nak_nir_lower_cf.c b/src/nouveau/compiler/nak_nir_lower_cf.c
index 817f6de4d73..8b146f1b0ae 100644
--- a/src/nouveau/compiler/nak_nir_lower_cf.c
+++ b/src/nouveau/compiler/nak_nir_lower_cf.c
@@ -376,6 +376,39 @@ lower_cf_list(nir_builder *b, nir_def *esc_reg, struct scope *parent_scope,
    }
 }
 
+bool
+nak_nir_phi_is_divergent(nir_phi_instr *phi)
+{
+   bool divergent = false;
+   nir_foreach_phi_src(phi_src, phi) {
+      /* There is a tricky case we need to care about here where a
+         * convergent block has a divergent dominator.  This can happen
+         * if, for instance, you have the following loop:
+         *
+         *    loop {
+         *       if (div) {
+         *          %20 = load_ubo(0, 0);
+         *       } else {
+         *          terminate;
+         *       }
+         *    }
+         *    use(%20);
+         *
+         * In this case, the load_ubo() dominates the use() even though
+         * the load_ubo() exists in divergent control-flow.  In this
+         * case, we simply flag the whole phi divergent because we
+         * don't want to deal with inserting a r2ur somewhere.
+         */
+      if (phi_src->pred->divergent || phi_src->src.ssa->divergent ||
+            nir_def_block(phi_src->src.ssa)->divergent) {
+         divergent = true;
+         break;
+      }
+   }
+
+   return divergent;
+}
+
 static void
 recompute_phi_divergence_impl(nir_function_impl *impl)
 {
@@ -388,33 +421,7 @@ recompute_phi_divergence_impl(nir_function_impl *impl)
                break;
 
             nir_phi_instr *phi = nir_instr_as_phi(instr);
-
-            bool divergent = false;
-            nir_foreach_phi_src(phi_src, phi) {
-               /* There is a tricky case we need to care about here where a
-                * convergent block has a divergent dominator.  This can happen
-                * if, for instance, you have the following loop:
-                *
-                *    loop {
-                *       if (div) {
-                *          %20 = load_ubo(0, 0);
-                *       } else {
-                *          terminate;
-                *       }
-                *    }
-                *    use(%20);
-                *
-                * In this case, the load_ubo() dominates the use() even though
-                * the load_ubo() exists in divergent control-flow.  In this
-                * case, we simply flag the whole phi divergent because we
-                * don't want to deal with inserting a r2ur somewhere.
-                */
-               if (phi_src->pred->divergent || phi_src->src.ssa->divergent ||
-                   nir_def_block(phi_src->src.ssa)->divergent) {
-                  divergent = true;
-                  break;
-               }
-            }
+            bool divergent = nak_nir_phi_is_divergent(phi);
 
             if (divergent != phi->def.divergent) {
                phi->def.divergent = divergent;
diff --git a/src/nouveau/compiler/nak_private.h b/src/nouveau/compiler/nak_private.h
index 588eb897eb4..896f00d279c 100644
--- a/src/nouveau/compiler/nak_private.h
+++ b/src/nouveau/compiler/nak_private.h
@@ -370,6 +370,7 @@ bool nak_nir_lower_cmat(nir_shader *shader, const struct nak_compiler *nak);
  * writing uregs from these blocks.
  */
 bool nak_block_is_divergent(const nir_block *block);
+bool nak_nir_phi_is_divergent(nir_phi_instr *phi);
 
 void nak_optimize_nir(nir_shader *nir, const struct nak_compiler *nak);
 

From 53bfdb400c2fe6f451648c9c344e3d318e26c96d Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Wed, 14 Jan 2026 17:59:20 +0100
Subject: [PATCH 3/7] nak/sm70: add helper for memory load store addresses

This also makes the selection of 32 vs 64 bit addresses based on the
actual source in the IR.
---
 src/nouveau/compiler/nak/nvdisasm_tests.rs |  2 +-
 src/nouveau/compiler/nak/sm70_encode.rs    | 62 +++++++++++++++-------
 2 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/src/nouveau/compiler/nak/nvdisasm_tests.rs b/src/nouveau/compiler/nak/nvdisasm_tests.rs
index fbcf7c82013..e055bd136ab 100644
--- a/src/nouveau/compiler/nak/nvdisasm_tests.rs
+++ b/src/nouveau/compiler/nak/nvdisasm_tests.rs
@@ -288,7 +288,7 @@ pub fn test_ldc() {
 #[test]
 pub fn test_ld_st_atom() {
     let r0 = RegRef::new(RegFile::GPR, 0, 1);
-    let r1 = RegRef::new(RegFile::GPR, 1, 1);
+    let r1 = RegRef::new(RegFile::GPR, 1, 2);
     let r2 = RegRef::new(RegFile::GPR, 2, 1);
     let r3 = RegRef::new(RegFile::GPR, 3, 1);
     let p4 = RegRef::new(RegFile::Pred, 4, 1);
diff --git a/src/nouveau/compiler/nak/sm70_encode.rs b/src/nouveau/compiler/nak/sm70_encode.rs
index 2f7694024df..b279a1f37ad 100644
--- a/src/nouveau/compiler/nak/sm70_encode.rs
+++ b/src/nouveau/compiler/nak/sm70_encode.rs
@@ -108,6 +108,29 @@ impl SM70Encoder<'_> {
         }
     }
 
+    fn set_reg_addr(
+        &mut self,
+        range: Range<usize>,
+        src: &Src,
+        size_bit: usize,
+    ) {
+        assert!(src.is_unmodified());
+        match src.src_ref {
+            SrcRef::Zero => {
+                self.set_reg(range, self.zero_reg(RegFile::GPR));
+                // We always treat a zero GPR as 32 bits, so the UGPR source
+                // can be 32 bits.
+                self.set_bit(size_bit, false);
+            }
+            SrcRef::Reg(reg) => {
+                self.set_reg(range, reg);
+                assert!(reg.comps() <= 2);
+                self.set_bit(size_bit, reg.comps() == 2);
+            }
+            _ => panic!("Not a register"),
+        }
+    }
+
     fn set_ureg_src(&mut self, start: usize, src: &Src) {
         assert!(src.src_mod.is_none());
         match src.src_ref {
@@ -117,6 +140,24 @@ impl SM70Encoder<'_> {
         }
     }
 
+    fn set_ureg_addr(&mut self, start: usize, src: &Src, size_bit: usize) {
+        assert!(src.src_mod.is_none());
+        match src.src_ref {
+            SrcRef::Zero => {
+                self.set_ureg(start, self.zero_reg(RegFile::UGPR));
+                // We always treat a zero UGPR as 64 bits, so the GPR source
+                // can be 64 bit.
+                self.set_bit(size_bit, true);
+            }
+            SrcRef::Reg(reg) => {
+                self.set_ureg(start, reg);
+                assert!(reg.comps() <= 2);
+                self.set_bit(size_bit, reg.comps() == 2);
+            }
+            _ => panic!("Not a register"),
+        }
+    }
+
     fn set_pred_dst(&mut self, range: Range<usize>, dst: &Dst) {
         match dst {
             Dst::None => self.set_pred_reg(range, self.true_reg(RegFile::Pred)),
@@ -3009,13 +3050,6 @@ impl SM70Encoder<'_> {
     }
 
     fn set_mem_access(&mut self, access: &MemAccess) {
-        self.set_field(
-            72..73,
-            match access.space.addr_type() {
-                MemAddrType::A32 => 0_u8,
-                MemAddrType::A64 => 1_u8,
-            },
-        );
         self.set_mem_type(73..76, access.mem_type);
         self.set_mem_order(&access.order);
         self.set_eviction_priority(&access.eviction_priority);
@@ -3179,7 +3213,7 @@ impl SM70Op for OpLd {
         }
 
         e.set_dst(&self.dst);
-        e.set_reg_src(24..32, &self.addr);
+        e.set_reg_addr(24..32, &self.addr, 72);
         e.set_field(40..64, self.offset);
     }
 }
@@ -3314,7 +3348,7 @@ impl SM70Op for OpSt {
             }
         }
 
-        e.set_reg_src(24..32, &self.addr);
+        e.set_reg_addr(24..32, &self.addr, 72);
         e.set_reg_src(32..40, &self.data);
         e.set_field(40..64, self.offset);
     }
@@ -3421,14 +3455,6 @@ impl SM70Op for OpAtom {
                     e.set_atom_op(87..91, self.atom_op);
                 }
 
-                e.set_field(
-                    72..73,
-                    match self.mem_space.addr_type() {
-                        MemAddrType::A32 => 0_u8,
-                        MemAddrType::A64 => 1_u8,
-                    },
-                );
-
                 e.set_mem_order(&self.mem_order);
                 e.set_eviction_priority(&self.mem_eviction_priority);
                 assert_eq!(self.addr_stride, OffsetStride::X1);
@@ -3468,7 +3494,7 @@ impl SM70Op for OpAtom {
         }
 
         e.set_dst(&self.dst);
-        e.set_reg_src(24..32, &self.addr);
+        e.set_reg_addr(24..32, &self.addr, 72);
         e.set_field(40..64, self.addr_offset);
         e.set_atom_type(self.atom_type, false);
     }

From e639aa342d43aa561832b487b82cba358b1f9e85 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sun, 3 May 2026 11:46:12 +0200
Subject: [PATCH 4/7] nak: wire up UGPR Ld/St/Atom encoding

---
 src/nouveau/compiler/nak/from_nir.rs        |  11 ++
 src/nouveau/compiler/nak/hw_tests.rs        |   4 +
 src/nouveau/compiler/nak/ir.rs              |  32 +++++-
 src/nouveau/compiler/nak/lower_copy_swap.rs |   2 +
 src/nouveau/compiler/nak/nvdisasm_tests.rs  |  36 +++++--
 src/nouveau/compiler/nak/sm70_encode.rs     | 108 +++++++++++++++++---
 6 files changed, 170 insertions(+), 23 deletions(-)

diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs
index 3bc4aaf0e08..a7665ed4ec8 100644
--- a/src/nouveau/compiler/nak/from_nir.rs
+++ b/src/nouveau/compiler/nak/from_nir.rs
@@ -2992,6 +2992,7 @@ impl<'a> ShaderFromNir<'a> {
                         dst.clone().into()
                     },
                     addr: addr,
+                    uniform_address: Src::ZERO,
                     cmpr: 0.into(),
                     data: data,
                     atom_op: atom_op,
@@ -3018,6 +3019,7 @@ impl<'a> ShaderFromNir<'a> {
                 b.push_op(OpAtom {
                     dst: dst.clone().into(),
                     addr: addr,
+                    uniform_address: Src::ZERO,
                     cmpr: cmpr,
                     data: data,
                     atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
@@ -3224,6 +3226,7 @@ impl<'a> ShaderFromNir<'a> {
                 b.push_op(OpLd {
                     dst: dst.clone().into(),
                     addr: addr,
+                    uniform_addr: Src::ZERO,
                     pred: pred,
                     offset: intrin.base(),
                     stride: OffsetStride::X1,
@@ -3335,6 +3338,7 @@ impl<'a> ShaderFromNir<'a> {
                 b.push_op(OpLd {
                     dst: dst.clone().into(),
                     addr: addr,
+                    uniform_addr: Src::ZERO,
                     pred: true.into(),
                     offset: intrin.base(),
                     stride: OffsetStride::X1,
@@ -3358,6 +3362,7 @@ impl<'a> ShaderFromNir<'a> {
                 b.push_op(OpLd {
                     dst: dst.clone().into(),
                     addr: addr,
+                    uniform_addr: Src::ZERO,
                     pred: true.into(),
                     offset: intrin.base(),
                     stride: intrin.offset_shift_nv().try_into().unwrap(),
@@ -3678,6 +3683,7 @@ impl<'a> ShaderFromNir<'a> {
                 b.push_op(OpAtom {
                     dst: dst.clone().into(),
                     addr: addr,
+                    uniform_address: Src::ZERO,
                     cmpr: 0.into(),
                     data: data,
                     atom_op: atom_op,
@@ -3704,6 +3710,7 @@ impl<'a> ShaderFromNir<'a> {
                 b.push_op(OpAtom {
                     dst: dst.clone().into(),
                     addr: addr,
+                    uniform_address: Src::ZERO,
                     cmpr: cmpr,
                     data: data,
                     atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
@@ -3736,6 +3743,7 @@ impl<'a> ShaderFromNir<'a> {
 
                 b.push_op(OpSt {
                     addr: addr,
+                    uniform_addr: Src::ZERO,
                     data: data,
                     offset: intrin.base(),
                     stride: OffsetStride::X1,
@@ -3767,6 +3775,7 @@ impl<'a> ShaderFromNir<'a> {
 
                 b.push_op(OpSt {
                     addr: addr,
+                    uniform_addr: Src::ZERO,
                     data: data,
                     offset: intrin.base(),
                     stride: OffsetStride::X1,
@@ -3788,6 +3797,7 @@ impl<'a> ShaderFromNir<'a> {
 
                 b.push_op(OpSt {
                     addr: addr,
+                    uniform_addr: Src::ZERO,
                     data: data,
                     offset: intrin.base(),
                     stride: intrin.offset_shift_nv().try_into().unwrap(),
@@ -3907,6 +3917,7 @@ impl<'a> ShaderFromNir<'a> {
                     mat_size,
                     mat_count,
                     addr,
+                    uniform_addr: Src::ZERO,
                     offset: intrin.base(),
                 });
                 self.set_dst(&intrin.def, dst);
diff --git a/src/nouveau/compiler/nak/hw_tests.rs b/src/nouveau/compiler/nak/hw_tests.rs
index bb6e6d8d5aa..09e4a0744de 100644
--- a/src/nouveau/compiler/nak/hw_tests.rs
+++ b/src/nouveau/compiler/nak/hw_tests.rs
@@ -154,6 +154,7 @@ impl<'a> TestShaderBuilder<'a> {
         self.push_op(OpLd {
             dst: dst.clone().into(),
             addr: self.data_addr.clone().into(),
+            uniform_addr: Src::ZERO,
             pred: true.into(),
             offset: offset.into(),
             access: access,
@@ -178,6 +179,7 @@ impl<'a> TestShaderBuilder<'a> {
         assert!(data.comps() == comps);
         self.push_op(OpSt {
             addr: self.data_addr.clone().into(),
+            uniform_addr: Src::ZERO,
             data: data.into(),
             offset: offset.into(),
             access: access,
@@ -1734,6 +1736,7 @@ fn test_op_ldsm() {
     let offset = b.imul(lane_id.into(), 16.into());
     b.push_op(OpSt {
         addr: offset.into(),
+        uniform_addr: Src::ZERO,
         data: input.into(),
         offset: 0,
         access: MemAccess {
@@ -1755,6 +1758,7 @@ fn test_op_ldsm() {
         mat_size: LdsmSize::M8N8,
         mat_count: 4,
         addr: addr.into(),
+        uniform_addr: Src::ZERO,
         offset: 0,
     });
     b.st_test_data(16, MemType::B128, res);
diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs
index 8d5feffb9c7..78febca91c6 100644
--- a/src/nouveau/compiler/nak/ir.rs
+++ b/src/nouveau/compiler/nak/ir.rs
@@ -6502,6 +6502,9 @@ pub struct OpLd {
     #[src_type(GPR)]
     pub addr: Src,
 
+    #[src_type(GPR)]
+    pub uniform_addr: Src,
+
     /// On false the load returns 0
     #[src_type(Pred)]
     pub pred: Src,
@@ -6513,7 +6516,11 @@ pub struct OpLd {
 
 impl DisplayOp for OpLd {
     fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "ld{} [{}{}", self.access, self.addr, self.stride)?;
+        write!(
+            f,
+            "ld{} [{}{}+{}",
+            self.access, self.addr, self.stride, self.uniform_addr
+        )?;
         if self.offset > 0 {
             write!(f, "+{:#x}", self.offset)?;
         }
@@ -6602,6 +6609,9 @@ pub struct OpLdsm {
     #[src_type(SSA)]
     pub addr: Src,
 
+    #[src_type(SSA)]
+    pub uniform_addr: Src,
+
     pub offset: i32,
 }
 
@@ -6658,6 +6668,9 @@ pub struct OpSt {
     #[src_type(SSA)]
     pub data: Src,
 
+    #[src_type(GPR)]
+    pub uniform_addr: Src,
+
     pub offset: i32,
     pub stride: OffsetStride,
     pub access: MemAccess,
@@ -6665,7 +6678,11 @@ pub struct OpSt {
 
 impl DisplayOp for OpSt {
     fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "st{} [{}{}", self.access, self.addr, self.stride)?;
+        write!(
+            f,
+            "st{} [{}{}+{}",
+            self.access, self.addr, self.stride, self.uniform_addr
+        )?;
         if self.offset > 0 {
             write!(f, "+{:#x}", self.offset)?;
         }
@@ -6711,6 +6728,9 @@ pub struct OpAtom {
     #[src_type(GPR)]
     pub addr: Src,
 
+    #[src_type(GPR)]
+    pub uniform_address: Src,
+
     #[src_type(GPR)]
     pub cmpr: Src,
 
@@ -6743,10 +6763,16 @@ impl DisplayOp for OpAtom {
         if !self.addr.is_zero() {
             write!(f, "{}{}", self.addr, self.addr_stride)?;
         }
-        if self.addr_offset > 0 {
+        if !self.uniform_address.is_zero() {
             if !self.addr.is_zero() {
                 write!(f, "+")?;
             }
+            write!(f, "{}", self.uniform_address)?;
+        }
+        if self.addr_offset > 0 {
+            if !self.addr.is_zero() || !self.uniform_address.is_zero() {
+                write!(f, "+")?;
+            }
             write!(f, "{:#x}", self.addr_offset)?;
         }
         write!(f, "]")?;
diff --git a/src/nouveau/compiler/nak/lower_copy_swap.rs b/src/nouveau/compiler/nak/lower_copy_swap.rs
index 19e7f7178fe..f1f34978849 100644
--- a/src/nouveau/compiler/nak/lower_copy_swap.rs
+++ b/src/nouveau/compiler/nak/lower_copy_swap.rs
@@ -95,6 +95,7 @@ impl LowerCopySwap {
                         b.push_op(OpLd {
                             dst: copy.dst,
                             addr: Src::ZERO,
+                            uniform_addr: Src::ZERO,
                             pred: true.into(),
                             offset: addr.try_into().unwrap(),
                             stride: OffsetStride::X1,
@@ -175,6 +176,7 @@ impl LowerCopySwap {
                         self.slm_size = max(self.slm_size, addr + 4);
                         b.push_op(OpSt {
                             addr: Src::ZERO,
+                            uniform_addr: Src::ZERO,
                             data: copy.src,
                             offset: addr.try_into().unwrap(),
                             stride: OffsetStride::X1,
diff --git a/src/nouveau/compiler/nak/nvdisasm_tests.rs b/src/nouveau/compiler/nak/nvdisasm_tests.rs
index e055bd136ab..c845298a73f 100644
--- a/src/nouveau/compiler/nak/nvdisasm_tests.rs
+++ b/src/nouveau/compiler/nak/nvdisasm_tests.rs
@@ -292,6 +292,7 @@ pub fn test_ld_st_atom() {
     let r2 = RegRef::new(RegFile::GPR, 2, 1);
     let r3 = RegRef::new(RegFile::GPR, 3, 1);
     let p4 = RegRef::new(RegFile::Pred, 4, 1);
+    let ur2 = RegRef::new(RegFile::UGPR, 2, 2);
 
     let order = MemOrder::Strong(MemScope::CTA);
 
@@ -318,6 +319,18 @@ pub fn test_ld_st_atom() {
             {
                 for addr_stride in [OffsetStride::X1, OffsetStride::X8] {
                     let cta = if sm >= 80 { "sm" } else { "cta" };
+                    let r1_str =
+                        if sm >= 75 && matches!(space, MemSpace::Global(_)) {
+                            "r1.64"
+                        } else {
+                            "r1"
+                        };
+                    let urz = if sm >= 73 {
+                        SrcRef::Reg(ur2).into()
+                    } else {
+                        Src::ZERO
+                    };
+                    let urz_str = if sm >= 73 { "+ur2" } else { "" };
 
                     let pri = match space {
                         MemSpace::Global(_) => MemEvictionPriority::First,
@@ -339,6 +352,7 @@ pub fn test_ld_st_atom() {
                     let instr = OpLd {
                         dst: Dst::Reg(r0),
                         addr: SrcRef::Reg(r1).into(),
+                        uniform_addr: urz.clone(),
                         pred: if matches!(space, MemSpace::Global(_))
                             && sm >= 73
                         {
@@ -353,7 +367,7 @@ pub fn test_ld_st_atom() {
                     let expected = match space {
                         MemSpace::Global(_) if sm >= 73 => {
                             format!(
-                                "ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}], p4;"
+                                "ldg.e.ef.strong.{cta} r0, [{r1_str}{urz_str}+{addr_offset_str}], p4;"
                             )
                         }
                         MemSpace::Global(_) => {
@@ -363,17 +377,20 @@ pub fn test_ld_st_atom() {
                         }
                         MemSpace::Shared => {
                             format!(
-                                "lds r0, [r1{addr_stride}+{addr_offset_str}];"
+                                "lds r0, [{r1_str}{addr_stride}{urz_str}+{addr_offset_str}];"
                             )
                         }
                         MemSpace::Local => {
-                            format!("ldl r0, [r1+{addr_offset_str}];")
+                            format!(
+                                "ldl r0, [{r1_str}{urz_str}+{addr_offset_str}];"
+                            )
                         }
                     };
                     c.push(instr, expected);
 
                     let instr = OpSt {
                         addr: SrcRef::Reg(r1).into(),
+                        uniform_addr: urz.clone(),
                         data: SrcRef::Reg(r2).into(),
                         offset: addr_offset,
                         access: access.clone(),
@@ -382,16 +399,18 @@ pub fn test_ld_st_atom() {
                     let expected = match space {
                         MemSpace::Global(_) => {
                             format!(
-                                "stg.e.ef.strong.{cta} [r1+{addr_offset_str}], r2;"
+                                "stg.e.ef.strong.{cta} [{r1_str}{urz_str}+{addr_offset_str}], r2;"
                             )
                         }
                         MemSpace::Shared => {
                             format!(
-                                "sts [r1{addr_stride}+{addr_offset_str}], r2;"
+                                "sts [{r1_str}{addr_stride}{urz_str}+{addr_offset_str}], r2;"
                             )
                         }
                         MemSpace::Local => {
-                            format!("stl [r1+{addr_offset_str}], r2;")
+                            format!(
+                                "stl [{r1_str}{urz_str}+{addr_offset_str}], r2;"
+                            )
                         }
                     };
                     c.push(instr, expected);
@@ -405,6 +424,7 @@ pub fn test_ld_st_atom() {
                                     Dst::None
                                 },
                                 addr: SrcRef::Reg(r1).into(),
+                                uniform_address: urz.clone(),
                                 data: SrcRef::Reg(r2).into(),
                                 atom_op: AtomOp::Add,
                                 cmpr: SrcRef::Reg(r3).into(),
@@ -429,7 +449,7 @@ pub fn test_ld_st_atom() {
                                     };
                                     let dst =
                                         if use_dst { "pt, r0, " } else { "" };
-                                    format!("{op}.e.add.ef{atom_type_str}.strong.{cta} {dst}[r1+{addr_offset_str}], r2;")
+                                    format!("{op}.e.add.ef{atom_type_str}.strong.{cta} {dst}[{r1_str}{urz_str}+{addr_offset_str}], r2;")
                                 }
                                 MemSpace::Shared => {
                                     if atom_type.is_float() {
@@ -439,7 +459,7 @@ pub fn test_ld_st_atom() {
                                         continue;
                                     }
                                     let dst = if use_dst { "r0" } else { "rz" };
-                                    format!("atoms.add{atom_type_str} {dst}, [r1{addr_stride}+{addr_offset_str}], r2;")
+                                    format!("atoms.add{atom_type_str} {dst}, [{r1_str}{addr_stride}{urz_str}+{addr_offset_str}], r2;")
                                 }
                                 MemSpace::Local => continue,
                             };
diff --git a/src/nouveau/compiler/nak/sm70_encode.rs b/src/nouveau/compiler/nak/sm70_encode.rs
index b279a1f37ad..07736e039b6 100644
--- a/src/nouveau/compiler/nak/sm70_encode.rs
+++ b/src/nouveau/compiler/nak/sm70_encode.rs
@@ -3170,15 +3170,21 @@ impl SM70Op for OpLd {
     }
 
     fn encode(&self, e: &mut SM70Encoder<'_>) {
+        let has_ugpr = e.sm >= 73;
         match self.access.space {
             MemSpace::Global(_) => {
-                e.set_opcode(0x381);
                 assert_eq!(self.stride, OffsetStride::X1);
-                if e.sm >= 73 {
+
+                if has_ugpr {
+                    e.set_opcode(0x981);
+                    e.set_reg_addr(24..32, &self.addr, 90);
+                    e.set_ureg_addr(32, &self.uniform_addr, 72);
                     e.set_rev_pred_src(64..67, 67, &self.pred);
                 } else {
-                    assert!(self.pred.is_true());
+                    e.set_opcode(0x381);
+                    e.set_reg_addr(24..32, &self.addr, 72);
                 }
+
                 e.set_pred_dst(81..84, &Dst::None);
                 e.set_mem_access(&self.access);
             }
@@ -3186,6 +3192,10 @@ impl SM70Op for OpLd {
                 assert!(self.pred.is_true());
                 assert_eq!(self.stride, OffsetStride::X1);
                 e.set_opcode(0x983);
+                e.set_reg_src(24..32, &self.addr);
+                if has_ugpr {
+                    e.set_ureg_src(32, &self.uniform_addr);
+                }
                 e.set_field(84..87, 1_u8);
 
                 e.set_mem_type(73..76, self.access.mem_type);
@@ -3199,6 +3209,10 @@ impl SM70Op for OpLd {
                 e.set_opcode(0x984);
                 assert!(self.pred.is_true());
 
+                e.set_reg_src(24..32, &self.addr);
+                if has_ugpr {
+                    e.set_ureg_src(32, &self.uniform_addr);
+                }
                 e.set_mem_type(73..76, self.access.mem_type);
                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
                 assert!(
@@ -3213,8 +3227,11 @@ impl SM70Op for OpLd {
         }
 
         e.set_dst(&self.dst);
-        e.set_reg_addr(24..32, &self.addr, 72);
         e.set_field(40..64, self.offset);
+        // We always enable UGPR mode, because the .E bit changes
+        // which source it applies to depending on it.
+        // This way it always applies to the UGPR.
+        e.set_bit(91, has_ugpr);
     }
 }
 
@@ -3315,15 +3332,30 @@ impl SM70Op for OpSt {
     }
 
     fn encode(&self, e: &mut SM70Encoder<'_>) {
+        let has_ugpr = e.sm >= 75;
         match self.access.space {
             MemSpace::Global(_) => {
-                e.set_opcode(0x386);
                 assert_eq!(self.stride, OffsetStride::X1);
+                if has_ugpr {
+                    e.set_opcode(0x986);
+                    e.set_reg_addr(24..32, &self.addr, 90);
+                    e.set_ureg_addr(64, &self.uniform_addr, 72);
+                } else {
+                    e.set_opcode(0x386);
+                    e.set_reg_addr(24..32, &self.addr, 72);
+                }
                 e.set_mem_access(&self.access);
             }
             MemSpace::Local => {
-                e.set_opcode(0x387);
                 assert_eq!(self.stride, OffsetStride::X1);
+                if has_ugpr {
+                    e.set_opcode(0x987);
+                    e.set_reg_src(24..32, &self.addr);
+                    e.set_ureg_src(64, &self.uniform_addr);
+                } else {
+                    e.set_opcode(0x387);
+                    e.set_reg_src(24..32, &self.addr);
+                }
                 e.set_field(84..87, 1_u8);
 
                 e.set_mem_type(73..76, self.access.mem_type);
@@ -3334,7 +3366,14 @@ impl SM70Op for OpSt {
                 );
             }
             MemSpace::Shared => {
-                e.set_opcode(0x388);
+                if has_ugpr {
+                    e.set_opcode(0x988);
+                    e.set_reg_src(24..32, &self.addr);
+                    e.set_ureg_src(64, &self.uniform_addr);
+                } else {
+                    e.set_opcode(0x388);
+                    e.set_reg_src(24..32, &self.addr);
+                }
 
                 e.set_mem_type(73..76, self.access.mem_type);
                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
@@ -3348,9 +3387,12 @@ impl SM70Op for OpSt {
             }
         }
 
-        e.set_reg_addr(24..32, &self.addr, 72);
         e.set_reg_src(32..40, &self.data);
         e.set_field(40..64, self.offset);
+        // We always enable UGPR mode, because the .E bit changes
+        // which source it applies to depending on it.
+        // This way it always applies to the UGPR.
+        e.set_bit(91, has_ugpr);
     }
 }
 
@@ -3425,6 +3467,7 @@ impl SM70Op for OpAtom {
     }
 
     fn encode(&self, e: &mut SM70Encoder<'_>) {
+        let has_ugpr = e.sm >= 75;
         match self.mem_space {
             MemSpace::Global(_) => {
                 if self.dst.is_none() {
@@ -3435,24 +3478,56 @@ impl SM70Op for OpAtom {
                     }
 
                     e.set_reg_src(32..40, &self.data);
+                    e.set_field(40..64, self.addr_offset);
                     e.set_atom_op(87..90, self.atom_op);
+                    if has_ugpr {
+                        e.set_reg_addr(24..32, &self.addr, 90);
+                        e.set_ureg_addr(64, &self.uniform_address, 72);
+                        e.set_bit(91, true);
+                    } else {
+                        e.set_reg_addr(24..32, &self.addr, 72);
+                        assert!(self.uniform_address.is_zero());
+                    }
                 } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
                     e.set_opcode(0x3a9);
 
                     assert!(cmp_src == AtomCmpSrc::Separate);
+                    assert!(self.uniform_address.is_zero());
+                    e.set_reg_addr(24..32, &self.addr, 72);
                     e.set_reg_src(32..40, &self.cmpr);
+                    e.set_field(40..64, self.addr_offset);
                     e.set_reg_src(64..72, &self.data);
                     e.set_pred_dst(81..84, &Dst::None);
                 } else {
                     if e.sm >= 90 && self.atom_type.is_float() {
-                        e.set_opcode(0x3a3);
+                        e.set_opcode(0x9a3);
+                    } else if has_ugpr {
+                        e.set_opcode(0x9a8);
                     } else {
                         e.set_opcode(0x3a8);
                     }
 
+                    if e.sm >= 100 {
+                        e.set_reg_addr(24..32, &self.addr, 63);
+                        e.set_ureg_addr(64, &self.uniform_address, 72);
+                    } else if has_ugpr {
+                        e.set_reg_addr(24..32, &self.addr, 70);
+                        e.set_ureg_addr(64, &self.uniform_address, 72);
+                    } else {
+                        e.set_reg_addr(24..32, &self.addr, 72);
+                        assert!(self.uniform_address.is_zero());
+                    };
+
+                    if e.sm >= 100 {
+                        e.set_field(40..63, self.addr_offset);
+                    } else {
+                        e.set_field(40..64, self.addr_offset);
+                    };
+
                     e.set_reg_src(32..40, &self.data);
                     e.set_pred_dst(81..84, &Dst::None);
                     e.set_atom_op(87..91, self.atom_op);
+                    e.set_bit(91, has_ugpr);
                 }
 
                 e.set_mem_order(&self.mem_order);
@@ -3465,10 +3540,17 @@ impl SM70Op for OpAtom {
                     e.set_opcode(0x38d);
 
                     assert!(cmp_src == AtomCmpSrc::Separate);
+                    assert!(self.uniform_address.is_zero());
                     e.set_reg_src(32..40, &self.cmpr);
                     e.set_reg_src(64..72, &self.data);
                 } else {
-                    e.set_opcode(0x38c);
+                    if has_ugpr {
+                        e.set_opcode(0x98c);
+                        e.set_ureg_src(64, &self.uniform_address);
+                        e.set_bit(91, true);
+                    } else {
+                        e.set_opcode(0x38c);
+                    }
 
                     e.set_reg_src(32..40, &self.data);
                     assert!(
@@ -3483,6 +3565,8 @@ impl SM70Op for OpAtom {
                     e.set_atom_op(87..91, self.atom_op);
                 }
 
+                e.set_reg_src(24..32, &self.addr);
+                e.set_field(40..64, self.addr_offset);
                 assert!(e.sm >= 75 || self.addr_stride == OffsetStride::X1);
                 e.set_field(78..80, self.addr_stride.encode_sm75());
 
@@ -3494,8 +3578,6 @@ impl SM70Op for OpAtom {
         }
 
         e.set_dst(&self.dst);
-        e.set_reg_addr(24..32, &self.addr, 72);
-        e.set_field(40..64, self.addr_offset);
         e.set_atom_type(self.atom_type, false);
     }
 }
@@ -4218,6 +4300,7 @@ impl SM70Op for OpLdsm {
         e.set_opcode(0x83b);
         e.set_dst(&self.dst);
         e.set_reg_src(24..32, &self.addr);
+        e.set_ureg_src(32, &self.uniform_addr);
         e.set_field(40..64, self.offset);
         e.set_field(
             72..74,
@@ -4238,6 +4321,7 @@ impl SM70Op for OpLdsm {
                 // LdsmSize::M8N32 => 3,
             },
         );
+        e.set_bit(91, !self.uniform_addr.is_zero());
     }
 }
 

From 24b725a5d26b337b1f15bec86297b9a9ad088e1c Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Thu, 29 Jan 2026 22:36:17 +0100
Subject: [PATCH 5/7] nir: add uniform address to nvidia IO intrinsics

Adding the zero constants have a minor impact on stats

Totals from 61 (0.01% of 1212873) affected shaders:
CodeSize: 1044720 -> 1047472 (+0.26%); split: -0.00%, +0.27%
Static cycle count: 1198932 -> 1198490 (-0.04%); split: -0.07%, +0.04%
---
 src/compiler/nir/nir.h                        |  6 +++-
 src/compiler/nir/nir_intrinsics.py            | 26 +++++++-------
 src/compiler/nir/nir_lower_io.c               | 33 +++++++++++++++++
 src/compiler/nir/nir_opt_offsets.c            |  8 ++++-
 src/compiler/nir/nir_validate.c               | 12 +++++--
 src/nouveau/compiler/nak/from_nir.rs          | 34 +++++++++++-------
 src/nouveau/compiler/nak_nir.c                | 35 +++++++++++++------
 src/nouveau/compiler/nak_nir_lower_cmat.c     |  3 +-
 .../compiler/nak_nir_lower_non_uniform_ldcx.c |  4 ++-
 9 files changed, 121 insertions(+), 40 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index cfa9a6e8a73..279c57e2241 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5787,11 +5787,13 @@ nir_lower_shader_calls(nir_shader *shader,
                        void *mem_ctx);
 
 int nir_get_io_offset_src_number(const nir_intrinsic_instr *instr);
+int nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr);
 int nir_get_io_index_src_number(const nir_intrinsic_instr *instr);
 int nir_get_io_data_src_number(const nir_intrinsic_instr *instr);
 int nir_get_io_arrayed_index_src_number(const nir_intrinsic_instr *instr);
 
 nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr);
+nir_src *nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr);
 nir_src *nir_get_io_index_src(nir_intrinsic_instr *instr);
 nir_src *nir_get_io_data_src(nir_intrinsic_instr *instr);
 nir_src *nir_get_io_arrayed_index_src(nir_intrinsic_instr *instr);
@@ -5801,7 +5803,6 @@ static inline unsigned
 nir_get_io_base_size_nv(const nir_intrinsic_instr *intr)
 {
    switch (intr->intrinsic) {
-   case nir_intrinsic_global_atomic_nv:
    case nir_intrinsic_global_atomic_swap_nv:
    case nir_intrinsic_shared_atomic_nv:
    case nir_intrinsic_shared_atomic_swap_nv:
@@ -5814,6 +5815,9 @@ nir_get_io_base_size_nv(const nir_intrinsic_instr *intr)
    case nir_intrinsic_store_shared_nv:
    case nir_intrinsic_store_shared_unlock_nv:
       return 24;
+   case nir_intrinsic_global_atomic_nv:
+      /* TODO: SM100+ only has 23 bits for the UGPR + GPR form */
+      return 23;
    case nir_intrinsic_ldc_nv:
    case nir_intrinsic_ldcx_nv:
       return 16;
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 1a7b029a3b6..daad6d6a305 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -941,7 +941,8 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
 # The offset is sign-extended or zero-extended based on the SIGN_EXTEND index.
 #
 # NV variants all come with a 24 bit base, that is unsigned with a constant 0 address,
-# signed otherwise.
+# signed otherwise. Non swap atomic also comes with an additional uniform address source
+# right after the non uniform memory address.
 #
 # PCO global variants use a vec3 for the memory address and data, where component X
 # has the low 32 address bits, component Y has the high 32 address bits, and component Z
@@ -950,13 +951,13 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
 intrinsic("deref_atomic",  src_comp=[-1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP])
 intrinsic("ssbo_atomic",  src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT])
 intrinsic("shared_atomic",  src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
-intrinsic("shared_atomic_nv",  src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV])
+intrinsic("shared_atomic_nv",  src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV])
 intrinsic("task_payload_atomic",  src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
 intrinsic("global_atomic",  src_comp=[1, 1], dest_comp=1, indices=[ATOMIC_OP])
 intrinsic("global_atomic_2x32",  src_comp=[2, 1], dest_comp=1, indices=[ATOMIC_OP])
 intrinsic("global_atomic_amd",  src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
 intrinsic("global_atomic_agx",  src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND])
-intrinsic("global_atomic_nv",  src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
+intrinsic("global_atomic_nv",  src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP])
 intrinsic("global_atomic_pco",  src_comp=[3], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32])
 
 intrinsic("deref_atomic_swap",  src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP])
@@ -1920,15 +1921,15 @@ load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flag
 # src[] = { value, address, unsigned 32-bit offset }.
 store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRITE_MASK])
 
-# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given,
-# signed otherwise.
+# src[] = { address, uniform_address }. BASE is a 24 bit unsigned offset if a constant 0 address and
+# a constant 0 uniform_address is given, signed otherwise.
 # load_global_nv has an additional boolean input that makes the load return 0 on false.
-load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
-store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
-load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
-store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
-load("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
-store("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
+load("global_nv", [1, 1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
+store("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
+load("scratch_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
+store("scratch_nv", [1, 1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
+load("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
+store("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
 
 # Same as shared_atomic_add, but with GDS. src[] = {store_val, gds_addr, m0}
 intrinsic("gds_atomic_add_amd",  src_comp=[1, 1, 1], dest_comp=1, indices=[BASE])
@@ -2942,7 +2943,8 @@ intrinsic("ssa_bar_nv", src_comp=[1])
 intrinsic("cmat_muladd_nv", src_comp=[-1, -1, -1], dest_comp=0, bit_sizes=src2,
           indices=[FLAGS], flags=[CAN_ELIMINATE])
 
-intrinsic("cmat_load_shared_nv", src_comp=[1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE])
+# src[] = { address, uniform_address }
+intrinsic("cmat_load_shared_nv", src_comp=[1, 1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE])
 
 # Moves a 8x8 16bit matrix with transposition within a subgroup
 intrinsic("cmat_mov_transpose_nv", src_comp=[2], dest_comp=2, bit_sizes=[16], flags=[CAN_ELIMINATE, CAN_REORDER, SUBGROUP])
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index 86f7a3591fd..4ae7cc7ca1a 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -1106,6 +1106,39 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr)
    case nir_intrinsic_bindless_image_##name:    \
    case nir_intrinsic_image_heap_##name
 
+/**
+ * Return the uniform offset source number for a load/store intrinsic or -1 if there's no offset.
+ */
+int
+nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_cmat_load_shared_nv:
+   case nir_intrinsic_global_atomic_nv:
+   case nir_intrinsic_load_global_nv:
+   case nir_intrinsic_load_scratch_nv:
+   case nir_intrinsic_load_shared_nv:
+   case nir_intrinsic_shared_atomic_nv:
+      return 1;
+   case nir_intrinsic_store_global_nv:
+   case nir_intrinsic_store_scratch_nv:
+   case nir_intrinsic_store_shared_nv:
+      return 2;
+   default:
+      return -1;
+   }
+}
+
+/**
+ * Return the uniform offset source for a load/store intrinsic.
+ */
+nir_src *
+nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr)
+{
+   const int idx = nir_get_io_uniform_offset_src_number(instr);
+   return idx >= 0 ? &instr->src[idx] : NULL;
+}
+
 /**
  * Return the index or handle source number for a load/store intrinsic or -1
  * if there's no index or handle.
diff --git a/src/compiler/nir/nir_opt_offsets.c b/src/compiler/nir/nir_opt_offsets.c
index 5e53ac297c2..70a47461b84 100644
--- a/src/compiler/nir/nir_opt_offsets.c
+++ b/src/compiler/nir/nir_opt_offsets.c
@@ -193,11 +193,12 @@ try_fold_load_store_nv(nir_builder *b,
 
    assert(offset_idx >= 0);
    nir_src src = intrin->src[offset_idx];
+   nir_src *uniform_src = nir_get_io_uniform_offset_src(intrin);
 
    int32_t min = 0;
    uint32_t max = BITFIELD_MASK(offset_bits);
 
-   if (!nir_src_is_const(src)) {
+   if (!nir_src_is_const(src) || (uniform_src && !nir_src_is_const(*uniform_src))) {
       max >>= 1;
       min = ~max;
    }
@@ -211,6 +212,11 @@ try_fold_load_store_nv(nir_builder *b,
       return false;
    }
 
+   /* We don't try to fold the offset for the uniform source on purpose,
+    * because we rely on running nir_opt_offsets before moving in the uniform
+    * source. However, we might run this pass again _after_ that, because we
+    * can eliminate a u2u64 on the _non uniform_ source and therefore might be
+    * able to fold in more constants into base. */
    return try_fold_load_store(b, intrin, state, offset_idx, min, max, false);
 }
 
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 20116c51a2e..59d9e9a92dc 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -761,9 +761,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
    case nir_intrinsic_vild_nv: {
       int base = nir_intrinsic_base(instr);
       nir_src src = *nir_get_io_offset_src(instr);
+      nir_src *uniform_src = nir_get_io_uniform_offset_src(instr);
       unsigned const_bits = nir_get_io_base_size_nv(instr);
 
-      if (nir_src_is_const(src) && nir_src_as_int(src) == 0) {
+      if (nir_src_is_const(src) && nir_src_as_int(src) == 0 &&
+          (!uniform_src || (nir_src_is_const(*uniform_src) && nir_src_as_int(*uniform_src) == 0))) {
          validate_assert(state, base >= 0 && base < BITFIELD_MASK(const_bits));
       } else {
          int32_t max = BITFIELD_MASK(const_bits - 1);
@@ -771,8 +773,14 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
          validate_assert(state, base >= min && base < max);
       }
 
+      if (uniform_src) {
+         validate_assert(state, uniform_src->ssa->bit_size >= src.ssa->bit_size);
+         if (state->impl->valid_metadata & nir_metadata_divergence)
+            validate_assert(state, !uniform_src->ssa->divergent);
+      }
+
       if (instr->intrinsic == nir_intrinsic_load_global_nv) {
-         validate_assert(state, instr->src[1].ssa->bit_size == 1);
+         validate_assert(state, instr->src[2].ssa->bit_size == 1);
       }
 
       break;
diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs
index a7665ed4ec8..fdc457b8685 100644
--- a/src/nouveau/compiler/nak/from_nir.rs
+++ b/src/nouveau/compiler/nak/from_nir.rs
@@ -2975,7 +2975,8 @@ impl<'a> ShaderFromNir<'a> {
             nir_intrinsic_global_atomic_nv => {
                 let bit_size = intrin.def.bit_size();
                 let addr = self.get_src(&srcs[0]);
-                let data = self.get_src(&srcs[1]);
+                let uaddr = self.get_src(&srcs[1]);
+                let data = self.get_src(&srcs[2]);
                 let atom_type = self.get_atomic_type(intrin);
                 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
 
@@ -2992,7 +2993,7 @@ impl<'a> ShaderFromNir<'a> {
                         dst.clone().into()
                     },
                     addr: addr,
-                    uniform_address: Src::ZERO,
+                    uniform_address: uaddr,
                     cmpr: 0.into(),
                     data: data,
                     atom_op: atom_op,
@@ -3220,13 +3221,14 @@ impl<'a> ShaderFromNir<'a> {
                         .get_eviction_priority(intrin.access()),
                 };
                 let addr = self.get_src(&srcs[0]);
-                let pred = self.get_src(&srcs[1]);
+                let uaddr = self.get_src(&srcs[1]);
+                let pred = self.get_src(&srcs[2]);
                 let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
 
                 b.push_op(OpLd {
                     dst: dst.clone().into(),
                     addr: addr,
-                    uniform_addr: Src::ZERO,
+                    uniform_addr: uaddr,
                     pred: pred,
                     offset: intrin.base(),
                     stride: OffsetStride::X1,
@@ -3333,12 +3335,13 @@ impl<'a> ShaderFromNir<'a> {
                     eviction_priority: MemEvictionPriority::Normal,
                 };
                 let addr = self.get_src(&srcs[0]);
+                let uaddr = self.get_src(&srcs[1]);
                 let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
 
                 b.push_op(OpLd {
                     dst: dst.clone().into(),
                     addr: addr,
-                    uniform_addr: Src::ZERO,
+                    uniform_addr: uaddr,
                     pred: true.into(),
                     offset: intrin.base(),
                     stride: OffsetStride::X1,
@@ -3357,12 +3360,14 @@ impl<'a> ShaderFromNir<'a> {
                     eviction_priority: MemEvictionPriority::Normal,
                 };
                 let addr = self.get_src(&srcs[0]);
+                let uaddr = self.get_src(&srcs[1]);
+
                 let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
 
                 b.push_op(OpLd {
                     dst: dst.clone().into(),
                     addr: addr,
-                    uniform_addr: Src::ZERO,
+                    uniform_addr: uaddr,
                     pred: true.into(),
                     offset: intrin.base(),
                     stride: intrin.offset_shift_nv().try_into().unwrap(),
@@ -3673,7 +3678,8 @@ impl<'a> ShaderFromNir<'a> {
             nir_intrinsic_shared_atomic_nv => {
                 let bit_size = intrin.def.bit_size();
                 let addr = self.get_src(&srcs[0]);
-                let data = self.get_src(&srcs[1]);
+                let uaddr = self.get_src(&srcs[1]);
+                let data = self.get_src(&srcs[2]);
                 let atom_type = self.get_atomic_type(intrin);
                 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
 
@@ -3683,7 +3689,7 @@ impl<'a> ShaderFromNir<'a> {
                 b.push_op(OpAtom {
                     dst: dst.clone().into(),
                     addr: addr,
-                    uniform_address: Src::ZERO,
+                    uniform_address: uaddr,
                     cmpr: 0.into(),
                     data: data,
                     atom_op: atom_op,
@@ -3740,10 +3746,11 @@ impl<'a> ShaderFromNir<'a> {
                         .get_eviction_priority(intrin.access()),
                 };
                 let addr = self.get_src(&srcs[1]);
+                let uaddr = self.get_src(&srcs[2]);
 
                 b.push_op(OpSt {
                     addr: addr,
-                    uniform_addr: Src::ZERO,
+                    uniform_addr: uaddr,
                     data: data,
                     offset: intrin.base(),
                     stride: OffsetStride::X1,
@@ -3772,10 +3779,11 @@ impl<'a> ShaderFromNir<'a> {
                     eviction_priority: MemEvictionPriority::Normal,
                 };
                 let addr = self.get_src(&srcs[1]);
+                let uaddr = self.get_src(&srcs[2]);
 
                 b.push_op(OpSt {
                     addr: addr,
-                    uniform_addr: Src::ZERO,
+                    uniform_addr: uaddr,
                     data: data,
                     offset: intrin.base(),
                     stride: OffsetStride::X1,
@@ -3794,10 +3802,11 @@ impl<'a> ShaderFromNir<'a> {
                     eviction_priority: MemEvictionPriority::Normal,
                 };
                 let addr = self.get_src(&srcs[1]);
+                let uaddr = self.get_src(&srcs[2]);
 
                 b.push_op(OpSt {
                     addr: addr,
-                    uniform_addr: Src::ZERO,
+                    uniform_addr: uaddr,
                     data: data,
                     offset: intrin.base(),
                     stride: intrin.offset_shift_nv().try_into().unwrap(),
@@ -3912,12 +3921,13 @@ impl<'a> ShaderFromNir<'a> {
                 };
                 let dst = b.alloc_ssa_vec(RegFile::GPR, comps);
                 let addr = self.get_src(&srcs[0]);
+                let uaddr = self.get_src(&srcs[1]);
                 b.push_op(OpLdsm {
                     dst: dst.clone().into(),
                     mat_size,
                     mat_count,
                     addr,
-                    uniform_addr: Src::ZERO,
+                    uniform_addr: uaddr,
                     offset: intrin.base(),
                 });
                 self.set_dst(&intrin.def, dst);
diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c
index 9c129859c63..c6a85525473 100644
--- a/src/nouveau/compiler/nak_nir.c
+++ b/src/nouveau/compiler/nak_nir.c
@@ -1019,8 +1019,23 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
                continue;
 
             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            nir_src *addr;
+
+            switch (intr->intrinsic) {
+            case nir_intrinsic_load_global_bounded:
+            case nir_intrinsic_load_global_constant_bounded: {
+               addr = &intr->src[0];
+               break;
+            }
+            default:
+               addr = nir_get_io_offset_src(intr);
+               break;
+            }
+            if (!addr)
+               continue;
+
             b.cursor = nir_before_instr(instr);
-            nir_src *addr = nir_get_io_offset_src(intr);
+            nir_def *uaddr = nir_imm_zero(&b, 1, addr->ssa->bit_size);
             nir_def *res = NULL;
             nir_intrinsic_instr *new = NULL;
 
@@ -1028,7 +1043,7 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
             case nir_intrinsic_load_global:
             case nir_intrinsic_load_global_constant: {
                nir_def *nir_true = nir_imm_bool(&b, true);
-               res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true);
+               res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr, nir_true);
                break;
             }
             case nir_intrinsic_load_global_bounded:
@@ -1044,32 +1059,32 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
                nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa));
                nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1);
                nir_def *cond = nir_ult(&b, last_byte, size->ssa);
-               res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond);
+               res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, uaddr, cond);
                break;
             }
             case nir_intrinsic_load_scratch:
-               res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
+               res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr);
                break;
             case nir_intrinsic_load_shared:
-               res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
+               res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr);
                break;
             case nir_intrinsic_store_global:
-               new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa);
+               new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
                break;
             case nir_intrinsic_store_scratch:
-               new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa);
+               new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
                break;
             case nir_intrinsic_store_shared:
-               new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa);
+               new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa, uaddr);
                break;
             case nir_intrinsic_global_atomic:
-               res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa);
+               res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa);
                break;
             case nir_intrinsic_global_atomic_swap:
                res = nir_global_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa);
                break;
             case nir_intrinsic_shared_atomic:
-               res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa);
+               res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa);
                break;
             case nir_intrinsic_shared_atomic_swap:
                res = nir_shared_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa);
diff --git a/src/nouveau/compiler/nak_nir_lower_cmat.c b/src/nouveau/compiler/nak_nir_lower_cmat.c
index 0490d21fd7b..e5c10171734 100644
--- a/src/nouveau/compiler/nak_nir_lower_cmat.c
+++ b/src/nouveau/compiler/nak_nir_lower_cmat.c
@@ -723,6 +723,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr)
    nir_def *base = intr->src[1].ssa;
    offset = nir_u2uN(b, offset, base->bit_size);
    nir_def *addr = nir_iadd(b, base, offset);
+   nir_def *zero = nir_imm_zero(b, addr->num_components, addr->bit_size);
 
    /* flip the layout for B matrices */
    if (desc.use == GLSL_CMAT_USE_B) {
@@ -734,7 +735,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr)
 
    /* Each thread loads 32 bits per matrix */
    assert(length * bit_size == 32 * ldsm_count);
-   return nir_cmat_load_shared_nv(b, length, bit_size, addr,
+   return nir_cmat_load_shared_nv(b, length, bit_size, addr, zero,
                                      .num_matrices = ldsm_count,
                                      .matrix_layout = layout);
 }
diff --git a/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c b/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c
index 10507233910..7fd64e13b98 100644
--- a/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c
+++ b/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c
@@ -56,10 +56,12 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load,
     * simple less-than check here.
     */
    nir_def *cond = nir_ilt(b, offset, size);
+   nir_def *zero_addr = nir_imm_zero(b, addr->num_components,
+                                        addr->bit_size);
    nir_def *val = nir_load_global_nv(b,
       load->def.num_components, load->def.bit_size,
       nir_iadd(b, addr, nir_u2u64(b, offset)),
-      cond,
+      zero_addr, cond,
       .align_mul = nir_intrinsic_align_mul(load),
       .align_offset = nir_intrinsic_align_offset(load),
       .access = ACCESS_CAN_REORDER,

From eeadd23c091f4eb64a386b5eca6c9389ae7616d3 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Tue, 24 Feb 2026 04:32:57 +0100
Subject: [PATCH 6/7] nak: add UGPR/GPR lowering for load/store/atom
 instructions

This tries to handle all combinations we might run into to. We should rely
on previous optimizations that the more difficult cases never happend.

As a side benefit instead of lowering a UGPR to a GPR, it will now be
moved to the UGPR slot.

Totals from 258010 (21.27% of 1212873) affected shaders:
CodeSize: 3742700224 -> 3576740928 (-4.43%); split: -4.44%, +0.01%
Number of GPRs: 13606055 -> 13496463 (-0.81%); split: -0.86%, +0.05%
SLM Size: 589740 -> 589660 (-0.01%)
Static cycle count: 3271547493 -> 3272550831 (+0.03%); split: -0.47%, +0.50%
Spills to memory: 56180 -> 56136 (-0.08%)
Fills from memory: 56180 -> 56136 (-0.08%)
Spills to reg: 108211 -> 110013 (+1.67%); split: -0.63%, +2.30%
Fills from reg: 99216 -> 100471 (+1.26%); split: -0.30%, +1.56%
Max warps/SM: 9921228 -> 9972060 (+0.51%); split: +0.52%, -0.00%
---
 src/nouveau/compiler/nak/ir.rs          | 11 +++
 src/nouveau/compiler/nak/sm70_encode.rs | 90 +++++++++++++++++++++++--
 2 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs
index 78febca91c6..f8756d4a80b 100644
--- a/src/nouveau/compiler/nak/ir.rs
+++ b/src/nouveau/compiler/nak/ir.rs
@@ -6468,6 +6468,17 @@ pub enum OffsetStride {
     X16 = 4,
 }
 
+impl OffsetStride {
+    pub fn shift(&self) -> u32 {
+        match self {
+            Self::X1 => 0,
+            Self::X4 => 2,
+            Self::X8 => 3,
+            Self::X16 => 4,
+        }
+    }
+}
+
 impl fmt::Display for OffsetStride {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let s = match self {
diff --git a/src/nouveau/compiler/nak/sm70_encode.rs b/src/nouveau/compiler/nak/sm70_encode.rs
index 07736e039b6..1fd267c256e 100644
--- a/src/nouveau/compiler/nak/sm70_encode.rs
+++ b/src/nouveau/compiler/nak/sm70_encode.rs
@@ -10,6 +10,7 @@ use crate::sm70::ShaderModel70;
 use bitview::*;
 
 use rustc_hash::FxHashMap;
+use std::mem;
 use std::ops::Range;
 
 /// A per-op trait that implements Volta+ opcode semantics
@@ -774,6 +775,60 @@ fn op_gpr(op: &impl DstsAsSlice) -> RegFile {
     }
 }
 
+fn legalize_load_store_address(
+    b: &mut LegalizeBuilder,
+    addr: &mut Src,
+    uniform_addr: &mut Src,
+    stride: Option<&mut OffsetStride>,
+) {
+    let stride_x1_or_none = matches!(stride, Some(OffsetStride::X1) | None);
+    if addr.is_ugpr_reg() {
+        if stride_x1_or_none && uniform_addr.is_zero() {
+            *uniform_addr = mem::replace(addr, Src::ZERO);
+        } else {
+            b.copy_src_if_uniform(addr);
+        }
+    }
+
+    if uniform_addr.is_gpr_reg() {
+        if addr.is_zero() {
+            assert!(stride_x1_or_none);
+            *addr = mem::replace(uniform_addr, Src::ZERO);
+        } else {
+            let uniform_ssa = uniform_addr.as_ssa().unwrap();
+            let mut ssa = addr.as_ssa().unwrap();
+
+            let addr_comps = ssa.comps();
+            if let Some(stride) = stride {
+                if *stride != OffsetStride::X1 {
+                    assert_eq!(addr_comps, 1);
+                    let shift = stride.shift();
+                    let shift = b.copy(shift.into());
+                    *addr = b.shl(addr.clone(), shift.into()).into();
+                    ssa = addr.as_ssa().unwrap();
+                    *stride = OffsetStride::X1;
+                }
+            }
+
+            if uniform_ssa.comps() == 2 {
+                // In case the non uniform address is 32 bits and the uniform one 64,
+                // we need convert it to 64 bits.
+                if uniform_ssa.comps() != addr_comps {
+                    let zero = b.copy(0.into());
+                    *addr = [ssa[0], zero].into();
+                }
+                *addr = b
+                    .iadd64(addr.clone(), uniform_addr.clone(), Src::ZERO)
+                    .into()
+            } else {
+                *addr =
+                    b.iadd(addr.clone(), uniform_addr.clone(), Src::ZERO).into()
+            }
+            *uniform_addr = 0.into();
+        }
+    }
+}
+
 //
 // Implementations of SM70Op for each op we support on Volta+
 //
@@ -3165,7 +3220,12 @@ impl SM70Op for OpSuAtom {
 
 impl SM70Op for OpLd {
     fn legalize(&mut self, b: &mut LegalizeBuilder) {
-        b.copy_src_if_uniform(&mut self.addr);
+        legalize_load_store_address(
+            b,
+            &mut self.addr,
+            &mut self.uniform_addr,
+            Some(&mut self.stride),
+        );
         b.copy_src_if_uniform(&mut self.pred);
     }
 
@@ -3327,8 +3387,13 @@ impl SM70Op for OpLdc {
 
 impl SM70Op for OpSt {
     fn legalize(&mut self, b: &mut LegalizeBuilder) {
-        b.copy_src_if_uniform(&mut self.addr);
         b.copy_src_if_uniform(&mut self.data);
+        legalize_load_store_address(
+            b,
+            &mut self.addr,
+            &mut self.uniform_addr,
+            Some(&mut self.stride),
+        );
     }
 
     fn encode(&self, e: &mut SM70Encoder<'_>) {
@@ -3461,9 +3526,19 @@ impl SM70Encoder<'_> {
 
 impl SM70Op for OpAtom {
     fn legalize(&mut self, b: &mut LegalizeBuilder) {
-        b.copy_src_if_uniform(&mut self.addr);
-        b.copy_src_if_uniform(&mut self.cmpr);
         b.copy_src_if_uniform(&mut self.data);
+
+        if matches!(self.atom_op, AtomOp::CmpExch(_)) {
+            b.copy_src_if_uniform(&mut self.addr);
+            b.copy_src_if_uniform(&mut self.cmpr);
+        } else {
+            legalize_load_store_address(
+                b,
+                &mut self.addr,
+                &mut self.uniform_address,
+                Some(&mut self.addr_stride),
+            );
+        }
     }
 
     fn encode(&self, e: &mut SM70Encoder<'_>) {
@@ -4291,7 +4366,12 @@ impl SM70Op for OpHmma {
 
 impl SM70Op for OpLdsm {
     fn legalize(&mut self, b: &mut LegalizeBuilder) {
-        b.copy_src_if_uniform(&mut self.addr);
+        legalize_load_store_address(
+            b,
+            &mut self.addr,
+            &mut self.uniform_addr,
+            None,
+        );
     }
 
     fn encode(&self, e: &mut SM70Encoder<'_>) {

From 0b4705ec956b69cc5fdf6c66200987e5992a3bed Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sun, 15 Mar 2026 21:16:30 +0100
Subject: [PATCH 7/7] nak: optimize iadds with an uniform operand in iadds of
 address calculations

Instead of doing the iadd manually we can use the uniform slot of the
ld/st/atom instruction getting rid of the iadd altogether.

Additionally for global memory we can also consume a 32 bit offset instead
of requiring it to be 64 bit.

Totals from 158539 (13.07% of 1212873) affected shaders:
CodeSize: 2308216336 -> 2242231136 (-2.86%); split: -2.86%, +0.00%
Number of GPRs: 8682436 -> 8662675 (-0.23%); split: -0.26%, +0.04%
SLM Size: 238816 -> 238604 (-0.09%)
Static cycle count: 2169063422 -> 2147747544 (-0.98%); split: -0.99%, +0.01%
Spills to memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02%
Fills from memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02%
Spills to reg: 45053 -> 45273 (+0.49%); split: -0.04%, +0.53%
Fills from reg: 36385 -> 36757 (+1.02%); split: -0.04%, +1.06%
Max warps/SM: 6027232 -> 6034616 (+0.12%); split: +0.12%, -0.00%
---
 src/nouveau/compiler/nak_nir.c | 113 +++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c
index c6a85525473..372000f67d2 100644
--- a/src/nouveau/compiler/nak_nir.c
+++ b/src/nouveau/compiler/nak_nir.c
@@ -1130,6 +1130,113 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
    return progress;
 }
 
+static bool
+is_divergent_phi(nir_instr *instr)
+{
+   if (instr->type != nir_instr_type_phi)
+      return false;
+   nir_phi_instr *phi = nir_instr_as_phi(instr);
+   return nak_nir_phi_is_divergent(phi);
+}
+
+static bool
+nak_nir_opt_uniform_address_impl(struct nir_builder *b,
+                                 nir_intrinsic_instr *intr, void *cb_data)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_cmat_load_shared_nv:
+   case nir_intrinsic_global_atomic_nv:
+   case nir_intrinsic_load_global_nv:
+   case nir_intrinsic_load_scratch_nv:
+   case nir_intrinsic_load_shared_nv:
+   case nir_intrinsic_shared_atomic_nv:
+   case nir_intrinsic_store_global_nv:
+   case nir_intrinsic_store_scratch_nv:
+   case nir_intrinsic_store_shared_nv: {
+      nir_src *offset_src = nir_get_io_offset_src(intr);
+      nir_def *offset = offset_src->ssa;
+      nir_src *uniform_offset_src = nir_get_io_uniform_offset_src(intr);
+      nir_def *uniform_offset = uniform_offset_src->ssa;
+      nir_block *use_block = intr->instr.block;
+
+      assert(nir_src_as_uint(*uniform_offset_src) == 0);
+
+      /* Nak can't collect vectors in non uniform control flow, so don't
+       * even try */
+      if (offset->bit_size == 64 && nak_block_is_divergent(use_block))
+         return false;
+
+      /* We ignore any constant offset */
+      if (nir_src_is_const(*offset_src))
+         return false;
+
+      /* If the source is already uniform, just swap them as the uniform slot
+       * should be 0 */
+      if (!nir_def_is_divergent_at_use_block(offset, use_block)) {
+         if (is_divergent_phi(nir_def_instr(offset)))
+            return false;
+         nir_src_rewrite(uniform_offset_src, offset);
+         nir_src_rewrite(offset_src, uniform_offset);
+         return true;
+      }
+
+      nir_alu_instr *iadd = nir_def_as_alu_or_null(offset_src->ssa);
+      if (!iadd || iadd->op != nir_op_iadd)
+         return false;
+
+      unsigned src0_div = nir_def_is_divergent_at_use_block(iadd->src[0].src.ssa, use_block);
+      unsigned src1_div = nir_def_is_divergent_at_use_block(iadd->src[1].src.ssa, use_block);
+      if (src0_div && src1_div)
+         return false;
+
+      b->cursor = nir_before_instr(&intr->instr);
+
+      nir_def *addr, *uaddr;
+      if (src0_div) {
+         assert(!src1_div);
+         addr = nir_ssa_for_alu_src(b, iadd, 0);
+         uaddr = nir_ssa_for_alu_src(b, iadd, 1);
+      } else {
+         assert(src1_div);
+         addr = nir_ssa_for_alu_src(b, iadd, 1);
+         uaddr = nir_ssa_for_alu_src(b, iadd, 0);
+      }
+
+      if (is_divergent_phi(nir_def_instr(uaddr)))
+         return false;
+
+      /* We can remove a u2u64 on the non uniform src */
+      if (addr->bit_size == 64) {
+         nir_alu_instr *u2u64 = nir_def_as_alu_or_null(addr);
+         if (u2u64 && u2u64->op == nir_op_u2u64)
+            addr = nir_ssa_for_alu_src(b, u2u64, 0);
+      }
+
+      nir_src_rewrite(offset_src, addr);
+      nir_src_rewrite(uniform_offset_src, uaddr);
+      return true;
+   }
+   default:
+      return false;
+   }
+}
+
+/** This pass assumes it is ran after nir_opt_offset */
+static bool
+nak_nir_opt_uniform_address(nir_shader *nir)
+{
+   if (nak_debug_no_ugpr())
+      return false;
+   nir_divergence_analysis(nir);
+   return nir_shader_intrinsics_pass(
+      nir,
+      nak_nir_opt_uniform_address_impl,
+      nir_metadata_control_flow,
+      NULL
+   );
+}
+
+
 static bool
 nak_nir_opt_offset_shift_nv_impl(struct nir_builder *b,
                                  nir_intrinsic_instr *intrin, void *data)
@@ -1333,6 +1440,12 @@ nak_postprocess_nir(nir_shader *nir,
       .cb_data = nak,
    };
    OPT(nir, nir_opt_offsets, &nak_offset_options);
+   if (nak->sm >= 73) {
+      OPT(nir, nak_nir_opt_uniform_address);
+      /* TODO: as we eliminate u2u64s we could fold more offsets in, however
+       * This would require us to verify it doesn't overflow, which we can't. */
+      /* OPT(nir, nir_opt_offsets, &nak_offset_options); */
+   }
 
    /* Should run after nir_opt_offsets, because nir_opt_algebraic will move
     * iadds down the chain */