From 11b8c8b8e672e6521e627f45c328a6533c23f77d Mon Sep 17 00:00:00 2001
From: Mel Henning <mhenning@darkrefraction.com>
Date: Mon, 10 Feb 2025 18:56:26 -0500
Subject: [PATCH] nak,nir: Add 64-bit lea_nv

Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32517>
---
 src/compiler/nir/nir_opcodes.py           |  2 +-
 src/nouveau/compiler/nak/builder.rs       | 48 ++++++++++++++++++++++
 src/nouveau/compiler/nak/from_nir.rs      |  6 ++-
 src/nouveau/compiler/nak/hw_tests.rs      | 49 +++++++++++++++++++++++
 src/nouveau/compiler/nak_nir_algebraic.py |  2 +
 5 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index ec4f6667f3e..619efaf64ad 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -1414,7 +1414,7 @@ opcode("prmt_nv", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
 
 # Address arithmetic instructions: shift and add
 # Shift must be a constant.
-opcode("lea_nv", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], False,
+opcode("lea_nv", 0, tuint, [0, 0, 0], [tuint, tuint, tuint32], False,
        "", "src0 + (src1 << (src2 % bit_size))")
 
 # 24b multiply into 32b result (with sign extension)
diff --git a/src/nouveau/compiler/nak/builder.rs b/src/nouveau/compiler/nak/builder.rs
index c9fb98a1be7..54a1febb8b0 100644
--- a/src/nouveau/compiler/nak/builder.rs
+++ b/src/nouveau/compiler/nak/builder.rs
@@ -630,6 +630,54 @@ pub trait SSABuilder: Builder {
         dst
     }
 
+    fn lea64(&mut self, a: Src, b: Src, shift: u8) -> SSARef {
+        assert!(self.sm() >= 70);
+        assert!(a.src_mod.is_none());
+        assert!(b.src_mod.is_none());
+
+        let a = a.as_ssa().unwrap();
+        let b = b.as_ssa().unwrap();
+        let dst = self.alloc_ssa(RegFile::GPR, 2);
+        let shift = shift % 64;
+        if shift >= 32 {
+            self.copy_to(dst[0].into(), b[0].into());
+            self.push_op(OpLea {
+                dst: dst[1].into(),
+                overflow: Dst::None,
+                a: a[0].into(),
+                b: b[1].into(),
+                a_high: 0.into(),
+                dst_high: false,
+                shift: shift - 32,
+                intermediate_mod: SrcMod::None,
+            });
+        } else {
+            let carry = self.alloc_ssa(RegFile::Pred, 1);
+            self.push_op(OpLea {
+                dst: dst[0].into(),
+                overflow: carry.into(),
+                a: a[0].into(),
+                b: b[0].into(),
+                a_high: 0.into(),
+                dst_high: false,
+                shift: shift,
+                intermediate_mod: SrcMod::None,
+            });
+            self.push_op(OpLeaX {
+                dst: dst[1].into(),
+                overflow: Dst::None,
+                a: a[0].into(),
+                b: b[1].into(),
+                a_high: a[1].into(),
+                carry: carry.into(),
+                dst_high: true,
+                shift: shift,
+                intermediate_mod: SrcMod::None,
+            });
+        }
+        dst
+    }
+
     fn lop2(&mut self, op: LogicOp2, x: Src, y: Src) -> SSARef {
         let dst = if x.is_predicate() {
             self.alloc_ssa(RegFile::Pred, 1)
diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs
index 909a366d990..f1945f92d80 100644
--- a/src/nouveau/compiler/nak/from_nir.rs
+++ b/src/nouveau/compiler/nak/from_nir.rs
@@ -1482,7 +1482,11 @@ impl<'a> ShaderFromNir<'a> {
                 let src_a = srcs[1];
                 let src_b = srcs[0];
                 let shift = nir_srcs[2].comp_as_uint(0).unwrap() as u8;
-                b.lea(src_a, src_b, shift)
+                match alu.def.bit_size {
+                    32 => b.lea(src_a, src_b, shift),
+                    64 => b.lea64(src_a, src_b, shift),
+                    x => panic!("unsupported bit size for nir_op_lea_nv: {x}"),
+                }
             }
             nir_op_isub => match alu.def.bit_size {
                 32 => b.iadd(srcs[0], srcs[1].ineg(), 0.into()),
diff --git a/src/nouveau/compiler/nak/hw_tests.rs b/src/nouveau/compiler/nak/hw_tests.rs
index 06fd4ce7215..140bdd1d291 100644
--- a/src/nouveau/compiler/nak/hw_tests.rs
+++ b/src/nouveau/compiler/nak/hw_tests.rs
@@ -788,6 +788,55 @@ fn test_op_leax() {
     }
 }
 
+#[test]
+fn test_lea64() {
+    let run = RunSingleton::get();
+    let invocations = 100;
+
+    for shift in 0..64 {
+        let mut b = TestShaderBuilder::new(run.sm.as_ref());
+
+        let x = Src::from([
+            b.ld_test_data(0, MemType::B32)[0],
+            b.ld_test_data(4, MemType::B32)[0],
+        ]);
+
+        let y = Src::from([
+            b.ld_test_data(8, MemType::B32)[0],
+            b.ld_test_data(12, MemType::B32)[0],
+        ]);
+
+        let dst = b.lea64(x, y, shift);
+        b.st_test_data(16, MemType::B32, dst[0].into());
+        b.st_test_data(20, MemType::B32, dst[1].into());
+
+        let bin = b.compile();
+
+        let mut a = Acorn::new();
+        let mut data = Vec::new();
+        for _ in 0..invocations {
+            data.push([
+                get_iadd_int(&mut a),
+                get_iadd_int(&mut a),
+                get_iadd_int(&mut a),
+                get_iadd_int(&mut a),
+                0,
+                0,
+            ]);
+        }
+
+        run.run.run(&bin, &mut data).unwrap();
+
+        for d in &data {
+            let x = u64::from(d[0]) | (u64::from(d[1]) << 32);
+            let y = u64::from(d[2]) | (u64::from(d[3]) << 32);
+            let dst = (x << shift).wrapping_add(y);
+            assert_eq!(d[4], dst as u32);
+            assert_eq!(d[5], (dst >> 32) as u32);
+        }
+    }
+}
+
 #[test]
 fn test_op_lop2() {
     if RunSingleton::get().sm.sm() < 70 {
diff --git a/src/nouveau/compiler/nak_nir_algebraic.py b/src/nouveau/compiler/nak_nir_algebraic.py
index 4f8a4004917..f1c58247c35 100644
--- a/src/nouveau/compiler/nak_nir_algebraic.py
+++ b/src/nouveau/compiler/nak_nir_algebraic.py
@@ -42,6 +42,8 @@ algebraic_lowering = [
 
     (('iadd(is_used_by_non_ldc_nv)', 'a@32', ('ishl', 'b@32', '#s@32')),
         ('lea_nv', a, b, s), 'nak->sm >= 70'),
+    (('iadd', 'a@64', ('ishl', 'b@64', '#s@32')),
+        ('lea_nv', a, b, s), 'nak->sm >= 70'),
 ]
 
 def main():