From f85069ed57039b66dde7eef2bd9af8dea3d35eff Mon Sep 17 00:00:00 2001
From: Faith Ekstrand <faith.ekstrand@collabora.com>
Date: Tue, 3 Oct 2023 13:19:07 -0500
Subject: [PATCH] nak: Use nak_nir_lower_vtg_io

This massively simplifies nak_from_nir.rs because it lets us do all the
annoying NIR fiddling in NIR and not in the back-end.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24998>
---
 src/nouveau/compiler/nak_encode_sm75.rs |  13 ++
 src/nouveau/compiler/nak_from_nir.rs    | 295 +++++++++++-------------
 src/nouveau/compiler/nak_ir.rs          |  30 +++
 src/nouveau/compiler/nak_nir.c          |  47 +---
 4 files changed, 184 insertions(+), 201 deletions(-)

diff --git a/src/nouveau/compiler/nak_encode_sm75.rs b/src/nouveau/compiler/nak_encode_sm75.rs
index 264fe8dcfbe..2f70d196f9a 100644
--- a/src/nouveau/compiler/nak_encode_sm75.rs
+++ b/src/nouveau/compiler/nak_encode_sm75.rs
@@ -1478,6 +1478,18 @@ impl SM75Instr {
         }
     }
 
+    fn encode_al2p(&mut self, op: &OpAL2P) {
+        self.set_opcode(0x920);
+
+        self.set_dst(op.dst);
+        self.set_reg_src(24..32, op.offset);
+
+        self.set_field(40..50, op.access.addr);
+        self.set_field(74..76, 0_u8); // comps
+        assert!(!op.access.patch);
+        self.set_bit(79, op.access.output);
+    }
+
     fn encode_ald(&mut self, op: &OpALd) {
         self.set_opcode(0x321);
 
@@ -1774,6 +1786,7 @@ impl SM75Instr {
             Op::St(op) => si.encode_st(&op),
             Op::Atom(op) => si.encode_atom(&op),
             Op::AtomCas(op) => si.encode_atom_cas(&op),
+            Op::AL2P(op) => si.encode_al2p(&op),
             Op::ALd(op) => si.encode_ald(&op),
             Op::ASt(op) => si.encode_ast(&op),
             Op::Ipa(op) => si.encode_ipa(&op),
diff --git a/src/nouveau/compiler/nak_from_nir.rs b/src/nouveau/compiler/nak_from_nir.rs
index d45efa13013..3d5d53e41c5 100644
--- a/src/nouveau/compiler/nak_from_nir.rs
+++ b/src/nouveau/compiler/nak_from_nir.rs
@@ -1200,6 +1200,105 @@ impl<'a> ShaderFromNir<'a> {
     ) {
         let srcs = intrin.srcs_as_slice();
         match intrin.intrinsic {
+            nir_intrinsic_al2p_nv => {
+                let offset = self.get_src(&srcs[0]);
+                let addr = u16::try_from(intrin.base()).unwrap();
+
+                let flags = intrin.flags();
+                let flags: nak_nir_attr_io_flags =
+                    unsafe { std::mem::transmute_copy(&flags) };
+
+                let access = AttrAccess {
+                    addr: addr,
+                    comps: 1,
+                    patch: flags.patch(),
+                    output: flags.output(),
+                    flags: 0,
+                };
+
+                let dst = b.alloc_ssa(RegFile::GPR, 1);
+                b.push_op(OpAL2P {
+                    dst: dst.into(),
+                    offset: offset,
+                    access: access,
+                });
+                self.set_dst(&intrin.def, dst);
+            }
+            nir_intrinsic_ald_nv | nir_intrinsic_ast_nv => {
+                let addr = u16::try_from(intrin.base()).unwrap();
+                let base = u16::try_from(intrin.range_base()).unwrap();
+                let range = u16::try_from(intrin.range()).unwrap();
+                let range = base..(base + range);
+
+                let flags = intrin.flags();
+                let flags: nak_nir_attr_io_flags =
+                    unsafe { std::mem::transmute_copy(&flags) };
+                assert!(!flags.patch() || !flags.phys());
+
+                if let ShaderIoInfo::Vtg(io) = &mut self.info.io {
+                    if flags.patch() {
+                        match &mut self.info.stage {
+                            ShaderStageInfo::TessellationInit(stage) => {
+                                assert!(flags.output());
+                                stage.per_patch_attribute_count = max(
+                                    stage.per_patch_attribute_count,
+                                    (range.end / 4).try_into().unwrap(),
+                                );
+                            }
+                            ShaderStageInfo::Tessellation => (),
+                            _ => panic!("Patch I/O not supported"),
+                        }
+                    } else {
+                        if flags.output() {
+                            if intrin.intrinsic == nir_intrinsic_ast_nv {
+                                io.mark_store_req(range.clone());
+                            }
+                            io.mark_attrs_written(range);
+                        } else {
+                            io.mark_attrs_read(range);
+                        }
+                    }
+                } else {
+                    panic!("Must be a VTG stage");
+                }
+
+                let access = AttrAccess {
+                    addr: addr,
+                    comps: intrin.num_components,
+                    patch: flags.patch(),
+                    output: flags.output(),
+                    flags: flags.phys().into(),
+                };
+
+                if intrin.intrinsic == nir_intrinsic_ald_nv {
+                    let vtx = self.get_src(&srcs[0]);
+                    let offset = self.get_src(&srcs[1]);
+
+                    assert!(intrin.def.bit_size() == 32);
+                    let dst = b.alloc_ssa(RegFile::GPR, access.comps);
+                    b.push_op(OpALd {
+                        dst: dst.into(),
+                        vtx: vtx,
+                        offset: offset,
+                        access: access,
+                    });
+                    self.set_dst(&intrin.def, dst);
+                } else if intrin.intrinsic == nir_intrinsic_ast_nv {
+                    assert!(srcs[0].bit_size() == 32);
+                    let data = self.get_src(&srcs[0]);
+                    let vtx = self.get_src(&srcs[1]);
+                    let offset = self.get_src(&srcs[2]);
+
+                    b.push_op(OpASt {
+                        data: data,
+                        vtx: vtx,
+                        offset: offset,
+                        access: access,
+                    });
+                } else {
+                    panic!("Invalid VTG I/O intrinsic");
+                }
+            }
             nir_intrinsic_bindless_image_atomic => {
                 let handle = self.get_src(&srcs[0]);
                 let dim = self.get_image_dim(intrin);
@@ -1366,173 +1465,33 @@ impl<'a> ShaderFromNir<'a> {
                 });
                 self.set_dst(&intrin.def, dst);
             }
-            nir_intrinsic_load_input
-            | nir_intrinsic_load_output
-            | nir_intrinsic_load_per_vertex_input
-            | nir_intrinsic_load_per_vertex_output
-            | nir_intrinsic_store_output
-            | nir_intrinsic_store_per_vertex_output => {
-                let comps = intrin.num_components;
-
-                let store_data = match intrin.intrinsic {
-                    nir_intrinsic_load_input
-                    | nir_intrinsic_load_output
-                    | nir_intrinsic_load_per_vertex_input
-                    | nir_intrinsic_load_per_vertex_output => {
-                        assert!(intrin.def.bit_size() == 32);
-                        assert!(intrin.def.num_components() == comps);
-                        None
-                    }
-                    nir_intrinsic_store_output
-                    | nir_intrinsic_store_per_vertex_output => {
-                        assert!(srcs[0].bit_size() == 32);
-                        assert!(srcs[0].num_components() == comps);
-                        Some(self.get_src(&srcs[0]))
-                    }
-                    _ => panic!("Unhandled intrinsic"),
+            nir_intrinsic_load_input => {
+                let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
+                    panic!("load_input is only used for fragment shaders");
                 };
 
-                let (vtx, offset, offset_as_u32) = match intrin.intrinsic {
-                    nir_intrinsic_load_input | nir_intrinsic_load_output => (
-                        Src::new_zero(),
-                        self.get_src(&srcs[0]),
-                        srcs[0].as_uint(),
-                    ),
-                    nir_intrinsic_load_per_vertex_input
-                    | nir_intrinsic_load_per_vertex_output => (
-                        self.get_src(&srcs[0]),
-                        self.get_src(&srcs[1]),
-                        srcs[1].as_uint(),
-                    ),
-                    nir_intrinsic_store_output => (
-                        Src::new_zero(),
-                        self.get_src(&srcs[1]),
-                        srcs[1].as_uint(),
-                    ),
-                    nir_intrinsic_store_per_vertex_output => (
-                        self.get_src(&srcs[1]),
-                        self.get_src(&srcs[2]),
-                        srcs[2].as_uint(),
-                    ),
-                    _ => panic!("Unhandled intrinsic"),
-                };
+                assert!(intrin.def.bit_size() == 32);
+                let comps = intrin.def.num_components;
 
-                let base = u16::try_from(intrin.base()).unwrap();
-                let range = u16::try_from(intrin.range()).unwrap();
-                let comp = u16::try_from(intrin.component()).unwrap();
+                let addr = u16::try_from(intrin.base()).unwrap()
+                    + u16::try_from(srcs[0].as_uint().unwrap()).unwrap()
+                    + 4 * u16::try_from(intrin.component()).unwrap();
 
-                let (range, addr, offset) = match offset_as_u32 {
-                    Some(imm) => {
-                        let imm = u16::try_from(imm).unwrap();
-                        let addr = base + imm + 4 * comp;
-                        let range = addr..(addr + 4 * u16::from(comps));
-                        (range, addr, Src::new_zero())
-                    }
-                    None => {
-                        let range = base..(base + range);
-                        (range, base + 4 * comp, offset)
-                    }
-                };
+                let dst = b.alloc_ssa(RegFile::GPR, comps);
+                for c in 0..comps {
+                    let c_addr = addr + 4 * u16::from(c);
 
-                let stage = self.nir.info.stage();
-                let (output, patch) = match intrin.intrinsic {
-                    nir_intrinsic_load_input => {
-                        (false, stage == MESA_SHADER_TESS_EVAL)
-                    }
-                    nir_intrinsic_load_output | nir_intrinsic_store_output => {
-                        (true, stage == MESA_SHADER_TESS_CTRL)
-                    }
-                    nir_intrinsic_load_per_vertex_input => (false, false),
-                    nir_intrinsic_load_per_vertex_output
-                    | nir_intrinsic_store_per_vertex_output => (true, false),
-                    _ => panic!("Unhandled intrinsic"),
-                };
+                    io.mark_attr_read(c_addr, PixelImap::Constant);
 
-                match &mut self.info.io {
-                    ShaderIoInfo::None => {
-                        panic!("Stage does not support load_input")
-                    }
-                    ShaderIoInfo::Fragment(io) => {
-                        if let Some(data) = store_data {
-                            // We assume these only ever happen in the
-                            // last block.  This is ensured by
-                            // nir_lower_io_to_temporaries()
-                            assert!(offset_as_u32 == Some(0));
-                            assert!(addr % 4 == 0);
-                            let data = data.as_ssa().unwrap();
-                            for c in 0..usize::from(comps) {
-                                let idx =
-                                    usize::from(addr / 4) + usize::from(c);
-                                self.fs_out_regs[idx] = data[c];
-                            }
-                        } else {
-                            let dst = b.alloc_ssa(RegFile::GPR, comps);
-                            for c in 0..comps {
-                                let c_addr = addr + 4 * u16::from(c);
-
-                                io.mark_attr_read(c_addr, PixelImap::Constant);
-
-                                b.push_op(OpIpa {
-                                    dst: dst[usize::from(c)].into(),
-                                    addr: c_addr,
-                                    freq: InterpFreq::Constant,
-                                    loc: InterpLoc::Default,
-                                    offset: SrcRef::Zero.into(),
-                                });
-                            }
-                            self.set_dst(&intrin.def, dst);
-                        }
-                    }
-                    ShaderIoInfo::Vtg(io) => {
-                        if patch {
-                            match &mut self.info.stage {
-                                ShaderStageInfo::TessellationInit(stage) => {
-                                    stage.per_patch_attribute_count = max(
-                                        stage.per_patch_attribute_count,
-                                        (range.end / 4).try_into().unwrap(),
-                                    );
-                                }
-                                ShaderStageInfo::Tessellation => (),
-                                _ => panic!("Patch I/O not supported"),
-                            }
-                        } else {
-                            if output {
-                                if store_data.is_none() {
-                                    io.mark_store_req(range.clone());
-                                }
-                                io.mark_attrs_written(range);
-                            } else {
-                                io.mark_attrs_read(range);
-                            }
-                        }
-
-                        let access = AttrAccess {
-                            addr: addr,
-                            comps: comps,
-                            patch: patch,
-                            output: output,
-                            flags: 0,
-                        };
-
-                        if let Some(data) = store_data {
-                            b.push_op(OpASt {
-                                vtx: vtx,
-                                offset: offset,
-                                data: data,
-                                access: access,
-                            });
-                        } else {
-                            let dst = b.alloc_ssa(RegFile::GPR, comps);
-                            b.push_op(OpALd {
-                                dst: dst.into(),
-                                vtx: vtx,
-                                offset: offset,
-                                access: access,
-                            });
-                            self.set_dst(&intrin.def, dst);
-                        }
-                    }
+                    b.push_op(OpIpa {
+                        dst: dst[usize::from(c)].into(),
+                        addr: c_addr,
+                        freq: InterpFreq::Constant,
+                        loc: InterpLoc::Default,
+                        offset: SrcRef::Zero.into(),
+                    });
                 }
+                self.set_dst(&intrin.def, dst);
             }
             nir_intrinsic_load_interpolated_input => {
                 let bary =
@@ -1851,6 +1810,22 @@ impl<'a> ShaderFromNir<'a> {
                     access: access,
                 });
             }
+            nir_intrinsic_store_output => {
+                let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
+                    panic!("load_input is only used for fragment shaders");
+                };
+                let data = self.get_src(&srcs[0]);
+
+                let addr = u16::try_from(intrin.base()).unwrap()
+                    + u16::try_from(srcs[1].as_uint().unwrap()).unwrap()
+                    + 4 * u16::try_from(intrin.component()).unwrap();
+                assert!(addr % 4 == 0);
+
+                for c in 0..usize::from(intrin.num_components) {
+                    let idx = usize::from(addr / 4) + usize::from(c);
+                    self.fs_out_regs[idx] = data.as_ssa().unwrap()[c];
+                }
+            }
             nir_intrinsic_store_scratch => {
                 let data = self.get_src(&srcs[0]);
                 let size_B =
diff --git a/src/nouveau/compiler/nak_ir.rs b/src/nouveau/compiler/nak_ir.rs
index ddd9d82cdbc..50c658ae961 100644
--- a/src/nouveau/compiler/nak_ir.rs
+++ b/src/nouveau/compiler/nak_ir.rs
@@ -3313,6 +3313,34 @@ impl fmt::Display for OpAtomCas {
     }
 }
 
+#[repr(C)]
+#[derive(SrcsAsSlice, DstsAsSlice)]
+pub struct OpAL2P {
+    pub dst: Dst,
+
+    #[src_type(GPR)]
+    pub offset: Src,
+
+    pub access: AttrAccess,
+}
+
+impl fmt::Display for OpAL2P {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "AL2P")?;
+        if self.access.output {
+            write!(f, ".O")?;
+        }
+        if self.access.patch {
+            write!(f, ".P")?;
+        }
+        write!(f, " {} a[{:#x}", self.dst, self.access.addr)?;
+        if !self.offset.is_zero() {
+            write!(f, "+{}", self.offset)?;
+        }
+        write!(f, "]")
+    }
+}
+
 #[repr(C)]
 #[derive(SrcsAsSlice, DstsAsSlice)]
 pub struct OpALd {
@@ -4019,6 +4047,7 @@ pub enum Op {
     St(OpSt),
     Atom(OpAtom),
     AtomCas(OpAtomCas),
+    AL2P(OpAL2P),
     ALd(OpALd),
     ASt(OpASt),
     Ipa(OpIpa),
@@ -4446,6 +4475,7 @@ impl Instr {
             | Op::St(_)
             | Op::Atom(_)
             | Op::AtomCas(_)
+            | Op::AL2P(_)
             | Op::ALd(_)
             | Op::ASt(_)
             | Op::Ipa(_)
diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c
index b4a00d19c57..facc7fb4473 100644
--- a/src/nouveau/compiler/nak_nir.c
+++ b/src/nouveau/compiler/nak_nir.c
@@ -304,18 +304,6 @@ nak_sysval_sysval_idx(gl_system_value sysval)
    }
 }
 
-static nir_def *
-nak_nir_isberd(nir_builder *b, nir_def *vertex)
-{
-   nir_def *info = nir_load_sysval_nv(b, 32, .base = NAK_SV_INVOCATION_INFO,
-                                      .access = ACCESS_CAN_REORDER);
-   nir_def *lo = nir_extract_u8_imm(b, info, 0);
-   nir_def *hi = nir_extract_u8_imm(b, info, 2);
-   nir_def *idx = nir_iadd(b, nir_imul(b, lo, hi), vertex);
-
-   return nir_isberd_nv(b, idx);
-}
-
 static bool
 nak_nir_lower_system_value_instr(nir_builder *b, nir_instr *instr, void *data)
 {
@@ -338,8 +326,8 @@ nak_nir_lower_system_value_instr(nir_builder *b, nir_instr *instr, void *data)
    case nir_intrinsic_load_primitive_id: {
       assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL ||
              b->shader->info.stage == MESA_SHADER_TESS_EVAL);
-      nir_def *idx = nak_nir_isberd(b, nir_imm_int(b, 0));
-      val = nir_load_per_vertex_input(b, 1, 32, idx, nir_imm_int(b, 0),
+      val = nir_load_per_vertex_input(b, 1, 32, nir_imm_int(b, 0),
+                                      nir_imm_int(b, 0),
                                       .base = NAK_ATTR_PRIMITIVE_ID,
                                       .dest_type = nir_type_int32);
       break;
@@ -423,23 +411,6 @@ nak_nir_lower_system_values(nir_shader *nir)
                                        NULL);
 }
 
-static bool
-lower_per_vertex_io_intrin(nir_builder *b,
-                           nir_intrinsic_instr *intrin,
-                           void *data)
-{
-   if (intrin->intrinsic != nir_intrinsic_load_per_vertex_input)
-      return false;
-
-   b->cursor = nir_before_instr(&intrin->instr);
-
-   nir_src *vertex = &intrin->src[0];
-   nir_def *idx = nak_nir_isberd(b, vertex->ssa);
-   nir_src_rewrite(vertex, idx);
-
-   return true;
-}
-
 static bool
 nak_nir_lower_varyings(nir_shader *nir, nir_variable_mode modes)
 {
@@ -452,16 +423,6 @@ nak_nir_lower_varyings(nir_shader *nir, nir_variable_mode modes)
 
    OPT(nir, nir_lower_io, modes, type_size_vec4_bytes, 0);
 
-   switch (nir->info.stage) {
-   case MESA_SHADER_TESS_CTRL:
-   case MESA_SHADER_TESS_EVAL:
-   case MESA_SHADER_GEOMETRY:
-      OPT(nir, nir_shader_intrinsics_pass, lower_per_vertex_io_intrin,
-          nir_metadata_block_index | nir_metadata_dominance, NULL);
-   default:
-      break;
-   }
-
    return progress;
 }
 
@@ -782,12 +743,16 @@ nak_postprocess_nir(nir_shader *nir,
    case MESA_SHADER_VERTEX:
       OPT(nir, nak_nir_lower_vs_inputs);
       OPT(nir, nak_nir_lower_varyings, nir_var_shader_out);
+      OPT(nir, nir_opt_constant_folding);
+      OPT(nir, nak_nir_lower_vtg_io, nak);
       break;
 
    case MESA_SHADER_TESS_CTRL:
    case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       OPT(nir, nak_nir_lower_varyings, nir_var_shader_in | nir_var_shader_out);
+      OPT(nir, nir_opt_constant_folding);
+      OPT(nir, nak_nir_lower_vtg_io, nak);
       break;
 
    case MESA_SHADER_FRAGMENT: