From 0fc02a8b69a0a7e9265c795778cc0db8916fcdce Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 15 Mar 2026 20:47:04 +0100 Subject: [PATCH 1/7] nir: add nir_intrinsic_cmat_load_shared_nv to nir_get_io_offset_src_number --- src/compiler/nir/nir_lower_io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 6bb281e6cba..86f7a3591fd 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -1018,6 +1018,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr) case nir_intrinsic_load_push_data_intel: case nir_intrinsic_vild_nv: case nir_intrinsic_load_shader_indirect_data_intel: + case nir_intrinsic_cmat_load_shared_nv: return 0; case nir_intrinsic_load_ubo: case nir_intrinsic_load_ubo_vec4: From 584ba918a1c809f9d5cd3c377111a65a15e32ecd Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Tue, 17 Mar 2026 00:37:22 +0100 Subject: [PATCH 2/7] nak: add nak_nir_phi_is_divergent helper --- src/nouveau/compiler/nak_nir_lower_cf.c | 61 ++++++++++++++----------- src/nouveau/compiler/nak_private.h | 1 + 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/src/nouveau/compiler/nak_nir_lower_cf.c b/src/nouveau/compiler/nak_nir_lower_cf.c index 817f6de4d73..8b146f1b0ae 100644 --- a/src/nouveau/compiler/nak_nir_lower_cf.c +++ b/src/nouveau/compiler/nak_nir_lower_cf.c @@ -376,6 +376,39 @@ lower_cf_list(nir_builder *b, nir_def *esc_reg, struct scope *parent_scope, } } +bool +nak_nir_phi_is_divergent(nir_phi_instr *phi) +{ + bool divergent = false; + nir_foreach_phi_src(phi_src, phi) { + /* There is a tricky case we need to care about here where a + * convergent block has a divergent dominator. This can happen + * if, for instance, you have the following loop: + * + * loop { + * if (div) { + * %20 = load_ubo(0, 0); + * } else { + * terminate; + * } + * } + * use(%20); + * + * In this case, the load_ubo() dominates the use() even though + * the load_ubo() exists in divergent control-flow. In this + * case, we simply flag the whole phi divergent because we + * don't want to deal with inserting a r2ur somewhere. + */ + if (phi_src->pred->divergent || phi_src->src.ssa->divergent || + nir_def_block(phi_src->src.ssa)->divergent) { + divergent = true; + break; + } + } + + return divergent; +} + static void recompute_phi_divergence_impl(nir_function_impl *impl) { @@ -388,33 +421,7 @@ recompute_phi_divergence_impl(nir_function_impl *impl) break; nir_phi_instr *phi = nir_instr_as_phi(instr); - - bool divergent = false; - nir_foreach_phi_src(phi_src, phi) { - /* There is a tricky case we need to care about here where a - * convergent block has a divergent dominator. This can happen - * if, for instance, you have the following loop: - * - * loop { - * if (div) { - * %20 = load_ubo(0, 0); - * } else { - * terminate; - * } - * } - * use(%20); - * - * In this case, the load_ubo() dominates the use() even though - * the load_ubo() exists in divergent control-flow. In this - * case, we simply flag the whole phi divergent because we - * don't want to deal with inserting a r2ur somewhere. - */ - if (phi_src->pred->divergent || phi_src->src.ssa->divergent || - nir_def_block(phi_src->src.ssa)->divergent) { - divergent = true; - break; - } - } + bool divergent = nak_nir_phi_is_divergent(phi); if (divergent != phi->def.divergent) { phi->def.divergent = divergent; diff --git a/src/nouveau/compiler/nak_private.h b/src/nouveau/compiler/nak_private.h index 588eb897eb4..896f00d279c 100644 --- a/src/nouveau/compiler/nak_private.h +++ b/src/nouveau/compiler/nak_private.h @@ -370,6 +370,7 @@ bool nak_nir_lower_cmat(nir_shader *shader, const struct nak_compiler *nak); * writing uregs from these blocks. */ bool nak_block_is_divergent(const nir_block *block); +bool nak_nir_phi_is_divergent(nir_phi_instr *phi); void nak_optimize_nir(nir_shader *nir, const struct nak_compiler *nak); From 53bfdb400c2fe6f451648c9c344e3d318e26c96d Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 14 Jan 2026 17:59:20 +0100 Subject: [PATCH 3/7] nak/sm70: add helper for memory load store addresses This also makes the selection of 32 vs 64 bit addresses based on the actual source in the IR. --- src/nouveau/compiler/nak/nvdisasm_tests.rs | 2 +- src/nouveau/compiler/nak/sm70_encode.rs | 62 +++++++++++++++------- 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/src/nouveau/compiler/nak/nvdisasm_tests.rs b/src/nouveau/compiler/nak/nvdisasm_tests.rs index fbcf7c82013..e055bd136ab 100644 --- a/src/nouveau/compiler/nak/nvdisasm_tests.rs +++ b/src/nouveau/compiler/nak/nvdisasm_tests.rs @@ -288,7 +288,7 @@ pub fn test_ldc() { #[test] pub fn test_ld_st_atom() { let r0 = RegRef::new(RegFile::GPR, 0, 1); - let r1 = RegRef::new(RegFile::GPR, 1, 1); + let r1 = RegRef::new(RegFile::GPR, 1, 2); let r2 = RegRef::new(RegFile::GPR, 2, 1); let r3 = RegRef::new(RegFile::GPR, 3, 1); let p4 = RegRef::new(RegFile::Pred, 4, 1); diff --git a/src/nouveau/compiler/nak/sm70_encode.rs b/src/nouveau/compiler/nak/sm70_encode.rs index 2f7694024df..b279a1f37ad 100644 --- a/src/nouveau/compiler/nak/sm70_encode.rs +++ b/src/nouveau/compiler/nak/sm70_encode.rs @@ -108,6 +108,29 @@ impl SM70Encoder<'_> { } } + fn set_reg_addr( + &mut self, + range: Range, + src: &Src, + size_bit: usize, + ) { + assert!(src.is_unmodified()); + match src.src_ref { + SrcRef::Zero => { + self.set_reg(range, self.zero_reg(RegFile::GPR)); + // We always treat a zero GPR as 32 bits, so the UGPR source + // can be 32 bits. + self.set_bit(size_bit, false); + } + SrcRef::Reg(reg) => { + self.set_reg(range, reg); + assert!(reg.comps() <= 2); + self.set_bit(size_bit, reg.comps() == 2); + } + _ => panic!("Not a register"), + } + } + fn set_ureg_src(&mut self, start: usize, src: &Src) { assert!(src.src_mod.is_none()); match src.src_ref { @@ -117,6 +140,24 @@ impl SM70Encoder<'_> { } } + fn set_ureg_addr(&mut self, start: usize, src: &Src, size_bit: usize) { + assert!(src.src_mod.is_none()); + match src.src_ref { + SrcRef::Zero => { + self.set_ureg(start, self.zero_reg(RegFile::UGPR)); + // We always treat a zero UGPR as 64 bits, so the GPR source + // can be 64 bit. + self.set_bit(size_bit, true); + } + SrcRef::Reg(reg) => { + self.set_ureg(start, reg); + assert!(reg.comps() <= 2); + self.set_bit(size_bit, reg.comps() == 2); + } + _ => panic!("Not a register"), + } + } + fn set_pred_dst(&mut self, range: Range, dst: &Dst) { match dst { Dst::None => self.set_pred_reg(range, self.true_reg(RegFile::Pred)), @@ -3009,13 +3050,6 @@ impl SM70Encoder<'_> { } fn set_mem_access(&mut self, access: &MemAccess) { - self.set_field( - 72..73, - match access.space.addr_type() { - MemAddrType::A32 => 0_u8, - MemAddrType::A64 => 1_u8, - }, - ); self.set_mem_type(73..76, access.mem_type); self.set_mem_order(&access.order); self.set_eviction_priority(&access.eviction_priority); @@ -3179,7 +3213,7 @@ impl SM70Op for OpLd { } e.set_dst(&self.dst); - e.set_reg_src(24..32, &self.addr); + e.set_reg_addr(24..32, &self.addr, 72); e.set_field(40..64, self.offset); } } @@ -3314,7 +3348,7 @@ impl SM70Op for OpSt { } } - e.set_reg_src(24..32, &self.addr); + e.set_reg_addr(24..32, &self.addr, 72); e.set_reg_src(32..40, &self.data); e.set_field(40..64, self.offset); } @@ -3421,14 +3455,6 @@ impl SM70Op for OpAtom { e.set_atom_op(87..91, self.atom_op); } - e.set_field( - 72..73, - match self.mem_space.addr_type() { - MemAddrType::A32 => 0_u8, - MemAddrType::A64 => 1_u8, - }, - ); - e.set_mem_order(&self.mem_order); e.set_eviction_priority(&self.mem_eviction_priority); assert_eq!(self.addr_stride, OffsetStride::X1); @@ -3468,7 +3494,7 @@ impl SM70Op for OpAtom { } e.set_dst(&self.dst); - e.set_reg_src(24..32, &self.addr); + e.set_reg_addr(24..32, &self.addr, 72); e.set_field(40..64, self.addr_offset); e.set_atom_type(self.atom_type, false); } From e639aa342d43aa561832b487b82cba358b1f9e85 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 3 May 2026 11:46:12 +0200 Subject: [PATCH 4/7] nak: wire up UGPR Ld/St/Atom encoding --- src/nouveau/compiler/nak/from_nir.rs | 11 ++ src/nouveau/compiler/nak/hw_tests.rs | 4 + src/nouveau/compiler/nak/ir.rs | 32 +++++- src/nouveau/compiler/nak/lower_copy_swap.rs | 2 + src/nouveau/compiler/nak/nvdisasm_tests.rs | 36 +++++-- src/nouveau/compiler/nak/sm70_encode.rs | 108 +++++++++++++++++--- 6 files changed, 170 insertions(+), 23 deletions(-) diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index 3bc4aaf0e08..a7665ed4ec8 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -2992,6 +2992,7 @@ impl<'a> ShaderFromNir<'a> { dst.clone().into() }, addr: addr, + uniform_address: Src::ZERO, cmpr: 0.into(), data: data, atom_op: atom_op, @@ -3018,6 +3019,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpAtom { dst: dst.clone().into(), addr: addr, + uniform_address: Src::ZERO, cmpr: cmpr, data: data, atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate), @@ -3224,6 +3226,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpLd { dst: dst.clone().into(), addr: addr, + uniform_addr: Src::ZERO, pred: pred, offset: intrin.base(), stride: OffsetStride::X1, @@ -3335,6 +3338,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpLd { dst: dst.clone().into(), addr: addr, + uniform_addr: Src::ZERO, pred: true.into(), offset: intrin.base(), stride: OffsetStride::X1, @@ -3358,6 +3362,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpLd { dst: dst.clone().into(), addr: addr, + uniform_addr: Src::ZERO, pred: true.into(), offset: intrin.base(), stride: intrin.offset_shift_nv().try_into().unwrap(), @@ -3678,6 +3683,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpAtom { dst: dst.clone().into(), addr: addr, + uniform_address: Src::ZERO, cmpr: 0.into(), data: data, atom_op: atom_op, @@ -3704,6 +3710,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpAtom { dst: dst.clone().into(), addr: addr, + uniform_address: Src::ZERO, cmpr: cmpr, data: data, atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate), @@ -3736,6 +3743,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpSt { addr: addr, + uniform_addr: Src::ZERO, data: data, offset: intrin.base(), stride: OffsetStride::X1, @@ -3767,6 +3775,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpSt { addr: addr, + uniform_addr: Src::ZERO, data: data, offset: intrin.base(), stride: OffsetStride::X1, @@ -3788,6 +3797,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpSt { addr: addr, + uniform_addr: Src::ZERO, data: data, offset: intrin.base(), stride: intrin.offset_shift_nv().try_into().unwrap(), @@ -3907,6 +3917,7 @@ impl<'a> ShaderFromNir<'a> { mat_size, mat_count, addr, + uniform_addr: Src::ZERO, offset: intrin.base(), }); self.set_dst(&intrin.def, dst); diff --git a/src/nouveau/compiler/nak/hw_tests.rs b/src/nouveau/compiler/nak/hw_tests.rs index bb6e6d8d5aa..09e4a0744de 100644 --- a/src/nouveau/compiler/nak/hw_tests.rs +++ b/src/nouveau/compiler/nak/hw_tests.rs @@ -154,6 +154,7 @@ impl<'a> TestShaderBuilder<'a> { self.push_op(OpLd { dst: dst.clone().into(), addr: self.data_addr.clone().into(), + uniform_addr: Src::ZERO, pred: true.into(), offset: offset.into(), access: access, @@ -178,6 +179,7 @@ impl<'a> TestShaderBuilder<'a> { assert!(data.comps() == comps); self.push_op(OpSt { addr: self.data_addr.clone().into(), + uniform_addr: Src::ZERO, data: data.into(), offset: offset.into(), access: access, @@ -1734,6 +1736,7 @@ fn test_op_ldsm() { let offset = b.imul(lane_id.into(), 16.into()); b.push_op(OpSt { addr: offset.into(), + uniform_addr: Src::ZERO, data: input.into(), offset: 0, access: MemAccess { @@ -1755,6 +1758,7 @@ fn test_op_ldsm() { mat_size: LdsmSize::M8N8, mat_count: 4, addr: addr.into(), + uniform_addr: Src::ZERO, offset: 0, }); b.st_test_data(16, MemType::B128, res); diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 8d5feffb9c7..78febca91c6 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -6502,6 +6502,9 @@ pub struct OpLd { #[src_type(GPR)] pub addr: Src, + #[src_type(GPR)] + pub uniform_addr: Src, + /// On false the load returns 0 #[src_type(Pred)] pub pred: Src, @@ -6513,7 +6516,11 @@ pub struct OpLd { impl DisplayOp for OpLd { fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "ld{} [{}{}", self.access, self.addr, self.stride)?; + write!( + f, + "ld{} [{}{}+{}", + self.access, self.addr, self.stride, self.uniform_addr + )?; if self.offset > 0 { write!(f, "+{:#x}", self.offset)?; } @@ -6602,6 +6609,9 @@ pub struct OpLdsm { #[src_type(SSA)] pub addr: Src, + #[src_type(SSA)] + pub uniform_addr: Src, + pub offset: i32, } @@ -6658,6 +6668,9 @@ pub struct OpSt { #[src_type(SSA)] pub data: Src, + #[src_type(GPR)] + pub uniform_addr: Src, + pub offset: i32, pub stride: OffsetStride, pub access: MemAccess, @@ -6665,7 +6678,11 @@ pub struct OpSt { impl DisplayOp for OpSt { fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "st{} [{}{}", self.access, self.addr, self.stride)?; + write!( + f, + "st{} [{}{}+{}", + self.access, self.addr, self.stride, self.uniform_addr + )?; if self.offset > 0 { write!(f, "+{:#x}", self.offset)?; } @@ -6711,6 +6728,9 @@ pub struct OpAtom { #[src_type(GPR)] pub addr: Src, + #[src_type(GPR)] + pub uniform_address: Src, + #[src_type(GPR)] pub cmpr: Src, @@ -6743,10 +6763,16 @@ impl DisplayOp for OpAtom { if !self.addr.is_zero() { write!(f, "{}{}", self.addr, self.addr_stride)?; } - if self.addr_offset > 0 { + if !self.uniform_address.is_zero() { if !self.addr.is_zero() { write!(f, "+")?; } + write!(f, "{}", self.uniform_address)?; + } + if self.addr_offset > 0 { + if !self.addr.is_zero() || !self.uniform_address.is_zero() { + write!(f, "+")?; + } write!(f, "{:#x}", self.addr_offset)?; } write!(f, "]")?; diff --git a/src/nouveau/compiler/nak/lower_copy_swap.rs b/src/nouveau/compiler/nak/lower_copy_swap.rs index 19e7f7178fe..f1f34978849 100644 --- a/src/nouveau/compiler/nak/lower_copy_swap.rs +++ b/src/nouveau/compiler/nak/lower_copy_swap.rs @@ -95,6 +95,7 @@ impl LowerCopySwap { b.push_op(OpLd { dst: copy.dst, addr: Src::ZERO, + uniform_addr: Src::ZERO, pred: true.into(), offset: addr.try_into().unwrap(), stride: OffsetStride::X1, @@ -175,6 +176,7 @@ impl LowerCopySwap { self.slm_size = max(self.slm_size, addr + 4); b.push_op(OpSt { addr: Src::ZERO, + uniform_addr: Src::ZERO, data: copy.src, offset: addr.try_into().unwrap(), stride: OffsetStride::X1, diff --git a/src/nouveau/compiler/nak/nvdisasm_tests.rs b/src/nouveau/compiler/nak/nvdisasm_tests.rs index e055bd136ab..c845298a73f 100644 --- a/src/nouveau/compiler/nak/nvdisasm_tests.rs +++ b/src/nouveau/compiler/nak/nvdisasm_tests.rs @@ -292,6 +292,7 @@ pub fn test_ld_st_atom() { let r2 = RegRef::new(RegFile::GPR, 2, 1); let r3 = RegRef::new(RegFile::GPR, 3, 1); let p4 = RegRef::new(RegFile::Pred, 4, 1); + let ur2 = RegRef::new(RegFile::UGPR, 2, 2); let order = MemOrder::Strong(MemScope::CTA); @@ -318,6 +319,18 @@ pub fn test_ld_st_atom() { { for addr_stride in [OffsetStride::X1, OffsetStride::X8] { let cta = if sm >= 80 { "sm" } else { "cta" }; + let r1_str = + if sm >= 75 && matches!(space, MemSpace::Global(_)) { + "r1.64" + } else { + "r1" + }; + let urz = if sm >= 73 { + SrcRef::Reg(ur2).into() + } else { + Src::ZERO + }; + let urz_str = if sm >= 73 { "+ur2" } else { "" }; let pri = match space { MemSpace::Global(_) => MemEvictionPriority::First, @@ -339,6 +352,7 @@ pub fn test_ld_st_atom() { let instr = OpLd { dst: Dst::Reg(r0), addr: SrcRef::Reg(r1).into(), + uniform_addr: urz.clone(), pred: if matches!(space, MemSpace::Global(_)) && sm >= 73 { @@ -353,7 +367,7 @@ pub fn test_ld_st_atom() { let expected = match space { MemSpace::Global(_) if sm >= 73 => { format!( - "ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}], p4;" + "ldg.e.ef.strong.{cta} r0, [{r1_str}{urz_str}+{addr_offset_str}], p4;" ) } MemSpace::Global(_) => { @@ -363,17 +377,20 @@ pub fn test_ld_st_atom() { } MemSpace::Shared => { format!( - "lds r0, [r1{addr_stride}+{addr_offset_str}];" + "lds r0, [{r1_str}{addr_stride}{urz_str}+{addr_offset_str}];" ) } MemSpace::Local => { - format!("ldl r0, [r1+{addr_offset_str}];") + format!( + "ldl r0, [{r1_str}{urz_str}+{addr_offset_str}];" + ) } }; c.push(instr, expected); let instr = OpSt { addr: SrcRef::Reg(r1).into(), + uniform_addr: urz.clone(), data: SrcRef::Reg(r2).into(), offset: addr_offset, access: access.clone(), @@ -382,16 +399,18 @@ pub fn test_ld_st_atom() { let expected = match space { MemSpace::Global(_) => { format!( - "stg.e.ef.strong.{cta} [r1+{addr_offset_str}], r2;" + "stg.e.ef.strong.{cta} [{r1_str}{urz_str}+{addr_offset_str}], r2;" ) } MemSpace::Shared => { format!( - "sts [r1{addr_stride}+{addr_offset_str}], r2;" + "sts [{r1_str}{addr_stride}{urz_str}+{addr_offset_str}], r2;" ) } MemSpace::Local => { - format!("stl [r1+{addr_offset_str}], r2;") + format!( + "stl [{r1_str}{urz_str}+{addr_offset_str}], r2;" + ) } }; c.push(instr, expected); @@ -405,6 +424,7 @@ pub fn test_ld_st_atom() { Dst::None }, addr: SrcRef::Reg(r1).into(), + uniform_address: urz.clone(), data: SrcRef::Reg(r2).into(), atom_op: AtomOp::Add, cmpr: SrcRef::Reg(r3).into(), @@ -429,7 +449,7 @@ pub fn test_ld_st_atom() { }; let dst = if use_dst { "pt, r0, " } else { "" }; - format!("{op}.e.add.ef{atom_type_str}.strong.{cta} {dst}[r1+{addr_offset_str}], r2;") + format!("{op}.e.add.ef{atom_type_str}.strong.{cta} {dst}[{r1_str}{urz_str}+{addr_offset_str}], r2;") } MemSpace::Shared => { if atom_type.is_float() { @@ -439,7 +459,7 @@ pub fn test_ld_st_atom() { continue; } let dst = if use_dst { "r0" } else { "rz" }; - format!("atoms.add{atom_type_str} {dst}, [r1{addr_stride}+{addr_offset_str}], r2;") + format!("atoms.add{atom_type_str} {dst}, [{r1_str}{addr_stride}{urz_str}+{addr_offset_str}], r2;") } MemSpace::Local => continue, }; diff --git a/src/nouveau/compiler/nak/sm70_encode.rs b/src/nouveau/compiler/nak/sm70_encode.rs index b279a1f37ad..07736e039b6 100644 --- a/src/nouveau/compiler/nak/sm70_encode.rs +++ b/src/nouveau/compiler/nak/sm70_encode.rs @@ -3170,15 +3170,21 @@ impl SM70Op for OpLd { } fn encode(&self, e: &mut SM70Encoder<'_>) { + let has_ugpr = e.sm >= 73; match self.access.space { MemSpace::Global(_) => { - e.set_opcode(0x381); assert_eq!(self.stride, OffsetStride::X1); - if e.sm >= 73 { + + if has_ugpr { + e.set_opcode(0x981); + e.set_reg_addr(24..32, &self.addr, 90); + e.set_ureg_addr(32, &self.uniform_addr, 72); e.set_rev_pred_src(64..67, 67, &self.pred); } else { - assert!(self.pred.is_true()); + e.set_opcode(0x381); + e.set_reg_addr(24..32, &self.addr, 72); } + e.set_pred_dst(81..84, &Dst::None); e.set_mem_access(&self.access); } @@ -3186,6 +3192,10 @@ impl SM70Op for OpLd { assert!(self.pred.is_true()); assert_eq!(self.stride, OffsetStride::X1); e.set_opcode(0x983); + e.set_reg_src(24..32, &self.addr); + if has_ugpr { + e.set_ureg_src(32, &self.uniform_addr); + } e.set_field(84..87, 1_u8); e.set_mem_type(73..76, self.access.mem_type); @@ -3199,6 +3209,10 @@ impl SM70Op for OpLd { e.set_opcode(0x984); assert!(self.pred.is_true()); + e.set_reg_src(24..32, &self.addr); + if has_ugpr { + e.set_ureg_src(32, &self.uniform_addr); + } e.set_mem_type(73..76, self.access.mem_type); assert!(self.access.order == MemOrder::Strong(MemScope::CTA)); assert!( @@ -3213,8 +3227,11 @@ impl SM70Op for OpLd { } e.set_dst(&self.dst); - e.set_reg_addr(24..32, &self.addr, 72); e.set_field(40..64, self.offset); + // We always enable UGPR mode, because the .E bit changes + // which source it applies to depending on it. + // This way it always applies to the UGPR. + e.set_bit(91, has_ugpr); } } @@ -3315,15 +3332,30 @@ impl SM70Op for OpSt { } fn encode(&self, e: &mut SM70Encoder<'_>) { + let has_ugpr = e.sm >= 75; match self.access.space { MemSpace::Global(_) => { - e.set_opcode(0x386); assert_eq!(self.stride, OffsetStride::X1); + if has_ugpr { + e.set_opcode(0x986); + e.set_reg_addr(24..32, &self.addr, 90); + e.set_ureg_addr(64, &self.uniform_addr, 72); + } else { + e.set_opcode(0x386); + e.set_reg_addr(24..32, &self.addr, 72); + } e.set_mem_access(&self.access); } MemSpace::Local => { - e.set_opcode(0x387); assert_eq!(self.stride, OffsetStride::X1); + if has_ugpr { + e.set_opcode(0x987); + e.set_reg_src(24..32, &self.addr); + e.set_ureg_src(64, &self.uniform_addr); + } else { + e.set_opcode(0x387); + e.set_reg_src(24..32, &self.addr); + } e.set_field(84..87, 1_u8); e.set_mem_type(73..76, self.access.mem_type); @@ -3334,7 +3366,14 @@ impl SM70Op for OpSt { ); } MemSpace::Shared => { - e.set_opcode(0x388); + if has_ugpr { + e.set_opcode(0x988); + e.set_reg_src(24..32, &self.addr); + e.set_ureg_src(64, &self.uniform_addr); + } else { + e.set_opcode(0x388); + e.set_reg_src(24..32, &self.addr); + } e.set_mem_type(73..76, self.access.mem_type); assert!(self.access.order == MemOrder::Strong(MemScope::CTA)); @@ -3348,9 +3387,12 @@ impl SM70Op for OpSt { } } - e.set_reg_addr(24..32, &self.addr, 72); e.set_reg_src(32..40, &self.data); e.set_field(40..64, self.offset); + // We always enable UGPR mode, because the .E bit changes + // which source it applies to depending on it. + // This way it always applies to the UGPR. + e.set_bit(91, has_ugpr); } } @@ -3425,6 +3467,7 @@ impl SM70Op for OpAtom { } fn encode(&self, e: &mut SM70Encoder<'_>) { + let has_ugpr = e.sm >= 75; match self.mem_space { MemSpace::Global(_) => { if self.dst.is_none() { @@ -3435,24 +3478,56 @@ impl SM70Op for OpAtom { } e.set_reg_src(32..40, &self.data); + e.set_field(40..64, self.addr_offset); e.set_atom_op(87..90, self.atom_op); + if has_ugpr { + e.set_reg_addr(24..32, &self.addr, 90); + e.set_ureg_addr(64, &self.uniform_address, 72); + e.set_bit(91, true); + } else { + e.set_reg_addr(24..32, &self.addr, 72); + assert!(self.uniform_address.is_zero()); + } } else if let AtomOp::CmpExch(cmp_src) = self.atom_op { e.set_opcode(0x3a9); assert!(cmp_src == AtomCmpSrc::Separate); + assert!(self.uniform_address.is_zero()); + e.set_reg_addr(24..32, &self.addr, 72); e.set_reg_src(32..40, &self.cmpr); + e.set_field(40..64, self.addr_offset); e.set_reg_src(64..72, &self.data); e.set_pred_dst(81..84, &Dst::None); } else { if e.sm >= 90 && self.atom_type.is_float() { - e.set_opcode(0x3a3); + e.set_opcode(0x9a3); + } else if has_ugpr { + e.set_opcode(0x9a8); } else { e.set_opcode(0x3a8); } + if e.sm >= 100 { + e.set_reg_addr(24..32, &self.addr, 63); + e.set_ureg_addr(64, &self.uniform_address, 72); + } else if has_ugpr { + e.set_reg_addr(24..32, &self.addr, 70); + e.set_ureg_addr(64, &self.uniform_address, 72); + } else { + e.set_reg_addr(24..32, &self.addr, 72); + assert!(self.uniform_address.is_zero()); + }; + + if e.sm >= 100 { + e.set_field(40..63, self.addr_offset); + } else { + e.set_field(40..64, self.addr_offset); + }; + e.set_reg_src(32..40, &self.data); e.set_pred_dst(81..84, &Dst::None); e.set_atom_op(87..91, self.atom_op); + e.set_bit(91, has_ugpr); } e.set_mem_order(&self.mem_order); @@ -3465,10 +3540,17 @@ impl SM70Op for OpAtom { e.set_opcode(0x38d); assert!(cmp_src == AtomCmpSrc::Separate); + assert!(self.uniform_address.is_zero()); e.set_reg_src(32..40, &self.cmpr); e.set_reg_src(64..72, &self.data); } else { - e.set_opcode(0x38c); + if has_ugpr { + e.set_opcode(0x98c); + e.set_ureg_src(64, &self.uniform_address); + e.set_bit(91, true); + } else { + e.set_opcode(0x38c); + } e.set_reg_src(32..40, &self.data); assert!( @@ -3483,6 +3565,8 @@ impl SM70Op for OpAtom { e.set_atom_op(87..91, self.atom_op); } + e.set_reg_src(24..32, &self.addr); + e.set_field(40..64, self.addr_offset); assert!(e.sm >= 75 || self.addr_stride == OffsetStride::X1); e.set_field(78..80, self.addr_stride.encode_sm75()); @@ -3494,8 +3578,6 @@ impl SM70Op for OpAtom { } e.set_dst(&self.dst); - e.set_reg_addr(24..32, &self.addr, 72); - e.set_field(40..64, self.addr_offset); e.set_atom_type(self.atom_type, false); } } @@ -4218,6 +4300,7 @@ impl SM70Op for OpLdsm { e.set_opcode(0x83b); e.set_dst(&self.dst); e.set_reg_src(24..32, &self.addr); + e.set_ureg_src(32, &self.uniform_addr); e.set_field(40..64, self.offset); e.set_field( 72..74, @@ -4238,6 +4321,7 @@ impl SM70Op for OpLdsm { // LdsmSize::M8N32 => 3, }, ); + e.set_bit(91, !self.uniform_addr.is_zero()); } } From 24b725a5d26b337b1f15bec86297b9a9ad088e1c Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Thu, 29 Jan 2026 22:36:17 +0100 Subject: [PATCH 5/7] nir: add uniform address to nvidia IO intrinsics Adding the zero constants have a minor impact on stats Totals from 61 (0.01% of 1212873) affected shaders: CodeSize: 1044720 -> 1047472 (+0.26%); split: -0.00%, +0.27% Static cycle count: 1198932 -> 1198490 (-0.04%); split: -0.07%, +0.04% --- src/compiler/nir/nir.h | 6 +++- src/compiler/nir/nir_intrinsics.py | 26 +++++++------- src/compiler/nir/nir_lower_io.c | 33 +++++++++++++++++ src/compiler/nir/nir_opt_offsets.c | 8 ++++- src/compiler/nir/nir_validate.c | 12 +++++-- src/nouveau/compiler/nak/from_nir.rs | 34 +++++++++++------- src/nouveau/compiler/nak_nir.c | 35 +++++++++++++------ src/nouveau/compiler/nak_nir_lower_cmat.c | 3 +- .../compiler/nak_nir_lower_non_uniform_ldcx.c | 4 ++- 9 files changed, 121 insertions(+), 40 deletions(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index cfa9a6e8a73..279c57e2241 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5787,11 +5787,13 @@ nir_lower_shader_calls(nir_shader *shader, void *mem_ctx); int nir_get_io_offset_src_number(const nir_intrinsic_instr *instr); +int nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr); int nir_get_io_index_src_number(const nir_intrinsic_instr *instr); int nir_get_io_data_src_number(const nir_intrinsic_instr *instr); int nir_get_io_arrayed_index_src_number(const nir_intrinsic_instr *instr); nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr); +nir_src *nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr); nir_src *nir_get_io_index_src(nir_intrinsic_instr *instr); nir_src *nir_get_io_data_src(nir_intrinsic_instr *instr); nir_src *nir_get_io_arrayed_index_src(nir_intrinsic_instr *instr); @@ -5801,7 +5803,6 @@ static inline unsigned nir_get_io_base_size_nv(const nir_intrinsic_instr *intr) { switch (intr->intrinsic) { - case nir_intrinsic_global_atomic_nv: case nir_intrinsic_global_atomic_swap_nv: case nir_intrinsic_shared_atomic_nv: case nir_intrinsic_shared_atomic_swap_nv: @@ -5814,6 +5815,9 @@ nir_get_io_base_size_nv(const nir_intrinsic_instr *intr) case nir_intrinsic_store_shared_nv: case nir_intrinsic_store_shared_unlock_nv: return 24; + case nir_intrinsic_global_atomic_nv: + /* TODO: SM100+ only has 23 bits for the UGPR + GPR form */ + return 23; case nir_intrinsic_ldc_nv: case nir_intrinsic_ldcx_nv: return 16; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 1a7b029a3b6..daad6d6a305 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -941,7 +941,8 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0, # The offset is sign-extended or zero-extended based on the SIGN_EXTEND index. # # NV variants all come with a 24 bit base, that is unsigned with a constant 0 address, -# signed otherwise. +# signed otherwise. Non swap atomic also comes with an additional uniform address source +# right after the non uniform memory address. # # PCO global variants use a vec3 for the memory address and data, where component X # has the low 32 address bits, component Y has the high 32 address bits, and component Z @@ -950,13 +951,13 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0, intrinsic("deref_atomic", src_comp=[-1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) intrinsic("ssbo_atomic", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT]) intrinsic("shared_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) -intrinsic("shared_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV]) +intrinsic("shared_atomic_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV]) intrinsic("task_payload_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) intrinsic("global_atomic", src_comp=[1, 1], dest_comp=1, indices=[ATOMIC_OP]) intrinsic("global_atomic_2x32", src_comp=[2, 1], dest_comp=1, indices=[ATOMIC_OP]) intrinsic("global_atomic_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) intrinsic("global_atomic_agx", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND]) -intrinsic("global_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) +intrinsic("global_atomic_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) intrinsic("global_atomic_pco", src_comp=[3], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32]) intrinsic("deref_atomic_swap", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) @@ -1920,15 +1921,15 @@ load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flag # src[] = { value, address, unsigned 32-bit offset }. store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRITE_MASK]) -# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given, -# signed otherwise. +# src[] = { address, uniform_address }. BASE is a 24 bit unsigned offset if a constant 0 address and +# a constant 0 uniform_address is given, signed otherwise. # load_global_nv has an additional boolean input that makes the load return 0 on false. -load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) -store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) -load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) -store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET]) -load("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) -store("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) +load("global_nv", [1, 1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) +store("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) +load("scratch_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) +store("scratch_nv", [1, 1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET]) +load("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) +store("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) # Same as shared_atomic_add, but with GDS. src[] = {store_val, gds_addr, m0} intrinsic("gds_atomic_add_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE]) @@ -2942,7 +2943,8 @@ intrinsic("ssa_bar_nv", src_comp=[1]) intrinsic("cmat_muladd_nv", src_comp=[-1, -1, -1], dest_comp=0, bit_sizes=src2, indices=[FLAGS], flags=[CAN_ELIMINATE]) -intrinsic("cmat_load_shared_nv", src_comp=[1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE]) +# src[] = { address, uniform_address } +intrinsic("cmat_load_shared_nv", src_comp=[1, 1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE]) # Moves a 8x8 16bit matrix with transposition within a subgroup intrinsic("cmat_mov_transpose_nv", src_comp=[2], dest_comp=2, bit_sizes=[16], flags=[CAN_ELIMINATE, CAN_REORDER, SUBGROUP]) diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 86f7a3591fd..4ae7cc7ca1a 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -1106,6 +1106,39 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr) case nir_intrinsic_bindless_image_##name: \ case nir_intrinsic_image_heap_##name +/** + * Return the uniform offset source number for a load/store intrinsic or -1 if there's no offset. + */ +int +nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_cmat_load_shared_nv: + case nir_intrinsic_global_atomic_nv: + case nir_intrinsic_load_global_nv: + case nir_intrinsic_load_scratch_nv: + case nir_intrinsic_load_shared_nv: + case nir_intrinsic_shared_atomic_nv: + return 1; + case nir_intrinsic_store_global_nv: + case nir_intrinsic_store_scratch_nv: + case nir_intrinsic_store_shared_nv: + return 2; + default: + return -1; + } +} + +/** + * Return the uniform offset source for a load/store intrinsic. + */ +nir_src * +nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr) +{ + const int idx = nir_get_io_uniform_offset_src_number(instr); + return idx >= 0 ? &instr->src[idx] : NULL; +} + /** * Return the index or handle source number for a load/store intrinsic or -1 * if there's no index or handle. diff --git a/src/compiler/nir/nir_opt_offsets.c b/src/compiler/nir/nir_opt_offsets.c index 5e53ac297c2..70a47461b84 100644 --- a/src/compiler/nir/nir_opt_offsets.c +++ b/src/compiler/nir/nir_opt_offsets.c @@ -193,11 +193,12 @@ try_fold_load_store_nv(nir_builder *b, assert(offset_idx >= 0); nir_src src = intrin->src[offset_idx]; + nir_src *uniform_src = nir_get_io_uniform_offset_src(intrin); int32_t min = 0; uint32_t max = BITFIELD_MASK(offset_bits); - if (!nir_src_is_const(src)) { + if (!nir_src_is_const(src) || (uniform_src && !nir_src_is_const(*uniform_src))) { max >>= 1; min = ~max; } @@ -211,6 +212,11 @@ try_fold_load_store_nv(nir_builder *b, return false; } + /* We don't try to fold the offset for the uniform source on purpose, + * because we rely on running nir_opt_offsets before moving in the uniform + * source. However, we might run this pass again _after_ that, because we + * can eliminate a u2u64 on the _non uniform_ source and therefore might be + * able to fold in more constants into base. */ return try_fold_load_store(b, intrin, state, offset_idx, min, max, false); } diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 20116c51a2e..59d9e9a92dc 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -761,9 +761,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) case nir_intrinsic_vild_nv: { int base = nir_intrinsic_base(instr); nir_src src = *nir_get_io_offset_src(instr); + nir_src *uniform_src = nir_get_io_uniform_offset_src(instr); unsigned const_bits = nir_get_io_base_size_nv(instr); - if (nir_src_is_const(src) && nir_src_as_int(src) == 0) { + if (nir_src_is_const(src) && nir_src_as_int(src) == 0 && + (!uniform_src || (nir_src_is_const(*uniform_src) && nir_src_as_int(*uniform_src) == 0))) { validate_assert(state, base >= 0 && base < BITFIELD_MASK(const_bits)); } else { int32_t max = BITFIELD_MASK(const_bits - 1); @@ -771,8 +773,14 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) validate_assert(state, base >= min && base < max); } + if (uniform_src) { + validate_assert(state, uniform_src->ssa->bit_size >= src.ssa->bit_size); + if (state->impl->valid_metadata & nir_metadata_divergence) + validate_assert(state, !uniform_src->ssa->divergent); + } + if (instr->intrinsic == nir_intrinsic_load_global_nv) { - validate_assert(state, instr->src[1].ssa->bit_size == 1); + validate_assert(state, instr->src[2].ssa->bit_size == 1); } break; diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index a7665ed4ec8..fdc457b8685 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -2975,7 +2975,8 @@ impl<'a> ShaderFromNir<'a> { nir_intrinsic_global_atomic_nv => { let bit_size = intrin.def.bit_size(); let addr = self.get_src(&srcs[0]); - let data = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[1]); + let data = self.get_src(&srcs[2]); let atom_type = self.get_atomic_type(intrin); let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate); @@ -2992,7 +2993,7 @@ impl<'a> ShaderFromNir<'a> { dst.clone().into() }, addr: addr, - uniform_address: Src::ZERO, + uniform_address: uaddr, cmpr: 0.into(), data: data, atom_op: atom_op, @@ -3220,13 +3221,14 @@ impl<'a> ShaderFromNir<'a> { .get_eviction_priority(intrin.access()), }; let addr = self.get_src(&srcs[0]); - let pred = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[1]); + let pred = self.get_src(&srcs[2]); let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4)); b.push_op(OpLd { dst: dst.clone().into(), addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, pred: pred, offset: intrin.base(), stride: OffsetStride::X1, @@ -3333,12 +3335,13 @@ impl<'a> ShaderFromNir<'a> { eviction_priority: MemEvictionPriority::Normal, }; let addr = self.get_src(&srcs[0]); + let uaddr = self.get_src(&srcs[1]); let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4)); b.push_op(OpLd { dst: dst.clone().into(), addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, pred: true.into(), offset: intrin.base(), stride: OffsetStride::X1, @@ -3357,12 +3360,14 @@ impl<'a> ShaderFromNir<'a> { eviction_priority: MemEvictionPriority::Normal, }; let addr = self.get_src(&srcs[0]); + let uaddr = self.get_src(&srcs[1]); + let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4)); b.push_op(OpLd { dst: dst.clone().into(), addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, pred: true.into(), offset: intrin.base(), stride: intrin.offset_shift_nv().try_into().unwrap(), @@ -3673,7 +3678,8 @@ impl<'a> ShaderFromNir<'a> { nir_intrinsic_shared_atomic_nv => { let bit_size = intrin.def.bit_size(); let addr = self.get_src(&srcs[0]); - let data = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[1]); + let data = self.get_src(&srcs[2]); let atom_type = self.get_atomic_type(intrin); let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate); @@ -3683,7 +3689,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpAtom { dst: dst.clone().into(), addr: addr, - uniform_address: Src::ZERO, + uniform_address: uaddr, cmpr: 0.into(), data: data, atom_op: atom_op, @@ -3740,10 +3746,11 @@ impl<'a> ShaderFromNir<'a> { .get_eviction_priority(intrin.access()), }; let addr = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[2]); b.push_op(OpSt { addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, data: data, offset: intrin.base(), stride: OffsetStride::X1, @@ -3772,10 +3779,11 @@ impl<'a> ShaderFromNir<'a> { eviction_priority: MemEvictionPriority::Normal, }; let addr = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[2]); b.push_op(OpSt { addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, data: data, offset: intrin.base(), stride: OffsetStride::X1, @@ -3794,10 +3802,11 @@ impl<'a> ShaderFromNir<'a> { eviction_priority: MemEvictionPriority::Normal, }; let addr = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[2]); b.push_op(OpSt { addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, data: data, offset: intrin.base(), stride: intrin.offset_shift_nv().try_into().unwrap(), @@ -3912,12 +3921,13 @@ impl<'a> ShaderFromNir<'a> { }; let dst = b.alloc_ssa_vec(RegFile::GPR, comps); let addr = self.get_src(&srcs[0]); + let uaddr = self.get_src(&srcs[1]); b.push_op(OpLdsm { dst: dst.clone().into(), mat_size, mat_count, addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, offset: intrin.base(), }); self.set_dst(&intrin.def, dst); diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 9c129859c63..c6a85525473 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1019,8 +1019,23 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_src *addr; + + switch (intr->intrinsic) { + case nir_intrinsic_load_global_bounded: + case nir_intrinsic_load_global_constant_bounded: { + addr = &intr->src[0]; + break; + } + default: + addr = nir_get_io_offset_src(intr); + break; + } + if (!addr) + continue; + b.cursor = nir_before_instr(instr); - nir_src *addr = nir_get_io_offset_src(intr); + nir_def *uaddr = nir_imm_zero(&b, 1, addr->ssa->bit_size); nir_def *res = NULL; nir_intrinsic_instr *new = NULL; @@ -1028,7 +1043,7 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) case nir_intrinsic_load_global: case nir_intrinsic_load_global_constant: { nir_def *nir_true = nir_imm_bool(&b, true); - res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true); + res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr, nir_true); break; } case nir_intrinsic_load_global_bounded: @@ -1044,32 +1059,32 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa)); nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1); nir_def *cond = nir_ult(&b, last_byte, size->ssa); - res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond); + res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, uaddr, cond); break; } case nir_intrinsic_load_scratch: - res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa); + res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr); break; case nir_intrinsic_load_shared: - res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa); + res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr); break; case nir_intrinsic_store_global: - new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa); + new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa, uaddr); break; case nir_intrinsic_store_scratch: - new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa); + new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa, uaddr); break; case nir_intrinsic_store_shared: - new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa); + new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa, uaddr); break; case nir_intrinsic_global_atomic: - res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa); + res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa); break; case nir_intrinsic_global_atomic_swap: res = nir_global_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa); break; case nir_intrinsic_shared_atomic: - res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa); + res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa); break; case nir_intrinsic_shared_atomic_swap: res = nir_shared_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa); diff --git a/src/nouveau/compiler/nak_nir_lower_cmat.c b/src/nouveau/compiler/nak_nir_lower_cmat.c index 0490d21fd7b..e5c10171734 100644 --- a/src/nouveau/compiler/nak_nir_lower_cmat.c +++ b/src/nouveau/compiler/nak_nir_lower_cmat.c @@ -723,6 +723,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr) nir_def *base = intr->src[1].ssa; offset = nir_u2uN(b, offset, base->bit_size); nir_def *addr = nir_iadd(b, base, offset); + nir_def *zero = nir_imm_zero(b, addr->num_components, addr->bit_size); /* flip the layout for B matrices */ if (desc.use == GLSL_CMAT_USE_B) { @@ -734,7 +735,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr) /* Each thread loads 32 bits per matrix */ assert(length * bit_size == 32 * ldsm_count); - return nir_cmat_load_shared_nv(b, length, bit_size, addr, + return nir_cmat_load_shared_nv(b, length, bit_size, addr, zero, .num_matrices = ldsm_count, .matrix_layout = layout); } diff --git a/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c b/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c index 10507233910..7fd64e13b98 100644 --- a/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c +++ b/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c @@ -56,10 +56,12 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load, * simple less-than check here. */ nir_def *cond = nir_ilt(b, offset, size); + nir_def *zero_addr = nir_imm_zero(b, addr->num_components, + addr->bit_size); nir_def *val = nir_load_global_nv(b, load->def.num_components, load->def.bit_size, nir_iadd(b, addr, nir_u2u64(b, offset)), - cond, + zero_addr, cond, .align_mul = nir_intrinsic_align_mul(load), .align_offset = nir_intrinsic_align_offset(load), .access = ACCESS_CAN_REORDER, From eeadd23c091f4eb64a386b5eca6c9389ae7616d3 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Tue, 24 Feb 2026 04:32:57 +0100 Subject: [PATCH 6/7] nak: add UGPR/GPR lowering for load/store/atom instructions This tries to handle all combinations we might run into to. We should rely on previous optimizations that the more difficult cases never happend. As a side benefit instead of lowering a UGPR to a GPR, it will now be moved to the UGPR slot. Totals from 258010 (21.27% of 1212873) affected shaders: CodeSize: 3742700224 -> 3576740928 (-4.43%); split: -4.44%, +0.01% Number of GPRs: 13606055 -> 13496463 (-0.81%); split: -0.86%, +0.05% SLM Size: 589740 -> 589660 (-0.01%) Static cycle count: 3271547493 -> 3272550831 (+0.03%); split: -0.47%, +0.50% Spills to memory: 56180 -> 56136 (-0.08%) Fills from memory: 56180 -> 56136 (-0.08%) Spills to reg: 108211 -> 110013 (+1.67%); split: -0.63%, +2.30% Fills from reg: 99216 -> 100471 (+1.26%); split: -0.30%, +1.56% Max warps/SM: 9921228 -> 9972060 (+0.51%); split: +0.52%, -0.00% --- src/nouveau/compiler/nak/ir.rs | 11 +++ src/nouveau/compiler/nak/sm70_encode.rs | 90 +++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 5 deletions(-) diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 78febca91c6..f8756d4a80b 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -6468,6 +6468,17 @@ pub enum OffsetStride { X16 = 4, } +impl OffsetStride { + pub fn shift(&self) -> u32 { + match self { + Self::X1 => 0, + Self::X4 => 2, + Self::X8 => 3, + Self::X16 => 4, + } + } +} + impl fmt::Display for OffsetStride { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let s = match self { diff --git a/src/nouveau/compiler/nak/sm70_encode.rs b/src/nouveau/compiler/nak/sm70_encode.rs index 07736e039b6..1fd267c256e 100644 --- a/src/nouveau/compiler/nak/sm70_encode.rs +++ b/src/nouveau/compiler/nak/sm70_encode.rs @@ -10,6 +10,7 @@ use crate::sm70::ShaderModel70; use bitview::*; use rustc_hash::FxHashMap; +use std::mem; use std::ops::Range; /// A per-op trait that implements Volta+ opcode semantics @@ -774,6 +775,60 @@ fn op_gpr(op: &impl DstsAsSlice) -> RegFile { } } +fn legalize_load_store_address( + b: &mut LegalizeBuilder, + addr: &mut Src, + uniform_addr: &mut Src, + stride: Option<&mut OffsetStride>, +) { + let stride_x1_or_none = matches!(stride, Some(OffsetStride::X1) | None); + if addr.is_ugpr_reg() { + if stride_x1_or_none && uniform_addr.is_zero() { + *uniform_addr = mem::replace(addr, Src::ZERO); + } else { + b.copy_src_if_uniform(addr); + } + } + + if uniform_addr.is_gpr_reg() { + if addr.is_zero() { + assert!(stride_x1_or_none); + *addr = mem::replace(uniform_addr, Src::ZERO); + } else { + let uniform_ssa = uniform_addr.as_ssa().unwrap(); + let mut ssa = addr.as_ssa().unwrap(); + + let addr_comps = ssa.comps(); + if let Some(stride) = stride { + if *stride != OffsetStride::X1 { + assert_eq!(addr_comps, 1); + let shift = stride.shift(); + let shift = b.copy(shift.into()); + *addr = b.shl(addr.clone(), shift.into()).into(); + ssa = addr.as_ssa().unwrap(); + *stride = OffsetStride::X1; + } + } + + if uniform_ssa.comps() == 2 { + // In case the non uniform address is 32 bits and the uniform one 64, + // we need convert it to 64 bits. + if uniform_ssa.comps() != addr_comps { + let zero = b.copy(0.into()); + *addr = [ssa[0], zero].into(); + } + *addr = b + .iadd64(addr.clone(), uniform_addr.clone(), Src::ZERO) + .into() + } else { + *addr = + b.iadd(addr.clone(), uniform_addr.clone(), Src::ZERO).into() + } + *uniform_addr = 0.into(); + } + } +} + // // Implementations of SM70Op for each op we support on Volta+ // @@ -3165,7 +3220,12 @@ impl SM70Op for OpSuAtom { impl SM70Op for OpLd { fn legalize(&mut self, b: &mut LegalizeBuilder) { - b.copy_src_if_uniform(&mut self.addr); + legalize_load_store_address( + b, + &mut self.addr, + &mut self.uniform_addr, + Some(&mut self.stride), + ); b.copy_src_if_uniform(&mut self.pred); } @@ -3327,8 +3387,13 @@ impl SM70Op for OpLdc { impl SM70Op for OpSt { fn legalize(&mut self, b: &mut LegalizeBuilder) { - b.copy_src_if_uniform(&mut self.addr); b.copy_src_if_uniform(&mut self.data); + legalize_load_store_address( + b, + &mut self.addr, + &mut self.uniform_addr, + Some(&mut self.stride), + ); } fn encode(&self, e: &mut SM70Encoder<'_>) { @@ -3461,9 +3526,19 @@ impl SM70Encoder<'_> { impl SM70Op for OpAtom { fn legalize(&mut self, b: &mut LegalizeBuilder) { - b.copy_src_if_uniform(&mut self.addr); - b.copy_src_if_uniform(&mut self.cmpr); b.copy_src_if_uniform(&mut self.data); + + if matches!(self.atom_op, AtomOp::CmpExch(_)) { + b.copy_src_if_uniform(&mut self.addr); + b.copy_src_if_uniform(&mut self.cmpr); + } else { + legalize_load_store_address( + b, + &mut self.addr, + &mut self.uniform_address, + Some(&mut self.addr_stride), + ); + } } fn encode(&self, e: &mut SM70Encoder<'_>) { @@ -4291,7 +4366,12 @@ impl SM70Op for OpHmma { impl SM70Op for OpLdsm { fn legalize(&mut self, b: &mut LegalizeBuilder) { - b.copy_src_if_uniform(&mut self.addr); + legalize_load_store_address( + b, + &mut self.addr, + &mut self.uniform_addr, + None, + ); } fn encode(&self, e: &mut SM70Encoder<'_>) { From 0b4705ec956b69cc5fdf6c66200987e5992a3bed Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 15 Mar 2026 21:16:30 +0100 Subject: [PATCH 7/7] nak: optimize iadds with an uniform operand in iadds of address calculations Instead of doing the iadd manually we can use the uniform slot of the ld/st/atom instruction getting rid of the iadd altogether. Additionally for global memory we can also consume a 32 bit offset instead of requiring it to be 64 bit. Totals from 158539 (13.07% of 1212873) affected shaders: CodeSize: 2308216336 -> 2242231136 (-2.86%); split: -2.86%, +0.00% Number of GPRs: 8682436 -> 8662675 (-0.23%); split: -0.26%, +0.04% SLM Size: 238816 -> 238604 (-0.09%) Static cycle count: 2169063422 -> 2147747544 (-0.98%); split: -0.99%, +0.01% Spills to memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02% Fills from memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02% Spills to reg: 45053 -> 45273 (+0.49%); split: -0.04%, +0.53% Fills from reg: 36385 -> 36757 (+1.02%); split: -0.04%, +1.06% Max warps/SM: 6027232 -> 6034616 (+0.12%); split: +0.12%, -0.00% --- src/nouveau/compiler/nak_nir.c | 113 +++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index c6a85525473..372000f67d2 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1130,6 +1130,113 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) return progress; } +static bool +is_divergent_phi(nir_instr *instr) +{ + if (instr->type != nir_instr_type_phi) + return false; + nir_phi_instr *phi = nir_instr_as_phi(instr); + return nak_nir_phi_is_divergent(phi); +} + +static bool +nak_nir_opt_uniform_address_impl(struct nir_builder *b, + nir_intrinsic_instr *intr, void *cb_data) +{ + switch (intr->intrinsic) { + case nir_intrinsic_cmat_load_shared_nv: + case nir_intrinsic_global_atomic_nv: + case nir_intrinsic_load_global_nv: + case nir_intrinsic_load_scratch_nv: + case nir_intrinsic_load_shared_nv: + case nir_intrinsic_shared_atomic_nv: + case nir_intrinsic_store_global_nv: + case nir_intrinsic_store_scratch_nv: + case nir_intrinsic_store_shared_nv: { + nir_src *offset_src = nir_get_io_offset_src(intr); + nir_def *offset = offset_src->ssa; + nir_src *uniform_offset_src = nir_get_io_uniform_offset_src(intr); + nir_def *uniform_offset = uniform_offset_src->ssa; + nir_block *use_block = intr->instr.block; + + assert(nir_src_as_uint(*uniform_offset_src) == 0); + + /* Nak can't collect vectors in non uniform control flow, so don't + * even try */ + if (offset->bit_size == 64 && nak_block_is_divergent(use_block)) + return false; + + /* We ignore any constant offset */ + if (nir_src_is_const(*offset_src)) + return false; + + /* If the source is already uniform, just swap them as the uniform slot + * should be 0 */ + if (!nir_def_is_divergent_at_use_block(offset, use_block)) { + if (is_divergent_phi(nir_def_instr(offset))) + return false; + nir_src_rewrite(uniform_offset_src, offset); + nir_src_rewrite(offset_src, uniform_offset); + return true; + } + + nir_alu_instr *iadd = nir_def_as_alu_or_null(offset_src->ssa); + if (!iadd || iadd->op != nir_op_iadd) + return false; + + unsigned src0_div = nir_def_is_divergent_at_use_block(iadd->src[0].src.ssa, use_block); + unsigned src1_div = nir_def_is_divergent_at_use_block(iadd->src[1].src.ssa, use_block); + if (src0_div && src1_div) + return false; + + b->cursor = nir_before_instr(&intr->instr); + + nir_def *addr, *uaddr; + if (src0_div) { + assert(!src1_div); + addr = nir_ssa_for_alu_src(b, iadd, 0); + uaddr = nir_ssa_for_alu_src(b, iadd, 1); + } else { + assert(src1_div); + addr = nir_ssa_for_alu_src(b, iadd, 1); + uaddr = nir_ssa_for_alu_src(b, iadd, 0); + } + + if (is_divergent_phi(nir_def_instr(uaddr))) + return false; + + /* We can remove a u2u64 on the non uniform src */ + if (addr->bit_size == 64) { + nir_alu_instr *u2u64 = nir_def_as_alu_or_null(addr); + if (u2u64 && u2u64->op == nir_op_u2u64) + addr = nir_ssa_for_alu_src(b, u2u64, 0); + } + + nir_src_rewrite(offset_src, addr); + nir_src_rewrite(uniform_offset_src, uaddr); + return true; + } + default: + return false; + } +} + +/** This pass assumes it is ran after nir_opt_offset */ +static bool +nak_nir_opt_uniform_address(nir_shader *nir) +{ + if (nak_debug_no_ugpr()) + return false; + nir_divergence_analysis(nir); + return nir_shader_intrinsics_pass( + nir, + nak_nir_opt_uniform_address_impl, + nir_metadata_control_flow, + NULL + ); +} + + static bool nak_nir_opt_offset_shift_nv_impl(struct nir_builder *b, nir_intrinsic_instr *intrin, void *data) @@ -1333,6 +1440,12 @@ nak_postprocess_nir(nir_shader *nir, .cb_data = nak, }; OPT(nir, nir_opt_offsets, &nak_offset_options); + if (nak->sm >= 73) { + OPT(nir, nak_nir_opt_uniform_address); + /* TODO: as we eliminate u2u64s we could fold more offsets in, however + * This would require us to verify it doesn't overflow, which we can't. */ + /* OPT(nir, nir_opt_offsets, &nak_offset_options); */ + } /* Should run after nir_opt_offsets, because nir_opt_algebraic will move * iadds down the chain */