diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 2afd086a5fa..9e22044e999 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1845,7 +1845,8 @@ store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRIT # src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given, # signed otherwise. -load("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) +# load_global_nv has an additional boolean input that makes the load return 0 on false. +load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET]) diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 8b86da89e12..52b0809bfb9 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -756,6 +756,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) int32_t min = ~BITFIELD_MASK(const_bits - 1); validate_assert(state, base >= min && base < max); } + + if (instr->intrinsic == nir_intrinsic_load_global_nv) { + validate_assert(state, instr->src[1].ssa->bit_size == 1); + } + break; } diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index e02326b58d9..bdec2e0fb58 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -3071,11 +3071,13 @@ impl<'a> ShaderFromNir<'a> { .get_eviction_priority(intrin.access()), }; let addr = self.get_src(&srcs[0]); + let pred = self.get_src(&srcs[1]); let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4)); b.push_op(OpLd { dst: dst.clone().into(), addr: addr, + pred: pred, offset: intrin.base(), stride: OffsetStride::X1, access: access, @@ -3186,6 +3188,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpLd { dst: dst.clone().into(), addr: addr, + pred: true.into(), offset: intrin.base(), stride: OffsetStride::X1, access: access, @@ -3208,6 +3211,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpLd { dst: dst.clone().into(), addr: addr, + pred: true.into(), offset: intrin.base(), stride: intrin.offset_shift_nv().try_into().unwrap(), access: access, diff --git a/src/nouveau/compiler/nak/hw_tests.rs b/src/nouveau/compiler/nak/hw_tests.rs index 76310559a8e..dce2aa6d9e8 100644 --- a/src/nouveau/compiler/nak/hw_tests.rs +++ b/src/nouveau/compiler/nak/hw_tests.rs @@ -154,6 +154,7 @@ impl<'a> TestShaderBuilder<'a> { self.push_op(OpLd { dst: dst.clone().into(), addr: self.data_addr.clone().into(), + pred: true.into(), offset: offset.into(), access: access, stride: OffsetStride::X1, diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 9159aa8eb65..0ab8d7cc28b 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -6459,6 +6459,10 @@ pub struct OpLd { #[src_type(GPR)] pub addr: Src, + /// On false the load returns 0 + #[src_type(Pred)] + pub pred: Src, + pub offset: i32, pub stride: OffsetStride, pub access: MemAccess, @@ -6470,7 +6474,7 @@ impl DisplayOp for OpLd { if self.offset > 0 { write!(f, "+{:#x}", self.offset)?; } - write!(f, "]") + write!(f, "], {}", self.pred) } } impl_display_for_op!(OpLd); diff --git a/src/nouveau/compiler/nak/lower_copy_swap.rs b/src/nouveau/compiler/nak/lower_copy_swap.rs index b63ba42e47d..19e7f7178fe 100644 --- a/src/nouveau/compiler/nak/lower_copy_swap.rs +++ b/src/nouveau/compiler/nak/lower_copy_swap.rs @@ -95,6 +95,7 @@ impl LowerCopySwap { b.push_op(OpLd { dst: copy.dst, addr: Src::ZERO, + pred: true.into(), offset: addr.try_into().unwrap(), stride: OffsetStride::X1, access: access, diff --git a/src/nouveau/compiler/nak/nvdisasm_tests.rs b/src/nouveau/compiler/nak/nvdisasm_tests.rs index ccb32801d38..8d574383d7c 100644 --- a/src/nouveau/compiler/nak/nvdisasm_tests.rs +++ b/src/nouveau/compiler/nak/nvdisasm_tests.rs @@ -291,6 +291,7 @@ pub fn test_ld_st_atom() { let r1 = RegRef::new(RegFile::GPR, 1, 1); let r2 = RegRef::new(RegFile::GPR, 2, 1); let r3 = RegRef::new(RegFile::GPR, 3, 1); + let p4 = RegRef::new(RegFile::Pred, 4, 1); let order = MemOrder::Strong(MemScope::CTA); @@ -338,11 +339,23 @@ pub fn test_ld_st_atom() { let instr = OpLd { dst: Dst::Reg(r0), addr: SrcRef::Reg(r1).into(), + pred: if matches!(space, MemSpace::Global(_)) + && sm >= 73 + { + SrcRef::Reg(p4).into() + } else { + true.into() + }, offset: addr_offset, access: access.clone(), stride: addr_stride, }; let expected = match space { + MemSpace::Global(_) if sm >= 73 => { + format!( + "ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}], p4;" + ) + } MemSpace::Global(_) => { format!( "ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}];" diff --git a/src/nouveau/compiler/nak/sm20.rs b/src/nouveau/compiler/nak/sm20.rs index f55065206b6..24c138e0a09 100644 --- a/src/nouveau/compiler/nak/sm20.rs +++ b/src/nouveau/compiler/nak/sm20.rs @@ -2310,6 +2310,7 @@ impl SM20Op for OpLd { fn encode(&self, e: &mut SM20Encoder<'_>) { assert_eq!(self.stride, OffsetStride::X1); + assert!(self.pred.is_true()); match self.access.space { MemSpace::Global(addr_type) => { e.set_opcode(SM20Unit::Mem, 0x20); diff --git a/src/nouveau/compiler/nak/sm32.rs b/src/nouveau/compiler/nak/sm32.rs index 13458b60adb..f5fadf1dccb 100644 --- a/src/nouveau/compiler/nak/sm32.rs +++ b/src/nouveau/compiler/nak/sm32.rs @@ -2550,6 +2550,7 @@ impl SM32Op for OpLd { fn encode(&self, e: &mut SM32Encoder<'_>) { assert_eq!(self.stride, OffsetStride::X1); + assert!(self.pred.is_true()); // Missing: // 0x7c8 for indirect const load match self.access.space { diff --git a/src/nouveau/compiler/nak/sm50.rs b/src/nouveau/compiler/nak/sm50.rs index cbd972f17df..f9aa923454d 100644 --- a/src/nouveau/compiler/nak/sm50.rs +++ b/src/nouveau/compiler/nak/sm50.rs @@ -2598,6 +2598,7 @@ impl SM50Op for OpLd { fn encode(&self, e: &mut SM50Encoder<'_>) { assert_eq!(self.stride, OffsetStride::X1); + assert!(self.pred.is_true()); e.set_opcode(match self.access.space { MemSpace::Global(_) => 0xeed0, MemSpace::Local => 0xef40, diff --git a/src/nouveau/compiler/nak/sm70_encode.rs b/src/nouveau/compiler/nak/sm70_encode.rs index 540b9fd659d..8b9a5c24b78 100644 --- a/src/nouveau/compiler/nak/sm70_encode.rs +++ b/src/nouveau/compiler/nak/sm70_encode.rs @@ -150,13 +150,13 @@ impl SM70Encoder<'_> { self.set_pred_src_file(range, not_bit, src, RegFile::UPred); } - fn set_rev_upred_src( + fn set_rev_pred_src_file( &mut self, range: Range, not_bit: usize, src: &Src, + file: RegFile, ) { - let file = RegFile::UPred; let (not, reg) = match src.src_ref { SrcRef::True => (false, self.true_reg(file)), SrcRef::False => (true, self.true_reg(file)), @@ -177,6 +177,24 @@ impl SM70Encoder<'_> { self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod)); } + fn set_rev_pred_src( + &mut self, + range: Range, + not_bit: usize, + src: &Src, + ) { + self.set_rev_pred_src_file(range, not_bit, src, RegFile::Pred) + } + + fn set_rev_upred_src( + &mut self, + range: Range, + not_bit: usize, + src: &Src, + ) { + self.set_rev_pred_src_file(range, not_bit, src, RegFile::UPred) + } + fn set_src_cb(&mut self, range: Range, cx_bit: usize, cb: &CBufRef) { let mut v = BitMutView::new_subset(self, range); v.set_field(6..22, cb.offset); @@ -3019,6 +3037,7 @@ impl SM70Op for OpSuAtom { impl SM70Op for OpLd { fn legalize(&mut self, b: &mut LegalizeBuilder) { b.copy_src_if_uniform(&mut self.addr); + b.copy_src_if_uniform(&mut self.pred); } fn encode(&self, e: &mut SM70Encoder<'_>) { @@ -3026,10 +3045,16 @@ impl SM70Op for OpLd { MemSpace::Global(_) => { e.set_opcode(0x381); assert_eq!(self.stride, OffsetStride::X1); + if e.sm >= 73 { + e.set_rev_pred_src(64..67, 67, &self.pred); + } else { + assert!(self.pred.is_true()); + } e.set_pred_dst(81..84, &Dst::None); e.set_mem_access(&self.access); } MemSpace::Local => { + assert!(self.pred.is_true()); assert_eq!(self.stride, OffsetStride::X1); e.set_opcode(0x983); e.set_field(84..87, 1_u8); @@ -3043,6 +3068,7 @@ impl SM70Op for OpLd { } MemSpace::Shared => { e.set_opcode(0x984); + assert!(self.pred.is_true()); e.set_mem_type(73..76, self.access.mem_type); assert!(self.access.order == MemOrder::Strong(MemScope::CTA)); diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 1bb8ae2517e..2bd45879170 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1014,9 +1014,11 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) switch (intr->intrinsic) { case nir_intrinsic_load_global: - case nir_intrinsic_load_global_constant: - res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa); + case nir_intrinsic_load_global_constant: { + nir_def *nir_true = nir_imm_bool(&b, true); + res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true); break; + } case nir_intrinsic_load_scratch: res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa); break; diff --git a/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c b/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c index 4f240740272..04058ec07da 100644 --- a/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c +++ b/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c @@ -63,6 +63,7 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load, val = nir_load_global_nv(b, load->def.num_components, load->def.bit_size, nir_iadd(b, addr, nir_u2u64(b, offset)), + nir_imm_bool(b, true), .align_mul = nir_intrinsic_align_mul(load), .align_offset = nir_intrinsic_align_offset(load), .access = ACCESS_CAN_REORDER,