From ab84cf11c7b180232c089cc21e851c6ab7142385 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Tue, 11 Jun 2024 10:14:28 -0500 Subject: [PATCH] nak/copy_prop: Don't propagate bindless cbufs into non-uniform blocks We can propagate within a non-uniform block just fine but not across them because that might change live registers in unpredictable ways. The real boundary here is that we can't propagate across an OpPin but that's a lot harder to express. Part-of: --- src/nouveau/compiler/nak/opt_copy_prop.rs | 199 +++++++++++++++------- 1 file changed, 138 insertions(+), 61 deletions(-) diff --git a/src/nouveau/compiler/nak/opt_copy_prop.rs b/src/nouveau/compiler/nak/opt_copy_prop.rs index 645aa6819fc..9fd4ae72270 100644 --- a/src/nouveau/compiler/nak/opt_copy_prop.rs +++ b/src/nouveau/compiler/nak/opt_copy_prop.rs @@ -5,12 +5,38 @@ use crate::ir::*; use std::collections::HashMap; +enum CBufRule { + Yes, + No, + BindlessRequiresBlock(usize), +} + +impl CBufRule { + fn allows_src(&self, src_bi: usize, src: &Src) -> bool { + let SrcRef::CBuf(cb) = &src.src_ref else { + return true; + }; + + match self { + CBufRule::Yes => true, + CBufRule::No => false, + CBufRule::BindlessRequiresBlock(bi) => match cb.buf { + CBuf::Binding(_) => true, + CBuf::BindlessSSA(_) => src_bi == *bi, + CBuf::BindlessUGPR(_) => panic!("Not in SSA form"), + }, + } + } +} + struct CopyEntry { + bi: usize, src_type: SrcType, src: Src, } struct PrmtEntry { + bi: usize, srcs: [Src; 2], selection: u16, } @@ -31,27 +57,45 @@ impl CopyPropPass { } } - fn add_copy(&mut self, dst: SSAValue, src_type: SrcType, src: Src) { + fn add_copy( + &mut self, + bi: usize, + dst: SSAValue, + src_type: SrcType, + src: Src, + ) { assert!(src.src_ref.get_reg().is_none()); self.ssa_map - .insert(dst, CopyPropEntry::Copy(CopyEntry { src_type, src })); + .insert(dst, CopyPropEntry::Copy(CopyEntry { bi, src_type, src })); } - fn add_prmt(&mut self, dst: SSAValue, srcs: [Src; 2], selection: u16) { + fn add_prmt( + &mut self, + bi: usize, + dst: SSAValue, + srcs: [Src; 2], + selection: u16, + ) { assert!( srcs[0].src_ref.get_reg().is_none() && srcs[1].src_ref.get_reg().is_none() ); - self.ssa_map - .insert(dst, CopyPropEntry::Prmt(PrmtEntry { srcs, selection })); + self.ssa_map.insert( + dst, + CopyPropEntry::Prmt(PrmtEntry { + bi, + srcs, + selection, + }), + ); } - fn add_fp64_copy(&mut self, dst: &SSARef, src: Src) { + fn add_fp64_copy(&mut self, bi: usize, dst: &SSARef, src: Src) { assert!(dst.comps() == 2); match src.src_ref { SrcRef::Zero | SrcRef::Imm32(_) => { - self.add_copy(dst[0], SrcType::ALU, Src::new_zero()); - self.add_copy(dst[1], SrcType::F64, src); + self.add_copy(bi, dst[0], SrcType::ALU, Src::new_zero()); + self.add_copy(bi, dst[1], SrcType::F64, src); } SrcRef::CBuf(cb) => { let lo32 = Src::from(SrcRef::CBuf(cb)); @@ -60,8 +104,8 @@ impl CopyPropPass { src_mod: src.src_mod, src_swizzle: src.src_swizzle, }; - self.add_copy(dst[0], SrcType::ALU, lo32); - self.add_copy(dst[1], SrcType::F64, hi32); + self.add_copy(bi, dst[0], SrcType::ALU, lo32); + self.add_copy(bi, dst[1], SrcType::F64, hi32); } SrcRef::SSA(ssa) => { assert!(ssa.comps() == 2); @@ -71,8 +115,8 @@ impl CopyPropPass { src_mod: src.src_mod, src_swizzle: src.src_swizzle, }; - self.add_copy(dst[0], SrcType::ALU, lo32); - self.add_copy(dst[1], SrcType::F64, hi32); + self.add_copy(bi, dst[0], SrcType::ALU, lo32); + self.add_copy(bi, dst[1], SrcType::F64, hi32); } _ => (), } @@ -185,7 +229,7 @@ impl CopyPropPass { fn prop_to_scalar_src( &self, src_type: SrcType, - allow_cbuf: bool, + cbuf_rule: &CBufRule, src: &mut Src, ) { loop { @@ -202,9 +246,7 @@ impl CopyPropPass { match entry { CopyPropEntry::Copy(entry) => { - if matches!(&entry.src.src_ref, SrcRef::CBuf(_)) - && !allow_cbuf - { + if !cbuf_rule.allows_src(entry.bi, &entry.src) { return; } @@ -259,9 +301,7 @@ impl CopyPropPass { let entry_src_idx = usize::from(entry_src_idx.unwrap()); let entry_src = entry.srcs[entry_src_idx]; - if matches!(&entry_src.src_ref, SrcRef::CBuf(_)) - && !allow_cbuf - { + if !cbuf_rule.allows_src(entry.bi, &entry_src) { return; } @@ -295,7 +335,7 @@ impl CopyPropPass { } } - fn prop_to_f64_src(&self, src: &mut Src) { + fn prop_to_f64_src(&self, cbuf_rule: &CBufRule, src: &mut Src) { loop { let src_ssa = match &mut src.src_ref { SrcRef::SSA(ssa) => ssa, @@ -349,6 +389,12 @@ impl CopyPropPass { return; } + if !cbuf_rule.allows_src(hi_entry.bi, &hi_entry.src) + || !cbuf_rule.allows_src(lo_entry.bi, &lo_entry.src) + { + return; + } + let new_src_ref = match hi_entry.src.src_ref { SrcRef::Zero => match lo_entry.src.src_ref { SrcRef::Zero | SrcRef::Imm32(0) => SrcRef::Zero, @@ -386,7 +432,12 @@ impl CopyPropPass { } } - fn prop_to_src(&self, src_type: SrcType, allow_cbuf: bool, src: &mut Src) { + fn prop_to_src( + &self, + src_type: SrcType, + cbuf_rule: &CBufRule, + src: &mut Src, + ) { match src_type { SrcType::SSA => { self.prop_to_ssa_src(src); @@ -401,17 +452,16 @@ impl CopyPropPass { | SrcType::I32 | SrcType::B32 | SrcType::Pred => { - self.prop_to_scalar_src(src_type, allow_cbuf, src); + self.prop_to_scalar_src(src_type, cbuf_rule, src); } SrcType::F64 => { - debug_assert!(allow_cbuf); - self.prop_to_f64_src(src); + self.prop_to_f64_src(cbuf_rule, src); } SrcType::Bar => (), } } - fn try_add_instr(&mut self, instr: &Instr) { + fn try_add_instr(&mut self, bi: usize, instr: &Instr) { match &instr.op { Op::HAdd2(add) => { let dst = add.dst.as_ssa().unwrap(); @@ -420,9 +470,9 @@ impl CopyPropPass { if !add.saturate { if add.srcs[0].is_fneg_zero(SrcType::F16v2) { - self.add_copy(dst, SrcType::F16v2, add.srcs[1]); + self.add_copy(bi, dst, SrcType::F16v2, add.srcs[1]); } else if add.srcs[1].is_fneg_zero(SrcType::F16v2) { - self.add_copy(dst, SrcType::F16v2, add.srcs[0]); + self.add_copy(bi, dst, SrcType::F16v2, add.srcs[0]); } } } @@ -433,18 +483,18 @@ impl CopyPropPass { if !add.saturate { if add.srcs[0].is_fneg_zero(SrcType::F32) { - self.add_copy(dst, SrcType::F32, add.srcs[1]); + self.add_copy(bi, dst, SrcType::F32, add.srcs[1]); } else if add.srcs[1].is_fneg_zero(SrcType::F32) { - self.add_copy(dst, SrcType::F32, add.srcs[0]); + self.add_copy(bi, dst, SrcType::F32, add.srcs[0]); } } } Op::DAdd(add) => { let dst = add.dst.as_ssa().unwrap(); if add.srcs[0].is_fneg_zero(SrcType::F64) { - self.add_fp64_copy(dst, add.srcs[1]); + self.add_fp64_copy(bi, dst, add.srcs[1]); } else if add.srcs[1].is_fneg_zero(SrcType::F64) { - self.add_fp64_copy(dst, add.srcs[0]); + self.add_fp64_copy(bi, dst, add.srcs[0]); } } Op::Lop3(lop) => { @@ -454,9 +504,10 @@ impl CopyPropPass { let op = lop.op; if op.lut == 0 { - self.add_copy(dst, SrcType::ALU, SrcRef::Zero.into()); + self.add_copy(bi, dst, SrcType::ALU, SrcRef::Zero.into()); } else if op.lut == !0 { self.add_copy( + bi, dst, SrcType::ALU, SrcRef::Imm32(u32::MAX).into(), @@ -464,7 +515,7 @@ impl CopyPropPass { } else { for s in 0..3 { if op.lut == LogicOp3::SRC_MASKS[s] { - self.add_copy(dst, SrcType::ALU, lop.srcs[s]); + self.add_copy(bi, dst, SrcType::ALU, lop.srcs[s]); } } } @@ -481,15 +532,31 @@ impl CopyPropPass { let op = lop.ops[i]; if op.lut == 0 { - self.add_copy(dst, SrcType::Pred, SrcRef::False.into()); + self.add_copy( + bi, + dst, + SrcType::Pred, + SrcRef::False.into(), + ); } else if op.lut == !0 { - self.add_copy(dst, SrcType::Pred, SrcRef::True.into()); + self.add_copy( + bi, + dst, + SrcType::Pred, + SrcRef::True.into(), + ); } else { for s in 0..3 { if op.lut == LogicOp3::SRC_MASKS[s] { - self.add_copy(dst, SrcType::Pred, lop.srcs[s]); + self.add_copy( + bi, + dst, + SrcType::Pred, + lop.srcs[s], + ); } else if op.lut == !LogicOp3::SRC_MASKS[s] { self.add_copy( + bi, dst, SrcType::Pred, lop.srcs[s].bnot(), @@ -506,9 +573,9 @@ impl CopyPropPass { if add.carry_in.is_zero() { if add.srcs[0].is_zero() { - self.add_copy(dst, SrcType::I32, add.srcs[1]); + self.add_copy(bi, dst, SrcType::I32, add.srcs[1]); } else if add.srcs[1].is_zero() { - self.add_copy(dst, SrcType::I32, add.srcs[0]); + self.add_copy(bi, dst, SrcType::I32, add.srcs[0]); } } } @@ -519,12 +586,12 @@ impl CopyPropPass { if add.srcs[0].is_zero() { if add.srcs[1].is_zero() { - self.add_copy(dst, SrcType::I32, add.srcs[2]); + self.add_copy(bi, dst, SrcType::I32, add.srcs[2]); } else if add.srcs[2].is_zero() { - self.add_copy(dst, SrcType::I32, add.srcs[1]); + self.add_copy(bi, dst, SrcType::I32, add.srcs[1]); } } else if add.srcs[1].is_zero() && add.srcs[2].is_zero() { - self.add_copy(dst, SrcType::I32, add.srcs[0]); + self.add_copy(bi, dst, SrcType::I32, add.srcs[0]); } } Op::Prmt(prmt) => { @@ -538,9 +605,9 @@ impl CopyPropPass { }; if sel == 0x3210 { - self.add_copy(dst[0], SrcType::GPR, prmt.srcs[0]); + self.add_copy(bi, dst[0], SrcType::GPR, prmt.srcs[0]); } else if sel == 0x7654 { - self.add_copy(dst[0], SrcType::GPR, prmt.srcs[1]); + self.add_copy(bi, dst[0], SrcType::GPR, prmt.srcs[1]); } else { let mut is_imm = true; let mut imm = 0_u32; @@ -559,9 +626,10 @@ impl CopyPropPass { } } if is_imm { - self.add_copy(dst[0], SrcType::GPR, imm.into()); + self.add_copy(bi, dst[0], SrcType::GPR, imm.into()); } else { self.add_prmt( + bi, dst[0], prmt.srcs, sel.try_into().unwrap(), @@ -574,19 +642,19 @@ impl CopyPropPass { if r2ur.src.is_uniform() { let dst = r2ur.dst.as_ssa().unwrap(); assert!(dst.comps() == 1); - self.add_copy(dst[0], SrcType::GPR, r2ur.src); + self.add_copy(bi, dst[0], SrcType::GPR, r2ur.src); } } Op::Copy(copy) => { let dst = copy.dst.as_ssa().unwrap(); assert!(dst.comps() == 1); - self.add_copy(dst[0], SrcType::GPR, copy.src); + self.add_copy(bi, dst[0], SrcType::GPR, copy.src); } Op::ParCopy(pcopy) => { for (dst, src) in pcopy.dsts_srcs.iter() { let dst = dst.as_ssa().unwrap(); assert!(dst.comps() == 1); - self.add_copy(dst[0], SrcType::GPR, *src); + self.add_copy(bi, dst[0], SrcType::GPR, *src); } } _ => (), @@ -594,25 +662,33 @@ impl CopyPropPass { } pub fn run(&mut self, f: &mut Function) { - for b in &mut f.blocks { + for (bi, b) in f.blocks.iter_mut().enumerate() { + let b_uniform = b.uniform; for instr in &mut b.instrs { - self.try_add_instr(instr); + self.try_add_instr(bi, instr); self.prop_to_pred(&mut instr.pred); - let allow_cbuf = !instr.is_uniform(); + let cbuf_rule = if instr.is_uniform() { + CBufRule::No + } else if !b_uniform { + CBufRule::BindlessRequiresBlock(bi) + } else { + CBufRule::Yes + }; match &mut instr.op { Op::IAdd2(add) => { // Carry-out interacts funny with SrcMod::INeg so we can // only propagate with modifiers if no carry is written. use SrcType::{ALU, I32}; + let [src0, src1] = &mut add.srcs; if add.carry_out.is_none() { - self.prop_to_src(I32, allow_cbuf, &mut add.srcs[0]); - self.prop_to_src(I32, allow_cbuf, &mut add.srcs[1]); + self.prop_to_src(I32, &cbuf_rule, src0); + self.prop_to_src(I32, &cbuf_rule, src1); } else { - self.prop_to_src(ALU, allow_cbuf, &mut add.srcs[0]); - self.prop_to_src(ALU, allow_cbuf, &mut add.srcs[1]); + self.prop_to_src(ALU, &cbuf_rule, src0); + self.prop_to_src(ALU, &cbuf_rule, src1); } } Op::IAdd3(add) => { @@ -620,23 +696,24 @@ impl CopyPropPass { // only propagate with modifiers if no overflow values // are written. use SrcType::{ALU, I32}; + let [src0, src1, src2] = &mut add.srcs; if add.overflow[0].is_none() && add.overflow[0].is_none() { - self.prop_to_src(I32, allow_cbuf, &mut add.srcs[0]); - self.prop_to_src(I32, allow_cbuf, &mut add.srcs[1]); - self.prop_to_src(I32, allow_cbuf, &mut add.srcs[2]); + self.prop_to_src(I32, &cbuf_rule, src0); + self.prop_to_src(I32, &cbuf_rule, src1); + self.prop_to_src(I32, &cbuf_rule, src2); } else { - self.prop_to_src(ALU, allow_cbuf, &mut add.srcs[0]); - self.prop_to_src(ALU, allow_cbuf, &mut add.srcs[1]); - self.prop_to_src(ALU, allow_cbuf, &mut add.srcs[2]); + self.prop_to_src(ALU, &cbuf_rule, src0); + self.prop_to_src(ALU, &cbuf_rule, src1); + self.prop_to_src(ALU, &cbuf_rule, src2); } } _ => { let src_types = instr.src_types(); for (i, src) in instr.srcs_mut().iter_mut().enumerate() { - self.prop_to_src(src_types[i], allow_cbuf, src); + self.prop_to_src(src_types[i], &cbuf_rule, src); } } }