From 3c32ff7fa9dd30242f6342cfab0fb8e8c3455ac6 Mon Sep 17 00:00:00 2001 From: Mel Henning Date: Thu, 11 Sep 2025 00:36:38 -0400 Subject: [PATCH] nak: Place most Op structs in Box<> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Between this and the previous few commits, the Box<> has moved from outside of Instr to inside the Op enum. This provides a few benefits: 1. We no longer need to allocate for the worst-case Op on every instruction. For example, OpIAdd3X is 232 bytes and OpBra is 40 bytes, which means we can save some memory on OpBra if we only allocate those 40 bytes. 2. The Op discriminant is available without chasing a pointer. The type of op is probably the most frequently used field, and this should benefit most passes that care about what type of Op they're handling. 3. Small Ops don't need any indirection at all. For example, OpPBk is only 4 bytes, which means we can just store it directly in less space than a pointer. Compared to Box, this is around a 1.4% shader compile time improvement. Acked-by: Faith Ekstrand Reviewed-by: Seán de Búrca Part-of: --- src/nouveau/compiler/nak/assign_regs.rs | 8 +- src/nouveau/compiler/nak/from_nir.rs | 11 +- src/nouveau/compiler/nak/ir.rs | 251 ++++++++++--------- src/nouveau/compiler/nak/legalize.rs | 23 +- src/nouveau/compiler/nak/lower_copy_swap.rs | 6 +- src/nouveau/compiler/nak/lower_par_copies.rs | 2 +- src/nouveau/compiler/nak/opt_jump_thread.rs | 17 +- 7 files changed, 165 insertions(+), 153 deletions(-) diff --git a/src/nouveau/compiler/nak/assign_regs.rs b/src/nouveau/compiler/nak/assign_regs.rs index 1bef9a56b02..42d5c99ebbc 100644 --- a/src/nouveau/compiler/nak/assign_regs.rs +++ b/src/nouveau/compiler/nak/assign_regs.rs @@ -1134,9 +1134,15 @@ impl AssignRegsBlock { Some(instr) } } - Op::Pin(OpPin { src, dst }) | Op::Unpin(OpUnpin { src, dst }) => { + Op::Pin(_) | Op::Unpin(_) => { assert!(instr.pred.is_true()); + let (src, dst) = match &instr.op { + Op::Pin(pin) => (&pin.src, &pin.dst), + Op::Unpin(unpin) => (&unpin.src, &unpin.dst), + _ => unreachable!(), + }; + // These basically act as a vector version of OpCopy except that // they only work on SSA values and we pin the destination if // it's OpPin. diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index fb0f8d76218..a07cca69fc9 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -3969,10 +3969,13 @@ impl<'a> ShaderFromNir<'a> { Op::Exit(OpExit {}) } else { self.cfg.add_edge(nb.index, target.index); - Op::Bra(OpBra { - target: self.get_block_label(target), - cond: true.into(), - }) + Op::Bra( + OpBra { + target: self.get_block_label(target), + cond: true.into(), + } + .into(), + ) }; b.predicate(pred).push_op(op); } diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 667eb7bbae3..5f98a3d6634 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -7839,101 +7839,101 @@ impl fmt::Display for OpAnnotate { #[derive(DisplayOp, DstsAsSlice, SrcsAsSlice, FromVariants)] pub enum Op { - FAdd(OpFAdd), - FFma(OpFFma), - FMnMx(OpFMnMx), - FMul(OpFMul), - Rro(OpRro), - MuFu(OpMuFu), - FSet(OpFSet), - FSetP(OpFSetP), - FSwzAdd(OpFSwzAdd), - FSwz(OpFSwz), - DAdd(OpDAdd), - DFma(OpDFma), - DMnMx(OpDMnMx), - DMul(OpDMul), - DSetP(OpDSetP), - HAdd2(OpHAdd2), - HFma2(OpHFma2), - HMul2(OpHMul2), - HSet2(OpHSet2), - HSetP2(OpHSetP2), - Imma(OpImma), - Hmma(OpHmma), - Ldsm(OpLdsm), - HMnMx2(OpHMnMx2), - BMsk(OpBMsk), - BRev(OpBRev), - Bfe(OpBfe), - Flo(OpFlo), - IAbs(OpIAbs), - IAdd2(OpIAdd2), - IAdd2X(OpIAdd2X), - IAdd3(OpIAdd3), - IAdd3X(OpIAdd3X), - IDp4(OpIDp4), - IMad(OpIMad), - IMad64(OpIMad64), - IMul(OpIMul), - IMnMx(OpIMnMx), - ISetP(OpISetP), - Lea(OpLea), - LeaX(OpLeaX), - Lop2(OpLop2), - Lop3(OpLop3), - PopC(OpPopC), - Shf(OpShf), - Shl(OpShl), - Shr(OpShr), - F2F(OpF2F), - F2FP(OpF2FP), - F2I(OpF2I), - I2F(OpI2F), - I2I(OpI2I), - FRnd(OpFRnd), - Mov(OpMov), - Prmt(OpPrmt), - Sel(OpSel), - Shfl(OpShfl), - PLop3(OpPLop3), - PSetP(OpPSetP), - R2UR(OpR2UR), - Redux(OpRedux), - Tex(OpTex), - Tld(OpTld), - Tld4(OpTld4), - Tmml(OpTmml), - Txd(OpTxd), - Txq(OpTxq), - SuLd(OpSuLd), - SuSt(OpSuSt), - SuAtom(OpSuAtom), - SuClamp(OpSuClamp), - SuBfm(OpSuBfm), - SuEau(OpSuEau), - IMadSp(OpIMadSp), - SuLdGa(OpSuLdGa), - SuStGa(OpSuStGa), - Ld(OpLd), - Ldc(OpLdc), - LdSharedLock(OpLdSharedLock), - St(OpSt), - StSCheckUnlock(OpStSCheckUnlock), - Atom(OpAtom), - AL2P(OpAL2P), - ALd(OpALd), - ASt(OpASt), - Ipa(OpIpa), - LdTram(OpLdTram), - CCtl(OpCCtl), - MemBar(OpMemBar), - BClear(OpBClear), - BMov(OpBMov), - Break(OpBreak), - BSSy(OpBSSy), - BSync(OpBSync), - Bra(OpBra), + FAdd(Box), + FFma(Box), + FMnMx(Box), + FMul(Box), + Rro(Box), + MuFu(Box), + FSet(Box), + FSetP(Box), + FSwzAdd(Box), + FSwz(Box), + DAdd(Box), + DFma(Box), + DMnMx(Box), + DMul(Box), + DSetP(Box), + HAdd2(Box), + HFma2(Box), + HMul2(Box), + HSet2(Box), + HSetP2(Box), + Imma(Box), + Hmma(Box), + Ldsm(Box), + HMnMx2(Box), + BMsk(Box), + BRev(Box), + Bfe(Box), + Flo(Box), + IAbs(Box), + IAdd2(Box), + IAdd2X(Box), + IAdd3(Box), + IAdd3X(Box), + IDp4(Box), + IMad(Box), + IMad64(Box), + IMul(Box), + IMnMx(Box), + ISetP(Box), + Lea(Box), + LeaX(Box), + Lop2(Box), + Lop3(Box), + PopC(Box), + Shf(Box), + Shl(Box), + Shr(Box), + F2F(Box), + F2FP(Box), + F2I(Box), + I2F(Box), + I2I(Box), + FRnd(Box), + Mov(Box), + Prmt(Box), + Sel(Box), + Shfl(Box), + PLop3(Box), + PSetP(Box), + R2UR(Box), + Redux(Box), + Tex(Box), + Tld(Box), + Tld4(Box), + Tmml(Box), + Txd(Box), + Txq(Box), + SuLd(Box), + SuSt(Box), + SuAtom(Box), + SuClamp(Box), + SuBfm(Box), + SuEau(Box), + IMadSp(Box), + SuLdGa(Box), + SuStGa(Box), + Ld(Box), + Ldc(Box), + LdSharedLock(Box), + St(Box), + StSCheckUnlock(Box), + Atom(Box), + AL2P(Box), + ALd(Box), + ASt(Box), + Ipa(Box), + LdTram(Box), + CCtl(Box), + MemBar(Box), + BClear(Box), + BMov(Box), + Break(Box), + BSSy(Box), + BSync(Box), + Bra(Box), SSy(OpSSy), Sync(OpSync), Brk(OpBrk), @@ -7941,34 +7941,39 @@ pub enum Op { Cont(OpCont), PCnt(OpPCnt), Exit(OpExit), - WarpSync(OpWarpSync), - Bar(OpBar), - TexDepBar(OpTexDepBar), - CS2R(OpCS2R), - Isberd(OpIsberd), - ViLd(OpViLd), - Kill(OpKill), + WarpSync(Box), + Bar(Box), + TexDepBar(Box), + CS2R(Box), + Isberd(Box), + ViLd(Box), + Kill(Box), Nop(OpNop), - PixLd(OpPixLd), - S2R(OpS2R), - Vote(OpVote), - Match(OpMatch), - Undef(OpUndef), - SrcBar(OpSrcBar), - PhiSrcs(OpPhiSrcs), - PhiDsts(OpPhiDsts), - Copy(OpCopy), - Pin(OpPin), - Unpin(OpUnpin), - Swap(OpSwap), - ParCopy(OpParCopy), - RegOut(OpRegOut), - Out(OpOut), - OutFinal(OpOutFinal), - Annotate(OpAnnotate), + PixLd(Box), + S2R(Box), + Vote(Box), + Match(Box), + Undef(Box), + SrcBar(Box), + PhiSrcs(Box), + PhiDsts(Box), + Copy(Box), + Pin(Box), + Unpin(Box), + Swap(Box), + ParCopy(Box), + RegOut(Box), + Out(Box), + OutFinal(Box), + Annotate(Box), } impl_display_for_op!(Op); +#[cfg(target_arch = "x86_64")] +const _: () = { + debug_assert!(size_of::() == 16); +}; + impl Op { pub fn is_branch(&self) -> bool { match self { @@ -8620,7 +8625,7 @@ impl BasicBlock { pub fn phi_dsts(&self) -> Option<&OpPhiDsts> { self.phi_dsts_ip().map(|ip| match &self.instrs[ip].op { - Op::PhiDsts(phi) => phi, + Op::PhiDsts(phi) => phi.deref(), _ => panic!("Expected to find the phi"), }) } @@ -8628,7 +8633,7 @@ impl BasicBlock { #[allow(dead_code)] pub fn phi_dsts_mut(&mut self) -> Option<&mut OpPhiDsts> { self.phi_dsts_ip().map(|ip| match &mut self.instrs[ip].op { - Op::PhiDsts(phi) => phi, + Op::PhiDsts(phi) => phi.deref_mut(), _ => panic!("Expected to find the phi"), }) } @@ -8646,14 +8651,14 @@ impl BasicBlock { } pub fn phi_srcs(&self) -> Option<&OpPhiSrcs> { self.phi_srcs_ip().map(|ip| match &self.instrs[ip].op { - Op::PhiSrcs(phi) => phi, + Op::PhiSrcs(phi) => phi.deref(), _ => panic!("Expected to find the phi"), }) } pub fn phi_srcs_mut(&mut self) -> Option<&mut OpPhiSrcs> { self.phi_srcs_ip().map(|ip| match &mut self.instrs[ip].op { - Op::PhiSrcs(phi) => phi, + Op::PhiSrcs(phi) => phi.deref_mut(), _ => panic!("Expected to find the phi"), }) } diff --git a/src/nouveau/compiler/nak/legalize.rs b/src/nouveau/compiler/nak/legalize.rs index 64a017b3133..98a4153138b 100644 --- a/src/nouveau/compiler/nak/legalize.rs +++ b/src/nouveau/compiler/nak/legalize.rs @@ -459,20 +459,17 @@ fn legalize_instr( } // OpBreak and OpBSsy impose additional RA constraints - match &mut instr.op { - Op::Break(OpBreak { - bar_in, bar_out, .. - }) - | Op::BSSy(OpBSSy { - bar_in, bar_out, .. - }) => { - let bar_in_ssa = bar_in.src_ref.as_ssa().unwrap(); - if !bar_out.is_none() && bl.is_live_after_ip(&bar_in_ssa[0], ip) { - let gpr = b.bmov_to_gpr(bar_in.clone()); - let tmp = b.bmov_to_bar(gpr.into()); - *bar_in = tmp.into(); - } + let mut legalize_break_bssy = |bar_in: &mut Src, bar_out: &mut Dst| { + let bar_in_ssa = bar_in.src_ref.as_ssa().unwrap(); + if !bar_out.is_none() && bl.is_live_after_ip(&bar_in_ssa[0], ip) { + let gpr = b.bmov_to_gpr(bar_in.clone()); + let tmp = b.bmov_to_bar(gpr.into()); + *bar_in = tmp.into(); } + }; + match &mut instr.op { + Op::Break(op) => legalize_break_bssy(&mut op.bar_in, &mut op.bar_out), + Op::BSSy(op) => legalize_break_bssy(&mut op.bar_in, &mut op.bar_out), _ => (), } diff --git a/src/nouveau/compiler/nak/lower_copy_swap.rs b/src/nouveau/compiler/nak/lower_copy_swap.rs index 00272b0f417..c4e243e8223 100644 --- a/src/nouveau/compiler/nak/lower_copy_swap.rs +++ b/src/nouveau/compiler/nak/lower_copy_swap.rs @@ -260,7 +260,7 @@ impl LowerCopySwap { .into(), })); } - self.lower_r2ur(&mut b, r2ur); + self.lower_r2ur(&mut b, *r2ur); b.into_mapped_instrs() } Op::Copy(copy) => { @@ -272,7 +272,7 @@ impl LowerCopySwap { .into(), })); } - self.lower_copy(&mut b, copy); + self.lower_copy(&mut b, *copy); b.into_mapped_instrs() } Op::Swap(swap) => { @@ -284,7 +284,7 @@ impl LowerCopySwap { .into(), })); } - self.lower_swap(&mut b, swap); + self.lower_swap(&mut b, *swap); b.into_mapped_instrs() } _ => MappedInstrs::One(instr), diff --git a/src/nouveau/compiler/nak/lower_par_copies.rs b/src/nouveau/compiler/nak/lower_par_copies.rs index e7562871259..fc16adaf56a 100644 --- a/src/nouveau/compiler/nak/lower_par_copies.rs +++ b/src/nouveau/compiler/nak/lower_par_copies.rs @@ -265,7 +265,7 @@ impl Shader<'_> { .into(), })); } - match lower_par_copy(pc, sm) { + match lower_par_copy(*pc, sm) { MappedInstrs::None => { if let Some(instr) = instrs.pop() { MappedInstrs::One(instr) diff --git a/src/nouveau/compiler/nak/opt_jump_thread.rs b/src/nouveau/compiler/nak/opt_jump_thread.rs index 8a114a18051..1b734258ea8 100644 --- a/src/nouveau/compiler/nak/opt_jump_thread.rs +++ b/src/nouveau/compiler/nak/opt_jump_thread.rs @@ -87,10 +87,13 @@ fn jump_thread(func: &mut Function) -> bool { .get(&target_label) .map(clone_branch) .unwrap_or_else(|| { - Op::Bra(OpBra { - target: target_label, - cond: true.into(), - }) + Op::Bra( + OpBra { + target: target_label, + cond: true.into(), + } + .into(), + ) }); replacements.insert(block_label, replacement); } @@ -139,10 +142,8 @@ fn rewrite_cfg(func: &mut Function) { fn opt_fall_through(func: &mut Function) { for i in 0..func.blocks.len() - 1 { let remove_last_instr = match func.blocks[i].branch() { - Some(b) => match b.op { - Op::Bra(OpBra { target, .. }) => { - target == func.blocks[i + 1].label - } + Some(b) => match &b.op { + Op::Bra(bra) => bra.target == func.blocks[i + 1].label, _ => false, }, None => false,