From eda940c8557dd68d60e085d8d2df5590ee3cf4f8 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Mon, 4 Dec 2023 09:03:14 -0600 Subject: [PATCH] nak: Make barriers SSA-friendly The NIR intrinsics now take and return a barrier whenever one is modified instead of modifying in-place. In NAK, we give the internal instructions the same treatment and convert everything to use barrier SSA values and RegRefs. In nak_from_nir, we move all barriers to/from GPRs. We'll clean up the massive pile of OpBMov later. Part-of: --- src/compiler/nir/nir_intrinsics.py | 5 +- src/nouveau/compiler/nak_encode_sm70.rs | 10 +- src/nouveau/compiler/nak_from_nir.rs | 133 +++++++------------- src/nouveau/compiler/nak_ir.rs | 46 +++---- src/nouveau/compiler/nak_nir_add_barriers.c | 16 ++- 5 files changed, 77 insertions(+), 133 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 183c6e2c989..380c153f0dd 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2036,8 +2036,9 @@ intrinsic("end_primitive_nv", dest_comp=1, src_comp=[1], indices=[STREAM_ID]) intrinsic("final_primitive_nv", src_comp=[1]) intrinsic("bar_set_nv", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE]) -intrinsic("bar_break_nv", src_comp=[1]) -intrinsic("bar_sync_nv", src_comp=[1]) +intrinsic("bar_break_nv", dest_comp=1, bit_sizes=[32], src_comp=[1]) +# src[] = { bar, bar_set } +intrinsic("bar_sync_nv", src_comp=[1, 1]) # In order to deal with flipped render targets, gl_PointCoord may be flipped # in the shader requiring a shader key or extra instructions or it may be diff --git a/src/nouveau/compiler/nak_encode_sm70.rs b/src/nouveau/compiler/nak_encode_sm70.rs index f5922845ce6..f1c15575a18 100644 --- a/src/nouveau/compiler/nak_encode_sm70.rs +++ b/src/nouveau/compiler/nak_encode_sm70.rs @@ -1692,7 +1692,7 @@ impl SM70Instr { self.set_opcode(0x355); self.set_dst(Dst::None); - self.set_field(24..28, op.dst.idx()); + self.set_bar_dst(24..28, op.dst); self.set_bit(84, true); // .CLEAR } @@ -1717,7 +1717,8 @@ impl SM70Instr { fn encode_break(&mut self, op: &OpBreak) { self.set_opcode(0x942); - self.set_field(16..20, op.bar.idx()); + assert!(op.bar_in.src_ref.as_reg() == op.bar_out.as_reg()); + self.set_bar_dst(16..20, op.bar_out); self.set_pred_src(87..90, 90, op.cond); } @@ -1728,14 +1729,15 @@ impl SM70Instr { labels: &HashMap, ) { self.set_opcode(0x945); - self.set_field(16..20, op.bar.idx()); + assert!(op.bar_in.src_ref.as_reg() == op.bar_out.as_reg()); + self.set_bar_dst(16..20, op.bar_out); self.set_rel_offset(34..64, &op.target, ip, labels); self.set_pred_src(87..90, 90, op.cond); } fn encode_bsync(&mut self, op: &OpBSync) { self.set_opcode(0x941); - self.set_field(16..20, op.bar.idx()); + self.set_bar_src(16..20, op.bar); self.set_pred_src(87..90, 90, op.cond); } diff --git a/src/nouveau/compiler/nak_from_nir.rs b/src/nouveau/compiler/nak_from_nir.rs index c7fd2523c21..1e58d8cc54a 100644 --- a/src/nouveau/compiler/nak_from_nir.rs +++ b/src/nouveau/compiler/nak_from_nir.rs @@ -144,53 +144,13 @@ impl<'a> PhiAllocMap<'a> { } } -struct BarAlloc { - used: u16, - num_bars: u8, -} - -impl BarAlloc { - pub fn new() -> BarAlloc { - BarAlloc { - used: 0, - num_bars: 0, - } - } - - pub fn num_bars(&self) -> u8 { - self.num_bars - } - - pub fn reserve(&mut self, idx: u8) { - self.num_bars = max(self.num_bars, idx + 1); - let bit = 1 << idx; - assert!(self.used & bit == 0); - self.used |= bit; - } - - pub fn alloc(&mut self) -> BarRef { - let idx = self.used.trailing_ones(); - assert!(idx < 16); - let idx = idx as u8; - self.reserve(idx); - BarRef::new(idx) - } - - pub fn free(&mut self, bar: BarRef) { - let bit = 1 << bar.idx(); - assert!(self.used & bit != 0); - self.used &= !bit; - } -} - struct ShaderFromNir<'a> { nir: &'a nir_shader, info: ShaderInfo, cfg: CFGBuilder, label_alloc: LabelAllocator, block_label: HashMap, - bar_alloc: BarAlloc, - bar_ref_label: HashMap, + bar_label: HashMap, fs_out_regs: [SSAValue; 34], end_block_id: u32, ssa_map: HashMap>, @@ -205,8 +165,7 @@ impl<'a> ShaderFromNir<'a> { cfg: CFGBuilder::new(), label_alloc: LabelAllocator::new(), block_label: HashMap::new(), - bar_alloc: BarAlloc::new(), - bar_ref_label: HashMap::new(), + bar_label: HashMap::new(), fs_out_regs: [SSAValue::NONE; 34], end_block_id: 0, ssa_map: HashMap::new(), @@ -1614,50 +1573,53 @@ impl<'a> ShaderFromNir<'a> { self.set_dst(&intrin.def, dst); } nir_intrinsic_bar_break_nv => { - let idx = &srcs[0].as_def().index; - let (bar, _) = self.bar_ref_label.get(idx).unwrap(); + let src = self.get_src(&srcs[0]); + let bar_in = b.bmov_to_bar(src); - let brk = b.push_op(OpBreak { - bar: *bar, + let bar_out = b.alloc_ssa(RegFile::Bar, 1); + b.push_op(OpBreak { + bar_out: bar_out.into(), + bar_in: bar_in.into(), cond: SrcRef::True.into(), }); - brk.deps.yld = true; + + self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into())); } nir_intrinsic_bar_set_nv => { let label = self.label_alloc.alloc(); - let bar = self.bar_alloc.alloc(); + let old = self.bar_label.insert(intrin.def.index, label); + assert!(old.is_none()); - let bmov = b.push_op(OpBClear { - dst: bar, + let bar_clear = b.alloc_ssa(RegFile::Bar, 1); + b.push_op(OpBClear { + dst: bar_clear.into(), }); - bmov.deps.yld = true; - let bssy = b.push_op(OpBSSy { - bar: bar, + let bar_out = b.alloc_ssa(RegFile::Bar, 1); + b.push_op(OpBSSy { + bar_out: bar_out.into(), + bar_in: bar_clear.into(), cond: SrcRef::True.into(), target: label, }); - bssy.deps.yld = true; - let old = - self.bar_ref_label.insert(intrin.def.index, (bar, label)); - assert!(old.is_none()); + self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into())); } nir_intrinsic_bar_sync_nv => { - let idx = &srcs[0].as_def().index; - let (bar, label) = self.bar_ref_label.get(idx).unwrap(); + let src = self.get_src(&srcs[0]); - let bsync = b.push_op(OpBSync { - bar: *bar, + let bar = b.bmov_to_bar(src); + b.push_op(OpBSync { + bar: bar.into(), cond: SrcRef::True.into(), }); - bsync.deps.yld = true; - self.bar_alloc.free(*bar); - - b.push_op(OpNop { - label: Some(*label), - }); + let bar_set_idx = &srcs[1].as_def().index; + if let Some(label) = self.bar_label.get(bar_set_idx) { + b.push_op(OpNop { + label: Some(*label), + }); + } } nir_intrinsic_bindless_image_atomic | nir_intrinsic_bindless_image_atomic_swap => { @@ -2126,6 +2088,9 @@ impl<'a> ShaderFromNir<'a> { SCOPE_NONE => (), SCOPE_WORKGROUP => { if self.nir.info.stage() == MESA_SHADER_COMPUTE { + // OpBar needs num_barriers > 0 but, as far as we + // know, it doesn't actually use a barrier. + self.info.num_barriers = 1; b.push_op(OpBar {}); b.push_op(OpNop { label: None }); } @@ -2538,27 +2503,24 @@ impl<'a> ShaderFromNir<'a> { // memory. Perhaps this is to ensure that our allocation is // actually available and not in use by another thread? let label = self.label_alloc.alloc(); - let bar = self.bar_alloc.alloc(); + let bar_clear = b.alloc_ssa(RegFile::Bar, 1); - let bmov = b.push_op(OpBClear { - dst: bar, + b.push_op(OpBClear { + dst: bar_clear.into(), }); - bmov.deps.yld = true; - let bssy = b.push_op(OpBSSy { - bar: bar, + let bar = b.alloc_ssa(RegFile::Bar, 1); + b.push_op(OpBSSy { + bar_out: bar.into(), + bar_in: bar_clear.into(), cond: SrcRef::True.into(), target: label, }); - bssy.deps.yld = true; - let bsync = b.push_op(OpBSync { - bar: bar, + b.push_op(OpBSync { + bar: bar.into(), cond: SrcRef::True.into(), }); - bsync.deps.yld = true; - - self.bar_alloc.free(bar); b.push_op(OpNop { label: Some(label) }); } @@ -2743,15 +2705,6 @@ impl<'a> ShaderFromNir<'a> { } pub fn parse_shader(mut self) -> Shader { - if self.nir.info.stage() == MESA_SHADER_COMPUTE - && self.nir.info.uses_control_barrier() - { - // We know OpBar uses a barrier but we don't know which one. Assume - // it implicitly uses B0 and reserve it so it doesn't stomp any - // other barriers - self.bar_alloc.reserve(0); - } - let mut functions = Vec::new(); for nf in self.nir.iter_functions() { if let Some(nfi) = nf.get_impl() { @@ -2760,8 +2713,6 @@ impl<'a> ShaderFromNir<'a> { } } - self.info.num_barriers = self.bar_alloc.num_bars(); - // Tessellation evaluation shaders MUST claim to read gl_TessCoord or // the hardware will throw an SPH error. match &self.info.stage { diff --git a/src/nouveau/compiler/nak_ir.rs b/src/nouveau/compiler/nak_ir.rs index 272f1da4000..ec6350d201a 100644 --- a/src/nouveau/compiler/nak_ir.rs +++ b/src/nouveau/compiler/nak_ir.rs @@ -642,28 +642,6 @@ impl fmt::Display for RegRef { } } -#[derive(Clone, Copy, Eq, Hash, PartialEq)] -pub struct BarRef { - idx: u8, -} - -impl BarRef { - pub fn new(idx: u8) -> BarRef { - assert!(idx < 16); - BarRef { idx: idx } - } - - pub fn idx(&self) -> u8 { - self.idx - } -} - -impl fmt::Display for BarRef { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "B{}", self.idx()) - } -} - #[derive(Clone, Copy)] pub enum Dst { None, @@ -3633,7 +3611,7 @@ impl_display_for_op!(OpMemBar); #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpBClear { - pub dst: BarRef, + pub dst: Dst, } impl DisplayOp for OpBClear { @@ -3665,7 +3643,10 @@ impl_display_for_op!(OpBMov); #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpBreak { - pub bar: BarRef, + pub bar_out: Dst, + + #[src_type(Bar)] + pub bar_in: Src, #[src_type(Pred)] pub cond: Src, @@ -3673,7 +3654,7 @@ pub struct OpBreak { impl DisplayOp for OpBreak { fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "break {} {}", self.cond, self.bar) + write!(f, "break {} {}", self.bar_in, self.cond) } } impl_display_for_op!(OpBreak); @@ -3681,7 +3662,10 @@ impl_display_for_op!(OpBreak); #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpBSSy { - pub bar: BarRef, + pub bar_out: Dst, + + #[src_type(Pred)] + pub bar_in: Src, #[src_type(Pred)] pub cond: Src, @@ -3691,7 +3675,7 @@ pub struct OpBSSy { impl DisplayOp for OpBSSy { fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "bssy {} {} {}", self.cond, self.bar, self.target) + write!(f, "bssy {} {} {}", self.bar_in, self.cond, self.target) } } impl_display_for_op!(OpBSSy); @@ -3699,7 +3683,8 @@ impl_display_for_op!(OpBSSy); #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpBSync { - pub bar: BarRef, + #[src_type(Bar)] + pub bar: Src, #[src_type(Pred)] pub cond: Src, @@ -3707,7 +3692,7 @@ pub struct OpBSync { impl DisplayOp for OpBSync { fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "bsync {} {}", self.cond, self.bar) + write!(f, "bsync {} {}", self.bar, self.cond) } } impl_display_for_op!(OpBSync); @@ -4704,9 +4689,6 @@ impl Instr { | Op::MemBar(_) | Op::Kill(_) | Op::Nop(_) - | Op::BClear(_) - | Op::Break(_) - | Op::BSSy(_) | Op::BSync(_) | Op::Bra(_) | Op::Exit(_) diff --git a/src/nouveau/compiler/nak_nir_add_barriers.c b/src/nouveau/compiler/nak_nir_add_barriers.c index c8b644ae7f3..b3147313126 100644 --- a/src/nouveau/compiler/nak_nir_add_barriers.c +++ b/src/nouveau/compiler/nak_nir_add_barriers.c @@ -10,7 +10,8 @@ struct barrier { nir_cf_node *node; - nir_def *bar; + nir_def *bar_set; + nir_def *bar_reg; }; struct add_barriers_state { @@ -30,13 +31,16 @@ add_bar_cf_node(nir_cf_node *node, struct add_barriers_state *state) b->cursor = nir_after_block(before); nir_def *bar = nir_bar_set_nv(b); + nir_def *bar_reg = nir_decl_reg(b, 1, 32, 0); + nir_store_reg(b, bar, bar_reg); b->cursor = nir_before_block_after_phis(after); - nir_bar_sync_nv(b, bar); + nir_bar_sync_nv(b, nir_load_reg(b, bar_reg), bar); struct barrier barrier = { .node = node, - .bar = bar, + .bar_set = bar, + .bar_reg = bar_reg, }; util_dynarray_append(&state->barriers, struct barrier, barrier); @@ -72,7 +76,9 @@ break_loop_bars(nir_block *block, struct add_barriers_state *state) const struct barrier *bar = util_dynarray_element(&state->barriers, struct barrier, idx); if (bar->node == p) { - nir_bar_break_nv(b, bar->bar); + nir_def *bar_val = nir_load_reg(b, bar->bar_reg); + bar_val = nir_bar_break_nv(b, bar_val); + nir_store_reg(b, bar_val, bar->bar_reg); idx--; } } @@ -281,6 +287,8 @@ nak_nir_add_barriers_impl(nir_function_impl *impl, nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis); + + nir_lower_reg_intrinsics_to_ssa_impl(impl); } else { nir_metadata_preserve(impl, nir_metadata_all); }