From 467980079d83d484c0eaf4962ee22ecae4917bb8 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 29 Apr 2026 14:27:24 +0200 Subject: [PATCH] nak/instr_sched_prepass: take shared mem and block size into account There is a bug in prepass that sometimes hurt occupancy. Totals from 6865 (0.57% of 1212873) affected shaders: CodeSize: 117726624 -> 116684032 (-0.89%); split: -1.03%, +0.15% Number of GPRs: 326680 -> 377264 (+15.48%); split: -0.01%, +15.49% Static cycle count: 55771616 -> 55292902 (-0.86%); split: -1.45%, +0.59% Spills to reg: 14283 -> 14611 (+2.30%); split: -0.19%, +2.49% Fills from reg: 12409 -> 12708 (+2.41%); split: -0.15%, +2.55% Max warps/SM: 215877 -> 215753 (-0.06%) --- .../compiler/nak/opt_instr_sched_prepass.rs | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs b/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs index 3893a3fcf93..ae16e31f532 100644 --- a/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs +++ b/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs @@ -26,9 +26,14 @@ const TARGET_FREE: i32 = 4; /// one more register causes occupancy to plummet. This function figures out how /// many GPRs you can use without costing occupancy, assuming you always need at /// least `x` GPRs. -fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 { +fn next_occupancy_cliff( + sm: &ShaderModelInfo, + x: u32, + shared_mem: u16, + block_size: u16, +) -> u32 { let total_regs: u32 = 65536; - let threads = max_warps_per_sm(sm, x, 0, 0) * 32; + let threads = max_warps_per_sm(sm, x, shared_mem, block_size) * 32; // This function doesn't actually model the maximum number of registers // correctly - callers need to worry about that separately. We do, @@ -44,7 +49,7 @@ fn test_next_occupancy_cliff() { for max_hw_warps in [32, 48, 64] { let sm = ShaderModelInfo::new(75, max_hw_warps, 0, 100 * 1024); for x in 0..255 { - let y = next_occupancy_cliff(&sm, x); + let y = next_occupancy_cliff(&sm, x, 0, 0); assert!(y >= x); assert_eq!( max_warps_per_sm(&sm, x, 0, 0), @@ -62,10 +67,14 @@ fn next_occupancy_cliff_with_reserved( sm: &ShaderModelInfo, gprs: i32, reserved: i32, + shared_mem: u16, + block_size: u16, ) -> i32 { i32::try_from(next_occupancy_cliff( sm, (gprs + reserved).try_into().unwrap(), + shared_mem, + block_size, )) .unwrap() - reserved @@ -757,11 +766,18 @@ fn get_schedule_types( min_gpr_target: i32, max_gpr_target: i32, reserved_gprs: i32, + shared_mem: u16, + block_size: u16, ) -> Vec { let mut out = Vec::new(); - let mut gpr_target = - next_occupancy_cliff_with_reserved(sm, min_gpr_target, reserved_gprs); + let mut gpr_target = next_occupancy_cliff_with_reserved( + sm, + min_gpr_target, + reserved_gprs, + shared_mem, + block_size, + ); while gpr_target < max_regs[RegFile::GPR] { out.push(ScheduleType::RegLimit(gpr_target.try_into().unwrap())); @@ -776,6 +792,8 @@ fn get_schedule_types( sm, gpr_target + 1, reserved_gprs, + shared_mem, + block_size, ); } @@ -798,6 +816,8 @@ impl Function { &mut self, sm: &ShaderModelInfo, max_regs: PerRegFile, + shared_mem: u16, + block_size: u16, ) { let liveness = SimpleLiveness::for_function(self); let mut live_out_sets: Vec = Vec::new(); @@ -918,6 +938,8 @@ impl Function { min_gpr_target, max_gpr_target, reserved_gprs, + shared_mem, + block_size, ); schedule_types.reverse(); @@ -1031,8 +1053,17 @@ impl Shader<'_> { } max_regs[RegFile::GPR] -= SW_RESERVED_GPRS; + let mut shared_mem = 0; + let mut block_size = 0; + if let ShaderStageInfo::Compute(compute) = &self.info.stage { + shared_mem = compute.smem_size; + block_size = compute.local_size.iter().product(); + } + for f in &mut self.functions { - f.opt_instr_sched_prepass(self.sm, max_regs); + f.opt_instr_sched_prepass( + self.sm, max_regs, shared_mem, block_size, + ); } } }