nak/instr_sched_prepass: take shared mem and block size into account

There is a bug in prepass that sometimes hurt occupancy. Totals from 6865 (0.57% of 1212873) affected shaders: CodeSize: 117726624 -> 116684032 (-0.89%); split: -1.03%, +0.15% Number of GPRs: 326680 -> 377264 (+15.48%); split: -0.01%, +15.49% Static cycle count: 55771616 -> 55292902 (-0.86%); split: -1.45%, +0.59% Spills to reg: 14283 -> 14611 (+2.30%); split: -0.19%, +2.49% Fills from reg: 12409 -> 12708 (+2.41%); split: -0.15%, +2.55% Max warps/SM: 215877 -> 215753 (-0.06%)
2026-05-08 02:38:04 +02:00 · 2026-04-29 14:27:24 +02:00 · 2026-04-29 14:27:24 +02:00 · 467980079d
commit 467980079d
parent a09af6ce29
1 changed files with 37 additions and 6 deletions
--- a/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs
+++ b/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs
@ -26,9 +26,14 @@ const TARGET_FREE: i32 = 4;
 /// one more register causes occupancy to plummet. This function figures out how
 /// many GPRs you can use without costing occupancy, assuming you always need at
 /// least `x` GPRs.
-fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 {
+fn next_occupancy_cliff(
+    sm: &ShaderModelInfo,
+    x: u32,
+    shared_mem: u16,
+    block_size: u16,
+) -> u32 {
    let total_regs: u32 = 65536;
-    let threads = max_warps_per_sm(sm, x, 0, 0) * 32;
+    let threads = max_warps_per_sm(sm, x, shared_mem, block_size) * 32;

    // This function doesn't actually model the maximum number of registers
    // correctly - callers need to worry about that separately. We do,
@ -44,7 +49,7 @@ fn test_next_occupancy_cliff() {
    for max_hw_warps in [32, 48, 64] {
        let sm = ShaderModelInfo::new(75, max_hw_warps, 0, 100 * 1024);
        for x in 0..255 {
-            let y = next_occupancy_cliff(&sm, x);
+            let y = next_occupancy_cliff(&sm, x, 0, 0);
            assert!(y >= x);
            assert_eq!(
                max_warps_per_sm(&sm, x, 0, 0),
@ -62,10 +67,14 @@ fn next_occupancy_cliff_with_reserved(
    sm: &ShaderModelInfo,
    gprs: i32,
    reserved: i32,
+    shared_mem: u16,
+    block_size: u16,
 ) -> i32 {
    i32::try_from(next_occupancy_cliff(
        sm,
        (gprs + reserved).try_into().unwrap(),
+        shared_mem,
+        block_size,
    ))
    .unwrap()
        - reserved
@ -757,11 +766,18 @@ fn get_schedule_types(
    min_gpr_target: i32,
    max_gpr_target: i32,
    reserved_gprs: i32,
+    shared_mem: u16,
+    block_size: u16,
 ) -> Vec<ScheduleType> {
    let mut out = Vec::new();

-    let mut gpr_target =
-        next_occupancy_cliff_with_reserved(sm, min_gpr_target, reserved_gprs);
+    let mut gpr_target = next_occupancy_cliff_with_reserved(
+        sm,
+        min_gpr_target,
+        reserved_gprs,
+        shared_mem,
+        block_size,
+    );
    while gpr_target < max_regs[RegFile::GPR] {
        out.push(ScheduleType::RegLimit(gpr_target.try_into().unwrap()));

@ -776,6 +792,8 @@ fn get_schedule_types(
            sm,
            gpr_target + 1,
            reserved_gprs,
+            shared_mem,
+            block_size,
        );
    }

@ -798,6 +816,8 @@ impl Function {
        &mut self,
        sm: &ShaderModelInfo,
        max_regs: PerRegFile<i32>,
+        shared_mem: u16,
+        block_size: u16,
    ) {
        let liveness = SimpleLiveness::for_function(self);
        let mut live_out_sets: Vec<LiveSet> = Vec::new();
@ -918,6 +938,8 @@ impl Function {
            min_gpr_target,
            max_gpr_target,
            reserved_gprs,
+            shared_mem,
+            block_size,
        );
        schedule_types.reverse();

@ -1031,8 +1053,17 @@ impl Shader<'_> {
        }
        max_regs[RegFile::GPR] -= SW_RESERVED_GPRS;

+        let mut shared_mem = 0;
+        let mut block_size = 0;
+        if let ShaderStageInfo::Compute(compute) = &self.info.stage {
+            shared_mem = compute.smem_size;
+            block_size = compute.local_size.iter().product();
+        }
+
        for f in &mut self.functions {
-            f.opt_instr_sched_prepass(self.sm, max_regs);
+            f.opt_instr_sched_prepass(
+                self.sm, max_regs, shared_mem, block_size,
+            );
        }
    }
 }