nak/instr_sched_prepass: take shared mem and block size into account

There is a bug in prepass that sometimes hurt occupancy.

Totals from 6865 (0.57% of 1212873) affected shaders:
CodeSize: 117726624 -> 116684032 (-0.89%); split: -1.03%, +0.15%
Number of GPRs: 326680 -> 377264 (+15.48%); split: -0.01%, +15.49%
Static cycle count: 55771616 -> 55292902 (-0.86%); split: -1.45%, +0.59%
Spills to reg: 14283 -> 14611 (+2.30%); split: -0.19%, +2.49%
Fills from reg: 12409 -> 12708 (+2.41%); split: -0.15%, +2.55%
Max warps/SM: 215877 -> 215753 (-0.06%)
This commit is contained in:
Karol Herbst 2026-04-29 14:27:24 +02:00 committed by Karol Herbst
parent a09af6ce29
commit 467980079d

View file

@ -26,9 +26,14 @@ const TARGET_FREE: i32 = 4;
/// one more register causes occupancy to plummet. This function figures out how
/// many GPRs you can use without costing occupancy, assuming you always need at
/// least `x` GPRs.
fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 {
fn next_occupancy_cliff(
sm: &ShaderModelInfo,
x: u32,
shared_mem: u16,
block_size: u16,
) -> u32 {
let total_regs: u32 = 65536;
let threads = max_warps_per_sm(sm, x, 0, 0) * 32;
let threads = max_warps_per_sm(sm, x, shared_mem, block_size) * 32;
// This function doesn't actually model the maximum number of registers
// correctly - callers need to worry about that separately. We do,
@ -44,7 +49,7 @@ fn test_next_occupancy_cliff() {
for max_hw_warps in [32, 48, 64] {
let sm = ShaderModelInfo::new(75, max_hw_warps, 0, 100 * 1024);
for x in 0..255 {
let y = next_occupancy_cliff(&sm, x);
let y = next_occupancy_cliff(&sm, x, 0, 0);
assert!(y >= x);
assert_eq!(
max_warps_per_sm(&sm, x, 0, 0),
@ -62,10 +67,14 @@ fn next_occupancy_cliff_with_reserved(
sm: &ShaderModelInfo,
gprs: i32,
reserved: i32,
shared_mem: u16,
block_size: u16,
) -> i32 {
i32::try_from(next_occupancy_cliff(
sm,
(gprs + reserved).try_into().unwrap(),
shared_mem,
block_size,
))
.unwrap()
- reserved
@ -757,11 +766,18 @@ fn get_schedule_types(
min_gpr_target: i32,
max_gpr_target: i32,
reserved_gprs: i32,
shared_mem: u16,
block_size: u16,
) -> Vec<ScheduleType> {
let mut out = Vec::new();
let mut gpr_target =
next_occupancy_cliff_with_reserved(sm, min_gpr_target, reserved_gprs);
let mut gpr_target = next_occupancy_cliff_with_reserved(
sm,
min_gpr_target,
reserved_gprs,
shared_mem,
block_size,
);
while gpr_target < max_regs[RegFile::GPR] {
out.push(ScheduleType::RegLimit(gpr_target.try_into().unwrap()));
@ -776,6 +792,8 @@ fn get_schedule_types(
sm,
gpr_target + 1,
reserved_gprs,
shared_mem,
block_size,
);
}
@ -798,6 +816,8 @@ impl Function {
&mut self,
sm: &ShaderModelInfo,
max_regs: PerRegFile<i32>,
shared_mem: u16,
block_size: u16,
) {
let liveness = SimpleLiveness::for_function(self);
let mut live_out_sets: Vec<LiveSet> = Vec::new();
@ -918,6 +938,8 @@ impl Function {
min_gpr_target,
max_gpr_target,
reserved_gprs,
shared_mem,
block_size,
);
schedule_types.reverse();
@ -1031,8 +1053,17 @@ impl Shader<'_> {
}
max_regs[RegFile::GPR] -= SW_RESERVED_GPRS;
let mut shared_mem = 0;
let mut block_size = 0;
if let ShaderStageInfo::Compute(compute) = &self.info.stage {
shared_mem = compute.smem_size;
block_size = compute.local_size.iter().product();
}
for f in &mut self.functions {
f.opt_instr_sched_prepass(self.sm, max_regs);
f.opt_instr_sched_prepass(
self.sm, max_regs, shared_mem, block_size,
);
}
}
}