diff --git a/src/nouveau/compiler/nak/assign_regs.rs b/src/nouveau/compiler/nak/assign_regs.rs index 82de4014b12..644aeb587af 100644 --- a/src/nouveau/compiler/nak/assign_regs.rs +++ b/src/nouveau/compiler/nak/assign_regs.rs @@ -1486,6 +1486,28 @@ impl Shader<'_> { ); } + // We try to check if allocating a few fewer registers would allow us + // to run 12 instead of 8 warps concurrently. + // The assumption is that running +50% threads will give us more + // performance than we lose with a bit of spilling. + let actual_gprs = (total_gprs + hw_reserved_gprs).min(max_gprs); + let warps_per_sm = max_warps_per_sm(self.sm, actual_gprs); + if warps_per_sm == 8 { + let new_max = if max_warps_per_sm(self.sm, actual_gprs - 8) > 8 { + total_gprs - 8 + } else if max_warps_per_sm(self.sm, actual_gprs - 16) > 8 { + // This gives us +15% performance in pixmark_piano + total_gprs - 16 + } else { + 0 + }; + + if new_max != 0 { + max_gprs = (new_max.next_multiple_of(8) - hw_reserved_gprs) + .min(max_gprs); + } + } + if total_gprs > max_gprs { // If we're spilling GPRs, we need to reserve 2 GPRs for OpParCopy // lowering because it needs to be able lower Mem copies which