diff --git a/src/nouveau/compiler/nak/api.rs b/src/nouveau/compiler/nak/api.rs index 527579c5ce5..6e25244912b 100644 --- a/src/nouveau/compiler/nak/api.rs +++ b/src/nouveau/compiler/nak/api.rs @@ -200,6 +200,10 @@ pub extern "C" fn nak_compiler_create( let nak = Box::new(nak_compiler { sm: dev.sm, warps_per_sm: dev.max_warps_per_mp, + blocks_per_sm: dev.max_blocks_per_mp, + max_shared_mem: u32::from( + dev.sm_smem_sizes_kB[usize::from(dev.sm_smem_size_count) - 1], + ) * 1024, nir_options: nir_options(dev), }); @@ -243,6 +247,7 @@ impl ShaderBin { let asm = CString::new(asm) .expect("NAK assembly has unexpected null characters"); + let mut shared_mem = 0; let c_info = nak_shader_info { stage: match info.stage { ShaderStageInfo::Compute(_) => MESA_SHADER_COMPUTE, @@ -272,6 +277,7 @@ impl ShaderBin { crs_size: sm.crs_size(info.max_crs_depth), __bindgen_anon_1: match &info.stage { ShaderStageInfo::Compute(cs_info) => { + shared_mem = cs_info.smem_size; nak_shader_info__bindgen_ty_1 { cs: nak_shader_info__bindgen_ty_1__bindgen_ty_1 { local_size: [ @@ -368,6 +374,7 @@ impl ShaderBin { eprintln!("Fills from reg: {}", c_info.num_fills_from_reg); eprintln!("Num GPRs: {}", c_info.num_gprs); eprintln!("SLM size: {}", c_info.slm_size); + eprintln!("Shared size: {shared_mem}"); if c_info.stage != MESA_SHADER_COMPUTE { eprint_hex("Header", &c_info.hdr); @@ -446,7 +453,12 @@ fn nak_compile_shader_internal( Some(unsafe { &*fs_key }) }; - let sm = ShaderModelInfo::new(nak.sm, nak.warps_per_sm); + let sm = ShaderModelInfo::new( + nak.sm, + nak.warps_per_sm, + nak.blocks_per_sm, + nak.max_shared_mem, + ); let mut s = nak_shader_from_nir(nak, nir, &sm); if DEBUG.print() { diff --git a/src/nouveau/compiler/nak/hw_tests.rs b/src/nouveau/compiler/nak/hw_tests.rs index bb6e6d8d5aa..a1c234389e1 100644 --- a/src/nouveau/compiler/nak/hw_tests.rs +++ b/src/nouveau/compiler/nak/hw_tests.rs @@ -37,8 +37,12 @@ impl RunSingleton { let run = Runner::new(dev_id); let sm_nr = run.dev_info().sm; - let sm = - ShaderModelInfo::new(sm_nr, run.dev_info().max_warps_per_mp); + let sm = ShaderModelInfo::new( + sm_nr, + run.dev_info().max_warps_per_mp, + 0, + 0, + ); RunSingleton { sm, run } }) } diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 8d5feffb9c7..80dff25e0df 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -9774,11 +9774,23 @@ pub trait ShaderModel { pub struct ShaderModelInfo { sm: u8, warps_per_sm: u8, + blocks_per_sm: u8, + shared_mem_per_sm: u32, } impl ShaderModelInfo { - pub fn new(sm: u8, warps_per_sm: u8) -> Self { - ShaderModelInfo { sm, warps_per_sm } + pub fn new( + sm: u8, + warps_per_sm: u8, + blocks_per_sm: u8, + shared_mem_per_sm: u32, + ) -> Self { + ShaderModelInfo { + sm, + warps_per_sm, + blocks_per_sm, + shared_mem_per_sm, + } } } @@ -9899,13 +9911,31 @@ pub fn gpr_limit_from_local_size(local_size: &[u16; 3]) -> u32 { min(out, 255) } -pub fn max_warps_per_sm(sm: &ShaderModelInfo, gprs: u32) -> u32 { - // TODO: Take local_size and shared mem limit into account for compute +pub fn max_warps_per_sm( + sm: &ShaderModelInfo, + gprs: u32, + shared_mem: u16, + block_size: u16, +) -> u32 { let total_regs: u32 = 65536; // GPRs are allocated in multiples of 8 let gprs = max(gprs, 1); let gprs = gprs.next_multiple_of(8); - let max_warps = prev_multiple_of((total_regs / 32) / gprs, 4); + let mut max_warps = prev_multiple_of((total_regs / 32) / gprs, 4); + let block_size = u32::from(block_size.next_multiple_of(32)); + + // Next we limit the warps according to our available blocks + if block_size != 0 { + max_warps = + max_warps.min((block_size * u32::from(sm.blocks_per_sm)) / 32); + } + + // Next we limit the warps according to our available shared memory + if shared_mem != 0 && block_size != 0 { + let max_blocks = sm.shared_mem_per_sm / u32::from(shared_mem); + max_warps = max_warps.min((max_blocks * block_size) / 32) + } + min(max_warps, sm.warps_per_sm.into()) } @@ -10071,10 +10101,21 @@ impl Shader<'_> { self.info.writes_global_mem = writes_global_mem; self.info.uses_fp64 = uses_fp64; - self.info.max_warps_per_sm = max_warps_per_sm( - self.sm, - self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(), - ); + if let ShaderStageInfo::Compute(compute) = &self.info.stage { + self.info.max_warps_per_sm = max_warps_per_sm( + self.sm, + self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(), + compute.smem_size, + compute.local_size.iter().product(), + ); + } else { + self.info.max_warps_per_sm = max_warps_per_sm( + self.sm, + self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(), + 0, + 0, + ); + } if self.sm.sm() >= 50 { if let ShaderStageInfo::Vertex(vertex_info) = &mut self.info.stage { diff --git a/src/nouveau/compiler/nak/nvdisasm_tests.rs b/src/nouveau/compiler/nak/nvdisasm_tests.rs index fbcf7c82013..1281e50ad7d 100644 --- a/src/nouveau/compiler/nak/nvdisasm_tests.rs +++ b/src/nouveau/compiler/nak/nvdisasm_tests.rs @@ -87,7 +87,7 @@ fn disassemble_instrs(instrs: Vec, sm: u8) -> Vec { io: ShaderIoInfo::None, }; - let sm = ShaderModelInfo::new(sm, 0); + let sm = ShaderModelInfo::new(sm, 0, 0, 0); let s = Shader { sm: &sm, info: info, diff --git a/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs b/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs index 7c880773367..ae16e31f532 100644 --- a/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs +++ b/src/nouveau/compiler/nak/opt_instr_sched_prepass.rs @@ -26,9 +26,14 @@ const TARGET_FREE: i32 = 4; /// one more register causes occupancy to plummet. This function figures out how /// many GPRs you can use without costing occupancy, assuming you always need at /// least `x` GPRs. -fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 { +fn next_occupancy_cliff( + sm: &ShaderModelInfo, + x: u32, + shared_mem: u16, + block_size: u16, +) -> u32 { let total_regs: u32 = 65536; - let threads = max_warps_per_sm(sm, x) * 32; + let threads = max_warps_per_sm(sm, x, shared_mem, block_size) * 32; // This function doesn't actually model the maximum number of registers // correctly - callers need to worry about that separately. We do, @@ -42,12 +47,18 @@ fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 { #[test] fn test_next_occupancy_cliff() { for max_hw_warps in [32, 48, 64] { - let sm = ShaderModelInfo::new(75, max_hw_warps); + let sm = ShaderModelInfo::new(75, max_hw_warps, 0, 100 * 1024); for x in 0..255 { - let y = next_occupancy_cliff(&sm, x); + let y = next_occupancy_cliff(&sm, x, 0, 0); assert!(y >= x); - assert_eq!(max_warps_per_sm(&sm, x), max_warps_per_sm(&sm, y)); - assert!(max_warps_per_sm(&sm, y) > max_warps_per_sm(&sm, y + 1)); + assert_eq!( + max_warps_per_sm(&sm, x, 0, 0), + max_warps_per_sm(&sm, y, 0, 0) + ); + assert!( + max_warps_per_sm(&sm, y, 0, 0) + > max_warps_per_sm(&sm, y + 1, 0, 0) + ); } } } @@ -56,10 +67,14 @@ fn next_occupancy_cliff_with_reserved( sm: &ShaderModelInfo, gprs: i32, reserved: i32, + shared_mem: u16, + block_size: u16, ) -> i32 { i32::try_from(next_occupancy_cliff( sm, (gprs + reserved).try_into().unwrap(), + shared_mem, + block_size, )) .unwrap() - reserved @@ -751,11 +766,18 @@ fn get_schedule_types( min_gpr_target: i32, max_gpr_target: i32, reserved_gprs: i32, + shared_mem: u16, + block_size: u16, ) -> Vec { let mut out = Vec::new(); - let mut gpr_target = - next_occupancy_cliff_with_reserved(sm, min_gpr_target, reserved_gprs); + let mut gpr_target = next_occupancy_cliff_with_reserved( + sm, + min_gpr_target, + reserved_gprs, + shared_mem, + block_size, + ); while gpr_target < max_regs[RegFile::GPR] { out.push(ScheduleType::RegLimit(gpr_target.try_into().unwrap())); @@ -770,6 +792,8 @@ fn get_schedule_types( sm, gpr_target + 1, reserved_gprs, + shared_mem, + block_size, ); } @@ -792,6 +816,8 @@ impl Function { &mut self, sm: &ShaderModelInfo, max_regs: PerRegFile, + shared_mem: u16, + block_size: u16, ) { let liveness = SimpleLiveness::for_function(self); let mut live_out_sets: Vec = Vec::new(); @@ -912,6 +938,8 @@ impl Function { min_gpr_target, max_gpr_target, reserved_gprs, + shared_mem, + block_size, ); schedule_types.reverse(); @@ -1025,8 +1053,17 @@ impl Shader<'_> { } max_regs[RegFile::GPR] -= SW_RESERVED_GPRS; + let mut shared_mem = 0; + let mut block_size = 0; + if let ShaderStageInfo::Compute(compute) = &self.info.stage { + shared_mem = compute.smem_size; + block_size = compute.local_size.iter().product(); + } + for f in &mut self.functions { - f.opt_instr_sched_prepass(self.sm, max_regs); + f.opt_instr_sched_prepass( + self.sm, max_regs, shared_mem, block_size, + ); } } } diff --git a/src/nouveau/compiler/nak_private.h b/src/nouveau/compiler/nak_private.h index 588eb897eb4..29930f4f2f3 100644 --- a/src/nouveau/compiler/nak_private.h +++ b/src/nouveau/compiler/nak_private.h @@ -20,6 +20,8 @@ bool nak_debug_no_ugpr(void); struct nak_compiler { uint8_t sm; uint8_t warps_per_sm; + uint8_t blocks_per_sm; + uint32_t max_shared_mem; struct nir_shader_compiler_options nir_options; }; diff --git a/src/nouveau/headers/nv_device_info.h b/src/nouveau/headers/nv_device_info.h index 1532983025c..ee0afb68ea9 100644 --- a/src/nouveau/headers/nv_device_info.h +++ b/src/nouveau/headers/nv_device_info.h @@ -55,6 +55,7 @@ struct nv_device_info { uint16_t tpc_count; uint8_t mp_per_tpc; uint8_t max_warps_per_mp; + uint8_t max_blocks_per_mp; bool has_transfer_queue; diff --git a/src/nouveau/vulkan/nvk_shader.c b/src/nouveau/vulkan/nvk_shader.c index 8f7f781034b..da907f1feef 100644 --- a/src/nouveau/vulkan/nvk_shader.c +++ b/src/nouveau/vulkan/nvk_shader.c @@ -1309,6 +1309,18 @@ nvk_shader_get_executable_statistics( stat->value.u64 = shader->info.slm_size; } + uint16_t smem_size = 0; + if (shader->info.stage == MESA_SHADER_COMPUTE) + smem_size = shader->info.cs.smem_size; + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Shared size"); + WRITE_STR(stat->description, + "Size of shader shared memory, in bytes"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = smem_size; + } + return vk_outarray_status(&out); } diff --git a/src/nouveau/winsys/nouveau_device.c b/src/nouveau/winsys/nouveau_device.c index f5ad9d5db0f..ae7cbd3f34b 100644 --- a/src/nouveau/winsys/nouveau_device.c +++ b/src/nouveau/winsys/nouveau_device.c @@ -159,6 +159,49 @@ max_warps_per_mp_for_sm(uint8_t sm) } } +static uint8_t +max_blocks_per_mp_for_sm(uint8_t sm) +{ + /* Values taken from CUDA programming guide section "Compute Capabilities" */ + switch (sm) { + case 10: + case 11: + case 12: + case 13: + case 20: + case 21: + return 8; + case 30: + case 32: + case 35: + case 37: + case 75: + case 86: + case 87: + return 16; + case 89: + case 110: + case 120: + return 24; + case 50: + case 52: + case 53: + case 60: + case 61: + case 62: + case 70: + case 72: + case 80: + case 90: + case 100: + return 32; + default: + assert(!"unkown SM version"); + /* return the smallest known value */ + return 8; + } +} + static uint8_t mp_per_tpc_for_chipset(uint16_t chipset) { @@ -538,6 +581,7 @@ nouveau_ws_device_new(drmDevicePtr drm_device) // for now we hardcode those values, but in the future Nouveau could provide that information to // us instead. device->info.max_warps_per_mp = max_warps_per_mp_for_sm(device->info.sm); + device->info.max_blocks_per_mp = max_blocks_per_mp_for_sm(device->info.sm); device->info.mp_per_tpc = mp_per_tpc_for_chipset(device->info.chipset); /* Transfer queues require two kernel fixes: