Merge branch 'nak/shared_mem_tracking' into 'main'

nak: report shared memory usage and adjust SM/warp metric

See merge request mesa/mesa!41249
This commit is contained in:
Karol Herbst 2026-05-08 00:15:51 +00:00
commit 161bedcbcb
9 changed files with 175 additions and 22 deletions

View file

@ -200,6 +200,10 @@ pub extern "C" fn nak_compiler_create(
let nak = Box::new(nak_compiler {
sm: dev.sm,
warps_per_sm: dev.max_warps_per_mp,
blocks_per_sm: dev.max_blocks_per_mp,
max_shared_mem: u32::from(
dev.sm_smem_sizes_kB[usize::from(dev.sm_smem_size_count) - 1],
) * 1024,
nir_options: nir_options(dev),
});
@ -243,6 +247,7 @@ impl ShaderBin {
let asm = CString::new(asm)
.expect("NAK assembly has unexpected null characters");
let mut shared_mem = 0;
let c_info = nak_shader_info {
stage: match info.stage {
ShaderStageInfo::Compute(_) => MESA_SHADER_COMPUTE,
@ -272,6 +277,7 @@ impl ShaderBin {
crs_size: sm.crs_size(info.max_crs_depth),
__bindgen_anon_1: match &info.stage {
ShaderStageInfo::Compute(cs_info) => {
shared_mem = cs_info.smem_size;
nak_shader_info__bindgen_ty_1 {
cs: nak_shader_info__bindgen_ty_1__bindgen_ty_1 {
local_size: [
@ -368,6 +374,7 @@ impl ShaderBin {
eprintln!("Fills from reg: {}", c_info.num_fills_from_reg);
eprintln!("Num GPRs: {}", c_info.num_gprs);
eprintln!("SLM size: {}", c_info.slm_size);
eprintln!("Shared size: {shared_mem}");
if c_info.stage != MESA_SHADER_COMPUTE {
eprint_hex("Header", &c_info.hdr);
@ -446,7 +453,12 @@ fn nak_compile_shader_internal(
Some(unsafe { &*fs_key })
};
let sm = ShaderModelInfo::new(nak.sm, nak.warps_per_sm);
let sm = ShaderModelInfo::new(
nak.sm,
nak.warps_per_sm,
nak.blocks_per_sm,
nak.max_shared_mem,
);
let mut s = nak_shader_from_nir(nak, nir, &sm);
if DEBUG.print() {

View file

@ -37,8 +37,12 @@ impl RunSingleton {
let run = Runner::new(dev_id);
let sm_nr = run.dev_info().sm;
let sm =
ShaderModelInfo::new(sm_nr, run.dev_info().max_warps_per_mp);
let sm = ShaderModelInfo::new(
sm_nr,
run.dev_info().max_warps_per_mp,
0,
0,
);
RunSingleton { sm, run }
})
}

View file

@ -9774,11 +9774,23 @@ pub trait ShaderModel {
pub struct ShaderModelInfo {
sm: u8,
warps_per_sm: u8,
blocks_per_sm: u8,
shared_mem_per_sm: u32,
}
impl ShaderModelInfo {
pub fn new(sm: u8, warps_per_sm: u8) -> Self {
ShaderModelInfo { sm, warps_per_sm }
pub fn new(
sm: u8,
warps_per_sm: u8,
blocks_per_sm: u8,
shared_mem_per_sm: u32,
) -> Self {
ShaderModelInfo {
sm,
warps_per_sm,
blocks_per_sm,
shared_mem_per_sm,
}
}
}
@ -9899,13 +9911,31 @@ pub fn gpr_limit_from_local_size(local_size: &[u16; 3]) -> u32 {
min(out, 255)
}
pub fn max_warps_per_sm(sm: &ShaderModelInfo, gprs: u32) -> u32 {
// TODO: Take local_size and shared mem limit into account for compute
pub fn max_warps_per_sm(
sm: &ShaderModelInfo,
gprs: u32,
shared_mem: u16,
block_size: u16,
) -> u32 {
let total_regs: u32 = 65536;
// GPRs are allocated in multiples of 8
let gprs = max(gprs, 1);
let gprs = gprs.next_multiple_of(8);
let max_warps = prev_multiple_of((total_regs / 32) / gprs, 4);
let mut max_warps = prev_multiple_of((total_regs / 32) / gprs, 4);
let block_size = u32::from(block_size.next_multiple_of(32));
// Next we limit the warps according to our available blocks
if block_size != 0 {
max_warps =
max_warps.min((block_size * u32::from(sm.blocks_per_sm)) / 32);
}
// Next we limit the warps according to our available shared memory
if shared_mem != 0 && block_size != 0 {
let max_blocks = sm.shared_mem_per_sm / u32::from(shared_mem);
max_warps = max_warps.min((max_blocks * block_size) / 32)
}
min(max_warps, sm.warps_per_sm.into())
}
@ -10071,10 +10101,21 @@ impl Shader<'_> {
self.info.writes_global_mem = writes_global_mem;
self.info.uses_fp64 = uses_fp64;
self.info.max_warps_per_sm = max_warps_per_sm(
self.sm,
self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(),
);
if let ShaderStageInfo::Compute(compute) = &self.info.stage {
self.info.max_warps_per_sm = max_warps_per_sm(
self.sm,
self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(),
compute.smem_size,
compute.local_size.iter().product(),
);
} else {
self.info.max_warps_per_sm = max_warps_per_sm(
self.sm,
self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(),
0,
0,
);
}
if self.sm.sm() >= 50 {
if let ShaderStageInfo::Vertex(vertex_info) = &mut self.info.stage {

View file

@ -87,7 +87,7 @@ fn disassemble_instrs(instrs: Vec<Instr>, sm: u8) -> Vec<String> {
io: ShaderIoInfo::None,
};
let sm = ShaderModelInfo::new(sm, 0);
let sm = ShaderModelInfo::new(sm, 0, 0, 0);
let s = Shader {
sm: &sm,
info: info,

View file

@ -26,9 +26,14 @@ const TARGET_FREE: i32 = 4;
/// one more register causes occupancy to plummet. This function figures out how
/// many GPRs you can use without costing occupancy, assuming you always need at
/// least `x` GPRs.
fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 {
fn next_occupancy_cliff(
sm: &ShaderModelInfo,
x: u32,
shared_mem: u16,
block_size: u16,
) -> u32 {
let total_regs: u32 = 65536;
let threads = max_warps_per_sm(sm, x) * 32;
let threads = max_warps_per_sm(sm, x, shared_mem, block_size) * 32;
// This function doesn't actually model the maximum number of registers
// correctly - callers need to worry about that separately. We do,
@ -42,12 +47,18 @@ fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 {
#[test]
fn test_next_occupancy_cliff() {
for max_hw_warps in [32, 48, 64] {
let sm = ShaderModelInfo::new(75, max_hw_warps);
let sm = ShaderModelInfo::new(75, max_hw_warps, 0, 100 * 1024);
for x in 0..255 {
let y = next_occupancy_cliff(&sm, x);
let y = next_occupancy_cliff(&sm, x, 0, 0);
assert!(y >= x);
assert_eq!(max_warps_per_sm(&sm, x), max_warps_per_sm(&sm, y));
assert!(max_warps_per_sm(&sm, y) > max_warps_per_sm(&sm, y + 1));
assert_eq!(
max_warps_per_sm(&sm, x, 0, 0),
max_warps_per_sm(&sm, y, 0, 0)
);
assert!(
max_warps_per_sm(&sm, y, 0, 0)
> max_warps_per_sm(&sm, y + 1, 0, 0)
);
}
}
}
@ -56,10 +67,14 @@ fn next_occupancy_cliff_with_reserved(
sm: &ShaderModelInfo,
gprs: i32,
reserved: i32,
shared_mem: u16,
block_size: u16,
) -> i32 {
i32::try_from(next_occupancy_cliff(
sm,
(gprs + reserved).try_into().unwrap(),
shared_mem,
block_size,
))
.unwrap()
- reserved
@ -751,11 +766,18 @@ fn get_schedule_types(
min_gpr_target: i32,
max_gpr_target: i32,
reserved_gprs: i32,
shared_mem: u16,
block_size: u16,
) -> Vec<ScheduleType> {
let mut out = Vec::new();
let mut gpr_target =
next_occupancy_cliff_with_reserved(sm, min_gpr_target, reserved_gprs);
let mut gpr_target = next_occupancy_cliff_with_reserved(
sm,
min_gpr_target,
reserved_gprs,
shared_mem,
block_size,
);
while gpr_target < max_regs[RegFile::GPR] {
out.push(ScheduleType::RegLimit(gpr_target.try_into().unwrap()));
@ -770,6 +792,8 @@ fn get_schedule_types(
sm,
gpr_target + 1,
reserved_gprs,
shared_mem,
block_size,
);
}
@ -792,6 +816,8 @@ impl Function {
&mut self,
sm: &ShaderModelInfo,
max_regs: PerRegFile<i32>,
shared_mem: u16,
block_size: u16,
) {
let liveness = SimpleLiveness::for_function(self);
let mut live_out_sets: Vec<LiveSet> = Vec::new();
@ -912,6 +938,8 @@ impl Function {
min_gpr_target,
max_gpr_target,
reserved_gprs,
shared_mem,
block_size,
);
schedule_types.reverse();
@ -1025,8 +1053,17 @@ impl Shader<'_> {
}
max_regs[RegFile::GPR] -= SW_RESERVED_GPRS;
let mut shared_mem = 0;
let mut block_size = 0;
if let ShaderStageInfo::Compute(compute) = &self.info.stage {
shared_mem = compute.smem_size;
block_size = compute.local_size.iter().product();
}
for f in &mut self.functions {
f.opt_instr_sched_prepass(self.sm, max_regs);
f.opt_instr_sched_prepass(
self.sm, max_regs, shared_mem, block_size,
);
}
}
}

View file

@ -20,6 +20,8 @@ bool nak_debug_no_ugpr(void);
struct nak_compiler {
uint8_t sm;
uint8_t warps_per_sm;
uint8_t blocks_per_sm;
uint32_t max_shared_mem;
struct nir_shader_compiler_options nir_options;
};

View file

@ -55,6 +55,7 @@ struct nv_device_info {
uint16_t tpc_count;
uint8_t mp_per_tpc;
uint8_t max_warps_per_mp;
uint8_t max_blocks_per_mp;
bool has_transfer_queue;

View file

@ -1309,6 +1309,18 @@ nvk_shader_get_executable_statistics(
stat->value.u64 = shader->info.slm_size;
}
uint16_t smem_size = 0;
if (shader->info.stage == MESA_SHADER_COMPUTE)
smem_size = shader->info.cs.smem_size;
vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Shared size");
WRITE_STR(stat->description,
"Size of shader shared memory, in bytes");
stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
stat->value.u64 = smem_size;
}
return vk_outarray_status(&out);
}

View file

@ -159,6 +159,49 @@ max_warps_per_mp_for_sm(uint8_t sm)
}
}
static uint8_t
max_blocks_per_mp_for_sm(uint8_t sm)
{
/* Values taken from CUDA programming guide section "Compute Capabilities" */
switch (sm) {
case 10:
case 11:
case 12:
case 13:
case 20:
case 21:
return 8;
case 30:
case 32:
case 35:
case 37:
case 75:
case 86:
case 87:
return 16;
case 89:
case 110:
case 120:
return 24;
case 50:
case 52:
case 53:
case 60:
case 61:
case 62:
case 70:
case 72:
case 80:
case 90:
case 100:
return 32;
default:
assert(!"unkown SM version");
/* return the smallest known value */
return 8;
}
}
static uint8_t
mp_per_tpc_for_chipset(uint16_t chipset)
{
@ -538,6 +581,7 @@ nouveau_ws_device_new(drmDevicePtr drm_device)
// for now we hardcode those values, but in the future Nouveau could provide that information to
// us instead.
device->info.max_warps_per_mp = max_warps_per_mp_for_sm(device->info.sm);
device->info.max_blocks_per_mp = max_blocks_per_mp_for_sm(device->info.sm);
device->info.mp_per_tpc = mp_per_tpc_for_chipset(device->info.chipset);
/* Transfer queues require two kernel fixes: