mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 02:38:04 +02:00
Merge branch 'nak/shared_mem_tracking' into 'main'
nak: report shared memory usage and adjust SM/warp metric See merge request mesa/mesa!41249
This commit is contained in:
commit
161bedcbcb
9 changed files with 175 additions and 22 deletions
|
|
@ -200,6 +200,10 @@ pub extern "C" fn nak_compiler_create(
|
|||
let nak = Box::new(nak_compiler {
|
||||
sm: dev.sm,
|
||||
warps_per_sm: dev.max_warps_per_mp,
|
||||
blocks_per_sm: dev.max_blocks_per_mp,
|
||||
max_shared_mem: u32::from(
|
||||
dev.sm_smem_sizes_kB[usize::from(dev.sm_smem_size_count) - 1],
|
||||
) * 1024,
|
||||
nir_options: nir_options(dev),
|
||||
});
|
||||
|
||||
|
|
@ -243,6 +247,7 @@ impl ShaderBin {
|
|||
let asm = CString::new(asm)
|
||||
.expect("NAK assembly has unexpected null characters");
|
||||
|
||||
let mut shared_mem = 0;
|
||||
let c_info = nak_shader_info {
|
||||
stage: match info.stage {
|
||||
ShaderStageInfo::Compute(_) => MESA_SHADER_COMPUTE,
|
||||
|
|
@ -272,6 +277,7 @@ impl ShaderBin {
|
|||
crs_size: sm.crs_size(info.max_crs_depth),
|
||||
__bindgen_anon_1: match &info.stage {
|
||||
ShaderStageInfo::Compute(cs_info) => {
|
||||
shared_mem = cs_info.smem_size;
|
||||
nak_shader_info__bindgen_ty_1 {
|
||||
cs: nak_shader_info__bindgen_ty_1__bindgen_ty_1 {
|
||||
local_size: [
|
||||
|
|
@ -368,6 +374,7 @@ impl ShaderBin {
|
|||
eprintln!("Fills from reg: {}", c_info.num_fills_from_reg);
|
||||
eprintln!("Num GPRs: {}", c_info.num_gprs);
|
||||
eprintln!("SLM size: {}", c_info.slm_size);
|
||||
eprintln!("Shared size: {shared_mem}");
|
||||
|
||||
if c_info.stage != MESA_SHADER_COMPUTE {
|
||||
eprint_hex("Header", &c_info.hdr);
|
||||
|
|
@ -446,7 +453,12 @@ fn nak_compile_shader_internal(
|
|||
Some(unsafe { &*fs_key })
|
||||
};
|
||||
|
||||
let sm = ShaderModelInfo::new(nak.sm, nak.warps_per_sm);
|
||||
let sm = ShaderModelInfo::new(
|
||||
nak.sm,
|
||||
nak.warps_per_sm,
|
||||
nak.blocks_per_sm,
|
||||
nak.max_shared_mem,
|
||||
);
|
||||
let mut s = nak_shader_from_nir(nak, nir, &sm);
|
||||
|
||||
if DEBUG.print() {
|
||||
|
|
|
|||
|
|
@ -37,8 +37,12 @@ impl RunSingleton {
|
|||
|
||||
let run = Runner::new(dev_id);
|
||||
let sm_nr = run.dev_info().sm;
|
||||
let sm =
|
||||
ShaderModelInfo::new(sm_nr, run.dev_info().max_warps_per_mp);
|
||||
let sm = ShaderModelInfo::new(
|
||||
sm_nr,
|
||||
run.dev_info().max_warps_per_mp,
|
||||
0,
|
||||
0,
|
||||
);
|
||||
RunSingleton { sm, run }
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9774,11 +9774,23 @@ pub trait ShaderModel {
|
|||
pub struct ShaderModelInfo {
|
||||
sm: u8,
|
||||
warps_per_sm: u8,
|
||||
blocks_per_sm: u8,
|
||||
shared_mem_per_sm: u32,
|
||||
}
|
||||
|
||||
impl ShaderModelInfo {
|
||||
pub fn new(sm: u8, warps_per_sm: u8) -> Self {
|
||||
ShaderModelInfo { sm, warps_per_sm }
|
||||
pub fn new(
|
||||
sm: u8,
|
||||
warps_per_sm: u8,
|
||||
blocks_per_sm: u8,
|
||||
shared_mem_per_sm: u32,
|
||||
) -> Self {
|
||||
ShaderModelInfo {
|
||||
sm,
|
||||
warps_per_sm,
|
||||
blocks_per_sm,
|
||||
shared_mem_per_sm,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -9899,13 +9911,31 @@ pub fn gpr_limit_from_local_size(local_size: &[u16; 3]) -> u32 {
|
|||
min(out, 255)
|
||||
}
|
||||
|
||||
pub fn max_warps_per_sm(sm: &ShaderModelInfo, gprs: u32) -> u32 {
|
||||
// TODO: Take local_size and shared mem limit into account for compute
|
||||
pub fn max_warps_per_sm(
|
||||
sm: &ShaderModelInfo,
|
||||
gprs: u32,
|
||||
shared_mem: u16,
|
||||
block_size: u16,
|
||||
) -> u32 {
|
||||
let total_regs: u32 = 65536;
|
||||
// GPRs are allocated in multiples of 8
|
||||
let gprs = max(gprs, 1);
|
||||
let gprs = gprs.next_multiple_of(8);
|
||||
let max_warps = prev_multiple_of((total_regs / 32) / gprs, 4);
|
||||
let mut max_warps = prev_multiple_of((total_regs / 32) / gprs, 4);
|
||||
let block_size = u32::from(block_size.next_multiple_of(32));
|
||||
|
||||
// Next we limit the warps according to our available blocks
|
||||
if block_size != 0 {
|
||||
max_warps =
|
||||
max_warps.min((block_size * u32::from(sm.blocks_per_sm)) / 32);
|
||||
}
|
||||
|
||||
// Next we limit the warps according to our available shared memory
|
||||
if shared_mem != 0 && block_size != 0 {
|
||||
let max_blocks = sm.shared_mem_per_sm / u32::from(shared_mem);
|
||||
max_warps = max_warps.min((max_blocks * block_size) / 32)
|
||||
}
|
||||
|
||||
min(max_warps, sm.warps_per_sm.into())
|
||||
}
|
||||
|
||||
|
|
@ -10071,10 +10101,21 @@ impl Shader<'_> {
|
|||
self.info.writes_global_mem = writes_global_mem;
|
||||
self.info.uses_fp64 = uses_fp64;
|
||||
|
||||
self.info.max_warps_per_sm = max_warps_per_sm(
|
||||
self.sm,
|
||||
self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(),
|
||||
);
|
||||
if let ShaderStageInfo::Compute(compute) = &self.info.stage {
|
||||
self.info.max_warps_per_sm = max_warps_per_sm(
|
||||
self.sm,
|
||||
self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(),
|
||||
compute.smem_size,
|
||||
compute.local_size.iter().product(),
|
||||
);
|
||||
} else {
|
||||
self.info.max_warps_per_sm = max_warps_per_sm(
|
||||
self.sm,
|
||||
self.info.num_gprs as u32 + self.sm.hw_reserved_gprs(),
|
||||
0,
|
||||
0,
|
||||
);
|
||||
}
|
||||
|
||||
if self.sm.sm() >= 50 {
|
||||
if let ShaderStageInfo::Vertex(vertex_info) = &mut self.info.stage {
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ fn disassemble_instrs(instrs: Vec<Instr>, sm: u8) -> Vec<String> {
|
|||
io: ShaderIoInfo::None,
|
||||
};
|
||||
|
||||
let sm = ShaderModelInfo::new(sm, 0);
|
||||
let sm = ShaderModelInfo::new(sm, 0, 0, 0);
|
||||
let s = Shader {
|
||||
sm: &sm,
|
||||
info: info,
|
||||
|
|
|
|||
|
|
@ -26,9 +26,14 @@ const TARGET_FREE: i32 = 4;
|
|||
/// one more register causes occupancy to plummet. This function figures out how
|
||||
/// many GPRs you can use without costing occupancy, assuming you always need at
|
||||
/// least `x` GPRs.
|
||||
fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 {
|
||||
fn next_occupancy_cliff(
|
||||
sm: &ShaderModelInfo,
|
||||
x: u32,
|
||||
shared_mem: u16,
|
||||
block_size: u16,
|
||||
) -> u32 {
|
||||
let total_regs: u32 = 65536;
|
||||
let threads = max_warps_per_sm(sm, x) * 32;
|
||||
let threads = max_warps_per_sm(sm, x, shared_mem, block_size) * 32;
|
||||
|
||||
// This function doesn't actually model the maximum number of registers
|
||||
// correctly - callers need to worry about that separately. We do,
|
||||
|
|
@ -42,12 +47,18 @@ fn next_occupancy_cliff(sm: &ShaderModelInfo, x: u32) -> u32 {
|
|||
#[test]
|
||||
fn test_next_occupancy_cliff() {
|
||||
for max_hw_warps in [32, 48, 64] {
|
||||
let sm = ShaderModelInfo::new(75, max_hw_warps);
|
||||
let sm = ShaderModelInfo::new(75, max_hw_warps, 0, 100 * 1024);
|
||||
for x in 0..255 {
|
||||
let y = next_occupancy_cliff(&sm, x);
|
||||
let y = next_occupancy_cliff(&sm, x, 0, 0);
|
||||
assert!(y >= x);
|
||||
assert_eq!(max_warps_per_sm(&sm, x), max_warps_per_sm(&sm, y));
|
||||
assert!(max_warps_per_sm(&sm, y) > max_warps_per_sm(&sm, y + 1));
|
||||
assert_eq!(
|
||||
max_warps_per_sm(&sm, x, 0, 0),
|
||||
max_warps_per_sm(&sm, y, 0, 0)
|
||||
);
|
||||
assert!(
|
||||
max_warps_per_sm(&sm, y, 0, 0)
|
||||
> max_warps_per_sm(&sm, y + 1, 0, 0)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -56,10 +67,14 @@ fn next_occupancy_cliff_with_reserved(
|
|||
sm: &ShaderModelInfo,
|
||||
gprs: i32,
|
||||
reserved: i32,
|
||||
shared_mem: u16,
|
||||
block_size: u16,
|
||||
) -> i32 {
|
||||
i32::try_from(next_occupancy_cliff(
|
||||
sm,
|
||||
(gprs + reserved).try_into().unwrap(),
|
||||
shared_mem,
|
||||
block_size,
|
||||
))
|
||||
.unwrap()
|
||||
- reserved
|
||||
|
|
@ -751,11 +766,18 @@ fn get_schedule_types(
|
|||
min_gpr_target: i32,
|
||||
max_gpr_target: i32,
|
||||
reserved_gprs: i32,
|
||||
shared_mem: u16,
|
||||
block_size: u16,
|
||||
) -> Vec<ScheduleType> {
|
||||
let mut out = Vec::new();
|
||||
|
||||
let mut gpr_target =
|
||||
next_occupancy_cliff_with_reserved(sm, min_gpr_target, reserved_gprs);
|
||||
let mut gpr_target = next_occupancy_cliff_with_reserved(
|
||||
sm,
|
||||
min_gpr_target,
|
||||
reserved_gprs,
|
||||
shared_mem,
|
||||
block_size,
|
||||
);
|
||||
while gpr_target < max_regs[RegFile::GPR] {
|
||||
out.push(ScheduleType::RegLimit(gpr_target.try_into().unwrap()));
|
||||
|
||||
|
|
@ -770,6 +792,8 @@ fn get_schedule_types(
|
|||
sm,
|
||||
gpr_target + 1,
|
||||
reserved_gprs,
|
||||
shared_mem,
|
||||
block_size,
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -792,6 +816,8 @@ impl Function {
|
|||
&mut self,
|
||||
sm: &ShaderModelInfo,
|
||||
max_regs: PerRegFile<i32>,
|
||||
shared_mem: u16,
|
||||
block_size: u16,
|
||||
) {
|
||||
let liveness = SimpleLiveness::for_function(self);
|
||||
let mut live_out_sets: Vec<LiveSet> = Vec::new();
|
||||
|
|
@ -912,6 +938,8 @@ impl Function {
|
|||
min_gpr_target,
|
||||
max_gpr_target,
|
||||
reserved_gprs,
|
||||
shared_mem,
|
||||
block_size,
|
||||
);
|
||||
schedule_types.reverse();
|
||||
|
||||
|
|
@ -1025,8 +1053,17 @@ impl Shader<'_> {
|
|||
}
|
||||
max_regs[RegFile::GPR] -= SW_RESERVED_GPRS;
|
||||
|
||||
let mut shared_mem = 0;
|
||||
let mut block_size = 0;
|
||||
if let ShaderStageInfo::Compute(compute) = &self.info.stage {
|
||||
shared_mem = compute.smem_size;
|
||||
block_size = compute.local_size.iter().product();
|
||||
}
|
||||
|
||||
for f in &mut self.functions {
|
||||
f.opt_instr_sched_prepass(self.sm, max_regs);
|
||||
f.opt_instr_sched_prepass(
|
||||
self.sm, max_regs, shared_mem, block_size,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@ bool nak_debug_no_ugpr(void);
|
|||
struct nak_compiler {
|
||||
uint8_t sm;
|
||||
uint8_t warps_per_sm;
|
||||
uint8_t blocks_per_sm;
|
||||
uint32_t max_shared_mem;
|
||||
|
||||
struct nir_shader_compiler_options nir_options;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ struct nv_device_info {
|
|||
uint16_t tpc_count;
|
||||
uint8_t mp_per_tpc;
|
||||
uint8_t max_warps_per_mp;
|
||||
uint8_t max_blocks_per_mp;
|
||||
|
||||
bool has_transfer_queue;
|
||||
|
||||
|
|
|
|||
|
|
@ -1309,6 +1309,18 @@ nvk_shader_get_executable_statistics(
|
|||
stat->value.u64 = shader->info.slm_size;
|
||||
}
|
||||
|
||||
uint16_t smem_size = 0;
|
||||
if (shader->info.stage == MESA_SHADER_COMPUTE)
|
||||
smem_size = shader->info.cs.smem_size;
|
||||
|
||||
vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
|
||||
WRITE_STR(stat->name, "Shared size");
|
||||
WRITE_STR(stat->description,
|
||||
"Size of shader shared memory, in bytes");
|
||||
stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
|
||||
stat->value.u64 = smem_size;
|
||||
}
|
||||
|
||||
return vk_outarray_status(&out);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -159,6 +159,49 @@ max_warps_per_mp_for_sm(uint8_t sm)
|
|||
}
|
||||
}
|
||||
|
||||
static uint8_t
|
||||
max_blocks_per_mp_for_sm(uint8_t sm)
|
||||
{
|
||||
/* Values taken from CUDA programming guide section "Compute Capabilities" */
|
||||
switch (sm) {
|
||||
case 10:
|
||||
case 11:
|
||||
case 12:
|
||||
case 13:
|
||||
case 20:
|
||||
case 21:
|
||||
return 8;
|
||||
case 30:
|
||||
case 32:
|
||||
case 35:
|
||||
case 37:
|
||||
case 75:
|
||||
case 86:
|
||||
case 87:
|
||||
return 16;
|
||||
case 89:
|
||||
case 110:
|
||||
case 120:
|
||||
return 24;
|
||||
case 50:
|
||||
case 52:
|
||||
case 53:
|
||||
case 60:
|
||||
case 61:
|
||||
case 62:
|
||||
case 70:
|
||||
case 72:
|
||||
case 80:
|
||||
case 90:
|
||||
case 100:
|
||||
return 32;
|
||||
default:
|
||||
assert(!"unkown SM version");
|
||||
/* return the smallest known value */
|
||||
return 8;
|
||||
}
|
||||
}
|
||||
|
||||
static uint8_t
|
||||
mp_per_tpc_for_chipset(uint16_t chipset)
|
||||
{
|
||||
|
|
@ -538,6 +581,7 @@ nouveau_ws_device_new(drmDevicePtr drm_device)
|
|||
// for now we hardcode those values, but in the future Nouveau could provide that information to
|
||||
// us instead.
|
||||
device->info.max_warps_per_mp = max_warps_per_mp_for_sm(device->info.sm);
|
||||
device->info.max_blocks_per_mp = max_blocks_per_mp_for_sm(device->info.sm);
|
||||
device->info.mp_per_tpc = mp_per_tpc_for_chipset(device->info.chipset);
|
||||
|
||||
/* Transfer queues require two kernel fixes:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue