rusticl/kernel: make use of cso info

Signed-off-by: Karol Herbst <kherbst@redhat.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19855>
This commit is contained in:
Karol Herbst 2022-11-18 15:51:18 +01:00 committed by Marge Bot
parent c7dd3677dc
commit ac993ae828
4 changed files with 76 additions and 50 deletions

View file

@ -89,11 +89,10 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size),
CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(&dev)),
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
cl_prop::<usize>(dev.subgroups() as usize)
cl_prop::<usize>(kernel.preferred_simd_size(&dev))
}
CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(&dev)),
// TODO
CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(dev.subgroups() as usize),
CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(&dev)),
// CL_INVALID_VALUE if param_name is not one of the supported values
_ => return Err(CL_INVALID_VALUE),
})

View file

@ -75,6 +75,7 @@ pub trait HelperContextWrapper {
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
fn delete_compute_state(&self, cso: *mut c_void);
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
fn unmap(&self, tx: PipeTransfer);
}
@ -159,6 +160,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
self.lock.delete_compute_state(cso)
}
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
self.lock.compute_state_info(state)
}
fn unmap(&self, tx: PipeTransfer) {
tx.with_ctx(&self.lock);
}

View file

@ -258,6 +258,7 @@ struct KernelDevStateInner {
nir: NirShader,
constant_buffer: Option<Arc<PipeResource>>,
cso: *mut c_void,
info: pipe_compute_state_object_info,
}
struct KernelDevState {
@ -279,21 +280,25 @@ impl KernelDevState {
let states = nirs
.into_iter()
.map(|(dev, nir)| {
let cso = if dev.shareable_shaders() {
dev.helper_ctx()
.create_compute_state(&nir, nir.shared_size())
} else {
ptr::null_mut()
};
let mut cso = dev
.helper_ctx()
.create_compute_state(&nir, nir.shared_size());
let info = dev.helper_ctx().compute_state_info(cso);
let cb = Self::create_nir_constant_buffer(&dev, &nir);
// if we can't share the cso between threads, destroy it now.
if !dev.shareable_shaders() {
dev.helper_ctx().delete_compute_state(cso);
cso = ptr::null_mut();
};
(
dev,
KernelDevStateInner {
nir: nir,
constant_buffer: cb,
cso: cso,
info: info,
},
)
})
@ -829,44 +834,6 @@ fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] {
val.try_into().unwrap()
}
fn optimize_local_size(d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
let mut threads = d.max_threads_per_block() as u32;
let dim_threads = d.max_block_sizes();
let subgroups = d.subgroups();
if !block.contains(&0) {
for i in 0..3 {
// we already made sure everything is fine
grid[i] /= block[i];
}
return;
}
for i in 0..3 {
let t = cmp::min(threads, dim_threads[i] as u32);
let gcd = gcd(t, grid[i]);
block[i] = gcd;
grid[i] /= gcd;
// update limits
threads /= block[i];
}
// if we didn't fill the subgroup we can do a bit better if we have threads remaining
let total_threads = block[0] * block[1] * block[2];
if threads != 1 && total_threads < subgroups {
for i in 0..3 {
if grid[i] * total_threads < threads {
block[i] *= grid[i];
grid[i] = 1;
// can only do it once as nothing is cleanly divisible
break;
}
}
}
}
impl Kernel {
pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
let (mut nirs, args, internal_args, attributes_string) =
@ -895,6 +862,44 @@ impl Kernel {
})
}
fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
let mut threads = self.max_threads_per_block(d) as u32;
let dim_threads = d.max_block_sizes();
let subgroups = self.preferred_simd_size(d) as u32;
if !block.contains(&0) {
for i in 0..3 {
// we already made sure everything is fine
grid[i] /= block[i];
}
return;
}
for i in 0..3 {
let t = cmp::min(threads, dim_threads[i] as u32);
let gcd = gcd(t, grid[i]);
block[i] = gcd;
grid[i] /= gcd;
// update limits
threads /= block[i];
}
// if we didn't fill the subgroup we can do a bit better if we have threads remaining
let total_threads = block[0] * block[1] * block[2];
if threads != 1 && total_threads < subgroups {
for i in 0..3 {
if grid[i] * total_threads < threads {
block[i] *= grid[i];
grid[i] = 1;
// can only do it once as nothing is cleanly divisible
break;
}
}
}
}
// the painful part is, that host threads are allowed to modify the kernel object once it was
// enqueued, so return a closure with all req data included.
pub fn launch(
@ -928,7 +933,7 @@ impl Kernel {
&[0; 4]
};
optimize_local_size(&q.device, &mut grid, &mut block);
self.optimize_local_size(&q.device, &mut grid, &mut block);
for (arg, val) in self.args.iter().zip(&self.values) {
if arg.dead {
@ -1225,7 +1230,15 @@ impl Kernel {
}
pub fn priv_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
self.dev_state.get(dev).nir.scratch_size() as cl_ulong
self.dev_state.get(dev).info.private_memory.into()
}
pub fn max_threads_per_block(&self, dev: &Device) -> usize {
self.dev_state.get(dev).info.max_threads as usize
}
pub fn preferred_simd_size(&self, dev: &Device) -> usize {
self.dev_state.get(dev).info.preferred_simd_size as usize
}
pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {

View file

@ -319,6 +319,14 @@ impl PipeContext {
unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
}
pub fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
let mut info = pipe_compute_state_object_info::default();
unsafe {
self.pipe.as_ref().get_compute_state_info.unwrap()(self.pipe.as_ptr(), state, &mut info)
}
info
}
pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
}
@ -530,6 +538,7 @@ fn has_required_cbs(context: &pipe_context) -> bool {
& has_required_feature!(context, delete_compute_state)
& has_required_feature!(context, delete_sampler_state)
& has_required_feature!(context, flush)
& has_required_feature!(context, get_compute_state_info)
& has_required_feature!(context, launch_grid)
& has_required_feature!(context, memory_barrier)
& has_required_feature!(context, resource_copy_region)