mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-09 17:00:13 +01:00
rusticl/kernel: make use of cso info
Signed-off-by: Karol Herbst <kherbst@redhat.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19855>
This commit is contained in:
parent
c7dd3677dc
commit
ac993ae828
4 changed files with 76 additions and 50 deletions
|
|
@ -89,11 +89,10 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
|
|||
CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size),
|
||||
CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(&dev)),
|
||||
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
|
||||
cl_prop::<usize>(dev.subgroups() as usize)
|
||||
cl_prop::<usize>(kernel.preferred_simd_size(&dev))
|
||||
}
|
||||
CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(&dev)),
|
||||
// TODO
|
||||
CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(dev.subgroups() as usize),
|
||||
CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(&dev)),
|
||||
// CL_INVALID_VALUE if param_name is not one of the supported values
|
||||
_ => return Err(CL_INVALID_VALUE),
|
||||
})
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ pub trait HelperContextWrapper {
|
|||
|
||||
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
|
||||
fn delete_compute_state(&self, cso: *mut c_void);
|
||||
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
|
||||
|
||||
fn unmap(&self, tx: PipeTransfer);
|
||||
}
|
||||
|
|
@ -159,6 +160,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
|
|||
self.lock.delete_compute_state(cso)
|
||||
}
|
||||
|
||||
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
|
||||
self.lock.compute_state_info(state)
|
||||
}
|
||||
|
||||
fn unmap(&self, tx: PipeTransfer) {
|
||||
tx.with_ctx(&self.lock);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -258,6 +258,7 @@ struct KernelDevStateInner {
|
|||
nir: NirShader,
|
||||
constant_buffer: Option<Arc<PipeResource>>,
|
||||
cso: *mut c_void,
|
||||
info: pipe_compute_state_object_info,
|
||||
}
|
||||
|
||||
struct KernelDevState {
|
||||
|
|
@ -279,21 +280,25 @@ impl KernelDevState {
|
|||
let states = nirs
|
||||
.into_iter()
|
||||
.map(|(dev, nir)| {
|
||||
let cso = if dev.shareable_shaders() {
|
||||
dev.helper_ctx()
|
||||
.create_compute_state(&nir, nir.shared_size())
|
||||
} else {
|
||||
ptr::null_mut()
|
||||
};
|
||||
|
||||
let mut cso = dev
|
||||
.helper_ctx()
|
||||
.create_compute_state(&nir, nir.shared_size());
|
||||
let info = dev.helper_ctx().compute_state_info(cso);
|
||||
let cb = Self::create_nir_constant_buffer(&dev, &nir);
|
||||
|
||||
// if we can't share the cso between threads, destroy it now.
|
||||
if !dev.shareable_shaders() {
|
||||
dev.helper_ctx().delete_compute_state(cso);
|
||||
cso = ptr::null_mut();
|
||||
};
|
||||
|
||||
(
|
||||
dev,
|
||||
KernelDevStateInner {
|
||||
nir: nir,
|
||||
constant_buffer: cb,
|
||||
cso: cso,
|
||||
info: info,
|
||||
},
|
||||
)
|
||||
})
|
||||
|
|
@ -829,44 +834,6 @@ fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] {
|
|||
val.try_into().unwrap()
|
||||
}
|
||||
|
||||
fn optimize_local_size(d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
|
||||
let mut threads = d.max_threads_per_block() as u32;
|
||||
let dim_threads = d.max_block_sizes();
|
||||
let subgroups = d.subgroups();
|
||||
|
||||
if !block.contains(&0) {
|
||||
for i in 0..3 {
|
||||
// we already made sure everything is fine
|
||||
grid[i] /= block[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for i in 0..3 {
|
||||
let t = cmp::min(threads, dim_threads[i] as u32);
|
||||
let gcd = gcd(t, grid[i]);
|
||||
|
||||
block[i] = gcd;
|
||||
grid[i] /= gcd;
|
||||
|
||||
// update limits
|
||||
threads /= block[i];
|
||||
}
|
||||
|
||||
// if we didn't fill the subgroup we can do a bit better if we have threads remaining
|
||||
let total_threads = block[0] * block[1] * block[2];
|
||||
if threads != 1 && total_threads < subgroups {
|
||||
for i in 0..3 {
|
||||
if grid[i] * total_threads < threads {
|
||||
block[i] *= grid[i];
|
||||
grid[i] = 1;
|
||||
// can only do it once as nothing is cleanly divisible
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Kernel {
|
||||
pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
|
||||
let (mut nirs, args, internal_args, attributes_string) =
|
||||
|
|
@ -895,6 +862,44 @@ impl Kernel {
|
|||
})
|
||||
}
|
||||
|
||||
fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
|
||||
let mut threads = self.max_threads_per_block(d) as u32;
|
||||
let dim_threads = d.max_block_sizes();
|
||||
let subgroups = self.preferred_simd_size(d) as u32;
|
||||
|
||||
if !block.contains(&0) {
|
||||
for i in 0..3 {
|
||||
// we already made sure everything is fine
|
||||
grid[i] /= block[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for i in 0..3 {
|
||||
let t = cmp::min(threads, dim_threads[i] as u32);
|
||||
let gcd = gcd(t, grid[i]);
|
||||
|
||||
block[i] = gcd;
|
||||
grid[i] /= gcd;
|
||||
|
||||
// update limits
|
||||
threads /= block[i];
|
||||
}
|
||||
|
||||
// if we didn't fill the subgroup we can do a bit better if we have threads remaining
|
||||
let total_threads = block[0] * block[1] * block[2];
|
||||
if threads != 1 && total_threads < subgroups {
|
||||
for i in 0..3 {
|
||||
if grid[i] * total_threads < threads {
|
||||
block[i] *= grid[i];
|
||||
grid[i] = 1;
|
||||
// can only do it once as nothing is cleanly divisible
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the painful part is, that host threads are allowed to modify the kernel object once it was
|
||||
// enqueued, so return a closure with all req data included.
|
||||
pub fn launch(
|
||||
|
|
@ -928,7 +933,7 @@ impl Kernel {
|
|||
&[0; 4]
|
||||
};
|
||||
|
||||
optimize_local_size(&q.device, &mut grid, &mut block);
|
||||
self.optimize_local_size(&q.device, &mut grid, &mut block);
|
||||
|
||||
for (arg, val) in self.args.iter().zip(&self.values) {
|
||||
if arg.dead {
|
||||
|
|
@ -1225,7 +1230,15 @@ impl Kernel {
|
|||
}
|
||||
|
||||
pub fn priv_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
|
||||
self.dev_state.get(dev).nir.scratch_size() as cl_ulong
|
||||
self.dev_state.get(dev).info.private_memory.into()
|
||||
}
|
||||
|
||||
pub fn max_threads_per_block(&self, dev: &Device) -> usize {
|
||||
self.dev_state.get(dev).info.max_threads as usize
|
||||
}
|
||||
|
||||
pub fn preferred_simd_size(&self, dev: &Device) -> usize {
|
||||
self.dev_state.get(dev).info.preferred_simd_size as usize
|
||||
}
|
||||
|
||||
pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
|
||||
|
|
|
|||
|
|
@ -319,6 +319,14 @@ impl PipeContext {
|
|||
unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
|
||||
}
|
||||
|
||||
pub fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
|
||||
let mut info = pipe_compute_state_object_info::default();
|
||||
unsafe {
|
||||
self.pipe.as_ref().get_compute_state_info.unwrap()(self.pipe.as_ptr(), state, &mut info)
|
||||
}
|
||||
info
|
||||
}
|
||||
|
||||
pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
|
||||
unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
|
||||
}
|
||||
|
|
@ -530,6 +538,7 @@ fn has_required_cbs(context: &pipe_context) -> bool {
|
|||
& has_required_feature!(context, delete_compute_state)
|
||||
& has_required_feature!(context, delete_sampler_state)
|
||||
& has_required_feature!(context, flush)
|
||||
& has_required_feature!(context, get_compute_state_info)
|
||||
& has_required_feature!(context, launch_grid)
|
||||
& has_required_feature!(context, memory_barrier)
|
||||
& has_required_feature!(context, resource_copy_region)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue