rusticl/kernel: make use of cso info

Signed-off-by: Karol Herbst <kherbst@redhat.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19855>
2026-05-07 11:28:05 +02:00 · 2022-11-18 15:51:18 +01:00 · 2022-11-18 15:51:18 +01:00 · ac993ae828
commit ac993ae828
parent c7dd3677dc
4 changed files with 76 additions and 50 deletions
--- a/src/gallium/frontends/rusticl/api/kernel.rs
+++ b/src/gallium/frontends/rusticl/api/kernel.rs
@ -89,11 +89,10 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
            CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size),
            CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(&dev)),
            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
-                cl_prop::<usize>(dev.subgroups() as usize)
+                cl_prop::<usize>(kernel.preferred_simd_size(&dev))
            }
            CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(&dev)),
-            // TODO
+            CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(&dev)),
            CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(dev.subgroups() as usize),
            // CL_INVALID_VALUE if param_name is not one of the supported values
            _ => return Err(CL_INVALID_VALUE),
        })
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@ -75,6 +75,7 @@ pub trait HelperContextWrapper {
    fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
    fn delete_compute_state(&self, cso: *mut c_void);
    fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
    fn unmap(&self, tx: PipeTransfer);
 }
@ -159,6 +160,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
        self.lock.delete_compute_state(cso)
    }
    fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
        self.lock.compute_state_info(state)
    }
    fn unmap(&self, tx: PipeTransfer) {
        tx.with_ctx(&self.lock);
    }
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@ -258,6 +258,7 @@ struct KernelDevStateInner {
    nir: NirShader,
    constant_buffer: Option<Arc<PipeResource>>,
    cso: *mut c_void,
    info: pipe_compute_state_object_info,
 }
 struct KernelDevState {
@ -279,21 +280,25 @@ impl KernelDevState {
        let states = nirs
            .into_iter()
            .map(|(dev, nir)| {
-                let cso = if dev.shareable_shaders() {
+                let mut cso = dev
-                    dev.helper_ctx()
+                    .helper_ctx()
-                        .create_compute_state(&nir, nir.shared_size())
+                    .create_compute_state(&nir, nir.shared_size());
-                } else {
+                let info = dev.helper_ctx().compute_state_info(cso);
                    ptr::null_mut()
                };
                let cb = Self::create_nir_constant_buffer(&dev, &nir);
                // if we can't share the cso between threads, destroy it now.
                if !dev.shareable_shaders() {
                    dev.helper_ctx().delete_compute_state(cso);
                    cso = ptr::null_mut();
                };
                (
                    dev,
                    KernelDevStateInner {
                        nir: nir,
                        constant_buffer: cb,
                        cso: cso,
                        info: info,
                    },
                )
            })
@ -829,44 +834,6 @@ fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] {
    val.try_into().unwrap()
 }
 fn optimize_local_size(d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
    let mut threads = d.max_threads_per_block() as u32;
    let dim_threads = d.max_block_sizes();
    let subgroups = d.subgroups();
    if !block.contains(&0) {
        for i in 0..3 {
            // we already made sure everything is fine
            grid[i] /= block[i];
        }
        return;
    }
    for i in 0..3 {
        let t = cmp::min(threads, dim_threads[i] as u32);
        let gcd = gcd(t, grid[i]);
        block[i] = gcd;
        grid[i] /= gcd;
        // update limits
        threads /= block[i];
    }
    // if we didn't fill the subgroup we can do a bit better if we have threads remaining
    let total_threads = block[0] * block[1] * block[2];
    if threads != 1 && total_threads < subgroups {
        for i in 0..3 {
            if grid[i] * total_threads < threads {
                block[i] *= grid[i];
                grid[i] = 1;
                // can only do it once as nothing is cleanly divisible
                break;
            }
        }
    }
 }
 impl Kernel {
    pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
        let (mut nirs, args, internal_args, attributes_string) =
@ -895,6 +862,44 @@ impl Kernel {
        })
    }
    fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
        let mut threads = self.max_threads_per_block(d) as u32;
        let dim_threads = d.max_block_sizes();
        let subgroups = self.preferred_simd_size(d) as u32;
        if !block.contains(&0) {
            for i in 0..3 {
                // we already made sure everything is fine
                grid[i] /= block[i];
            }
            return;
        }
        for i in 0..3 {
            let t = cmp::min(threads, dim_threads[i] as u32);
            let gcd = gcd(t, grid[i]);
            block[i] = gcd;
            grid[i] /= gcd;
            // update limits
            threads /= block[i];
        }
        // if we didn't fill the subgroup we can do a bit better if we have threads remaining
        let total_threads = block[0] * block[1] * block[2];
        if threads != 1 && total_threads < subgroups {
            for i in 0..3 {
                if grid[i] * total_threads < threads {
                    block[i] *= grid[i];
                    grid[i] = 1;
                    // can only do it once as nothing is cleanly divisible
                    break;
                }
            }
        }
    }
    // the painful part is, that host threads are allowed to modify the kernel object once it was
    // enqueued, so return a closure with all req data included.
    pub fn launch(
@ -928,7 +933,7 @@ impl Kernel {
            &[0; 4]
        };
-        optimize_local_size(&q.device, &mut grid, &mut block);
+        self.optimize_local_size(&q.device, &mut grid, &mut block);
        for (arg, val) in self.args.iter().zip(&self.values) {
            if arg.dead {
@ -1225,7 +1230,15 @@ impl Kernel {
    }
    pub fn priv_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
-        self.dev_state.get(dev).nir.scratch_size() as cl_ulong
+        self.dev_state.get(dev).info.private_memory.into()
    }
    pub fn max_threads_per_block(&self, dev: &Device) -> usize {
        self.dev_state.get(dev).info.max_threads as usize
    }
    pub fn preferred_simd_size(&self, dev: &Device) -> usize {
        self.dev_state.get(dev).info.preferred_simd_size as usize
    }
    pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
--- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
@ -319,6 +319,14 @@ impl PipeContext {
        unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
    }
    pub fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
        let mut info = pipe_compute_state_object_info::default();
        unsafe {
            self.pipe.as_ref().get_compute_state_info.unwrap()(self.pipe.as_ptr(), state, &mut info)
        }
        info
    }
    pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
        unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
    }
@ -530,6 +538,7 @@ fn has_required_cbs(context: &pipe_context) -> bool {
        & has_required_feature!(context, delete_compute_state)
        & has_required_feature!(context, delete_sampler_state)
        & has_required_feature!(context, flush)
        & has_required_feature!(context, get_compute_state_info)
        & has_required_feature!(context, launch_grid)
        & has_required_feature!(context, memory_barrier)
        & has_required_feature!(context, resource_copy_region)