rusticl/kernel: make use of cso info

Signed-off-by: Karol Herbst <kherbst@redhat.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19855>
2026-01-09 17:00:13 +01:00 · 2022-11-18 15:51:18 +01:00 · 2022-11-18 15:51:18 +01:00 · ac993ae828
commit ac993ae828
parent c7dd3677dc
4 changed files with 76 additions and 50 deletions
--- a/src/gallium/frontends/rusticl/api/kernel.rs
+++ b/src/gallium/frontends/rusticl/api/kernel.rs
@ -89,11 +89,10 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
            CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size),
            CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(&dev)),
            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
-                cl_prop::<usize>(dev.subgroups() as usize)
+                cl_prop::<usize>(kernel.preferred_simd_size(&dev))
            }
            CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(&dev)),
-            // TODO
-            CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(dev.subgroups() as usize),
+            CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(&dev)),
            // CL_INVALID_VALUE if param_name is not one of the supported values
            _ => return Err(CL_INVALID_VALUE),
        })
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@ -75,6 +75,7 @@ pub trait HelperContextWrapper {

    fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
    fn delete_compute_state(&self, cso: *mut c_void);
+    fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;

    fn unmap(&self, tx: PipeTransfer);
 }
@ -159,6 +160,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
        self.lock.delete_compute_state(cso)
    }

+    fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
+        self.lock.compute_state_info(state)
+    }
+
    fn unmap(&self, tx: PipeTransfer) {
        tx.with_ctx(&self.lock);
    }
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@ -258,6 +258,7 @@ struct KernelDevStateInner {
    nir: NirShader,
    constant_buffer: Option<Arc<PipeResource>>,
    cso: *mut c_void,
+    info: pipe_compute_state_object_info,
 }

 struct KernelDevState {
@ -279,21 +280,25 @@ impl KernelDevState {
        let states = nirs
            .into_iter()
            .map(|(dev, nir)| {
-                let cso = if dev.shareable_shaders() {
-                    dev.helper_ctx()
-                        .create_compute_state(&nir, nir.shared_size())
-                } else {
-                    ptr::null_mut()
-                };
-
+                let mut cso = dev
+                    .helper_ctx()
+                    .create_compute_state(&nir, nir.shared_size());
+                let info = dev.helper_ctx().compute_state_info(cso);
                let cb = Self::create_nir_constant_buffer(&dev, &nir);

+                // if we can't share the cso between threads, destroy it now.
+                if !dev.shareable_shaders() {
+                    dev.helper_ctx().delete_compute_state(cso);
+                    cso = ptr::null_mut();
+                };
+
                (
                    dev,
                    KernelDevStateInner {
                        nir: nir,
                        constant_buffer: cb,
                        cso: cso,
+                        info: info,
                    },
                )
            })
@ -829,44 +834,6 @@ fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] {
    val.try_into().unwrap()
 }

-fn optimize_local_size(d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
-    let mut threads = d.max_threads_per_block() as u32;
-    let dim_threads = d.max_block_sizes();
-    let subgroups = d.subgroups();
-
-    if !block.contains(&0) {
-        for i in 0..3 {
-            // we already made sure everything is fine
-            grid[i] /= block[i];
-        }
-        return;
-    }
-
-    for i in 0..3 {
-        let t = cmp::min(threads, dim_threads[i] as u32);
-        let gcd = gcd(t, grid[i]);
-
-        block[i] = gcd;
-        grid[i] /= gcd;
-
-        // update limits
-        threads /= block[i];
-    }
-
-    // if we didn't fill the subgroup we can do a bit better if we have threads remaining
-    let total_threads = block[0] * block[1] * block[2];
-    if threads != 1 && total_threads < subgroups {
-        for i in 0..3 {
-            if grid[i] * total_threads < threads {
-                block[i] *= grid[i];
-                grid[i] = 1;
-                // can only do it once as nothing is cleanly divisible
-                break;
-            }
-        }
-    }
-}
-
 impl Kernel {
    pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
        let (mut nirs, args, internal_args, attributes_string) =
@ -895,6 +862,44 @@ impl Kernel {
        })
    }

+    fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
+        let mut threads = self.max_threads_per_block(d) as u32;
+        let dim_threads = d.max_block_sizes();
+        let subgroups = self.preferred_simd_size(d) as u32;
+
+        if !block.contains(&0) {
+            for i in 0..3 {
+                // we already made sure everything is fine
+                grid[i] /= block[i];
+            }
+            return;
+        }
+
+        for i in 0..3 {
+            let t = cmp::min(threads, dim_threads[i] as u32);
+            let gcd = gcd(t, grid[i]);
+
+            block[i] = gcd;
+            grid[i] /= gcd;
+
+            // update limits
+            threads /= block[i];
+        }
+
+        // if we didn't fill the subgroup we can do a bit better if we have threads remaining
+        let total_threads = block[0] * block[1] * block[2];
+        if threads != 1 && total_threads < subgroups {
+            for i in 0..3 {
+                if grid[i] * total_threads < threads {
+                    block[i] *= grid[i];
+                    grid[i] = 1;
+                    // can only do it once as nothing is cleanly divisible
+                    break;
+                }
+            }
+        }
+    }
+
    // the painful part is, that host threads are allowed to modify the kernel object once it was
    // enqueued, so return a closure with all req data included.
    pub fn launch(
@ -928,7 +933,7 @@ impl Kernel {
            &[0; 4]
        };

-        optimize_local_size(&q.device, &mut grid, &mut block);
+        self.optimize_local_size(&q.device, &mut grid, &mut block);

        for (arg, val) in self.args.iter().zip(&self.values) {
            if arg.dead {
@ -1225,7 +1230,15 @@ impl Kernel {
    }

    pub fn priv_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
-        self.dev_state.get(dev).nir.scratch_size() as cl_ulong
+        self.dev_state.get(dev).info.private_memory.into()
+    }
+
+    pub fn max_threads_per_block(&self, dev: &Device) -> usize {
+        self.dev_state.get(dev).info.max_threads as usize
+    }
+
+    pub fn preferred_simd_size(&self, dev: &Device) -> usize {
+        self.dev_state.get(dev).info.preferred_simd_size as usize
    }

    pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
--- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
@ -319,6 +319,14 @@ impl PipeContext {
        unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
    }

+    pub fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
+        let mut info = pipe_compute_state_object_info::default();
+        unsafe {
+            self.pipe.as_ref().get_compute_state_info.unwrap()(self.pipe.as_ptr(), state, &mut info)
+        }
+        info
+    }
+
    pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
        unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
    }
@ -530,6 +538,7 @@ fn has_required_cbs(context: &pipe_context) -> bool {
        & has_required_feature!(context, delete_compute_state)
        & has_required_feature!(context, delete_sampler_state)
        & has_required_feature!(context, flush)
+        & has_required_feature!(context, get_compute_state_info)
        & has_required_feature!(context, launch_grid)
        & has_required_feature!(context, memory_barrier)
        & has_required_feature!(context, resource_copy_region)