diff --git a/docs/features.txt b/docs/features.txt index be608220ecc..2ea9a56316d 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -876,7 +876,7 @@ Rusticl OpenCL 2.0 -- all DONE: Rusticl OpenCL 2.1 -- all DONE: - Sub groups in progress + Sub groups DONE (iris, llvmpipe, radeonsi) - cl_khr_subgroups in progress cl_khr_il_program DONE Device and host timer synchronization DONE (iris, llvmpipe, radeonsi) diff --git a/src/gallium/frontends/rusticl/api/device.rs b/src/gallium/frontends/rusticl/api/device.rs index a477d5dfd44..681db4418b5 100644 --- a/src/gallium/frontends/rusticl/api/device.rs +++ b/src/gallium/frontends/rusticl/api/device.rs @@ -171,7 +171,11 @@ impl CLInfo for cl_device_id { CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE => cl_prop::(dev.const_max_size()), CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE => cl_prop::(0), CL_DEVICE_MAX_MEM_ALLOC_SIZE => cl_prop::(dev.max_mem_alloc()), - CL_DEVICE_MAX_NUM_SUB_GROUPS => cl_prop::(0), + CL_DEVICE_MAX_NUM_SUB_GROUPS => cl_prop::(if dev.subgroups_supported() { + dev.max_subgroups() + } else { + 0 + }), CL_DEVICE_MAX_ON_DEVICE_EVENTS => cl_prop::(0), CL_DEVICE_MAX_ON_DEVICE_QUEUES => cl_prop::(0), CL_DEVICE_MAX_PARAMETER_SIZE => cl_prop::(dev.param_max_size()), @@ -274,6 +278,13 @@ impl CLInfo for cl_device_id { (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN) as cl_device_fp_config, ), CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS => cl_prop::(false), + CL_DEVICE_SUB_GROUP_SIZES_INTEL => { + cl_prop::>(if dev.subgroups_supported() { + dev.subgroup_sizes() + } else { + vec![0; 1] + }) + } CL_DEVICE_SVM_CAPABILITIES | CL_DEVICE_SVM_CAPABILITIES_ARM => { cl_prop::( if dev.svm_supported() { diff --git a/src/gallium/frontends/rusticl/api/icd.rs b/src/gallium/frontends/rusticl/api/icd.rs index 1393de54c4f..84b942b28fa 100644 --- a/src/gallium/frontends/rusticl/api/icd.rs +++ b/src/gallium/frontends/rusticl/api/icd.rs @@ -465,7 +465,7 @@ extern "C" fn cl_get_kernel_sub_group_info( param_value_size_ret: *mut usize, ) -> cl_int { match kernel.get_info_obj( - (device, input_value_size, input_value), + (device, input_value_size, input_value, param_value_size), param_name, param_value_size, param_value, diff --git a/src/gallium/frontends/rusticl/api/kernel.rs b/src/gallium/frontends/rusticl/api/kernel.rs index c135f3ac2eb..06439a71288 100644 --- a/src/gallium/frontends/rusticl/api/kernel.rs +++ b/src/gallium/frontends/rusticl/api/kernel.rs @@ -10,6 +10,7 @@ use rusticl_opencl_gen::*; use rusticl_proc_macros::cl_entrypoint; use rusticl_proc_macros::cl_info_entrypoint; +use std::cmp; use std::mem::{self, MaybeUninit}; use std::os::raw::c_void; use std::ptr; @@ -106,16 +107,115 @@ impl CLInfoObj for cl_kernel { } } -impl CLInfoObj for cl_kernel { +impl CLInfoObj + for cl_kernel +{ fn query( &self, - (d, _input_value_size, _input_value): (cl_device_id, usize, *const c_void), - _q: cl_program_build_info, + (dev, input_value_size, input_value, output_value_size): ( + cl_device_id, + usize, + *const c_void, + usize, + ), + q: cl_program_build_info, ) -> CLResult>> { - let _kernel = self.get_ref()?; - let _dev = d.get_arc()?; + let kernel = self.get_ref()?; - Err(CL_INVALID_OPERATION) + // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated + // with kernel. + let dev = if dev.is_null() { + if kernel.prog.devs.len() > 1 { + return Err(CL_INVALID_DEVICE); + } else { + kernel.prog.devs[0].clone() + } + } else { + dev.get_arc()? + }; + + // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel + if !kernel.prog.devs.contains(&dev) { + return Err(CL_INVALID_DEVICE); + } + + // CL_INVALID_OPERATION if device does not support subgroups. + if !dev.subgroups_supported() { + return Err(CL_INVALID_OPERATION); + } + + let usize_byte = mem::size_of::(); + // first we have to convert the input to a proper thing + let input: &[usize] = match q { + CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE | CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => { + // CL_INVALID_VALUE if param_name is CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, + // CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE or ... and the size in bytes specified by + // input_value_size is not valid or if input_value is NULL. + if ![usize_byte, 2 * usize_byte, 3 * usize_byte].contains(&input_value_size) { + return Err(CL_INVALID_VALUE); + } + // SAFETY: we verified the size as best as possible, with the rest we trust the client + unsafe { slice::from_raw_parts(input_value.cast(), input_value_size / usize_byte) } + } + CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => { + // CL_INVALID_VALUE if param_name is ... CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT + // and the size in bytes specified by input_value_size is not valid or if + // input_value is NULL. + if input_value_size != usize_byte || input_value.is_null() { + return Err(CL_INVALID_VALUE); + } + // SAFETY: we trust the client here + unsafe { slice::from_raw_parts(input_value.cast(), 1) } + } + _ => &[], + }; + + Ok(match q { + CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => { + cl_prop::(kernel.subgroups_for_block(&dev, input)) + } + CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE => { + cl_prop::(kernel.subgroup_size_for_block(&dev, input)) + } + CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => { + let subgroups = input[0]; + let mut res = vec![0; 3]; + + for subgroup_size in kernel.subgroup_sizes(&dev) { + let threads = subgroups * subgroup_size; + + if threads > dev.max_threads_per_block() { + continue; + } + + let block = [threads, 1, 1]; + let real_subgroups = kernel.subgroups_for_block(&dev, &block); + + if real_subgroups == subgroups { + res = block.to_vec(); + break; + } + } + + res.truncate(output_value_size / usize_byte); + cl_prop::>(res) + } + CL_KERNEL_MAX_NUM_SUB_GROUPS => { + let threads = kernel.max_threads_per_block(&dev); + let max_groups = dev.max_subgroups(); + + let mut result = 0; + for sgs in kernel.subgroup_sizes(&dev) { + result = cmp::max(result, threads / sgs); + result = cmp::min(result, max_groups as usize); + } + cl_prop::(result) + } + CL_KERNEL_COMPILE_NUM_SUB_GROUPS => cl_prop::(kernel.num_subgroups), + CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL => cl_prop::(kernel.subgroup_size), + // CL_INVALID_VALUE if param_name is not one of the supported values + _ => return Err(CL_INVALID_VALUE), + }) } } diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs index 97a5a59f96e..64b72b42643 100644 --- a/src/gallium/frontends/rusticl/core/device.rs +++ b/src/gallium/frontends/rusticl/core/device.rs @@ -82,6 +82,7 @@ pub trait HelperContextWrapper { fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void; fn delete_compute_state(&self, cso: *mut c_void); fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info; + fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32; fn unmap(&self, tx: PipeTransfer); } @@ -170,6 +171,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> { self.lock.compute_state_info(state) } + fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32 { + self.lock.compute_state_subgroup_size(state, block) + } + fn unmap(&self, tx: PipeTransfer) { tx.with_ctx(&self.lock); } @@ -572,6 +577,12 @@ impl Device { add_ext(1, 0, 0, "cl_khr_device_uuid"); } + if self.subgroups_supported() { + // requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS + //add_ext(1, 0, 0, "cl_khr_subgroups"); + add_feat(1, 0, 0, "__opencl_c_subgroups"); + } + if self.svm_supported() { add_ext(1, 0, 0, "cl_arm_shared_virtual_memory"); } @@ -857,6 +868,22 @@ impl Device { .collect() } + pub fn max_subgroups(&self) -> u32 { + ComputeParam::::compute_param( + self.screen.as_ref(), + pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_SUBGROUPS, + ) + } + + pub fn subgroups_supported(&self) -> bool { + let subgroup_sizes = self.subgroup_sizes().len(); + + // we need to be able to query a CSO for subgroup sizes if multiple sub group sizes are + // supported, doing it without shareable shaders isn't practical + self.max_subgroups() > 0 + && (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders())) + } + pub fn svm_supported(&self) -> bool { self.screen.param(pipe_cap::PIPE_CAP_SYSTEM_SVM) == 1 } @@ -905,7 +932,7 @@ impl Device { images_write_3d: self.image_3d_write_supported(), integer_dot_product: true, intel_subgroups: false, - subgroups: false, + subgroups: self.subgroups_supported(), subgroups_ifp: false, } } diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs index f5af9433f8a..5fcaf650b86 100644 --- a/src/gallium/frontends/rusticl/core/kernel.rs +++ b/src/gallium/frontends/rusticl/core/kernel.rs @@ -335,6 +335,8 @@ pub struct Kernel { pub values: Vec>>, pub work_group_size: [usize; 3], pub build: Arc, + pub subgroup_size: usize, + pub num_subgroups: usize, dev_state: Arc, } @@ -813,6 +815,8 @@ impl Kernel { prog: prog, name: name, work_group_size: work_group_size, + subgroup_size: nir.subgroup_size() as usize, + num_subgroups: nir.num_subgroups() as usize, values: values, dev_state: KernelDevState::new(nirs), build: nir_kernel_build, @@ -1208,6 +1212,42 @@ impl Kernel { pub fn has_svm_devs(&self) -> bool { self.prog.devs.iter().any(|dev| dev.svm_supported()) } + + pub fn subgroup_sizes(&self, dev: &Device) -> Vec { + SetBitIndices::from_msb(self.dev_state.get(dev).info.simd_sizes) + .map(|bit| 1 << bit) + .collect() + } + + pub fn subgroups_for_block(&self, dev: &Device, block: &[usize]) -> usize { + let subgroup_size = self.subgroup_size_for_block(dev, block); + if subgroup_size == 0 { + return 0; + } + + let threads = block.iter().product(); + div_round_up(threads, subgroup_size) + } + + pub fn subgroup_size_for_block(&self, dev: &Device, block: &[usize]) -> usize { + let subgroup_sizes = self.subgroup_sizes(dev); + if subgroup_sizes.is_empty() { + return 0; + } + + if subgroup_sizes.len() == 1 { + return subgroup_sizes[0]; + } + + let block = [ + *block.get(0).unwrap_or(&1) as u32, + *block.get(1).unwrap_or(&1) as u32, + *block.get(2).unwrap_or(&1) as u32, + ]; + + dev.helper_ctx() + .compute_state_subgroup_size(self.dev_state.get(dev).cso, &block) as usize + } } impl Clone for Kernel { @@ -1219,6 +1259,8 @@ impl Clone for Kernel { values: self.values.clone(), work_group_size: self.work_group_size, build: self.build.clone(), + subgroup_size: self.subgroup_size, + num_subgroups: self.num_subgroups, dev_state: self.dev_state.clone(), } } diff --git a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs index 3df685a0312..59e00617300 100644 --- a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs +++ b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs @@ -332,6 +332,7 @@ impl SPIRVBin { float16: true, float64: true, generic_pointers: true, + groups: true, int8: true, int16: true, int64: true, diff --git a/src/gallium/frontends/rusticl/mesa/compiler/nir.rs b/src/gallium/frontends/rusticl/mesa/compiler/nir.rs index 07173780e31..81aca5cecc6 100644 --- a/src/gallium/frontends/rusticl/mesa/compiler/nir.rs +++ b/src/gallium/frontends/rusticl/mesa/compiler/nir.rs @@ -179,6 +179,27 @@ impl NirShader { unsafe { (*self.nir.as_ptr()).info.workgroup_size } } + pub fn subgroup_size(&self) -> u8 { + let subgroup_size = unsafe { (*self.nir.as_ptr()).info.subgroup_size }; + let valid_subgroup_sizes = [ + gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_8, + gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_16, + gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_32, + gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_64, + gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_128, + ]; + + if valid_subgroup_sizes.contains(&subgroup_size) { + subgroup_size as u8 + } else { + 0 + } + } + + pub fn num_subgroups(&self) -> u8 { + unsafe { (*self.nir.as_ptr()).info.num_subgroups } + } + pub fn set_workgroup_size_variable_if_zero(&self) { let nir = self.nir.as_ptr(); unsafe { diff --git a/src/gallium/frontends/rusticl/mesa/pipe/context.rs b/src/gallium/frontends/rusticl/mesa/pipe/context.rs index 0a7bbe84ecd..90edc2a6612 100644 --- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs +++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs @@ -327,6 +327,16 @@ impl PipeContext { info } + pub fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32 { + unsafe { + if let Some(cb) = self.pipe.as_ref().get_compute_state_subgroup_size { + cb(self.pipe.as_ptr(), state, block) + } else { + 0 + } + } + } + pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void { unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) } } diff --git a/src/gallium/frontends/rusticl/rusticl_llvm_bindings.hpp b/src/gallium/frontends/rusticl/rusticl_llvm_bindings.hpp new file mode 100644 index 00000000000..d9fb575a4ab --- /dev/null +++ b/src/gallium/frontends/rusticl/rusticl_llvm_bindings.hpp @@ -0,0 +1,4 @@ +//#include +//#include +#include +#include