diff --git a/docs/features.txt b/docs/features.txt index 789ae241962..1963da4a1b8 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -962,7 +962,7 @@ Rusticl extensions that are not part of any OpenCL version: cl_khr_subgroup_rotate not started cl_khr_subgroup_shuffle DONE (iris, llvmpipe, radeonsi) cl_khr_subgroup_shuffle_relative DONE (iris, llvmpipe, radeonsi) - cl_khr_suggested_local_work_size not started + cl_khr_suggested_local_work_size DONE cl_khr_terminate_context not started cl_khr_throttle_hints not started cl_khr_work_group_uniform_arithmetic not started diff --git a/src/gallium/frontends/rusticl/api/icd.rs b/src/gallium/frontends/rusticl/api/icd.rs index e17713425d3..936f9fcbdd9 100644 --- a/src/gallium/frontends/rusticl/api/icd.rs +++ b/src/gallium/frontends/rusticl/api/icd.rs @@ -510,6 +510,11 @@ extern "C" fn cl_get_extension_function_address( "clGetGLObjectInfo" => cl_get_gl_object_info as *mut ::std::ffi::c_void, "clGetGLTextureInfo" => cl_get_gl_texture_info as *mut ::std::ffi::c_void, + // cl_khr_suggested_local_work_size + "clGetKernelSuggestedLocalWorkSizeKHR" => { + cl_get_kernel_suggested_local_work_size_khr as *mut ::std::ffi::c_void + } + // cl_arm_shared_virtual_memory "clEnqueueSVMFreeARM" => cl_enqueue_svm_free_arm as *mut ::std::ffi::c_void, "clEnqueueSVMMapARM" => cl_enqueue_svm_map_arm as *mut ::std::ffi::c_void, diff --git a/src/gallium/frontends/rusticl/api/kernel.rs b/src/gallium/frontends/rusticl/api/kernel.rs index 02865dbad38..bdab3e207e4 100644 --- a/src/gallium/frontends/rusticl/api/kernel.rs +++ b/src/gallium/frontends/rusticl/api/kernel.rs @@ -236,6 +236,17 @@ unsafe fn kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) - } } +/// # Safety +/// +/// This function is only safe when called on an array of `work_dim` length +unsafe fn kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]> { + if !arr.is_null() { + unsafe { Some(slice::from_raw_parts_mut(arr, work_dim as usize)) } + } else { + None + } +} + #[cl_entrypoint] fn create_kernel( program: cl_program, @@ -649,3 +660,109 @@ fn clone_kernel(source_kernel: cl_kernel) -> CLResult { let k = Kernel::ref_from_raw(source_kernel)?; Ok(Arc::new(k.clone()).into_cl()) } + +#[cl_entrypoint] +fn get_kernel_suggested_local_work_size_khr( + command_queue: cl_command_queue, + kernel: cl_kernel, + work_dim: cl_uint, + global_work_offset: *const usize, + global_work_size: *const usize, + suggested_local_work_size: *mut usize, +) -> CLResult<()> { + // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified in + // global_work_size are 0. + if global_work_size.is_null() { + return Err(CL_INVALID_GLOBAL_WORK_SIZE); + } + + if global_work_offset.is_null() { + return Err(CL_INVALID_GLOBAL_OFFSET); + } + + // CL_INVALID_VALUE if suggested_local_work_size is NULL. + if suggested_local_work_size.is_null() { + return Err(CL_INVALID_VALUE); + } + + // CL_INVALID_COMMAND_QUEUE if command_queue is not a valid host command-queue. + let queue = Queue::ref_from_raw(command_queue)?; + + // CL_INVALID_KERNEL if kernel is not a valid kernel object. + let kernel = Kernel::ref_from_raw(kernel)?; + + // CL_INVALID_CONTEXT if the context associated with kernel is not the same as the context + // associated with command_queue. + if queue.context != kernel.prog.context { + return Err(CL_INVALID_CONTEXT); + } + + // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available + // for kernel for the device associated with command_queue. + if kernel.prog.status(queue.device) != CL_BUILD_SUCCESS as cl_build_status { + return Err(CL_INVALID_PROGRAM_EXECUTABLE); + } + + // CL_INVALID_KERNEL_ARGS if all argument values for kernel have not been set. + if kernel.arg_values().iter().any(|v| v.is_none()) { + return Err(CL_INVALID_KERNEL_ARGS); + } + + // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and + // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS). + if work_dim == 0 || work_dim > queue.device.max_grid_dimensions() { + return Err(CL_INVALID_WORK_DIMENSION); + } + + let mut global_work_size = + unsafe { kernel_work_arr_or_default(global_work_size, work_dim).to_vec() }; + + let suggested_local_work_size = unsafe { + kernel_work_arr_mut(suggested_local_work_size, work_dim).ok_or(CL_INVALID_VALUE)? + }; + + let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) }; + + let device_bits = queue.device.address_bits(); + let device_max = u64::MAX >> (u64::BITS - device_bits); + for i in 0..work_dim as usize { + let gws = global_work_size[i]; + let gwo = global_work_offset[i]; + + // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified + // in global_work_size are 0. + if gws == 0 { + return Err(CL_INVALID_GLOBAL_WORK_SIZE); + } + // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size exceed the + // maximum value representable by size_t on the device associated with command_queue. + if gws as u64 > device_max { + return Err(CL_INVALID_GLOBAL_WORK_SIZE); + } + // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size plus the + // corresponding value in global_work_offset for dimension exceeds the maximum value + // representable by size_t on the device associated with command_queue. + if u64::checked_add(gws as u64, gwo as u64) + .filter(|&x| x <= device_max) + .is_none() + { + return Err(CL_INVALID_GLOBAL_OFFSET); + } + } + + kernel.suggest_local_size( + queue.device, + work_dim as usize, + &mut global_work_size, + suggested_local_work_size, + ); + + Ok(()) + + // CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is set as an argument to kernel and the offset specified when the sub-buffer object was created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN for the device associated with command_queue. + // CL_INVALID_IMAGE_SIZE if an image object is set as an argument to kernel and the image dimensions are not supported by device associated with command_queue. + // CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is set as an argument to kernel and the image format is not supported by the device associated with command_queue. + // CL_INVALID_OPERATION if an SVM pointer is set as an argument to kernel and the device associated with command_queue does not support SVM or the required SVM capabilities for the SVM pointer. + // CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the device. + // CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. +} diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs index c2aafd01532..7eec3a46621 100644 --- a/src/gallium/frontends/rusticl/core/kernel.rs +++ b/src/gallium/frontends/rusticl/core/kernel.rs @@ -831,21 +831,19 @@ impl Kernel { }) } - fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) { - let mut threads = self.max_threads_per_block(d) as u32; + pub fn suggest_local_size( + &self, + d: &Device, + work_dim: usize, + grid: &mut [usize], + block: &mut [usize], + ) { + let mut threads = self.max_threads_per_block(d); let dim_threads = d.max_block_sizes(); - let subgroups = self.preferred_simd_size(d) as u32; + let subgroups = self.preferred_simd_size(d); - if !block.contains(&0) { - for i in 0..3 { - // we already made sure everything is fine - grid[i] /= block[i]; - } - return; - } - - for i in 0..3 { - let t = cmp::min(threads, dim_threads[i] as u32); + for i in 0..work_dim { + let t = cmp::min(threads, dim_threads[i]); let gcd = gcd(t, grid[i]); block[i] = gcd; @@ -856,9 +854,9 @@ impl Kernel { } // if we didn't fill the subgroup we can do a bit better if we have threads remaining - let total_threads = block[0] * block[1] * block[2]; + let total_threads = block.iter().take(work_dim).product::(); if threads != 1 && total_threads < subgroups { - for i in 0..3 { + for i in 0..work_dim { if grid[i] * total_threads < threads { block[i] *= grid[i]; grid[i] = 1; @@ -869,6 +867,31 @@ impl Kernel { } } + fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) { + if !block.contains(&0) { + for i in 0..3 { + // we already made sure everything is fine + grid[i] /= block[i]; + } + return; + } + + let mut usize_grid = [0usize; 3]; + let mut usize_block = [0usize; 3]; + + for i in 0..3 { + usize_grid[i] = grid[i] as usize; + usize_block[i] = block[i] as usize; + } + + self.suggest_local_size(d, 3, &mut usize_grid, &mut usize_block); + + for i in 0..3 { + grid[i] = usize_grid[i] as u32; + block[i] = usize_block[i] as u32; + } + } + // the painful part is, that host threads are allowed to modify the kernel object once it was // enqueued, so return a closure with all req data included. pub fn launch( diff --git a/src/gallium/frontends/rusticl/core/platform.rs b/src/gallium/frontends/rusticl/core/platform.rs index c7508bad4ad..f80ed9d3ceb 100644 --- a/src/gallium/frontends/rusticl/core/platform.rs +++ b/src/gallium/frontends/rusticl/core/platform.rs @@ -53,6 +53,7 @@ gen_cl_exts!([ (1, 0, 0, "cl_khr_icd"), (1, 0, 0, "cl_khr_il_program"), (1, 0, 0, "cl_khr_spirv_no_integer_wrap_decoration"), + (1, 0, 0, "cl_khr_suggested_local_work_size"), ]); static mut PLATFORM: Platform = Platform {