From 29026053db4e0d23f04dc0620b1b5c2cfee248cf Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Fri, 25 Mar 2022 14:40:26 +0100 Subject: [PATCH] rusticl/mem: implement copies between buffers and images v2: Use the pitches from the pipe_transfer_map (Jason) Signed-off-by: Karol Herbst Acked-by: Alyssa Rosenzweig Part-of: --- src/gallium/frontends/rusticl/api/icd.rs | 62 +++-- src/gallium/frontends/rusticl/api/memory.rs | 221 +++++++++++++++--- src/gallium/frontends/rusticl/api/types.rs | 9 + src/gallium/frontends/rusticl/core/context.rs | 32 ++- src/gallium/frontends/rusticl/core/device.rs | 71 +++++- src/gallium/frontends/rusticl/core/kernel.rs | 12 +- src/gallium/frontends/rusticl/core/memory.rs | 163 ++++++++----- 7 files changed, 453 insertions(+), 117 deletions(-) diff --git a/src/gallium/frontends/rusticl/api/icd.rs b/src/gallium/frontends/rusticl/api/icd.rs index 958f77e0d5c..45a421103ed 100644 --- a/src/gallium/frontends/rusticl/api/icd.rs +++ b/src/gallium/frontends/rusticl/api/icd.rs @@ -1073,33 +1073,51 @@ extern "C" fn cl_enqueue_copy_image( } extern "C" fn cl_enqueue_copy_image_to_buffer( - _command_queue: cl_command_queue, - _src_image: cl_mem, - _dst_buffer: cl_mem, - _src_origin: *const usize, - _region: *const usize, - _dst_offset: usize, - _num_events_in_wait_list: cl_uint, - _event_wait_list: *const cl_event, - _event: *mut cl_event, + command_queue: cl_command_queue, + src_image: cl_mem, + dst_buffer: cl_mem, + src_origin: *const usize, + region: *const usize, + dst_offset: usize, + num_events_in_wait_list: cl_uint, + event_wait_list: *const cl_event, + event: *mut cl_event, ) -> cl_int { - println!("cl_enqueue_copy_image_to_buffer not implemented"); - CL_OUT_OF_HOST_MEMORY + match_err!(enqueue_copy_image_to_buffer( + command_queue, + src_image, + dst_buffer, + src_origin, + region, + dst_offset, + num_events_in_wait_list, + event_wait_list, + event, + )) } extern "C" fn cl_enqueue_copy_buffer_to_image( - _command_queue: cl_command_queue, - _src_buffer: cl_mem, - _dst_image: cl_mem, - _src_offset: usize, - _dst_origin: *const usize, - _region: *const usize, - _num_events_in_wait_list: cl_uint, - _event_wait_list: *const cl_event, - _event: *mut cl_event, + command_queue: cl_command_queue, + src_buffer: cl_mem, + dst_image: cl_mem, + src_offset: usize, + dst_origin: *const usize, + region: *const usize, + num_events_in_wait_list: cl_uint, + event_wait_list: *const cl_event, + event: *mut cl_event, ) -> cl_int { - println!("cl_enqueue_copy_buffer_to_image not implemented"); - CL_OUT_OF_HOST_MEMORY + match_err!(enqueue_copy_buffer_to_image( + command_queue, + src_buffer, + dst_image, + src_offset, + dst_origin, + region, + num_events_in_wait_list, + event_wait_list, + event, + )) } extern "C" fn cl_enqueue_map_buffer( diff --git a/src/gallium/frontends/rusticl/api/memory.rs b/src/gallium/frontends/rusticl/api/memory.rs index dbeea4be062..b81073cc984 100644 --- a/src/gallium/frontends/rusticl/api/memory.rs +++ b/src/gallium/frontends/rusticl/api/memory.rs @@ -16,6 +16,7 @@ use self::mesa_rust_util::properties::Properties; use self::mesa_rust_util::ptr::*; use self::rusticl_opencl_gen::*; +use std::cell::Cell; use std::cmp::Ordering; use std::os::raw::c_void; use std::ptr; @@ -1502,11 +1503,11 @@ pub fn enqueue_map_buffer( event: *mut cl_event, ) -> CLResult<*mut c_void> { let q = command_queue.get_arc()?; - let b = buffer.get_ref()?; + let b = buffer.get_arc()?; let block = check_cl_bool(blocking_map).ok_or(CL_INVALID_VALUE)?; let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?; - validate_map_flags(b, map_flags)?; + validate_map_flags(&b, map_flags)?; // CL_INVALID_VALUE if region being mapped given by (offset, size) is out of bounds or if size // is 0 @@ -1525,17 +1526,37 @@ pub fn enqueue_map_buffer( return Err(CL_INVALID_CONTEXT); } - create_and_queue( - q.clone(), - CL_COMMAND_MAP_BUFFER, - evs, - event, - block, - // we don't really have anything to do here? - Box::new(|_, _| Ok(())), - )?; + if block { + let ptr = Arc::new(Cell::new(Ok(ptr::null_mut()))); + let cloned = ptr.clone(); + create_and_queue( + q, + CL_COMMAND_MAP_BUFFER, + evs, + event, + block, + // we don't really have anything to do here? + Box::new(move |q, ctx| { + cloned.set(b.map_buffer(q, Some(ctx), offset, size)); + Ok(()) + }), + )?; + + ptr.get() + } else { + create_and_queue( + q.clone(), + CL_COMMAND_MAP_BUFFER, + evs, + event, + block, + // we don't really have anything to do here? + Box::new(|_, _| Ok(())), + )?; + + b.map_buffer(&q, None, offset, size) + } - b.map_buffer(&q, offset, size, block) // TODO // CL_MISALIGNED_SUB_BUFFER_OFFSET if buffer is a sub-buffer object and offset specified when the sub-buffer object is created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for the device associated with queue. This error code is missing before version 1.1. // CL_MAP_FAILURE if there is a failure to map the requested region into the host address space. This error cannot occur for buffer objects created with CL_MEM_USE_HOST_PTR or CL_MEM_ALLOC_HOST_PTR. @@ -1815,6 +1836,106 @@ pub fn enqueue_fill_image( //image are not supported by device associated with queue. } +pub fn enqueue_copy_buffer_to_image( + command_queue: cl_command_queue, + src_buffer: cl_mem, + dst_image: cl_mem, + src_offset: usize, + dst_origin: *const usize, + region: *const usize, + num_events_in_wait_list: cl_uint, + event_wait_list: *const cl_event, + event: *mut cl_event, +) -> CLResult<()> { + let q = command_queue.get_arc()?; + let src = src_buffer.get_arc()?; + let dst = dst_image.get_arc()?; + let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?; + + // CL_INVALID_CONTEXT if the context associated with command_queue, src_buffer and dst_image + // are not the same + if q.context != src.context || q.context != dst.context { + return Err(CL_INVALID_CONTEXT); + } + + // CL_INVALID_VALUE if dst_origin or region is NULL. + if dst_origin.is_null() || region.is_null() { + return Err(CL_INVALID_VALUE); + } + + let region = unsafe { CLVec::from_raw(region) }; + let src_origin = CLVec::new([src_offset, 0, 0]); + let dst_origin = unsafe { CLVec::from_raw(dst_origin) }; + + create_and_queue( + q, + CL_COMMAND_COPY_BUFFER_TO_IMAGE, + evs, + event, + false, + Box::new(move |q, ctx| src.copy_to(q, ctx, &dst, src_origin, dst_origin, ®ion)), + ) + + //• CL_INVALID_MEM_OBJECT if src_buffer is not a valid buffer object or dst_image is not a valid image object or if dst_image is a 1D image buffer object created from src_buffer. + //• CL_INVALID_VALUE if the 1D, 2D or 3D rectangular region specified by dst_origin and dst_origin + region refer to a region outside dst_image, or if the region specified by src_offset and src_offset + src_cb refer to a region outside src_buffer. + //• CL_INVALID_VALUE if values in dst_origin and region do not follow rules described in the argument description for dst_origin and region. + //• CL_MISALIGNED_SUB_BUFFER_OFFSET if src_buffer is a sub-buffer object and offset specified when the sub-buffer object is created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated with queue. + //• CL_INVALID_IMAGE_SIZE if image dimensions (image width, height, specified or compute row and/or slice pitch) for dst_image are not supported by device associated with queue. + //• CL_IMAGE_FORMAT_NOT_SUPPORTED if image format (image channel order and data type) for dst_image are not supported by device associated with queue. + //• CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate memory for data store associated with src_buffer or dst_image. + //• CL_INVALID_OPERATION if the device associated with command_queue does not support images (i.e. CL_DEVICE_IMAGE_SUPPORT specified in the Device Queries table is CL_FALSE). +} + +pub fn enqueue_copy_image_to_buffer( + command_queue: cl_command_queue, + src_image: cl_mem, + dst_buffer: cl_mem, + src_origin: *const usize, + region: *const usize, + dst_offset: usize, + num_events_in_wait_list: cl_uint, + event_wait_list: *const cl_event, + event: *mut cl_event, +) -> CLResult<()> { + let q = command_queue.get_arc()?; + let src = src_image.get_arc()?; + let dst = dst_buffer.get_arc()?; + let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?; + + // CL_INVALID_CONTEXT if the context associated with command_queue, src_image and dst_buffer + // are not the same + if q.context != src.context || q.context != dst.context { + return Err(CL_INVALID_CONTEXT); + } + + // CL_INVALID_VALUE if src_origin or region is NULL. + if src_origin.is_null() || region.is_null() { + return Err(CL_INVALID_VALUE); + } + + let region = unsafe { CLVec::from_raw(region) }; + let src_origin = unsafe { CLVec::from_raw(src_origin) }; + let dst_origin = CLVec::new([dst_offset, 0, 0]); + + create_and_queue( + q, + CL_COMMAND_COPY_IMAGE_TO_BUFFER, + evs, + event, + false, + Box::new(move |q, ctx| src.copy_to(q, ctx, &dst, src_origin, dst_origin, ®ion)), + ) + + //• CL_INVALID_MEM_OBJECT if src_image is not a valid image object or dst_buffer is not a valid buffer object or if src_image is a 1D image buffer object created from dst_buffer. + //• CL_INVALID_VALUE if the 1D, 2D or 3D rectangular region specified by src_origin and src_origin + region refers to a region outside src_image, or if the region specified by dst_offset and dst_offset + dst_cb to a region outside dst_buffer. + //• CL_INVALID_VALUE if values in src_origin and region do not follow rules described in the argument description for src_origin and region. + //• CL_MISALIGNED_SUB_BUFFER_OFFSET if dst_buffer is a sub-buffer object and offset specified when the sub-buffer object is created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated with queue. This error code is missing before version 1.1. + //• CL_INVALID_IMAGE_SIZE if image dimensions (image width, height, specified or compute row and/or slice pitch) for src_image are not supported by device associated with queue. + //• CL_IMAGE_FORMAT_NOT_SUPPORTED if image format (image channel order and data type) for src_image are not supported by device associated with queue. + //• CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate memory for data store associated with src_image or dst_buffer. + //• CL_INVALID_OPERATION if the device associated with command_queue does not support images (i.e. CL_DEVICE_IMAGE_SUPPORT specified in the Device Queries table is CL_FALSE). +} + pub fn enqueue_map_image( command_queue: cl_command_queue, image: cl_mem, @@ -1829,11 +1950,11 @@ pub fn enqueue_map_image( event: *mut cl_event, ) -> CLResult<*mut ::std::os::raw::c_void> { let q = command_queue.get_arc()?; - let i = image.get_ref()?; + let i = image.get_arc()?; let block = check_cl_bool(blocking_map).ok_or(CL_INVALID_VALUE)?; let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?; - validate_map_flags(i, map_flags)?; + validate_map_flags(&i, map_flags)?; // CL_INVALID_CONTEXT if context associated with command_queue and image are not the same if i.context != q.context { @@ -1849,16 +1970,6 @@ pub fn enqueue_map_image( let region = unsafe { CLVec::from_raw(region) }; let origin = unsafe { CLVec::from_raw(origin) }; - create_and_queue( - q.clone(), - CL_COMMAND_MAP_IMAGE, - evs, - event, - block, - // we don't really have anything to do here? - Box::new(|_, _| Ok(())), - )?; - let mut dummy_slice_pitch: usize = 0; let image_slice_pitch = if image_slice_pitch.is_null() { // CL_INVALID_VALUE if image is a 3D image, 1D or 2D image array object and @@ -1871,14 +1982,60 @@ pub fn enqueue_map_image( unsafe { image_slice_pitch.as_mut().unwrap() } }; - i.map_image( - &q, - &origin, - ®ion, - unsafe { image_row_pitch.as_mut().unwrap() }, - image_slice_pitch, - block, - ) + if block { + let res = Arc::new(Cell::new((Ok(ptr::null_mut()), 0, 0))); + let cloned = res.clone(); + + create_and_queue( + q.clone(), + CL_COMMAND_MAP_IMAGE, + evs, + event, + block, + // we don't really have anything to do here? + Box::new(move |q, ctx| { + let mut image_row_pitch = 0; + let mut image_slice_pitch = 0; + + let ptr = i.map_image( + q, + Some(ctx), + &origin, + ®ion, + &mut image_row_pitch, + &mut image_slice_pitch, + ); + cloned.set((ptr, image_row_pitch, image_slice_pitch)); + + Ok(()) + }), + )?; + + let res = res.get(); + unsafe { *image_row_pitch = res.1 }; + *image_slice_pitch = res.2; + res.0 + } else { + create_and_queue( + q.clone(), + CL_COMMAND_MAP_IMAGE, + evs, + event, + block, + // we don't really have anything to do here? + Box::new(|_, _| Ok(())), + )?; + + i.map_image( + &q, + None, + &origin, + ®ion, + unsafe { image_row_pitch.as_mut().unwrap() }, + image_slice_pitch, + ) + } + //• CL_INVALID_VALUE if region being mapped given by (origin, origin + region) is out of bounds or if values specified in map_flags are not valid. //• CL_INVALID_VALUE if values in origin and region do not follow rules described in the argument description for origin and region. //• CL_INVALID_IMAGE_SIZE if image dimensions (image width, height, specified or compute row and/or slice pitch) for image are not supported by device associated with queue. diff --git a/src/gallium/frontends/rusticl/api/types.rs b/src/gallium/frontends/rusticl/api/types.rs index 5f1856f1e65..962a7ec4c4a 100644 --- a/src/gallium/frontends/rusticl/api/types.rs +++ b/src/gallium/frontends/rusticl/api/types.rs @@ -2,6 +2,8 @@ extern crate rusticl_opencl_gen; use self::rusticl_opencl_gen::*; +use std::iter::Product; + #[macro_export] macro_rules! cl_closure { (|$obj:ident| $cb:ident($($arg:ident$(,)?)*)) => { @@ -91,6 +93,13 @@ impl CLVec { pub unsafe fn from_raw(v: *const T) -> Self { Self { vals: *v.cast() } } + + pub fn pixels<'a>(&'a self) -> T + where + T: Product<&'a T>, + { + self.vals.iter().product() + } } impl CLVec { diff --git a/src/gallium/frontends/rusticl/core/context.rs b/src/gallium/frontends/rusticl/core/context.rs index 0dd513fd88c..77299d155d8 100644 --- a/src/gallium/frontends/rusticl/core/context.rs +++ b/src/gallium/frontends/rusticl/core/context.rs @@ -4,6 +4,7 @@ extern crate rusticl_opencl_gen; use crate::api::icd::*; use crate::core::device::*; use crate::core::format::*; +use crate::core::memory::*; use crate::core::util::*; use crate::impl_cl_type_trait; @@ -35,7 +36,11 @@ impl Context { }) } - pub fn create_buffer(&self, size: usize) -> CLResult, Arc>> { + pub fn create_buffer( + &self, + size: usize, + user_ptr: *mut c_void, + ) -> CLResult, Arc>> { let adj_size: u32 = size.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?; let mut res = HashMap::new(); for dev in &self.devs { @@ -45,6 +50,16 @@ impl Context { .ok_or(CL_OUT_OF_RESOURCES); res.insert(Arc::clone(dev), Arc::new(resource?)); } + + if !user_ptr.is_null() { + res.iter() + .map(|(d, r)| { + d.helper_ctx() + .exec(|ctx| ctx.buffer_subdata(r, 0, user_ptr, size.try_into().unwrap())) + }) + .for_each(|f| f.wait()); + } + Ok(res) } @@ -69,6 +84,7 @@ impl Context { &self, desc: &cl_image_desc, format: &cl_image_format, + user_ptr: *mut c_void, ) -> CLResult, Arc>> { let width = desc .image_width @@ -97,6 +113,20 @@ impl Context { .ok_or(CL_OUT_OF_RESOURCES); res.insert(Arc::clone(dev), Arc::new(resource?)); } + + if !user_ptr.is_null() { + let bx = desc.bx()?; + let stride = desc.row_pitch()?; + let layer_stride = desc.slice_pitch()?; + + res.iter() + .map(|(d, r)| { + d.helper_ctx() + .exec(|ctx| ctx.texture_subdata(r, &bx, user_ptr, stride, layer_stride)) + }) + .for_each(|f| f.wait()); + } + Ok(res) } diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs index 59295c2526e..c4831b952ab 100644 --- a/src/gallium/frontends/rusticl/core/device.rs +++ b/src/gallium/frontends/rusticl/core/device.rs @@ -14,7 +14,10 @@ use self::mesa_rust::compiler::clc::*; use self::mesa_rust::compiler::nir::*; use self::mesa_rust::pipe::context::*; use self::mesa_rust::pipe::device::load_screens; +use self::mesa_rust::pipe::fence::*; +use self::mesa_rust::pipe::resource::*; use self::mesa_rust::pipe::screen::*; +use self::mesa_rust::pipe::transfer::*; use self::mesa_rust_gen::*; use self::rusticl_opencl_gen::*; @@ -23,6 +26,7 @@ use std::cmp::min; use std::collections::HashMap; use std::convert::TryInto; use std::env; +use std::os::raw::*; use std::sync::Arc; use std::sync::Mutex; use std::sync::MutexGuard; @@ -42,6 +46,67 @@ pub struct Device { helper_ctx: Mutex>, } +pub trait HelperContextWrapper { + #[must_use] + fn exec(&self, func: F) -> PipeFence + where + F: Fn(&HelperContext); + + fn buffer_map_async(&self, res: &PipeResource, offset: i32, size: i32) -> PipeTransfer; + fn texture_map_async(&self, res: &PipeResource, bx: &pipe_box) -> PipeTransfer; + fn unmap(&self, tx: PipeTransfer); +} + +pub struct HelperContext<'a> { + lock: MutexGuard<'a, Arc>, +} + +impl<'a> HelperContext<'a> { + pub fn buffer_subdata( + &self, + res: &PipeResource, + offset: c_uint, + data: *const c_void, + size: c_uint, + ) { + self.lock.buffer_subdata(res, offset, data, size) + } + + pub fn texture_subdata( + &self, + res: &PipeResource, + bx: &pipe_box, + data: *const c_void, + stride: u32, + layer_stride: u32, + ) { + self.lock + .texture_subdata(res, bx, data, stride, layer_stride) + } +} + +impl<'a> HelperContextWrapper for HelperContext<'a> { + fn exec(&self, func: F) -> PipeFence + where + F: Fn(&HelperContext), + { + func(self); + self.lock.flush() + } + + fn buffer_map_async(&self, res: &PipeResource, offset: i32, size: i32) -> PipeTransfer { + self.lock.buffer_map(res, offset, size, false) + } + + fn texture_map_async(&self, res: &PipeResource, bx: &pipe_box) -> PipeTransfer { + self.lock.texture_map(res, bx, false) + } + + fn unmap(&self, tx: PipeTransfer) { + tx.with_ctx(&self.lock); + } +} + impl_cl_type_trait!(cl_device_id, Device, CL_INVALID_DEVICE); impl Device { @@ -534,8 +599,10 @@ impl Device { id as u32 } - pub fn helper_ctx(&self) -> MutexGuard> { - self.helper_ctx.lock().unwrap() + pub fn helper_ctx(&self) -> impl HelperContextWrapper + '_ { + HelperContext { + lock: self.helper_ctx.lock().unwrap(), + } } pub fn cl_features(&self) -> clc_optional_features { diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs index c56c2279720..32a0b929f05 100644 --- a/src/gallium/frontends/rusticl/core/kernel.rs +++ b/src/gallium/frontends/rusticl/core/kernel.rs @@ -467,12 +467,12 @@ impl Kernel { .resource_create_buffer(buf.len() as u32) .unwrap(), ); - q.device.helper_ctx().buffer_subdata( - &res, - 0, - buf.as_ptr().cast(), - buf.len() as u32, - ); + q.device + .helper_ctx() + .exec(|ctx| { + ctx.buffer_subdata(&res, 0, buf.as_ptr().cast(), buf.len() as u32) + }) + .wait(); resource_info.push((Some(res), arg.offset)); } InternalKernelArgType::GlobalWorkOffsets => { diff --git a/src/gallium/frontends/rusticl/core/memory.rs b/src/gallium/frontends/rusticl/core/memory.rs index 30e40442fc5..193d33ad612 100644 --- a/src/gallium/frontends/rusticl/core/memory.rs +++ b/src/gallium/frontends/rusticl/core/memory.rs @@ -216,16 +216,10 @@ impl Mem { let buffer = if bit_check(flags, CL_MEM_USE_HOST_PTR) { context.create_buffer_from_user(size, host_ptr) } else { - context.create_buffer(size) + assert_eq!(bit_check(flags, CL_MEM_COPY_HOST_PTR), !host_ptr.is_null()); + context.create_buffer(size, host_ptr) }?; - if bit_check(flags, CL_MEM_COPY_HOST_PTR) { - for (d, r) in &buffer { - d.helper_ctx() - .buffer_subdata(r, 0, host_ptr, size.try_into().unwrap()); - } - } - let host_ptr = if bit_check(flags, CL_MEM_USE_HOST_PTR) { host_ptr } else { @@ -313,20 +307,10 @@ impl Mem { let texture = if bit_check(flags, CL_MEM_USE_HOST_PTR) { context.create_texture_from_user(&image_desc, image_format, host_ptr) } else { - context.create_texture(&image_desc, image_format) + assert_eq!(bit_check(flags, CL_MEM_COPY_HOST_PTR), !host_ptr.is_null()); + context.create_texture(&image_desc, image_format, host_ptr) }?; - if bit_check(flags, CL_MEM_COPY_HOST_PTR) { - let bx = image_desc.bx()?; - let stride = image_desc.row_pitch()?; - let layer_stride = image_desc.slice_pitch()?; - - for (d, r) in &texture { - d.helper_ctx() - .texture_subdata(r, &bx, host_ptr, stride, layer_stride); - } - } - let host_ptr = if bit_check(flags, CL_MEM_USE_HOST_PTR) { host_ptr } else { @@ -356,38 +340,67 @@ impl Mem { self.mem_type == CL_MEM_OBJECT_BUFFER } - fn tx( + fn tx_raw( &self, q: &Arc, - ctx: &PipeContext, + ctx: Option<&PipeContext>, mut offset: usize, size: usize, - blocking: bool, ) -> CLResult { let b = self.to_parent(&mut offset); let r = b.get_res()?.get(&q.device).unwrap(); assert!(self.is_buffer()); - Ok(ctx.buffer_map( - r, - offset.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?, - size.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?, - blocking, - )) + Ok(if let Some(ctx) = ctx { + ctx.buffer_map( + r, + offset.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?, + size.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?, + true, + ) + } else { + q.device.helper_ctx().buffer_map_async( + r, + offset.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?, + size.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?, + ) + }) } - fn tx_image( + fn tx<'a>( &self, q: &Arc, - ctx: &PipeContext, + ctx: &'a PipeContext, + offset: usize, + size: usize, + ) -> CLResult> { + Ok(self.tx_raw(q, Some(ctx), offset, size)?.with_ctx(ctx)) + } + + fn tx_image_raw( + &self, + q: &Arc, + ctx: Option<&PipeContext>, bx: &pipe_box, - blocking: bool, ) -> CLResult { assert!(!self.is_buffer()); let r = self.get_res()?.get(&q.device).unwrap(); - Ok(ctx.texture_map(r, bx, blocking)) + Ok(if let Some(ctx) = ctx { + ctx.texture_map(r, bx, true) + } else { + q.device.helper_ctx().texture_map_async(r, bx) + }) + } + + fn tx_image<'a>( + &self, + q: &Arc, + ctx: &'a PipeContext, + bx: &pipe_box, + ) -> CLResult> { + Ok(self.tx_image_raw(q, Some(ctx), bx)?.with_ctx(ctx)) } pub fn has_same_parent(&self, other: &Self) -> bool { @@ -428,7 +441,7 @@ impl Mem { ) -> CLResult<()> { assert!(self.is_buffer()); - let tx = self.tx(q, ctx, offset, size, true)?; + let tx = self.tx(q, ctx, offset, size)?; unsafe { ptr::copy_nonoverlapping(tx.ptr(), ptr, size); @@ -469,17 +482,59 @@ impl Mem { ) -> CLResult<()> { let src = self.to_parent(&mut src_origin[0]); let dst = dst.to_parent(&mut dst_origin[0]); - let bx = create_box(&src_origin, region, self.mem_type)?; - let mut dst_origin: [u32; 3] = dst_origin.try_into()?; let src_res = src.get_res()?.get(&q.device).unwrap(); let dst_res = dst.get_res()?.get(&q.device).unwrap(); - if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY { - (dst_origin[1], dst_origin[2]) = (dst_origin[2], dst_origin[1]); - } + if self.is_buffer() && !dst.is_buffer() || !self.is_buffer() && dst.is_buffer() { + let tx_src; + let tx_dst; - ctx.resource_copy_region(src_res, dst_res, &dst_origin, &bx); + if self.is_buffer() { + let bpp = dst.image_format.pixel_size().unwrap() as usize; + tx_src = self.tx(q, ctx, src_origin[0], region.pixels() * bpp)?; + tx_dst = dst.tx_image(q, ctx, &create_box(&dst_origin, region, dst.mem_type)?)?; + + sw_copy( + tx_src.ptr(), + tx_dst.ptr(), + region, + &CLVec::default(), + region[0] * bpp, + region[0] * region[1] * bpp, + &CLVec::default(), + tx_dst.row_pitch() as usize, + tx_dst.slice_pitch() as usize, + bpp as u8, + ) + } else { + let bpp = self.image_format.pixel_size().unwrap() as usize; + tx_src = self.tx_image(q, ctx, &create_box(&src_origin, region, self.mem_type)?)?; + tx_dst = dst.tx(q, ctx, dst_origin[0], region.pixels() * bpp)?; + + sw_copy( + tx_src.ptr(), + tx_dst.ptr(), + region, + &CLVec::default(), + tx_src.row_pitch() as usize, + tx_src.slice_pitch() as usize, + &CLVec::default(), + region[0] * bpp, + region[0] * region[1] * bpp, + bpp as u8, + ) + } + } else { + let bx = create_box(&src_origin, region, self.mem_type)?; + let mut dst_origin: [u32; 3] = dst_origin.try_into()?; + + if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY { + (dst_origin[1], dst_origin[2]) = (dst_origin[2], dst_origin[1]); + } + + ctx.resource_copy_region(src_res, dst_res, &dst_origin, &bx); + } Ok(()) } @@ -546,7 +601,7 @@ impl Mem { dst_slice_pitch: usize, ) -> CLResult<()> { if self.is_buffer() { - let tx = self.tx(q, ctx, 0, self.size, true)?; + let tx = self.tx(q, ctx, 0, self.size)?; sw_copy( src, tx.ptr(), @@ -603,13 +658,13 @@ impl Mem { let pixel_size; if self.is_buffer() { - tx = self.tx(q, ctx, 0, self.size, true)?; + tx = self.tx(q, ctx, 0, self.size)?; pixel_size = 1; } else { assert!(dst_origin == &CLVec::default()); let bx = create_box(src_origin, region, self.mem_type)?; - tx = self.tx_image(q, ctx, &bx, true)?; + tx = self.tx_image(q, ctx, &bx)?; src_row_pitch = tx.row_pitch() as usize; src_slice_pitch = tx.slice_pitch() as usize; @@ -646,9 +701,10 @@ impl Mem { dst_slice_pitch: usize, ) -> CLResult<()> { assert!(self.is_buffer()); + assert!(dst.is_buffer()); - let tx_src = self.tx(q, ctx, 0, self.size, true)?; - let tx_dst = dst.tx(q, ctx, 0, self.size, true)?; + let tx_src = self.tx(q, ctx, 0, self.size)?; + let tx_dst = dst.tx(q, ctx, 0, self.size)?; // TODO check to use hw accelerated paths (e.g. resource_copy_region or blits) sw_copy( @@ -670,16 +726,15 @@ impl Mem { fn map<'a>( &self, q: &Arc, - ctx: &PipeContext, + ctx: Option<&PipeContext>, lock: &'a mut MutexGuard, - block: bool, ) -> CLResult<&'a PipeTransfer> { if !lock.tx.contains_key(&q.device) { let tx = if self.is_buffer() { - self.tx(q, ctx, 0, self.size, block)? + self.tx_raw(q, ctx, 0, self.size)? } else { let bx = self.image_desc.bx()?; - self.tx_image(q, ctx, &bx, block)? + self.tx_image_raw(q, ctx, &bx)? }; lock.tx.insert(q.device.clone(), (tx, 0)); @@ -694,14 +749,14 @@ impl Mem { pub fn map_buffer( &self, q: &Arc, + ctx: Option<&PipeContext>, offset: usize, _size: usize, - block: bool, ) -> CLResult<*mut c_void> { assert!(self.is_buffer()); let mut lock = self.maps.lock().unwrap(); - let tx = self.map(q, &q.device.helper_ctx(), &mut lock, block)?; + let tx = self.map(q, ctx, &mut lock)?; let ptr = unsafe { tx.ptr().add(offset) }; if let Some(e) = lock.maps.get_mut(&ptr) { @@ -716,16 +771,16 @@ impl Mem { pub fn map_image( &self, q: &Arc, + ctx: Option<&PipeContext>, origin: &CLVec, _region: &CLVec, row_pitch: &mut usize, slice_pitch: &mut usize, - block: bool, ) -> CLResult<*mut c_void> { assert!(!self.is_buffer()); let mut lock = self.maps.lock().unwrap(); - let tx = self.map(q, &q.device.helper_ctx(), &mut lock, block)?; + let tx = self.map(q, ctx, &mut lock)?; *row_pitch = tx.row_pitch() as usize; *slice_pitch = tx.slice_pitch() as usize; @@ -786,7 +841,7 @@ impl Drop for Mem { .for_each(|cb| cb(cl)); for (d, tx) in self.maps.lock().unwrap().tx.drain() { - tx.0.with_ctx(&d.helper_ctx()); + d.helper_ctx().unmap(tx.0); } } }