rusticl: add support for coarse-grain buffer SVM

Reviewed-by: Adam Jackson <ajax@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32942>
This commit is contained in:
Karol Herbst 2024-12-16 14:56:24 +01:00 committed by Marge Bot
parent 6e13e438d1
commit da4de8d7e3
14 changed files with 685 additions and 157 deletions

View file

@ -756,7 +756,7 @@ Rusticl OpenCL 1.2 -- all DONE:
Rusticl OpenCL 2.0 -- all DONE:
Shared virtual memory in progress (nvc0, llvmpipe)
Shared virtual memory DONE (nvc0, llvmpipe)
Device queues not started
- cl_khr_create_command_queue DONE
- Additional queries for clGetDeviceInfo DONE

View file

@ -28,3 +28,4 @@ VK_EXT_texel_buffer_alignment on panvk
cl_khr_kernel_clock on freedreno, iris, llvmpipe, nvc0, panfrost, radeonsi and zink with llvm-19 or newer
GL_KHR_texture_compression_astc_hdr on panfrost and asahi
cl_ext_buffer_device_address on llvmpipe and zink
Completed OpenCL 2.0 coarse grain buffer SVM support

View file

@ -288,16 +288,17 @@ unsafe impl CLInfo<cl_device_info> for cl_device_id {
}
}
CL_DEVICE_SVM_CAPABILITIES | CL_DEVICE_SVM_CAPABILITIES_ARM => {
v.write::<cl_device_svm_capabilities>(
if dev.svm_supported() {
CL_DEVICE_SVM_COARSE_GRAIN_BUFFER
| CL_DEVICE_SVM_FINE_GRAIN_BUFFER
| CL_DEVICE_SVM_FINE_GRAIN_SYSTEM
} else {
0
}
.into(),
)
let mut caps = 0;
if dev.api_svm_supported() {
caps |= CL_DEVICE_SVM_COARSE_GRAIN_BUFFER;
}
if dev.system_svm_supported() {
caps |= CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM;
}
v.write::<cl_device_svm_capabilities>(caps.into())
}
CL_DEVICE_TYPE => {
// CL_DEVICE_TYPE_DEFAULT ... will never be returned in CL_DEVICE_TYPE for any

View file

@ -554,7 +554,7 @@ fn set_kernel_exec_info(
// CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM if no devices in the context associated with kernel
// support SVM.
let check_svm_support = || {
if devs.iter().all(|dev| !dev.svm_supported()) {
if devs.iter().all(|dev| !dev.api_svm_supported()) {
Err(CL_INVALID_OPERATION)
} else {
Ok(())
@ -577,16 +577,36 @@ fn set_kernel_exec_info(
CL_KERNEL_EXEC_INFO_SVM_PTRS | CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM => {
check_svm_support()?;
// reuse the existing container so we avoid reallocations
let mut svms = k.svms.lock().unwrap();
// To specify that no SVM allocations will be accessed by a kernel other than those set
// as kernel arguments, specify an empty set by passing param_value_size equal to zero
// and param_value equal to NULL.
if !param_value.is_null() || param_value_size != 0 {
let _ = unsafe {
if param_value_size == 0 && param_value.is_null() {
svms.clear();
} else {
let pointers = unsafe {
cl_slice::from_raw_parts_bytes_len::<*const c_void>(
param_value,
param_value_size,
)?
};
// We need to clear _after_ the error checking above. We could just assign a new
// container, however we also want to reuse the allocations.
svms.clear();
pointers
.iter()
// Each of the pointers can be the pointer returned by clSVMAlloc or can be a
// pointer to the middle of an SVM allocation. It is sufficient to specify one
// pointer for each SVM allocation.
//
// So we'll simply fetch the base and store that one.
.filter_map(|&handle| k.prog.context.find_svm_alloc(handle as usize))
.for_each(|(base, _)| {
svms.insert(base as usize);
});
}
}
CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM
@ -600,14 +620,19 @@ fn set_kernel_exec_info(
if val.len() != 1 {
return Err(CL_INVALID_VALUE);
}
// CL_INVALID_OPERATION if param_name is CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM and
// param_value is CL_TRUE but no devices in context associated with kernel support
// fine-grain system SVM allocations.
if val[0] == CL_TRUE && devs.iter().all(|dev| !dev.system_svm_supported()) {
return Err(CL_INVALID_OPERATION);
}
}
// CL_INVALID_VALUE if param_name is not valid
_ => return Err(CL_INVALID_VALUE),
}
Ok(())
// CL_INVALID_OPERATION if param_name is CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM and param_value is CL_TRUE but no devices in context associated with kernel support fine-grain system SVM allocations.
}
#[cl_entrypoint(clEnqueueNDRangeKernel)]

View file

@ -21,7 +21,7 @@ use rusticl_proc_macros::cl_info_entrypoint;
use std::cmp;
use std::cmp::Ordering;
use std::mem::{self, MaybeUninit};
use std::mem;
use std::num::NonZeroU64;
use std::os::raw::c_void;
use std::ptr;
@ -297,13 +297,13 @@ fn create_buffer_with_properties(
// or if CL_MEM_USE_HOST_PTR is set in flags and host_ptr is a pointer returned by clSVMAlloc
// and size is greater than the size passed to clSVMAlloc.
if let Some((svm_ptr, svm_layout)) = c.find_svm_alloc(host_ptr as usize) {
if let Some((svm_ptr, alloc_size)) = c.find_svm_alloc(host_ptr as usize) {
// SAFETY: they are part of the same allocation, and because host_ptr >= svm_ptr we can cast
// to usize.
let diff = unsafe { host_ptr.byte_offset_from(svm_ptr) } as usize;
// technically we don't have to account for the offset, but it's almost for free.
if size > svm_layout - diff {
if size > alloc_size - diff {
return Err(CL_INVALID_BUFFER_SIZE);
}
}
@ -2376,13 +2376,16 @@ pub fn svm_alloc(
return Err(CL_INVALID_VALUE);
}
let alignment = if alignment != 0 {
alignment as usize
} else {
// When alignment is 0, the size of the largest supported type is used.
// In the case of the full profile, that's `long16`.
mem::size_of::<[u64; 16]>()
};
// When alignment is 0, the size of the largest supported type is used.
// In the case of the full profile, that's `long16`.
let alignment = NonZeroU64::new(alignment.into())
.unwrap_or(NonZeroU64::new(mem::size_of::<[u64; 16]>() as u64).unwrap());
// size is 0 or > CL_DEVICE_MAX_MEM_ALLOC_SIZE value for any device in context.
let size = NonZeroU64::new(size as u64).ok_or(CL_INVALID_VALUE)?;
if size.get() > c.max_mem_alloc() {
return Err(CL_INVALID_VALUE);
}
c.alloc_svm_ptr(size, alignment)
@ -2422,7 +2425,7 @@ fn enqueue_svm_free_impl(
}
// CL_INVALID_OPERATION if the device associated with command queue does not support SVM.
if !q.device.svm_supported() {
if !q.device.api_svm_supported() {
return Err(CL_INVALID_OPERATION);
}
@ -2529,47 +2532,33 @@ fn enqueue_svm_memcpy_impl(
let block = check_cl_bool(blocking_copy).ok_or(CL_INVALID_VALUE)?;
// CL_INVALID_OPERATION if the device associated with command queue does not support SVM.
if !q.device.svm_supported() {
if !q.device.api_svm_supported() {
return Err(CL_INVALID_OPERATION);
}
// CL_INVALID_VALUE if dst_ptr or src_ptr is NULL.
if src_ptr.is_null() || dst_ptr.is_null() {
return Err(CL_INVALID_VALUE);
}
let src_ptr = src_ptr as usize;
let dst_ptr = dst_ptr as usize;
// CL_MEM_COPY_OVERLAP if the values specified for dst_ptr, src_ptr and size result in an
// overlapping copy.
let dst_ptr_addr = dst_ptr as usize;
let src_ptr_addr = src_ptr as usize;
if (src_ptr_addr <= dst_ptr_addr && dst_ptr_addr < src_ptr_addr + size)
|| (dst_ptr_addr <= src_ptr_addr && src_ptr_addr < dst_ptr_addr + size)
if (src_ptr <= dst_ptr && dst_ptr < src_ptr + size)
|| (dst_ptr <= src_ptr && src_ptr < dst_ptr + size)
{
return Err(CL_MEM_COPY_OVERLAP);
}
// CAST: We have no idea about the type or initialization status of these bytes.
// MaybeUninit<u8> is the safe bet.
let src_ptr = src_ptr.cast::<MaybeUninit<u8>>();
// CAST: We have no idea about the type or initialization status of these bytes.
// MaybeUninit<u8> is the safe bet.
let dst_ptr = dst_ptr.cast::<MaybeUninit<u8>>();
// SAFETY: It is up to the application to ensure the memory is valid to read for `size` bytes
// and that it doesn't modify it until the command has completed.
let src = unsafe { cl_slice::from_raw_parts(src_ptr, size)? };
// SAFETY: We've ensured there's no aliasing between src and dst. It is up to the application
// to ensure the memory is valid to read and write for `size` bytes and that it doesn't modify
// or read from it until the command has completed.
let dst = unsafe { cl_slice::from_raw_parts_mut(dst_ptr, size)? };
create_and_queue(
q,
cmd_type,
evs,
event,
block,
Box::new(move |_, _| {
dst.copy_from_slice(src);
Ok(())
}),
Box::new(move |q, ctx| q.context.copy_svm(ctx, src_ptr, dst_ptr, size)),
)
}
@ -2636,10 +2625,20 @@ fn enqueue_svm_mem_fill_impl(
let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?;
// CL_INVALID_OPERATION if the device associated with command queue does not support SVM.
if !q.device.svm_supported() {
if !q.device.api_svm_supported() {
return Err(CL_INVALID_OPERATION);
}
// CL_INVALID_VALUE if svm_ptr is NULL.
if svm_ptr.is_null() {
return Err(CL_INVALID_VALUE);
}
// CL_INVALID_VALUE if svm_ptr is not aligned to pattern_size bytes.
if !is_aligned_to(svm_ptr, pattern_size) {
return Err(CL_INVALID_VALUE);
}
// CL_INVALID_VALUE if pattern is NULL [...]
if pattern.is_null() {
return Err(CL_INVALID_VALUE);
@ -2701,35 +2700,9 @@ fn enqueue_svm_mem_fill_impl(
// `pattern_size` bytes and properly initialized.
// Creating a bitwise copy can't create memory safety issues, since `Pattern` is `Copy`.
let pattern = unsafe { pattern_ptr.read_unaligned() };
let svm_ptr = svm_ptr as usize;
// CAST: Same as with `pattern`, we don't know the exact type of `svm_ptr`, but we do
// know it's fine if we choose the same type here. The application might reasonably
// give us uninitialized memory though, so cast to a `MaybeUninit<Pattern>`, which has
// the same layout as `Pattern`.
let svm_ptr = svm_ptr.cast::<MaybeUninit<Pattern>>();
// SAFETY: It is the calling application's responsibility to ensure that `svm_ptr` is
// valid for reads and writes up to `size` bytes.
// Since `pattern_size == mem::size_of::<Pattern>()` and `MaybeUninit<Pattern>` has the
// same layout as `Pattern`, we know that
// `size / pattern_size * mem::size_of<MaybeUninit<Pattern>>` equals `size`.
//
// Since we're creating a `&[MaybeUninit<Pattern>]` the initialization status does not
// matter.
//
// From here on out we only access the referenced memory though this slice. In
// particular, since we've made a copy of `pattern`, it doesn't matter if the memory
// region referenced by `pattern` aliases the one referenced by this slice. It is up to
// the application not to access it at all until this command has been completed.
let svm_slice = unsafe { cl_slice::from_raw_parts_mut(svm_ptr, size / pattern_size)? };
Box::new(move |_, _| {
for x in svm_slice {
x.write(pattern);
}
Ok(())
})
Box::new(move |q, ctx| q.context.clear_svm(ctx, svm_ptr, size, pattern.0))
}};
}
@ -2817,7 +2790,7 @@ fn enqueue_svm_map_impl(
let block = check_cl_bool(blocking_map).ok_or(CL_INVALID_VALUE)?;
// CL_INVALID_OPERATION if the device associated with command queue does not support SVM.
if !q.device.svm_supported() {
if !q.device.api_svm_supported() {
return Err(CL_INVALID_OPERATION);
}
@ -2834,7 +2807,15 @@ fn enqueue_svm_map_impl(
// ... or if values specified in map_flags are not valid.
validate_map_flags_common(flags)?;
create_and_queue(q, cmd_type, evs, event, block, Box::new(|_, _| Ok(())))
let svm_ptr = svm_ptr as usize;
create_and_queue(
q,
cmd_type,
evs,
event,
block,
Box::new(move |q, ctx| q.context.copy_svm_to_host(ctx, svm_ptr, flags)),
)
}
#[cl_entrypoint(clEnqueueSVMMap)]
@ -2897,7 +2878,7 @@ fn enqueue_svm_unmap_impl(
let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?;
// CL_INVALID_OPERATION if the device associated with command queue does not support SVM.
if !q.device.svm_supported() {
if !q.device.api_svm_supported() {
return Err(CL_INVALID_OPERATION);
}
@ -2906,7 +2887,15 @@ fn enqueue_svm_unmap_impl(
return Err(CL_INVALID_VALUE);
}
create_and_queue(q, cmd_type, evs, event, false, Box::new(|_, _| Ok(())))
create_and_queue(
q,
cmd_type,
evs,
event,
false,
// TODO: we _could_ migrate the content somewhere, but it's really pointless to do
Box::new(move |_, _| Ok(())),
)
}
#[cl_entrypoint(clEnqueueSVMUnmap)]
@ -2960,7 +2949,7 @@ fn enqueue_svm_migrate_mem(
let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?;
// CL_INVALID_OPERATION if the device associated with command queue does not support SVM.
if !q.device.svm_supported() {
if !q.device.api_svm_supported() {
return Err(CL_INVALID_OPERATION);
}
@ -3010,9 +2999,9 @@ fn enqueue_svm_migrate_mem(
evs,
event,
false,
Box::new(move |_, ctx| {
ctx.svm_migrate(&svm_pointers, &sizes, to_device, content_undefined);
Ok(())
Box::new(move |q, ctx| {
q.context
.migrate_svm(ctx, svm_pointers, sizes, to_device, content_undefined)
}),
)
}

View file

@ -1,13 +1,15 @@
use crate::api::icd::*;
use crate::api::types::DeleteContextCB;
use crate::api::util::checked_compare;
use crate::api::util::bit_check;
use crate::core::device::*;
use crate::core::format::*;
use crate::core::gl::*;
use crate::core::memory::*;
use crate::core::queue::*;
use crate::core::util::*;
use crate::impl_cl_type_trait;
use mesa_rust::pipe::context::RWFlags;
use mesa_rust::pipe::resource::*;
use mesa_rust::pipe::screen::ResourceType;
use mesa_rust_gen::*;
@ -19,15 +21,21 @@ use rusticl_opencl_gen::*;
use std::alloc;
use std::alloc::Layout;
use std::cmp::Ordering;
use std::cmp;
use std::collections::HashMap;
use std::convert::TryInto;
use std::ffi::c_int;
use std::mem;
use std::num::NonZeroU64;
use std::os::raw::c_void;
use std::ptr;
use std::slice;
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::Weak;
use super::platform::Platform;
struct TrackedBDAAlloc {
buffer: Weak<Buffer>,
size: cl_mem_device_address_ext,
@ -39,6 +47,53 @@ impl AllocSize<cl_mem_device_address_ext> for TrackedBDAAlloc {
}
}
struct SVMAlloc {
layout: Layout,
vma: Option<NonZeroU64>,
alloc: Arc<Allocation>,
}
impl SVMAlloc {
pub fn size(&self) -> usize {
self.layout.size()
}
}
impl Drop for SVMAlloc {
fn drop(&mut self) {
if let Some(vma) = self.vma {
let address = vma.get() as usize as *mut c_void;
unsafe {
debug_assert_eq!(0, munmap(address, self.size()));
}
Platform::get()
.vm
.as_ref()
.unwrap()
.lock()
.unwrap()
.free(vma, NonZeroU64::new(self.size() as u64).unwrap());
} else {
// SAFETY: we make sure that svm_pointer is a valid allocation and reuse the same layout
// from the allocation
unsafe {
alloc::dealloc(self.alloc.host_ptr().cast(), self.layout);
}
}
}
}
impl AllocSize<usize> for SVMAlloc {
fn size(&self) -> usize {
SVMAlloc::size(self)
}
}
struct SVMContext {
svm_ptrs: TrackedPointers<usize, SVMAlloc>,
}
pub struct Context {
pub base: CLObjectBase<CL_INVALID_CONTEXT>,
pub devs: Vec<&'static Device>,
@ -48,7 +103,7 @@ pub struct Context {
bda_ptrs: Mutex<
HashMap<&'static Device, TrackedPointers<cl_mem_device_address_ext, TrackedBDAAlloc>>,
>,
svm_ptrs: Mutex<TrackedPointers<usize, Layout>>,
svm: Mutex<SVMContext>,
pub gl_ctx_manager: Option<GLCtxManager>,
}
@ -66,7 +121,9 @@ impl Context {
properties: properties,
dtors: Mutex::new(Vec::new()),
bda_ptrs: Mutex::new(HashMap::new()),
svm_ptrs: Mutex::new(TrackedPointers::new()),
svm: Mutex::new(SVMContext {
svm_ptrs: TrackedPointers::new(),
}),
gl_ctx_manager: gl_ctx_manager,
})
}
@ -207,13 +264,21 @@ impl Context {
}
pub fn has_svm_devs(&self) -> bool {
self.devs.iter().any(|dev| dev.svm_supported())
self.devs.iter().any(|dev| dev.api_svm_supported())
}
pub fn alloc_svm_ptr(&self, size: usize, alignment: usize) -> CLResult<*mut c_void> {
pub fn alloc_svm_ptr(
&self,
size: NonZeroU64,
mut alignment: NonZeroU64,
) -> CLResult<*mut c_void> {
// TODO: choose better alignment in regards to huge pages
alignment = cmp::max(alignment, NonZeroU64::new(0x1000).unwrap());
// clSVMAlloc will fail if alignment is not a power of two.
// `from_size_align()` verifies this condition is met.
let layout = Layout::from_size_align(size, alignment).or(Err(CL_INVALID_VALUE))?;
let layout = Layout::from_size_align(size.get() as usize, alignment.get() as usize)
.or(Err(CL_INVALID_VALUE))?;
// clSVMAlloc will fail if size is 0 or > CL_DEVICE_MAX_MEM_ALLOC_SIZE value
// for any device in context.
@ -222,38 +287,252 @@ impl Context {
// `from_size_align()` ensures that the allocation will fit in host memory,
// the maximum allocation may be smaller due to limitations from gallium or
// devices.
let size_aligned = layout.pad_to_align().size();
if size == 0 || checked_compare(size_aligned, Ordering::Greater, self.max_mem_alloc()) {
return Err(CL_INVALID_VALUE);
}
// let size_aligned = layout.pad_to_align().size();
// SAFETY: `size` is verified to be non-zero and the returned pointer is not
// expected to point to initialized memory.
let ptr = unsafe { alloc::alloc(layout) };
// allocate a vma if one of the devices doesn't support system SVM
let vma = if let Some(vm) = &Platform::get().vm {
Some(
vm.lock()
.unwrap()
.alloc(size, alignment)
.ok_or(CL_OUT_OF_RESOURCES)?,
)
} else {
None
};
let ptr: *mut c_void = if let Some(vma) = &vma {
let res = unsafe {
mmap(
vma.get() as usize as *mut c_void,
size.get() as usize,
(PROT_READ | PROT_WRITE) as c_int,
// MAP_FIXED_NOREPLACE needs 4.17
(MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE | MAP_NORESERVE) as c_int,
-1,
0,
)
};
// mmap returns MAP_FAILED on error which is -1
if res as usize == usize::MAX {
return Err(CL_OUT_OF_HOST_MEMORY);
}
res.cast()
} else {
unsafe { alloc::alloc(layout) }.cast()
};
if ptr.is_null() {
Err(CL_OUT_OF_HOST_MEMORY)
} else {
Ok(ptr.cast())
return Err(CL_OUT_OF_HOST_MEMORY);
}
let address = ptr as u64;
let mut buffers = HashMap::new();
for &dev in &self.devs {
let size: u32 = size.get().try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?;
// For system SVM devices we simply create a userptr resource.
let res = if dev.system_svm_supported() {
dev.screen()
.resource_create_buffer_from_user(size, ptr, PIPE_BIND_GLOBAL, 0)
} else {
dev.screen().resource_create_buffer(
size,
ResourceType::Normal,
PIPE_BIND_GLOBAL,
PIPE_RESOURCE_FLAG_FRONTEND_VM,
)
};
let res = res.ok_or(CL_OUT_OF_RESOURCES)?;
if !dev.system_svm_supported() {
if !dev.screen().resource_assign_vma(&res, address) {
return Err(CL_OUT_OF_RESOURCES);
}
}
buffers.insert(dev, Arc::new(res));
}
self.svm.lock().unwrap().svm_ptrs.insert(
ptr as usize,
SVMAlloc {
layout: layout,
vma,
alloc: Arc::new(Allocation::new(buffers, 0, ptr)),
},
);
Ok(ptr)
}
pub fn find_svm_alloc(&self, ptr: usize) -> Option<(*const c_void, usize)> {
self.svm_ptrs
pub fn copy_svm_to_dev(
&self,
ctx: &QueueContext,
ptr: usize,
) -> CLResult<Option<Arc<PipeResource>>> {
let svm = self.svm.lock().unwrap();
let Some(alloc) = svm.svm_ptrs.find_alloc_precise(ptr) else {
return Ok(None);
};
Ok(Some(Arc::clone(
alloc.alloc.get_res_for_access(ctx, RWFlags::RW)?,
)))
}
pub fn copy_svm_to_host(
&self,
ctx: &QueueContext,
svm_ptr: usize,
flags: cl_map_flags,
) -> CLResult<()> {
// no need to copy
if bit_check(flags, CL_MAP_WRITE_INVALIDATE_REGION) {
return Ok(());
}
let svm = self.svm.lock().unwrap();
let Some((_, alloc)) = svm.svm_ptrs.find_alloc(svm_ptr) else {
return Ok(());
};
alloc.alloc.migrate_to_hostptr(ctx, RWFlags::RW)
}
pub fn copy_svm(
&self,
ctx: &QueueContext,
src_addr: usize,
dst_addr: usize,
size: usize,
) -> CLResult<()> {
let svm = self.svm.lock().unwrap();
let src = svm.svm_ptrs.find_alloc(src_addr);
let dst = svm.svm_ptrs.find_alloc(dst_addr);
#[allow(clippy::collapsible_else_if)]
if let Some((src_base, src_alloc)) = src {
let src_res = src_alloc.alloc.get_res_for_access(ctx, RWFlags::RD)?;
let src_offset = src_addr - src_base;
if let Some((dst_base, dst_alloc)) = dst {
let dst_res = dst_alloc.alloc.get_res_for_access(ctx, RWFlags::WR)?;
let dst_offset = dst_addr - dst_base;
ctx.resource_copy_buffer(
src_res,
src_offset as i32,
dst_res,
dst_offset as u32,
size as i32,
);
} else {
let map = ctx
.buffer_map(src_res, src_offset as i32, size as i32, RWFlags::RD)
.ok_or(CL_OUT_OF_HOST_MEMORY)?;
unsafe {
ptr::copy_nonoverlapping(map.ptr(), dst_addr as *mut c_void, size);
}
}
} else {
if let Some((dst_base, dst_alloc)) = dst {
let dst_res = dst_alloc.alloc.get_res_for_access(ctx, RWFlags::WR)?;
let dst_offset = dst_addr - dst_base;
ctx.buffer_subdata(
dst_res,
dst_offset as u32,
src_addr as *const c_void,
size as u32,
);
} else {
unsafe {
ptr::copy(src_addr as *const c_void, dst_addr as *mut c_void, size);
}
}
}
Ok(())
}
pub fn clear_svm<const T: usize>(
&self,
ctx: &QueueContext,
svm_ptr: usize,
size: usize,
pattern: [u8; T],
) -> CLResult<()> {
let svm = self.svm.lock().unwrap();
if let Some((base, alloc)) = svm.svm_ptrs.find_alloc(svm_ptr) {
let res = alloc.alloc.get_res_for_access(ctx, RWFlags::WR)?;
let offset = svm_ptr - base;
ctx.clear_buffer(res, &pattern, offset as u32, size as u32);
} else {
let slice = unsafe {
slice::from_raw_parts_mut(svm_ptr as *mut _, size / mem::size_of_val(&pattern))
};
slice.fill(pattern);
}
Ok(())
}
pub fn migrate_svm(
&self,
ctx: &QueueContext,
pointers: Vec<usize>,
sizes: Vec<usize>,
to_device: bool,
content_undefined: bool,
) -> CLResult<()> {
let svm = self.svm.lock().unwrap();
if ctx.dev.system_svm_supported() {
ctx.svm_migrate(&pointers, &sizes, to_device, content_undefined);
} else {
for ptr in pointers {
let Some((_, alloc)) = svm.svm_ptrs.find_alloc(ptr) else {
continue;
};
// we assume it's only read, so it remains valid on the host until future commands
// have different needs.
if to_device {
alloc.alloc.get_res_for_access(ctx, RWFlags::RD)?;
} else {
alloc.alloc.migrate_to_hostptr(ctx, RWFlags::RD)?;
}
}
}
Ok(())
}
pub fn get_svm_alloc(&self, ptr: usize) -> Option<(*mut c_void, Arc<Allocation>)> {
self.svm
.lock()
.unwrap()
.svm_ptrs
.find_alloc(ptr)
.map(|(ptr, layout)| (ptr as *const c_void, layout.size()))
.map(|(base, alloc)| (base as *mut c_void, Arc::clone(&alloc.alloc)))
}
pub fn find_svm_alloc(&self, ptr: usize) -> Option<(*mut c_void, usize)> {
self.svm
.lock()
.unwrap()
.svm_ptrs
.find_alloc(ptr)
.map(|(ptr, alloc)| (ptr as _, alloc.size()))
}
pub fn remove_svm_ptr(&self, ptr: usize) {
if let Some(layout) = self.svm_ptrs.lock().unwrap().remove(ptr) {
// SAFETY: we make sure that svm_pointer is a valid allocation and reuse the same layout
// from the allocation
unsafe {
alloc::dealloc(ptr as *mut u8, layout);
}
}
self.svm.lock().unwrap().svm_ptrs.remove(ptr);
}
pub fn add_bda_ptr(&self, buffer: &Arc<Buffer>) {

View file

@ -27,6 +27,7 @@ use std::convert::TryInto;
use std::env;
use std::ffi::CStr;
use std::mem::transmute;
use std::num::NonZeroU64;
use std::os::raw::*;
use std::sync::Arc;
use std::sync::Mutex;
@ -104,6 +105,13 @@ pub trait HelperContextWrapper {
where
F: Fn(&HelperContext);
fn buffer_map(
&self,
res: &PipeResource,
offset: i32,
size: i32,
rw: RWFlags,
) -> Option<PipeTransfer>;
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
fn delete_compute_state(&self, cso: *mut c_void);
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
@ -165,6 +173,16 @@ impl HelperContextWrapper for HelperContext<'_> {
self.lock.flush()
}
fn buffer_map(
&self,
res: &PipeResource,
offset: i32,
size: i32,
rw: RWFlags,
) -> Option<PipeTransfer> {
self.lock.buffer_map(res, offset, size, rw)
}
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void {
self.lock.create_compute_state(nir, static_local_mem)
}
@ -1157,10 +1175,33 @@ impl Device {
&& (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders()))
}
pub fn svm_supported(&self) -> bool {
pub fn system_svm_supported(&self) -> bool {
self.screen.caps().system_svm
}
pub fn svm_supported(&self) -> bool {
if cfg!(not(target_pointer_width = "64")) {
return false;
}
self.system_svm_supported() || self.screen().is_vm_supported()
}
/// Checks if the device supports SVM _and_ that we were able to initialize SVM support on a
/// platform level.
pub fn api_svm_supported(&self) -> bool {
self.system_svm_supported()
|| (self.screen().is_vm_supported() && Platform::get().vm.is_some())
}
// returns (start, end)
pub fn vm_alloc_range(&self) -> Option<(NonZeroU64, NonZeroU64)> {
let min = self.screen.caps().min_vma;
let max = self.screen.caps().max_vma;
Some((NonZeroU64::new(min)?, NonZeroU64::new(max)?))
}
pub fn unified_memory(&self) -> bool {
self.screen.caps().uma
}

View file

@ -524,6 +524,7 @@ pub struct Kernel {
pub name: String,
values: Mutex<Vec<Option<KernelArgValue>>>,
pub bdas: Mutex<Vec<cl_mem_device_address_ext>>,
pub svms: Mutex<HashSet<usize>>,
builds: HashMap<&'static Device, Arc<NirKernelBuilds>>,
pub kernel_info: Arc<KernelInfo>,
}
@ -1246,6 +1247,7 @@ impl Kernel {
name: name,
values: Mutex::new(values),
bdas: Mutex::new(Vec::new()),
svms: Mutex::new(HashSet::new()),
builds: builds,
kernel_info: kernel_info,
})
@ -1323,6 +1325,7 @@ impl Kernel {
let arg_values = self.arg_values().clone();
let nir_kernel_builds = Arc::clone(&self.builds[q.device]);
let mut bdas = self.bdas.lock().unwrap().clone();
let svms = self.svms.lock().unwrap().clone();
let mut buffer_arcs = HashMap::new();
let mut image_arcs = HashMap::new();
@ -1442,6 +1445,12 @@ impl Kernel {
printf_buf = Some(buf);
}
// translate SVM pointers to their base first
let mut svms: HashSet<_> = svms
.into_iter()
.filter_map(|svm_pointer| Some(q.context.find_svm_alloc(svm_pointer)?.0 as usize))
.collect();
for arg in &nir_kernel_build.compiled_args {
let is_opaque = if let CompiledKernelArgType::APIArg(idx) = arg.kind {
kernel_info.args[idx].kind.is_opaque()
@ -1485,6 +1494,9 @@ impl Kernel {
if let Some(address) = buffer.dev_address(ctx.dev) {
let _ = buffer.get_res_for_access(ctx, rw)?;
bdas.push(address.get());
} else if buffer.is_svm() {
let _ = buffer.get_res_for_access(ctx, rw)?;
svms.insert(buffer.host_ptr() as usize);
}
} else {
let res = buffer.get_res_for_access(ctx, rw)?;
@ -1498,6 +1510,11 @@ impl Kernel {
}
}
&KernelArgValue::SVM(handle) => {
// get the base address so we deduplicate properly
if let Some((base, _)) = q.context.find_svm_alloc(handle) {
svms.insert(base as usize);
}
if !api_arg.dead {
add_pointer(q, &mut input, handle as u64);
}
@ -1603,11 +1620,21 @@ impl Kernel {
.filter_map(|address| q.context.find_bda_alloc(q.device, address))
.collect::<HashSet<_>>();
let bdas: Vec<_> = bdas
let mut bdas: Vec<_> = bdas
.iter()
.map(|buffer| Ok(buffer.get_res_for_access(ctx, RWFlags::RW)?.deref()))
.collect::<CLResult<_>>()?;
let svms_new = svms
.into_iter()
.filter_map(|svm| q.context.copy_svm_to_dev(ctx, svm).transpose())
.collect::<CLResult<Vec<_>>>()?;
// uhhh
for svm in &svms_new {
bdas.push(svm);
}
// subtract the shader local_size as we only request something on top of that.
variable_local_size -= static_local_size;
@ -1819,7 +1846,7 @@ impl Kernel {
}
pub fn has_svm_devs(&self) -> bool {
self.prog.devs.iter().any(|dev| dev.svm_supported())
self.prog.devs.iter().any(|dev| dev.api_svm_supported())
}
pub fn subgroup_sizes(&self, dev: &Device) -> impl ExactSizeIterator<Item = usize> {
@ -1876,6 +1903,7 @@ impl Clone for Kernel {
name: self.name.clone(),
values: Mutex::new(self.arg_values().clone()),
bdas: Mutex::new(self.bdas.lock().unwrap().clone()),
svms: Mutex::new(self.svms.lock().unwrap().clone()),
builds: self.builds.clone(),
kernel_info: Arc::clone(&self.kernel_info),
}

View file

@ -398,11 +398,17 @@ pub struct SubAllocation {
offset: usize,
}
pub struct SvmAllocation {
alloc: Arc<Allocation>,
offset: usize,
}
/// Abstraction over the memory allocation. It might be a real GPU backing storage or simply a sub
/// allocation over an existing memory object.
enum Allocation {
pub enum Allocation {
Resource(ResourceAllocation),
SubAlloc(SubAllocation),
Svm(SvmAllocation),
}
// TODO: - Once it's used for more stuff might make sense to split it into an Image and Buffer
@ -455,6 +461,14 @@ impl Allocation {
})
}
fn new_svm(alloc: Arc<Allocation>, offset: usize) -> Self {
Self::Svm(SvmAllocation {
// we precalculate the entire offset here.
offset: alloc.offset() + offset,
alloc: alloc,
})
}
/// Returns true if the backing storage of the two objects is equal.
fn backing_resource_eq(&self, other: &Self) -> bool {
ptr::eq(self.get_real_resource(), other.get_real_resource())
@ -465,24 +479,26 @@ impl Allocation {
match self {
Allocation::SubAlloc(sub) => sub.mem.alloc.get_real_resource(),
Allocation::Resource(res) => res,
Allocation::Svm(svm) => svm.alloc.get_real_resource(),
}
}
/// Returns the resource associated with `dev` without any data migration.
fn get_res_of_dev(&self, dev: &Device) -> CLResult<&Arc<PipeResource>> {
self.get_real_resource()
.res
.get(dev)
.ok_or(CL_OUT_OF_HOST_MEMORY)
fn get_res_of_dev(&self, dev: &Device) -> Option<&Arc<PipeResource>> {
self.get_real_resource().res.get(dev)
}
/// Returns the resource associated with `ctx.dev` and transparently migrate the data.
fn get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>> {
pub fn get_res_for_access(
&self,
ctx: &QueueContext,
rw: RWFlags,
) -> CLResult<&Arc<PipeResource>> {
self.get_real_resource().get_res_for_access(ctx, rw)
}
/// Migrates the content to the host. Fails if there is no host ptr.
pub fn _migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()> {
pub fn migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()> {
self.get_real_resource().migrate_to_hostptr(ctx, rw)
}
@ -497,14 +513,20 @@ impl Allocation {
host_ptr as _
}
fn is_user_alloc_for_dev(&self, dev: &Device) -> CLResult<bool> {
Ok(self.get_res_of_dev(dev)?.is_user())
fn is_user_alloc_for_dev(&self, dev: &Device) -> bool {
if let Some(res) = self.get_res_of_dev(dev) {
res.is_user()
} else {
// for SVM allocations there might not even be a real resource
dev.system_svm_supported()
}
}
fn offset(&self) -> usize {
match self {
Allocation::Resource(res) => res.offset,
Allocation::SubAlloc(sub) => sub.offset,
Allocation::Svm(svm) => svm.offset,
}
}
}
@ -773,39 +795,71 @@ impl MemBase {
.copied()
== Some(CL_TRUE.into());
let res_type = if bit_check(flags, CL_MEM_ALLOC_HOST_PTR) {
ResourceType::Staging
// if it's a SVM host ptr, we just use the already allocated resource if it exists, because
// this is actually mandated by the spec. The size requirement will be checked inside the
// API layer.
//
// From the OpenCL spec:
// If clCreateBuffer or clCreateBufferWithProperties is called with a pointer returned
// clSVMAlloc as its host_ptr argument, and CL_MEM_USE_HOST_PTR is set in its flags
// argument, clCreateBuffer or clCreateBufferWithProperties will succeed and return
// valid non-zero buffer object as long as the size argument is no larger than the size
// argument passed in the original clSVMAlloc call. The new buffer object returned has the
// shared memory as the underlying storage.
let svm = bit_check(flags, CL_MEM_USE_HOST_PTR)
.then(|| context.get_svm_alloc(host_ptr as usize))
.flatten();
let alloc = if let Some((svm_ptr, ref svm_alloc)) = svm {
// SAFETY: svm_ptr is the base of the allocation host_ptr points into.
let offset = unsafe { host_ptr.byte_offset_from(svm_ptr) } as usize;
Allocation::new_svm(Arc::clone(svm_alloc), offset)
} else {
ResourceType::Normal
let res_type = if bit_check(flags, CL_MEM_ALLOC_HOST_PTR) {
ResourceType::Staging
} else {
ResourceType::Normal
};
let buffer = context.create_buffer(
size,
host_ptr,
bit_check(flags, CL_MEM_COPY_HOST_PTR),
bda,
res_type,
)?;
// We can only keep the host_ptr when `CL_MEM_USE_HOST_PTR` is set.
if !bit_check(flags, CL_MEM_USE_HOST_PTR) {
host_ptr = ptr::null_mut()
}
Allocation::new(buffer, 0, host_ptr)
};
let buffer = context.create_buffer(
size,
host_ptr,
bit_check(flags, CL_MEM_COPY_HOST_PTR),
bda,
res_type,
)?;
// We can only keep the host_ptr when `CL_MEM_USE_HOST_PTR` is set.
if !bit_check(flags, CL_MEM_USE_HOST_PTR) {
host_ptr = ptr::null_mut()
}
let addresses = bda.then(|| {
context
.devs
.iter()
.filter(|dev| dev.bda_supported())
.map(|&dev| {
let address = buffer[dev].resource_get_address();
// If the buffer is backed by an SVM allocation, we need to use its address.
let address = if let Some((address, _)) = svm {
NonZeroU64::new(address as usize as u64)
} else if let Some(res) = alloc.get_res_of_dev(dev) {
res.resource_get_address()
} else {
// if there is no resource, it's a system SVM allocation
assert!(dev.system_svm_supported());
NonZeroU64::new(alloc.host_ptr() as u64)
};
Some((dev, address?))
})
.collect::<Option<_>>()
.unwrap()
});
let alloc = Allocation::new(buffer, 0, host_ptr);
let buffer = Arc::new(Buffer {
base: Self {
base: CLObjectBase::new(RusticlTypes::Buffer),
@ -1102,6 +1156,9 @@ impl MemBase {
match &self.alloc {
Allocation::SubAlloc(sub) => Some(&sub.mem),
Allocation::Resource(_) => None,
// In theory the SVM allocation is the parent, but that's not a memory object on the API
// level.
Allocation::Svm(_) => None,
}
}
@ -1109,10 +1166,10 @@ impl MemBase {
self.alloc.host_ptr()
}
fn is_pure_user_memory(&self, d: &Device) -> CLResult<bool> {
fn is_pure_user_memory(&self, d: &Device) -> bool {
// 1Dbuffer objects are weird. The parent memory object can be a host_ptr thing, but we are
// not allowed to actually return a pointer based on the host_ptr when mapping.
Ok(self.alloc.is_user_alloc_for_dev(d)? && !self.host_ptr().is_null())
self.alloc.is_user_alloc_for_dev(d) && !self.host_ptr().is_null()
}
fn map<T>(
@ -1430,7 +1487,7 @@ impl Buffer {
// in this case we only need to migrate to the device if the data is located on a device not
// having a userptr allocation.
if self.is_pure_user_memory(ctx.dev)? {
if self.is_pure_user_memory(ctx.dev) {
let rw = if mapping.writes {
RWFlags::RW
} else {
@ -1446,7 +1503,7 @@ impl Buffer {
pub fn sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
// no need to update
if self.is_pure_user_memory(ctx.dev)? {
if self.is_pure_user_memory(ctx.dev) {
return Ok(());
}
@ -1904,7 +1961,7 @@ impl Image {
// in this case we only need to migrate to the device if the data is located on a device not
// having a userptr allocation.
if self.is_pure_user_memory(ctx.dev)? {
if self.is_pure_user_memory(ctx.dev) {
let rw = if mapping.writes {
RWFlags::RW
} else {
@ -1930,7 +1987,7 @@ impl Image {
pub fn sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
// no need to update
if self.is_pure_user_memory(ctx.dev)? {
if self.is_pure_user_memory(ctx.dev) {
return Ok(());
}

View file

@ -3,25 +3,47 @@ use crate::api::icd::DISPATCH;
use crate::core::device::*;
use crate::core::version::*;
use mesa_rust::pipe::screen::ScreenVMAllocation;
use mesa_rust::util::vm::VM;
use mesa_rust_gen::*;
use mesa_rust_util::string::char_arr_to_cstr;
use rusticl_opencl_gen::*;
use std::cmp;
use std::env;
use std::num::NonZeroU64;
use std::ops::Deref;
use std::ptr;
use std::ptr::addr_of;
use std::ptr::addr_of_mut;
use std::sync::Mutex;
use std::sync::Once;
/// Maximum size a pixel can be across all supported image formats.
pub const MAX_PIXEL_SIZE_BYTES: u64 = 4 * 4;
pub struct PlatformVM<'a> {
vm: Mutex<VM>,
// we make use of the drop to automatically free the reserved VM
_dev_allocs: Vec<ScreenVMAllocation<'a>>,
}
impl Deref for PlatformVM<'_> {
type Target = Mutex<VM>;
fn deref(&self) -> &Self::Target {
&self.vm
}
}
#[repr(C)]
pub struct Platform {
dispatch: &'static cl_icd_dispatch,
pub devs: Vec<Device>,
pub extension_string: String,
pub extensions: Vec<cl_name_version>,
// lifetime has to match the one of devs
pub vm: Option<PlatformVM<'static>>,
}
pub enum PerfDebugLevel {
@ -57,6 +79,7 @@ static mut PLATFORM: Platform = Platform {
devs: Vec::new(),
extension_string: String::new(),
extensions: Vec::new(),
vm: None,
};
static mut PLATFORM_DBG: PlatformDebug = PlatformDebug {
allow_invalid_spirv: false,
@ -139,13 +162,56 @@ impl Platform {
unsafe { &*addr_of!(PLATFORM_FEATURES) }
}
fn init(&mut self) {
fn alloc_vm(devs: &[Device]) -> Option<PlatformVM<'_>> {
// We support buffer SVM only on 64 bit platforms
if cfg!(not(target_pointer_width = "64")) {
return None;
}
// No need to check system SVM devices
let devs = devs.iter().filter(|dev| !dev.system_svm_supported());
let (start, end) = devs.clone().filter_map(|dev| dev.vm_alloc_range()).reduce(
|(min_a, max_a), (min_b, max_b)| (cmp::max(min_a, min_b), cmp::min(max_a, max_b)),
)?;
// Allocate 1/8 of the available VM. No specific reason for this limit. Might have to bump
// this later, but it's probably fine as there is plenty of VM available.
let size = NonZeroU64::new((end.get() / 8).next_power_of_two())?;
if start > size {
return None;
}
let mut allocs = Vec::new();
for dev in devs {
allocs.push(dev.screen().alloc_vm(size, size)?);
}
Some(PlatformVM {
vm: Mutex::new(VM::new(size, size)),
_dev_allocs: allocs,
})
}
fn init(&'static mut self) {
unsafe {
glsl_type_singleton_init_or_ref();
}
self.devs = Device::all();
self.vm = Self::alloc_vm(&self.devs);
if self
.devs
.iter()
.any(|dev| !dev.system_svm_supported() && dev.svm_supported())
&& self.vm.is_none()
{
// TODO: in theory we should also remove the exposed SVM extension, but...
eprintln!("rusticl: could not initialize SVM support");
}
let mut exts_str: Vec<&str> = Vec::new();
let mut add_ext = |major, minor, patch, ext: &'static str| {
self.extensions

View file

@ -282,7 +282,18 @@ impl PipeResource {
impl Drop for PipeResource {
fn drop(&mut self) {
unsafe { pipe_resource_reference(&mut self.pipe.as_ptr(), ptr::null_mut()) }
unsafe {
let pipe = self.pipe.as_ref();
let screen = pipe.screen.as_ref().unwrap();
if pipe.flags & PIPE_RESOURCE_FLAG_FRONTEND_VM != 0 {
if let Some(resource_assign_vma) = screen.resource_assign_vma {
resource_assign_vma(pipe.screen, self.pipe(), 0);
}
}
pipe_resource_reference(&mut self.pipe.as_ptr(), ptr::null_mut());
}
}
}

View file

@ -308,6 +308,12 @@ rusticl_mesa_bindings = rust.bindgen(
'--allowlist-function', 'rusticl_.*',
'--allowlist-function', 'std(err|out)_ptr',
# libc
'--allowlist-function', 'mmap',
'--allowlist-function', 'munmap',
'--allowlist-var', 'MAP_.*',
'--allowlist-var', 'PROT_.*',
# winsys
'--allowlist-var', 'WINSYS_HANDLE_TYPE_.*',
],

View file

@ -1,4 +1,5 @@
#include <stdio.h>
#include <sys/mman.h>
FILE *stdout_ptr(void);
FILE *stderr_ptr(void);

View file

@ -1,6 +1,9 @@
use std::{
alloc::Layout,
collections::{btree_map::Entry, BTreeMap},
collections::{
btree_map::{Entry, Values, ValuesMut},
BTreeMap,
},
hash::{Hash, Hasher},
mem,
ops::{Add, Deref},
@ -175,6 +178,14 @@ impl<P, T: AllocSize<P>> TrackedPointers<P, T> {
ptrs: BTreeMap::new(),
}
}
pub fn values(&self) -> Values<'_, P, T> {
self.ptrs.values()
}
pub fn values_mut(&mut self) -> ValuesMut<'_, P, T> {
self.ptrs.values_mut()
}
}
impl<P, T: AllocSize<P>> TrackedPointers<P, T>
@ -201,6 +212,18 @@ where
None
}
pub fn find_alloc_mut(&mut self, ptr: P) -> Option<(P, &mut T)> {
if let Some((&base, val)) = self.ptrs.range_mut(..=ptr).next_back() {
let size = val.size();
// we check if ptr is within [base..base+size)
// means we can check if ptr - (base + size) < 0
if ptr < (base + size) {
return Some((base, val));
}
}
None
}
pub fn find_alloc_precise(&self, ptr: P) -> Option<&T> {
self.ptrs.get(&ptr)
}