mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 11:28:05 +02:00
rusticl/kernel: make use of cso info
Signed-off-by: Karol Herbst <kherbst@redhat.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19855>
This commit is contained in:
parent
c7dd3677dc
commit
ac993ae828
4 changed files with 76 additions and 50 deletions
|
|
@ -89,11 +89,10 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
|
||||||
CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size),
|
CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size),
|
||||||
CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(&dev)),
|
CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(&dev)),
|
||||||
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
|
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
|
||||||
cl_prop::<usize>(dev.subgroups() as usize)
|
cl_prop::<usize>(kernel.preferred_simd_size(&dev))
|
||||||
}
|
}
|
||||||
CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(&dev)),
|
CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(&dev)),
|
||||||
// TODO
|
CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(&dev)),
|
||||||
CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(dev.subgroups() as usize),
|
|
||||||
// CL_INVALID_VALUE if param_name is not one of the supported values
|
// CL_INVALID_VALUE if param_name is not one of the supported values
|
||||||
_ => return Err(CL_INVALID_VALUE),
|
_ => return Err(CL_INVALID_VALUE),
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,7 @@ pub trait HelperContextWrapper {
|
||||||
|
|
||||||
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
|
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
|
||||||
fn delete_compute_state(&self, cso: *mut c_void);
|
fn delete_compute_state(&self, cso: *mut c_void);
|
||||||
|
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
|
||||||
|
|
||||||
fn unmap(&self, tx: PipeTransfer);
|
fn unmap(&self, tx: PipeTransfer);
|
||||||
}
|
}
|
||||||
|
|
@ -159,6 +160,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
|
||||||
self.lock.delete_compute_state(cso)
|
self.lock.delete_compute_state(cso)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
|
||||||
|
self.lock.compute_state_info(state)
|
||||||
|
}
|
||||||
|
|
||||||
fn unmap(&self, tx: PipeTransfer) {
|
fn unmap(&self, tx: PipeTransfer) {
|
||||||
tx.with_ctx(&self.lock);
|
tx.with_ctx(&self.lock);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -258,6 +258,7 @@ struct KernelDevStateInner {
|
||||||
nir: NirShader,
|
nir: NirShader,
|
||||||
constant_buffer: Option<Arc<PipeResource>>,
|
constant_buffer: Option<Arc<PipeResource>>,
|
||||||
cso: *mut c_void,
|
cso: *mut c_void,
|
||||||
|
info: pipe_compute_state_object_info,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct KernelDevState {
|
struct KernelDevState {
|
||||||
|
|
@ -279,21 +280,25 @@ impl KernelDevState {
|
||||||
let states = nirs
|
let states = nirs
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(dev, nir)| {
|
.map(|(dev, nir)| {
|
||||||
let cso = if dev.shareable_shaders() {
|
let mut cso = dev
|
||||||
dev.helper_ctx()
|
.helper_ctx()
|
||||||
.create_compute_state(&nir, nir.shared_size())
|
.create_compute_state(&nir, nir.shared_size());
|
||||||
} else {
|
let info = dev.helper_ctx().compute_state_info(cso);
|
||||||
ptr::null_mut()
|
|
||||||
};
|
|
||||||
|
|
||||||
let cb = Self::create_nir_constant_buffer(&dev, &nir);
|
let cb = Self::create_nir_constant_buffer(&dev, &nir);
|
||||||
|
|
||||||
|
// if we can't share the cso between threads, destroy it now.
|
||||||
|
if !dev.shareable_shaders() {
|
||||||
|
dev.helper_ctx().delete_compute_state(cso);
|
||||||
|
cso = ptr::null_mut();
|
||||||
|
};
|
||||||
|
|
||||||
(
|
(
|
||||||
dev,
|
dev,
|
||||||
KernelDevStateInner {
|
KernelDevStateInner {
|
||||||
nir: nir,
|
nir: nir,
|
||||||
constant_buffer: cb,
|
constant_buffer: cb,
|
||||||
cso: cso,
|
cso: cso,
|
||||||
|
info: info,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
@ -829,44 +834,6 @@ fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] {
|
||||||
val.try_into().unwrap()
|
val.try_into().unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn optimize_local_size(d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
|
|
||||||
let mut threads = d.max_threads_per_block() as u32;
|
|
||||||
let dim_threads = d.max_block_sizes();
|
|
||||||
let subgroups = d.subgroups();
|
|
||||||
|
|
||||||
if !block.contains(&0) {
|
|
||||||
for i in 0..3 {
|
|
||||||
// we already made sure everything is fine
|
|
||||||
grid[i] /= block[i];
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for i in 0..3 {
|
|
||||||
let t = cmp::min(threads, dim_threads[i] as u32);
|
|
||||||
let gcd = gcd(t, grid[i]);
|
|
||||||
|
|
||||||
block[i] = gcd;
|
|
||||||
grid[i] /= gcd;
|
|
||||||
|
|
||||||
// update limits
|
|
||||||
threads /= block[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we didn't fill the subgroup we can do a bit better if we have threads remaining
|
|
||||||
let total_threads = block[0] * block[1] * block[2];
|
|
||||||
if threads != 1 && total_threads < subgroups {
|
|
||||||
for i in 0..3 {
|
|
||||||
if grid[i] * total_threads < threads {
|
|
||||||
block[i] *= grid[i];
|
|
||||||
grid[i] = 1;
|
|
||||||
// can only do it once as nothing is cleanly divisible
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Kernel {
|
impl Kernel {
|
||||||
pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
|
pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
|
||||||
let (mut nirs, args, internal_args, attributes_string) =
|
let (mut nirs, args, internal_args, attributes_string) =
|
||||||
|
|
@ -895,6 +862,44 @@ impl Kernel {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
|
||||||
|
let mut threads = self.max_threads_per_block(d) as u32;
|
||||||
|
let dim_threads = d.max_block_sizes();
|
||||||
|
let subgroups = self.preferred_simd_size(d) as u32;
|
||||||
|
|
||||||
|
if !block.contains(&0) {
|
||||||
|
for i in 0..3 {
|
||||||
|
// we already made sure everything is fine
|
||||||
|
grid[i] /= block[i];
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..3 {
|
||||||
|
let t = cmp::min(threads, dim_threads[i] as u32);
|
||||||
|
let gcd = gcd(t, grid[i]);
|
||||||
|
|
||||||
|
block[i] = gcd;
|
||||||
|
grid[i] /= gcd;
|
||||||
|
|
||||||
|
// update limits
|
||||||
|
threads /= block[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we didn't fill the subgroup we can do a bit better if we have threads remaining
|
||||||
|
let total_threads = block[0] * block[1] * block[2];
|
||||||
|
if threads != 1 && total_threads < subgroups {
|
||||||
|
for i in 0..3 {
|
||||||
|
if grid[i] * total_threads < threads {
|
||||||
|
block[i] *= grid[i];
|
||||||
|
grid[i] = 1;
|
||||||
|
// can only do it once as nothing is cleanly divisible
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// the painful part is, that host threads are allowed to modify the kernel object once it was
|
// the painful part is, that host threads are allowed to modify the kernel object once it was
|
||||||
// enqueued, so return a closure with all req data included.
|
// enqueued, so return a closure with all req data included.
|
||||||
pub fn launch(
|
pub fn launch(
|
||||||
|
|
@ -928,7 +933,7 @@ impl Kernel {
|
||||||
&[0; 4]
|
&[0; 4]
|
||||||
};
|
};
|
||||||
|
|
||||||
optimize_local_size(&q.device, &mut grid, &mut block);
|
self.optimize_local_size(&q.device, &mut grid, &mut block);
|
||||||
|
|
||||||
for (arg, val) in self.args.iter().zip(&self.values) {
|
for (arg, val) in self.args.iter().zip(&self.values) {
|
||||||
if arg.dead {
|
if arg.dead {
|
||||||
|
|
@ -1225,7 +1230,15 @@ impl Kernel {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn priv_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
|
pub fn priv_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
|
||||||
self.dev_state.get(dev).nir.scratch_size() as cl_ulong
|
self.dev_state.get(dev).info.private_memory.into()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn max_threads_per_block(&self, dev: &Device) -> usize {
|
||||||
|
self.dev_state.get(dev).info.max_threads as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn preferred_simd_size(&self, dev: &Device) -> usize {
|
||||||
|
self.dev_state.get(dev).info.preferred_simd_size as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
|
pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
|
||||||
|
|
|
||||||
|
|
@ -319,6 +319,14 @@ impl PipeContext {
|
||||||
unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
|
unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
|
||||||
|
let mut info = pipe_compute_state_object_info::default();
|
||||||
|
unsafe {
|
||||||
|
self.pipe.as_ref().get_compute_state_info.unwrap()(self.pipe.as_ptr(), state, &mut info)
|
||||||
|
}
|
||||||
|
info
|
||||||
|
}
|
||||||
|
|
||||||
pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
|
pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
|
||||||
unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
|
unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
|
||||||
}
|
}
|
||||||
|
|
@ -530,6 +538,7 @@ fn has_required_cbs(context: &pipe_context) -> bool {
|
||||||
& has_required_feature!(context, delete_compute_state)
|
& has_required_feature!(context, delete_compute_state)
|
||||||
& has_required_feature!(context, delete_sampler_state)
|
& has_required_feature!(context, delete_sampler_state)
|
||||||
& has_required_feature!(context, flush)
|
& has_required_feature!(context, flush)
|
||||||
|
& has_required_feature!(context, get_compute_state_info)
|
||||||
& has_required_feature!(context, launch_grid)
|
& has_required_feature!(context, launch_grid)
|
||||||
& has_required_feature!(context, memory_barrier)
|
& has_required_feature!(context, memory_barrier)
|
||||||
& has_required_feature!(context, resource_copy_region)
|
& has_required_feature!(context, resource_copy_region)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue