diff --git a/src/gallium/frontends/rusticl/api/device.rs b/src/gallium/frontends/rusticl/api/device.rs index a5ecdc5bcbd..7c215b2cd89 100644 --- a/src/gallium/frontends/rusticl/api/device.rs +++ b/src/gallium/frontends/rusticl/api/device.rs @@ -139,8 +139,7 @@ impl CLInfo for cl_device_id { CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG => cl_prop::(1), CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT => cl_prop::(1), CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => cl_prop::(1), - // TODO - CL_DEVICE_PRINTF_BUFFER_SIZE => cl_prop::(0), + CL_DEVICE_PRINTF_BUFFER_SIZE => cl_prop::(dev.printf_buffer_size()), // TODO CL_DEVICE_PROFILING_TIMER_RESOLUTION => cl_prop::(0), CL_DEVICE_OPENCL_C_FEATURES => cl_prop::>(Vec::new()), diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs index e7d47612c3b..59295c2526e 100644 --- a/src/gallium/frontends/rusticl/core/device.rs +++ b/src/gallium/frontends/rusticl/core/device.rs @@ -265,6 +265,18 @@ impl Device { } } + if self.embedded { + // The minimum value for the EMBEDDED profile is 1 KB. + if self.printf_buffer_size() < 1024 { + res = CLVersion::Cl1_1; + } + } else { + // The minimum value for the FULL profile is 1 MB. + if self.printf_buffer_size() < 1024 * 1024 { + res = CLVersion::Cl1_1; + } + } + if !exts.contains(&"cl_khr_byte_addressable_store") || !exts.contains(&"cl_khr_global_int32_base_atomics") || !exts.contains(&"cl_khr_global_int32_extended_atomics") @@ -495,6 +507,10 @@ impl Device { ) as usize } + pub fn printf_buffer_size(&self) -> usize { + 1024 * 1024 + } + pub fn screen(&self) -> &Arc { &self.screen } diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs index 8d65119d880..79e460f0a6e 100644 --- a/src/gallium/frontends/rusticl/core/kernel.rs +++ b/src/gallium/frontends/rusticl/core/kernel.rs @@ -21,6 +21,7 @@ use std::collections::HashMap; use std::collections::HashSet; use std::convert::TryInto; use std::ptr; +use std::slice; use std::sync::Arc; // ugh, we are not allowed to take refs, so... @@ -46,6 +47,7 @@ pub enum KernelArgType { pub enum InternalKernelArgType { ConstantBuffer, GlobalWorkOffsets, + PrintfBuffer, } #[derive(Clone)] @@ -156,7 +158,7 @@ where // mostly like clc_spirv_to_dxil // does not DCEe uniforms or images! -fn lower_and_optimize_nir_pre_inputs(nir: &mut NirShader, lib_clc: &NirShader) { +fn lower_and_optimize_nir_pre_inputs(dev: &Device, nir: &mut NirShader, lib_clc: &NirShader) { nir.set_workgroup_size_variable_if_zero(); nir.structurize(); while { @@ -212,7 +214,12 @@ fn lower_and_optimize_nir_pre_inputs(nir: &mut NirShader, lib_clc: &NirShader) { nir_variable_mode::nir_var_function_temp, Some(glsl_get_cl_type_size_align), ); - // TODO printf + + let mut printf_opts = nir_lower_printf_options::default(); + printf_opts.set_treat_doubles_as_floats(false); + printf_opts.max_buffer_size = dev.printf_buffer_size() as u32; + nir.pass1(nir_lower_printf, &printf_opts); + nir.pass0(nir_split_var_copies); nir.pass0(nir_opt_copy_prop_vars); nir.pass0(nir_lower_var_copies); @@ -255,6 +262,7 @@ fn lower_and_optimize_nir_late( Some(glsl_get_cl_type_size_align), ); nir.extract_constant_initializers(); + // TODO printf // TODO 32 bit devices // add vars for global offsets @@ -282,6 +290,19 @@ fn lower_and_optimize_nir_late( "constant_buffer_addr", ); } + if nir.has_printf() { + res.push(InternalKernelArg { + kind: InternalKernelArgType::PrintfBuffer, + offset: 0, + size: 8, + }); + lower_state.printf_buf = nir.add_var( + nir_variable_mode::nir_var_uniform, + unsafe { glsl_uint64_t_type() }, + args + res.len() - 1, + "printf_buffer_addr", + ); + } nir.pass2( nir_lower_vars_to_explicit_types, @@ -332,6 +353,14 @@ fn lower_and_optimize_nir_late( res } +fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] { + let val; + (val, *buf) = (*buf).split_at(S); + // we split of 4 bytes and convert to [u8; 4], so this should be safe + // use split_array_ref once it's stable + val.try_into().unwrap() +} + impl Kernel { pub fn new( name: String, @@ -340,7 +369,7 @@ impl Kernel { args: Vec, ) -> Arc { nirs.iter_mut() - .for_each(|(d, n)| lower_and_optimize_nir_pre_inputs(n, &d.lib_clc)); + .for_each(|(d, n)| lower_and_optimize_nir_pre_inputs(d, n, &d.lib_clc)); let nir = nirs.values_mut().next().unwrap(); let wgs = nir.workgroup_size(); let work_group_size = [wgs[0] as usize, wgs[1] as usize, wgs[2] as usize]; @@ -374,7 +403,7 @@ impl Kernel { // the painful part is, that host threads are allowed to modify the kernel object once it was // enqueued, so return a closure with all req data included. pub fn launch( - &self, + self: &Arc, q: &Arc, work_dim: u32, block: &[usize], @@ -388,6 +417,7 @@ impl Kernel { let mut input: Vec = Vec::new(); let mut resource_info = Vec::new(); let mut local_size: u32 = nir.shared_size(); + let printf_size = q.device.printf_buffer_size() as u32; for i in 0..3 { if block[i] == 0 { @@ -424,6 +454,7 @@ impl Kernel { } } + let mut printf_buf = None; for arg in &self.internal_args { input.append(&mut vec![0; arg.offset - input.len()]); match arg.kind { @@ -447,30 +478,72 @@ impl Kernel { InternalKernelArgType::GlobalWorkOffsets => { input.extend_from_slice(&cl_prop::<[u64; 3]>(offsets)); } + InternalKernelArgType::PrintfBuffer => { + let buf = + Arc::new(q.device.screen.resource_create_buffer(printf_size).unwrap()); + + input.extend_from_slice(&[0; 8]); + resource_info.push((Some(buf.clone()), arg.offset)); + + printf_buf = Some(buf); + } } } - let cso = q - .device - .helper_ctx() - .create_compute_state(nir, input.len() as u32, local_size); - - Box::new(move |_, ctx| { + let k = self.clone(); + Box::new(move |q, ctx| { + let nir = k.nirs.get(&q.device).unwrap(); let mut input = input.clone(); let mut resources = Vec::with_capacity(resource_info.len()); let mut globals: Vec<*mut u32> = Vec::new(); + let printf_format = nir.printf_format(); + let printf_buf = printf_buf.clone(); for (res, offset) in resource_info.clone() { resources.push(res); globals.push(unsafe { input.as_mut_ptr().add(offset) }.cast()); } + if let Some(printf_buf) = &printf_buf { + let init_data: [u8; 1] = [4]; + ctx.buffer_subdata( + printf_buf, + 0, + init_data.as_ptr().cast(), + init_data.len() as u32, + ); + } + let cso = ctx.create_compute_state(nir, input.len() as u32, local_size); + ctx.bind_compute_state(cso); ctx.set_global_binding(resources.as_slice(), &mut globals); ctx.launch_grid(work_dim, block, grid, &input); ctx.clear_global_binding(globals.len() as u32); ctx.delete_compute_state(cso); ctx.memory_barrier(PIPE_BARRIER_GLOBAL_BUFFER); + + if let Some(printf_buf) = &printf_buf { + let tx = ctx + .buffer_map(printf_buf, 0, printf_size as i32, true) + .with_ctx(ctx); + let mut buf: &[u8] = + unsafe { slice::from_raw_parts(tx.ptr().cast(), printf_size as usize) }; + let length = u32::from_ne_bytes(*extract(&mut buf)); + + // update our slice to make sure we don't go out of bounds + buf = &buf[0..(length - 4) as usize]; + + unsafe { + u_printf( + stdout, + buf.as_ptr().cast(), + buf.len(), + printf_format.as_ptr(), + printf_format.len() as u32, + ); + } + } + Ok(()) }) } diff --git a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs index c337117da05..1230565ee3a 100644 --- a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs +++ b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs @@ -189,6 +189,7 @@ impl SPIRVBin { kernel: true, kernel_image: true, linkage: true, + printf: true, ..Default::default() }, diff --git a/src/gallium/frontends/rusticl/mesa/compiler/nir.rs b/src/gallium/frontends/rusticl/mesa/compiler/nir.rs index 7737c23687b..ec11e9f2331 100644 --- a/src/gallium/frontends/rusticl/mesa/compiler/nir.rs +++ b/src/gallium/frontends/rusticl/mesa/compiler/nir.rs @@ -199,6 +199,23 @@ impl NirShader { } } + pub fn has_printf(&self) -> bool { + unsafe { + !self.nir.as_ref().printf_info.is_null() && self.nir.as_ref().printf_info_count != 0 + } + } + + pub fn printf_format(&self) -> &[u_printf_info] { + if self.has_printf() { + unsafe { + let nir = self.nir.as_ref(); + slice::from_raw_parts(nir.printf_info, nir.printf_info_count as usize) + } + } else { + &[] + } + } + pub fn get_constant_buffer(&self) -> &[u8] { unsafe { let nir = self.nir.as_ref(); diff --git a/src/gallium/frontends/rusticl/mesa/pipe/screen.rs b/src/gallium/frontends/rusticl/mesa/pipe/screen.rs index 60155e1f53c..1b546a00535 100644 --- a/src/gallium/frontends/rusticl/mesa/pipe/screen.rs +++ b/src/gallium/frontends/rusticl/mesa/pipe/screen.rs @@ -91,6 +91,7 @@ impl PipeScreen { tmpl.height0 = 1; tmpl.depth0 = 1; tmpl.array_size = 1; + tmpl.bind = PIPE_BIND_GLOBAL; PipeResource::new(unsafe { (*self.screen).resource_create.unwrap()(self.screen, &tmpl) }) } @@ -107,6 +108,7 @@ impl PipeScreen { tmpl.height0 = 1; tmpl.depth0 = 1; tmpl.array_size = 1; + tmpl.bind = PIPE_BIND_GLOBAL; PipeResource::new(unsafe { (*self.screen).resource_from_user_memory.unwrap()(self.screen, &tmpl, mem) @@ -184,7 +186,7 @@ impl PipeScreen { let s = &mut unsafe { *self.screen }; if let Some(func) = s.finalize_nir { unsafe { - func(s, nir.get_nir().cast()); + func(self.screen, nir.get_nir().cast()); } } } diff --git a/src/gallium/frontends/rusticl/meson.build b/src/gallium/frontends/rusticl/meson.build index 44d2c853f18..d746e7c223b 100644 --- a/src/gallium/frontends/rusticl/meson.build +++ b/src/gallium/frontends/rusticl/meson.build @@ -193,12 +193,13 @@ rusticl_mesa_bindings_rs = rust.bindgen( rusticl_bindgen_args, '--whitelist-function', 'clc_.*', '--whitelist-function', 'glsl_.*', + '--whitelist-function', 'malloc', '--whitelist-function', 'nir_.*', '--whitelist-function', 'pipe_.*', '--whitelist-function', 'rusticl_.*', '--whitelist-function', 'rz?alloc_.*', '--whitelist-function', 'spirv_.*', - '--whitelist-function', 'malloc', + '--whitelist-function', 'u_.*', '--whitelist-type', 'pipe_endian', '--whitelist-type', 'clc_kernel_arg_access_qualifier', '--bitfield-enum', 'clc_kernel_arg_access_qualifier', @@ -210,6 +211,7 @@ rusticl_mesa_bindings_rs = rust.bindgen( '--whitelist-var', 'PIPE_.*', '--bitfield-enum', 'pipe_map_flags', '--allowlist-var', 'stderr', + '--allowlist-var', 'stdout', '--bitfield-enum', 'nir_lower_int64_options', ], ) diff --git a/src/gallium/frontends/rusticl/rusticl_mesa_bindings.h b/src/gallium/frontends/rusticl/rusticl_mesa_bindings.h index ad4ff8290ee..5d40f17bc9f 100644 --- a/src/gallium/frontends/rusticl/rusticl_mesa_bindings.h +++ b/src/gallium/frontends/rusticl/rusticl_mesa_bindings.h @@ -10,4 +10,6 @@ #include "pipe/p_state.h" #include "pipe-loader/pipe_loader.h" +#include "util/u_printf.h" + #include "rusticl_nir.h" diff --git a/src/gallium/frontends/rusticl/rusticl_nir.c b/src/gallium/frontends/rusticl/rusticl_nir.c index 973b2151e91..3485f8c6507 100644 --- a/src/gallium/frontends/rusticl/rusticl_nir.c +++ b/src/gallium/frontends/rusticl/rusticl_nir.c @@ -23,6 +23,8 @@ rusticl_lower_intrinsics_instr( return nir_load_var(b, state->base_global_invoc_id); case nir_intrinsic_load_constant_base_ptr: return nir_load_var(b, state->const_buf); + case nir_intrinsic_load_printf_buffer_address: + return nir_load_var(b, state->printf_buf); default: return NULL; } diff --git a/src/gallium/frontends/rusticl/rusticl_nir.h b/src/gallium/frontends/rusticl/rusticl_nir.h index f34e91dd248..78ea550e955 100644 --- a/src/gallium/frontends/rusticl/rusticl_nir.h +++ b/src/gallium/frontends/rusticl/rusticl_nir.h @@ -1,6 +1,7 @@ struct rusticl_lower_state { nir_variable *base_global_invoc_id; nir_variable *const_buf; + nir_variable *printf_buf; }; bool rusticl_lower_intrinsics(nir_shader *nir, struct rusticl_lower_state *state);