From 43ea91b10d480e9dd9d563842da8cfb9a9663a6a Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 8 Nov 2025 14:23:02 +0100 Subject: [PATCH 01/12] rusticl/queue: fix error code for invalid queue properties part 1 Cc: mesa-stable --- src/gallium/frontends/rusticl/api/queue.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/frontends/rusticl/api/queue.rs b/src/gallium/frontends/rusticl/api/queue.rs index fb38fa9c768..04327f523b0 100644 --- a/src/gallium/frontends/rusticl/api/queue.rs +++ b/src/gallium/frontends/rusticl/api/queue.rs @@ -135,7 +135,7 @@ fn create_command_queue_with_properties( let d = Device::ref_from_raw(device)?; // SAFETY: properties is a 0 terminated array by spec. - let properties = unsafe { Properties::new(properties) }.ok_or(CL_INVALID_PROPERTY)?; + let properties = unsafe { Properties::new(properties) }.ok_or(CL_INVALID_VALUE)?; for (&key, &val) in properties.iter() { match u32::try_from(key).or(Err(CL_INVALID_PROPERTY))? { CL_QUEUE_PROPERTIES => queue_properties = val, @@ -152,7 +152,7 @@ fn create_command_queue_with_properties( // CL_INVALID_QUEUE_PROPERTIES if values specified in properties are valid but are not // supported by the device. CL_QUEUE_SIZE => return Err(CL_INVALID_QUEUE_PROPERTIES), - _ => return Err(CL_INVALID_PROPERTY), + _ => return Err(CL_INVALID_VALUE), } } From ccb3a06f7ac8f91a22b40d8ff7cbe70913d21742 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 8 Nov 2025 14:24:01 +0100 Subject: [PATCH 02/12] rusticl/queue: fix error code for invalid queue properties part 2 Fixes: 2c202eb7870 ("rusticl: verify validity of property names and values") --- src/gallium/frontends/rusticl/api/queue.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/frontends/rusticl/api/queue.rs b/src/gallium/frontends/rusticl/api/queue.rs index 04327f523b0..9b4e8114f1b 100644 --- a/src/gallium/frontends/rusticl/api/queue.rs +++ b/src/gallium/frontends/rusticl/api/queue.rs @@ -137,7 +137,7 @@ fn create_command_queue_with_properties( // SAFETY: properties is a 0 terminated array by spec. let properties = unsafe { Properties::new(properties) }.ok_or(CL_INVALID_VALUE)?; for (&key, &val) in properties.iter() { - match u32::try_from(key).or(Err(CL_INVALID_PROPERTY))? { + match u32::try_from(key).or(Err(CL_INVALID_VALUE))? { CL_QUEUE_PROPERTIES => queue_properties = val, CL_QUEUE_PRIORITY_KHR if d.context_priority_supported() != 0 => { let valid_props: cl_queue_properties = (CL_QUEUE_PRIORITY_LOW_KHR From 315024248aab8de402d351c28bee802cb8a4a120 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 8 Nov 2025 14:29:47 +0100 Subject: [PATCH 03/12] rusticl/queue: fix error code for invalid sampler kernel arg Fixes: 5795ee0e083 ("rusticl: translate spirv to nir and first steps to kernel arg handling") --- src/gallium/frontends/rusticl/api/kernel.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/gallium/frontends/rusticl/api/kernel.rs b/src/gallium/frontends/rusticl/api/kernel.rs index b3571cf25be..164d0aa1c11 100644 --- a/src/gallium/frontends/rusticl/api/kernel.rs +++ b/src/gallium/frontends/rusticl/api/kernel.rs @@ -416,13 +416,18 @@ fn set_kernel_arg( return Err(CL_INVALID_ARG_VALUE); } } - // If the argument is of type sampler_t, the arg_value entry must be a pointer to the - // sampler object. - KernelArgType::Constant(_) | KernelArgType::Sampler => { + KernelArgType::Constant(_) => { if arg_value.is_null() { return Err(CL_INVALID_ARG_VALUE); } } + KernelArgType::Sampler => { + // CL_INVALID_SAMPLER for an argument declared to be of type sampler_t when the + // specified arg_value is not a valid sampler object. + if arg_value.is_null() { + return Err(CL_INVALID_SAMPLER); + } + } _ => {} }; From 04ef78b786c67bd2a19b6b0abdc187db70432e32 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 8 Nov 2025 16:15:07 +0100 Subject: [PATCH 04/12] rusticl/kernel: take no kernel_info reference inside the launch closure Otherwise patterns like this wouldn't work: clCreateKernel(prog) clEnqueueNDRangeKernel clReleaseKernel clBuildProgram(prog) Fixes: bb2453c6495 ("rusticl/kernel: move most of the code in launch inside the closure") --- src/gallium/frontends/rusticl/core/kernel.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs index 748bd3c2143..aa1ce195242 100644 --- a/src/gallium/frontends/rusticl/core/kernel.rs +++ b/src/gallium/frontends/rusticl/core/kernel.rs @@ -1501,7 +1501,8 @@ impl Kernel { offsets: &[usize], ) -> CLResult { // Clone all the data we need to execute this kernel - let kernel_info = Arc::clone(&self.kernel_info); + let work_group_size_hint = self.kernel_info.work_group_size_hint; + let args = self.kernel_info.args.clone(); let arg_values = self.values.clone(); let nir_kernel_builds = Arc::clone(&self.builds[q.device]); let mut bdas = self.bdas.clone(); @@ -1548,8 +1549,7 @@ impl Kernel { && grid[0] <= hw_max_grid[0] && grid[1] <= hw_max_grid[1] && grid[2] <= hw_max_grid[2] - && (kernel_info.work_group_size_hint == [0; 3] - || block == kernel_info.work_group_size_hint) + && (work_group_size_hint == [0; 3] || block == work_group_size_hint) { NirKernelVariant::Optimized } else { @@ -1603,7 +1603,7 @@ impl Kernel { for arg in &nir_kernel_build.compiled_args { let is_opaque = if let CompiledKernelArgType::APIArg(idx) = arg.kind { - kernel_info.args[idx].kind.is_opaque() + args[idx].kind.is_opaque() } else { false }; @@ -1614,7 +1614,7 @@ impl Kernel { match arg.kind { CompiledKernelArgType::APIArg(idx) => { - let api_arg = &kernel_info.args[idx]; + let api_arg = &args[idx]; let Some(value) = &arg_values[idx] else { continue; }; From 97de88fa2ab7c919cf690013c57a83b973cedcf2 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 9 Nov 2025 00:30:06 +0100 Subject: [PATCH 05/12] rusticl/spirv: preserve signed zeroes by default Cc: mesa-stable --- src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs index 3482bb045e3..9f84f67d4e9 100644 --- a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs +++ b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs @@ -303,13 +303,13 @@ impl SPIRVBin { private_data: ptr::from_mut(log).cast(), }); + let float_controls = float_controls::FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 as u32 + | float_controls::FLOAT_CONTROLS_SIGNED_ZERO_PRESERVE as u32; spirv_to_nir_options { create_library: library, environment: nir_spirv_execution_environment::NIR_SPIRV_OPENCL, clc_shader: clc_shader, - float_controls_execution_mode: float_controls::FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 - as u32, - + float_controls_execution_mode: float_controls, printf: true, capabilities: caps, constant_addr_format: global_addr_format, From 9807cc078e075e30155e29cb908e6cd4efd6a768 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 19 Oct 2025 15:57:14 +0200 Subject: [PATCH 06/12] glsl_types: include compiler/builtin_types.h Apparently CUDA has a builtin_types.h file and it can conflict with ours... Reviewed-by: Adam Jackson --- src/compiler/glsl_types.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h index 85028726622..abf804ebd7c 100644 --- a/src/compiler/glsl_types.h +++ b/src/compiler/glsl_types.h @@ -29,6 +29,7 @@ #include #include +#include "compiler/builtin_types.h" #include "shader_enums.h" #include "c11/threads.h" #include "util/blob.h" From 834a50a3ebcca582b34efc8d90ad33e3d3ba38f9 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 11 Oct 2025 17:32:04 +0200 Subject: [PATCH 07/12] nir/lower_system_values: add num_subgroups and subgroup_id lowering Nvidia hardware doesn't have those in hardware. NAK currently loweres them itself. Reviewed-by: Adam Jackson --- src/compiler/nir/nir.h | 2 ++ src/compiler/nir/nir_lower_system_values.c | 24 ++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 05b9a4c76e0..b962c16025d 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5560,6 +5560,8 @@ typedef struct nir_lower_compute_system_values_options { bool lower_local_invocation_index : 1; bool lower_cs_local_id_to_index : 1; bool lower_workgroup_id_to_index : 1; + bool lower_num_subgroups : 1; + bool lower_subgroup_id : 1; bool global_id_is_32bit : 1; /* At shader execution time, check if WorkGroupId should be 1D * and compute it quickly. Fall back to slow computation if not. diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c index 61d5282db80..f00ee581bce 100644 --- a/src/compiler/nir/nir_lower_system_values.c +++ b/src/compiler/nir/nir_lower_system_values.c @@ -794,6 +794,30 @@ lower_compute_system_value_instr(nir_builder *b, case nir_intrinsic_load_shader_index: return nir_imm_int(b, b->shader->info.cs.shader_index); + case nir_intrinsic_load_num_subgroups: { + if (!options || !options->lower_num_subgroups) + return NULL; + + nir_def *group_size = nir_load_workgroup_size(b); + nir_def *threads = nir_imul(b, nir_channel(b, group_size, 0), + nir_channel(b, group_size, 1)); + threads = nir_imul(b, threads, nir_channel(b, group_size, 2)); + + /* DIV_ROUND_UP(A, B) = ((A + B - 1) / B) */ + nir_def *subgroup_size = nir_load_subgroup_size(b); + nir_def *subgroup_size_m1 = nir_iadd_imm(b, subgroup_size, -1); + nir_def *numerator = nir_iadd(b, threads, subgroup_size_m1); + return nir_udiv(b, numerator, subgroup_size); + } + + case nir_intrinsic_load_subgroup_id: { + if (!options || !options->lower_subgroup_id) + return NULL; + + nir_def *tid = nir_load_local_invocation_index(b); + return nir_udiv(b, tid, nir_load_subgroup_size(b)); + } + default: return NULL; } From 8e8a6d1e09cbfdbd0b215d550fe81d9efe4771ad Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Thu, 4 Sep 2025 10:29:20 +0200 Subject: [PATCH 08/12] gallium: add SUBGROUP_FEATURE bits for rotate and rotate_clustered The bit values are taken from Vulkan to make it easy for Zink. Those new subgroup features will be used by rusticl for cl_khr_subgroup_rotate. Reviewed-by: Adam Jackson --- src/gallium/include/pipe/p_defines.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index ae0fdba1a9c..ced85b506e3 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -698,6 +698,14 @@ enum pipe_conservative_raster_mode #define PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED (1 << 6) #define PIPE_SHADER_SUBGROUP_FEATURE_QUAD (1 << 7) #define PIPE_SHADER_SUBGROUP_NUM_FEATURES 8 +/* VK_SUBGROUP_FEATURE_ROTATE_BIT */ +#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE (1 << 9) +/* VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT */ +#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED (1 << 10) +#define PIPE_SHADER_SUBGROUP_FEATURE_MASK \ + (BITFIELD_MASK(PIPE_SHADER_SUBGROUP_NUM_FEATURES) | \ + PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | \ + PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED) enum pipe_point_size_lower_mode { PIPE_POINT_SIZE_LOWER_ALWAYS, From 2f7c79cf5ff08d152ef83430607affd93ef34bc0 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 3 Sep 2025 21:52:08 +0200 Subject: [PATCH 09/12] clc: handle all optional subgroup extensions Reviewed-by: Alyssa Rosenzweig Reviewed-by: Adam Jackson --- src/compiler/clc/clc.h | 8 +++++++- src/compiler/clc/clc_helpers.cpp | 24 +++++++++++++++++++++--- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/compiler/clc/clc.h b/src/compiler/clc/clc.h index 95ef4dd759b..73588a31ad5 100644 --- a/src/compiler/clc/clc.h +++ b/src/compiler/clc/clc.h @@ -72,9 +72,15 @@ struct clc_optional_features { * progress */ bool subgroups_ifp; + bool subgroups_ballot; + bool subgroups_clustered; + bool subgroups_extended_types; + bool subgroups_named_barrier; + bool subgroups_non_uniform_arithmetic; + bool subgroups_non_uniform_vote; + bool subgroups_rotate; bool subgroups_shuffle; bool subgroups_shuffle_relative; - bool subgroups_ballot; }; struct clc_compile_args { diff --git a/src/compiler/clc/clc_helpers.cpp b/src/compiler/clc/clc_helpers.cpp index 8e69ef845a1..9912f9b0256 100644 --- a/src/compiler/clc/clc_helpers.cpp +++ b/src/compiler/clc/clc_helpers.cpp @@ -1024,15 +1024,33 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx, } if (args->features.subgroups) { c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_subgroups"); + if (args->features.subgroups_ballot) { + c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1"); + } + if (args->features.subgroups_clustered) { + c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_clustered_reduce=1"); + } + if (args->features.subgroups_extended_types) { + c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_extended_types=1"); + } + if (args->features.subgroups_named_barrier) { + c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_named_barrier=1"); + } + if (args->features.subgroups_non_uniform_arithmetic) { + c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_arithmetic=1"); + } + if (args->features.subgroups_non_uniform_vote) { + c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_vote=1"); + } + if (args->features.subgroups_rotate) { + c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_rotate=1"); + } if (args->features.subgroups_shuffle) { c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle=1"); } if (args->features.subgroups_shuffle_relative) { c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle_relative=1"); } - if (args->features.subgroups_ballot) { - c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1"); - } } if (args->features.subgroups_ifp) { assert(args->features.subgroups); From cb5e5222877b2c9c577ca8e4e5da11e36a91f2db Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 3 Sep 2025 21:55:48 +0200 Subject: [PATCH 10/12] rusticl: properly check for subgroup support Also add zink to the features.txt entries. Reviewed-by: Alyssa Rosenzweig Reviewed-by: Adam Jackson --- docs/features.txt | 6 ++-- src/gallium/frontends/rusticl/core/device.rs | 36 ++++++++++++++++---- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index 08763314c91..5cea523628c 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -814,7 +814,7 @@ Rusticl Optional OpenCL 2.x Features: Device and host timer synchronization DONE (freedreno, iris, llvmpipe, radeonsi, zink) OpenCL C 2.0 in progress - Memory Consistency Model (atomics) not started - - Sub-groups DONE (iris, llvmpipe, radeonsi, asahi) + - Sub-groups DONE (asahi, iris, llvmpipe, radeonsi, zink) - Work-group Collective Functions not started - Generic Address Space in progress cl_khr_il_program DONE @@ -877,8 +877,8 @@ Rusticl extensions: cl_khr_subgroup_non_uniform_arithmetic not started cl_khr_subgroup_non_uniform_vote not started cl_khr_subgroup_rotate not started - cl_khr_subgroup_shuffle DONE (iris, llvmpipe, radeonsi, asahi) - cl_khr_subgroup_shuffle_relative DONE (iris, llvmpipe, radeonsi, asahi) + cl_khr_subgroup_shuffle DONE (asahi, iris, llvmpipe, radeonsi, zink) + cl_khr_subgroup_shuffle_relative DONE (asahi, iris, llvmpipe, radeonsi, zink) cl_khr_subgroups in progress cl_khr_suggested_local_work_size DONE cl_khr_terminate_context not started diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs index f1b8671e589..9239a89a287 100644 --- a/src/gallium/frontends/rusticl/core/device.rs +++ b/src/gallium/frontends/rusticl/core/device.rs @@ -741,17 +741,22 @@ impl DeviceBase { } if self.subgroups_supported() { - add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle); - add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative); add_cap(SpvCapability::SpvCapabilityGroups); add_cap(SpvCapability::SpvCapabilitySubgroupDispatch); // requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS //add_ext(1, 0, 0, "cl_khr_subgroups"); add_feat(1, 0, 0, "__opencl_c_subgroups"); - // we have lowering in `nir_lower_subgroups`, drivers can just use that - add_ext(1, 0, 0, "cl_khr_subgroup_shuffle"); - add_ext(1, 0, 0, "cl_khr_subgroup_shuffle_relative"); + if self.subgroup_shuffle_supported() { + add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle); + add_ext(1, 0, 0, "cl_khr_subgroup_shuffle"); + } + + if self.subgroup_shuffle_relative_supported() { + add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative); + add_ext(1, 0, 0, "cl_khr_subgroup_shuffle_relative"); + } + if self.intel_subgroups_supported() { // add_cap(SpvCapability::SpvCapabilitySubgroupBufferBlockIOINTEL); // add_cap(SpvCapability::SpvCapabilitySubgroupImageBlockIOINTEL); @@ -1174,6 +1179,23 @@ impl DeviceBase { // supported, doing it without shareable shaders isn't practical self.max_subgroups() > 0 && (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders())) + && self.screen().caps().shader_subgroup_supported_features + & PIPE_SHADER_SUBGROUP_FEATURE_BASIC + != 0 + } + + pub fn subgroup_shuffle_supported(&self) -> bool { + self.subgroups_supported() + && self.screen().caps().shader_subgroup_supported_features + & PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE + != 0 + } + + pub fn subgroup_shuffle_relative_supported(&self) -> bool { + self.subgroups_supported() + && self.screen().caps().shader_subgroup_supported_features + & PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE_RELATIVE + != 0 } pub fn system_svm_supported(&self) -> bool { @@ -1256,8 +1278,8 @@ impl DeviceBase { intel_subgroups: self.intel_subgroups_supported(), kernel_clock: self.kernel_clock_supported(), subgroups: subgroups_supported, - subgroups_shuffle: subgroups_supported, - subgroups_shuffle_relative: subgroups_supported, + subgroups_shuffle: self.subgroup_shuffle_supported(), + subgroups_shuffle_relative: self.subgroup_shuffle_relative_supported(), ..Default::default() } } From d1870c2212e38427dad9adc54308fce74b58ab63 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 3 Sep 2025 21:57:31 +0200 Subject: [PATCH 11/12] rusticl: support more subgroup extensions Reviewed-by: Alyssa Rosenzweig Reviewed-by: Adam Jackson --- docs/features.txt | 12 ++-- src/gallium/frontends/rusticl/core/device.rs | 69 ++++++++++++++++++++ 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index 5cea523628c..6f6adea8b60 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -870,13 +870,13 @@ Rusticl extensions: cl_khr_spirv_no_integer_wrap_decoration DONE cl_khr_spirv_queries DONE cl_khr_srgb_image_writes not started - cl_khr_subgroup_ballot not started - cl_khr_subgroup_clustered_reduce not started - cl_khr_subgroup_extended_types not started + cl_khr_subgroup_ballot DONE (asahi, iris, llvmpipe, radeonsi, zink) + cl_khr_subgroup_clustered_reduce DONE (asahi, iris, llvmpipe, radeonsi, zink) + cl_khr_subgroup_extended_types DONE (asahi, iris, llvmpipe, radeonsi, zink) cl_khr_subgroup_named_barrier not started - cl_khr_subgroup_non_uniform_arithmetic not started - cl_khr_subgroup_non_uniform_vote not started - cl_khr_subgroup_rotate not started + cl_khr_subgroup_non_uniform_arithmetic DONE (asahi, iris, llvmpipe, radeonsi, zink) + cl_khr_subgroup_non_uniform_vote DONE (asahi, iris, llvmpipe, radeonsi, zink) + cl_khr_subgroup_rotate DONE (iris, llvmpipe, zink) cl_khr_subgroup_shuffle DONE (asahi, iris, llvmpipe, radeonsi, zink) cl_khr_subgroup_shuffle_relative DONE (asahi, iris, llvmpipe, radeonsi, zink) cl_khr_subgroups in progress diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs index 9239a89a287..dac45f8f7f5 100644 --- a/src/gallium/frontends/rusticl/core/device.rs +++ b/src/gallium/frontends/rusticl/core/device.rs @@ -745,8 +745,36 @@ impl DeviceBase { add_cap(SpvCapability::SpvCapabilitySubgroupDispatch); // requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS //add_ext(1, 0, 0, "cl_khr_subgroups"); + add_ext(1, 0, 0, "cl_khr_subgroup_extended_types"); add_feat(1, 0, 0, "__opencl_c_subgroups"); + if self.subgroup_ballot_supported() { + add_cap(SpvCapability::SpvCapabilityGroupNonUniformBallot); + add_ext(1, 0, 0, "cl_khr_subgroup_ballot"); + } + + if self.subgroup_clustered_supported() { + add_cap(SpvCapability::SpvCapabilityGroupNonUniformClustered); + add_ext(1, 0, 0, "cl_khr_subgroup_clustered_reduce"); + } + + if self.subgroup_non_uniform_arithmetic_supported() { + add_cap(SpvCapability::SpvCapabilityGroupNonUniformArithmetic); + add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_arithmetic"); + } + + if self.subgroup_non_uniform_vote_supported() { + add_cap(SpvCapability::SpvCapabilityGroupNonUniform); + add_cap(SpvCapability::SpvCapabilityGroupNonUniformVote); + add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_vote"); + } + + if self.subgroup_rotate_supported() { + add_cap(SpvCapability::SpvCapabilityGroupNonUniformRotateKHR); + add_ext(1, 0, 0, "cl_khr_subgroup_rotate"); + add_spirv(c"SPV_KHR_subgroup_rotate"); + } + if self.subgroup_shuffle_supported() { add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle); add_ext(1, 0, 0, "cl_khr_subgroup_shuffle"); @@ -1184,6 +1212,41 @@ impl DeviceBase { != 0 } + pub fn subgroup_ballot_supported(&self) -> bool { + self.subgroups_supported() + && self.screen().caps().shader_subgroup_supported_features + & PIPE_SHADER_SUBGROUP_FEATURE_BALLOT + != 0 + } + + pub fn subgroup_clustered_supported(&self) -> bool { + self.subgroups_supported() + && self.screen().caps().shader_subgroup_supported_features + & PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED + != 0 + } + + pub fn subgroup_non_uniform_arithmetic_supported(&self) -> bool { + self.subgroups_supported() + && self.screen().caps().shader_subgroup_supported_features + & PIPE_SHADER_SUBGROUP_FEATURE_ARITHMETIC + != 0 + } + + pub fn subgroup_non_uniform_vote_supported(&self) -> bool { + self.subgroups_supported() + && self.screen().caps().shader_subgroup_supported_features + & PIPE_SHADER_SUBGROUP_FEATURE_VOTE + != 0 + } + + pub fn subgroup_rotate_supported(&self) -> bool { + let mask = + PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED; + self.subgroups_supported() + && self.screen().caps().shader_subgroup_supported_features & mask == mask + } + pub fn subgroup_shuffle_supported(&self) -> bool { self.subgroups_supported() && self.screen().caps().shader_subgroup_supported_features @@ -1278,6 +1341,12 @@ impl DeviceBase { intel_subgroups: self.intel_subgroups_supported(), kernel_clock: self.kernel_clock_supported(), subgroups: subgroups_supported, + subgroups_ballot: self.subgroup_ballot_supported(), + subgroups_clustered: self.subgroup_clustered_supported(), + subgroups_extended_types: subgroups_supported, + subgroups_non_uniform_arithmetic: self.subgroup_non_uniform_arithmetic_supported(), + subgroups_non_uniform_vote: self.subgroup_non_uniform_vote_supported(), + subgroups_rotate: self.subgroup_rotate_supported(), subgroups_shuffle: self.subgroup_shuffle_supported(), subgroups_shuffle_relative: self.subgroup_shuffle_relative_supported(), ..Default::default() From a18a4750069169b6c9fa39a627380dfb71a3b10d Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 4 Oct 2025 19:05:10 +0200 Subject: [PATCH 12/12] nocl: add it Acked-by: Alyssa Rosenzweig --- docs/drivers/nocl.rst | 13 + docs/envvars.rst | 23 + docs/features.txt | 26 +- docs/index.rst | 1 + meson.build | 13 +- meson.options | 2 +- src/compiler/meson.build | 2 +- .../auxiliary/pipe-loader/pipe_loader.c | 3 + .../auxiliary/pipe-loader/pipe_loader.h | 5 + .../drivers/nocl/compiler/cbindgen.toml | 12 + src/gallium/drivers/nocl/compiler/lib.rs | 1191 +++++++++++++++++ src/gallium/drivers/nocl/compiler/meson.build | 44 + src/gallium/drivers/nocl/meson.build | 54 + src/gallium/drivers/nocl/nocl_nir.c | 824 ++++++++++++ .../drivers/nocl/nocl_nir_algebraic.py | 65 + src/gallium/drivers/nocl/nocl_pipe.c | 1027 ++++++++++++++ src/gallium/drivers/nocl/nocl_private.h | 147 ++ .../frontends/rusticl/mesa/pipe/device.rs | 1 + src/gallium/meson.build | 6 + src/gallium/targets/rusticl/meson.build | 1 + src/gallium/winsys/nocl/meson.build | 25 + src/gallium/winsys/nocl/nocl_cuda.c | 212 +++ src/gallium/winsys/nocl/nocl_cuda_public.h | 73 + 23 files changed, 3754 insertions(+), 16 deletions(-) create mode 100644 docs/drivers/nocl.rst create mode 100644 src/gallium/drivers/nocl/compiler/cbindgen.toml create mode 100644 src/gallium/drivers/nocl/compiler/lib.rs create mode 100644 src/gallium/drivers/nocl/compiler/meson.build create mode 100644 src/gallium/drivers/nocl/meson.build create mode 100644 src/gallium/drivers/nocl/nocl_nir.c create mode 100644 src/gallium/drivers/nocl/nocl_nir_algebraic.py create mode 100644 src/gallium/drivers/nocl/nocl_pipe.c create mode 100644 src/gallium/drivers/nocl/nocl_private.h create mode 100644 src/gallium/winsys/nocl/meson.build create mode 100644 src/gallium/winsys/nocl/nocl_cuda.c create mode 100644 src/gallium/winsys/nocl/nocl_cuda_public.h diff --git a/docs/drivers/nocl.rst b/docs/drivers/nocl.rst new file mode 100644 index 00000000000..885dbfc45e5 --- /dev/null +++ b/docs/drivers/nocl.rst @@ -0,0 +1,13 @@ +NoCL +==== + +Overview +-------- + +The NoCL driver is a Gallium driver that targets the CUDA driver API and +therefore can be used to layer OpenCL on top of the NVIDIA proprietary driver. + +The minimum supported CUDA API version is 6.5. + +It translates NIR to PTX and makes use of the CUDA internal JIT compiler to +generate device code. diff --git a/docs/envvars.rst b/docs/envvars.rst index 6fbf8e6bc03..9952db963fb 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -2147,6 +2147,29 @@ Freedreno driver environment variables Other Gallium drivers have their own environment variables. These may change frequently so the source code should be consulted for details. +NoCL driver environment variables +--------------------------------- + +.. envvar:: NOCL_DEBUG + + Debug flags for the NoCL driver. + +.. envvar:: NOCL_API_TARGET + + Assumes a lower CUDA Driver version is used than what's loaded at runtime. + Useful to ensure that code runs on older Driver versions. For development + only. + +.. envvar:: NOCL_PTX_VERSION + + Overrides the PTX target version with the specified value. Can be used to + ensure code compiles successfully on various CUDA versions. For development + only. Specified without the dot. + +.. envvar:: NOCL_SM_TARGET + + Overrides the PTX target SM level with the specified value. Can be used to + ensure code compiles successfully on various GPUs. For development only. Vulkan loader environment variables ----------------------------------- diff --git a/docs/features.txt b/docs/features.txt index 6f6adea8b60..15860805104 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -814,7 +814,7 @@ Rusticl Optional OpenCL 2.x Features: Device and host timer synchronization DONE (freedreno, iris, llvmpipe, radeonsi, zink) OpenCL C 2.0 in progress - Memory Consistency Model (atomics) not started - - Sub-groups DONE (asahi, iris, llvmpipe, radeonsi, zink) + - Sub-groups DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink) - Work-group Collective Functions not started - Generic Address Space in progress cl_khr_il_program DONE @@ -834,7 +834,7 @@ Rusticl extensions: cl_khr_d3d10_sharing not started cl_khr_d3d11_sharing not started cl_khr_device_enqueue_local_arg_types not started - cl_khr_device_uuid DONE (freedreno, iris, llvmpipe, radeonsi, zink) + cl_khr_device_uuid DONE (freedreno, iris, llvmpipe, nocl, radeonsi, zink) cl_khr_dx9_media_sharing not started cl_khr_egl_event not started cl_khr_egl_image not started @@ -849,7 +849,7 @@ Rusticl extensions: cl_khr_external_semaphore_opaque_fd not started cl_khr_external_semaphore_sync_fd DONE (radeonsi, zink) cl_khr_external_semaphore_win32 not started - cl_khr_fp16 DONE (asahi, freedreno, llvmpipe, panfrost, radeonsi, zink) + cl_khr_fp16 DONE (asahi, freedreno, llvmpipe, nocl, panfrost, radeonsi, zink) cl_khr_gl_depth_images not started cl_khr_gl_event not started cl_khr_gl_msaa_sharing not started @@ -859,10 +859,10 @@ Rusticl extensions: cl_khr_int64_base_atomics not started cl_khr_int64_extended_atomics not started cl_khr_integer_dot_product DONE - cl_khr_kernel_clock DONE (freedreno, iris, llvmpipe, nvc0, panfrost, radeonsi, zink, needs llvm-19) + cl_khr_kernel_clock DONE (freedreno, iris, llvmpipe, nocl, nvc0, panfrost, radeonsi, zink, needs llvm-19) cl_khr_mipmap_image not started cl_khr_mipmap_image_writes not started - cl_khr_pci_bus_info DONE (iris, nvc0, radeonsi, zink) + cl_khr_pci_bus_info DONE (iris, nocl, nvc0, radeonsi, zink) cl_khr_priority_hints DONE (asahi, freedreno, iris, panfrost, radeonsi) cl_khr_semaphore DONE (radeonsi, zink) cl_khr_spirv_extended_debug_info not started @@ -870,21 +870,21 @@ Rusticl extensions: cl_khr_spirv_no_integer_wrap_decoration DONE cl_khr_spirv_queries DONE cl_khr_srgb_image_writes not started - cl_khr_subgroup_ballot DONE (asahi, iris, llvmpipe, radeonsi, zink) - cl_khr_subgroup_clustered_reduce DONE (asahi, iris, llvmpipe, radeonsi, zink) - cl_khr_subgroup_extended_types DONE (asahi, iris, llvmpipe, radeonsi, zink) + cl_khr_subgroup_ballot DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink) + cl_khr_subgroup_clustered_reduce DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink) + cl_khr_subgroup_extended_types DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink) cl_khr_subgroup_named_barrier not started - cl_khr_subgroup_non_uniform_arithmetic DONE (asahi, iris, llvmpipe, radeonsi, zink) - cl_khr_subgroup_non_uniform_vote DONE (asahi, iris, llvmpipe, radeonsi, zink) + cl_khr_subgroup_non_uniform_arithmetic DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink) + cl_khr_subgroup_non_uniform_vote DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink) cl_khr_subgroup_rotate DONE (iris, llvmpipe, zink) - cl_khr_subgroup_shuffle DONE (asahi, iris, llvmpipe, radeonsi, zink) - cl_khr_subgroup_shuffle_relative DONE (asahi, iris, llvmpipe, radeonsi, zink) + cl_khr_subgroup_shuffle DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink) + cl_khr_subgroup_shuffle_relative DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink) cl_khr_subgroups in progress cl_khr_suggested_local_work_size DONE cl_khr_terminate_context not started cl_khr_throttle_hints not started cl_khr_work_group_uniform_arithmetic not started - cl_ext_buffer_device_address DONE (iris, llvmpipe, radeonsi, zink) + cl_ext_buffer_device_address DONE (iris, llvmpipe, nocl, radeonsi, zink) cl_ext_cxx_for_opencl not started cl_ext_device_fission not started cl_ext_float_atomics not started diff --git a/docs/index.rst b/docs/index.rst index 36b6acd2192..161ffcd36f0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -85,6 +85,7 @@ Linux, FreeBSD, and other operating systems. drivers/freedreno drivers/lima drivers/llvmpipe + drivers/nocl drivers/nvk drivers/panfrost drivers/powervr diff --git a/meson.build b/meson.build index f241ba862cd..bb7750154a3 100644 --- a/meson.build +++ b/meson.build @@ -215,6 +215,7 @@ with_gallium_d3d12 = gallium_drivers.contains('d3d12') with_gallium_asahi = gallium_drivers.contains('asahi') with_gallium_rocket = gallium_drivers.contains('rocket') with_gallium_ethosu = gallium_drivers.contains('ethosu') +with_gallium_nocl = gallium_drivers.contains('nocl') foreach gallium_driver : gallium_drivers pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper()) endforeach @@ -754,7 +755,7 @@ if with_gallium_rusticl endif with_virtgpu_kumquat = get_option('virtgpu_kumquat') and with_gfxstream_vk -if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat +if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat or with_gallium_nocl # rust.bindgen() does not pass `--rust-target` to bindgen until 1.7.0. if meson.version().version_compare('< 1.7.0') error('Mesa Rust support requires Meson 1.7.0 or newer') @@ -1600,6 +1601,16 @@ elif with_shader_cache error('Shader Cache requires compression') endif +if with_gallium_nocl + dep_libcuda = dependency( + 'cuda', + # By default meson searches for the cudart, which we don't want to use. We want to use the + # CUDA driver library. + modules : ['cuda'], + version : '>= 6.5', + ) +endif + if host_machine.system() == 'windows' # For MSVC and MinGW we aren't using pthreads, and dependency('threads') will add linkage # to pthread for MinGW, so leave the dependency null_dep for Windows. For Windows linking to diff --git a/meson.options b/meson.options index 75731475c12..43ca2c7c367 100644 --- a/meson.options +++ b/meson.options @@ -87,7 +87,7 @@ option( choices : [ 'all', 'auto', 'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris', - 'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi', + 'lima', 'llvmpipe', 'nocl', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi', 'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink', ], description : 'List of gallium drivers to build. If this is set to auto ' + diff --git a/src/compiler/meson.build b/src/compiler/meson.build index 92e72a44f9a..8742b8ce533 100644 --- a/src/compiler/meson.build +++ b/src/compiler/meson.build @@ -72,6 +72,6 @@ if with_gallium endif subdir('isaspec') -if with_nouveau_vk +if with_nouveau_vk or with_gallium_nocl subdir('rust') endif diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.c b/src/gallium/auxiliary/pipe-loader/pipe_loader.c index 1f54a3b8b84..56fc578909d 100644 --- a/src/gallium/auxiliary/pipe-loader/pipe_loader.c +++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c @@ -44,6 +44,9 @@ static int (*backends[])(struct pipe_loader_device **, int) = { #ifdef HAVE_LIBDRM &pipe_loader_drm_probe, +#endif +#ifdef HAVE_NOCL + &pipe_loader_cuda_probe, #endif &pipe_loader_sw_probe }; diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h index eda16a71692..9f04e40094e 100644 --- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h +++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h @@ -215,6 +215,11 @@ pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev, int pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev); +#ifdef HAVE_NOCL +int +pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev); +#endif + /** * Get a list of known DRM accel devices. * diff --git a/src/gallium/drivers/nocl/compiler/cbindgen.toml b/src/gallium/drivers/nocl/compiler/cbindgen.toml new file mode 100644 index 00000000000..82b8a9dc0e9 --- /dev/null +++ b/src/gallium/drivers/nocl/compiler/cbindgen.toml @@ -0,0 +1,12 @@ +language = "C" + +includes = ["nir.h"] +autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */" +include_guard = "NIR_TO_PTX_H" +usize_is_size_t = true +style = "tag" + +[export] +include = ["NirToPtxOutput"] +prefix = "" +renaming_overrides_prefixing = true diff --git a/src/gallium/drivers/nocl/compiler/lib.rs b/src/gallium/drivers/nocl/compiler/lib.rs new file mode 100644 index 00000000000..a0015561192 --- /dev/null +++ b/src/gallium/drivers/nocl/compiler/lib.rs @@ -0,0 +1,1191 @@ +// Copyright © 2025 Karol Herbst +// SPDX-License-Identifier: MIT + +#![allow(non_upper_case_globals)] + +use std::{cmp, collections::HashMap, ffi::CStr, str::Utf8Error}; + +use compiler::{bindings::*, nir::AsDef}; + +#[repr(C)] +pub struct NirToPtxOptions { + pub target_ptx: u16, + pub target_sm: u16, +} + +#[repr(C)] +pub struct NirToPtxOutput { + pub ptx: *mut u8, + pub ptx_len: usize, + pub ptx_capacity: usize, +} + +struct Reg { + index: u32, + bit_size: u8, +} + +impl Reg { + fn src_string(&self) -> String { + def_string_raw_prefixed(self.index, 1, 0, "") + } +} + +struct NirToPtxState<'n> { + nir: &'n nir_shader, + options: NirToPtxOptions, + vars: Vec, + block: Vec, + lines: Vec, + regs: HashMap, +} + +fn opcode(alu: &nir_alu_instr) -> Option<(&'static str, &'static str)> { + let alu_op = alu.op; + let def_bit_size = alu.def.bit_size; + let src_bit_size = alu.get_src(0).bit_size(); + + // Not all ops support predicates, so let's be specific about it here. + Some(if src_bit_size == 1 { + match alu_op { + nir_op_iand => ("and", "pred"), + nir_op_ine => ("xor", "pred"), + nir_op_inot => ("not", "pred"), + nir_op_ior => ("or", "pred"), + nir_op_ixor => ("xor", "pred"), + _ => return None, + } + } else if def_bit_size == 1 { + match alu_op { + nir_op_feq => ("setp.eq", "f"), + nir_op_fequ => ("setp.equ", "f"), + nir_op_fisfinite => ("testp.finite", "f"), + nir_op_fneo => ("setp.ne", "f"), + nir_op_fneu => ("setp.neu", "f"), + nir_op_fge => ("setp.ge", "f"), + nir_op_fgeu => ("setp.geu", "f"), + nir_op_flt => ("setp.lt", "f"), + nir_op_fltu => ("setp.ltu", "f"), + nir_op_ieq => ("setp.eq", "b"), + nir_op_ine => ("setp.ne", "b"), + nir_op_ige => ("setp.ge", "s"), + nir_op_ilt => ("setp.lt", "s"), + nir_op_uge => ("setp.ge", "u"), + nir_op_ult => ("setp.lt", "u"), + _ => return None, + } + } else { + match alu_op { + nir_op_bit_count => ("popc", "b"), + nir_op_bitfield_reverse => ("brev", "b"), + nir_op_f2f16 => ("cvt.rn", "f16.f"), + nir_op_f2f16_rtz => ("cvt.rz", "f16.f"), + nir_op_f2f32 => ("cvt", "f32.f"), + nir_op_fabs => ("abs", "f"), + // .rn prevents contraction + nir_op_fadd => ("add.rn", "f"), + nir_op_fdiv => ("div.approx", "f"), + nir_op_ffma => ("fma.rn", "f"), + nir_op_fmax => ("max", "f"), + nir_op_fmin => ("min", "f"), + // .rn prevents contraction + nir_op_fmul => ("mul.rn", "f"), + nir_op_fneg => ("neg", "f"), + nir_op_frcp => ("rcp.approx", "f"), + nir_op_frsq => ("rsqrt.approx", "f"), + nir_op_fsqrt => ("sqrt.approx", "f"), + nir_op_fsub => ("sub", "f"), + nir_op_i2i8 => ("cvt", "s8.s"), + nir_op_i2i16 => ("cvt", "s16.s"), + nir_op_i2i32 => ("cvt", "s32.s"), + nir_op_i2i64 => ("cvt", "s64.s"), + nir_op_iabs => ("abs", "s"), + nir_op_iadd => ("add", "s"), + nir_op_iadd_sat => ("add.sat", "s"), + nir_op_iand => ("and", "b"), + nir_op_ibitfield_extract => ("bfe", "s"), + nir_op_idiv => ("div", "s"), + nir_op_ieq => ("set.eq", "b"), + nir_op_imad => ("mad.lo", "s"), + nir_op_imax => ("max", "s"), + nir_op_imin => ("min", "s"), + nir_op_imul => ("mul.lo", "s"), + nir_op_imul24 => ("mul24.lo", "s"), + nir_op_imul_high => ("mul.hi", "s"), + nir_op_ineg => ("neg", "s"), + nir_op_inot => ("not", "b"), + nir_op_ior => ("or", "b"), + nir_op_irem => ("rem", "s"), + nir_op_ishl => ("shl", "b"), + nir_op_ishr => ("shr", "s"), + nir_op_isub => ("sub", "s"), + nir_op_isub_sat => ("sub.sat", "s"), + nir_op_ixor => ("xor", "b"), + nir_op_sdot_4x8_iadd => ("dp4a.s32", "s"), + nir_op_sudot_4x8_iadd => ("dp4a.s32", "u"), + nir_op_u2u8 => ("cvt", "u8.u"), + nir_op_u2u16 => ("cvt", "u16.u"), + nir_op_u2u32 => ("cvt", "u32.u"), + nir_op_u2u64 => ("cvt", "u64.u"), + nir_op_ubitfield_extract => ("bfe", "u"), + nir_op_udiv => ("div", "u"), + nir_op_udot_4x8_uadd => ("dp4a", "u32.u"), + nir_op_ufind_msb => ("bfind", "u"), + nir_op_umad24 => ("mad24.lo", "u"), + nir_op_umax => ("max", "u"), + nir_op_umin => ("min", "u"), + nir_op_umod => ("rem", "u"), + nir_op_umul24 => ("mul24.lo", "u"), + nir_op_umul_2x32_64 => ("mul.wide", "u"), + nir_op_umul_high => ("mul.hi", "u"), + nir_op_ushr => ("shr", "u"), + _ => return None, + } + }) +} + +fn bit_size_string(bit_size: u8) -> &'static str { + match bit_size { + 1 => "", + 8 => "8", + 16 => "16", + 32 => "32", + 64 => "64", + _ => unreachable!("unsupported bit_size"), + } +} + +fn def_string(def: &nir_def, comp: u8) -> String { + def_string_prefixed(def, comp, "") +} + +fn def_string_raw_prefixed(index: u32, num_components: u8, comp: u8, prefix: &str) -> String { + let swizzle = swizzle_reg(comp, num_components); + format!("%r{index}{prefix}{swizzle}") +} + +fn def_string_prefixed(def: &nir_def, comp: u8, prefix: &str) -> String { + def_string_raw_prefixed(def.index, def.num_components, comp, prefix) +} + +fn def_var_string_raw(index: u32, bit_size: u8, num_components: u8, prefix: &str) -> String { + let vec = vec_string(num_components); + + if bit_size == 1 { + format!("\t.reg {vec} .pred %r{index}{prefix};") + } else { + format!("\t.reg {vec} .b{bit_size} %r{index}{prefix};") + } +} + +fn src_string(src: &nir_alu_src, comp: u8) -> String { + def_string(src.src.as_def(), src.swizzle[usize::from(comp)]) +} + +fn mem_from_intrinsic(intrin: &nir_intrinsic_instr) -> &'static str { + match intrin.intrinsic { + nir_intrinsic_load_global_constant => "global", + nir_intrinsic_load_global | nir_intrinsic_store_global => "global", + nir_intrinsic_load_shared | nir_intrinsic_store_shared => "shared", + nir_intrinsic_load_scratch | nir_intrinsic_store_scratch => "local", + _ => unreachable!("intrinsic not a memory operation!"), + } +} + +fn swizzle_reg(swizzle: u8, num_components: u8) -> &'static str { + if num_components == 1 { + "" + } else { + match swizzle { + 0 => ".x", + 1 => ".y", + 2 => ".z", + 3 => ".w", + _ => unreachable!("unsupported swizzle"), + } + } +} + +fn vec_string(num_components: u8) -> &'static str { + match num_components { + 1 => "", + 2 => ".v2", + 3 | 4 => ".v4", + 8 => ".v8", + _ => unreachable!("unsupported vec size {num_components}"), + } +} + +impl NirToPtxState<'_> { + fn finalize(self) -> NirToPtxOutput { + // Until `String::into_raw_parts` is stabilized, this string needs to be turned + // into a `Vec` before calling `as_mut_ptr`. `String` does not implement + // `as_mut_ptr` itself but relies on `Deref`. + // By going through a slice the resulting pointer only has access rights to the + // portion of the string containing the actual string data, that is in the range + // `0..len`. The pointer does not have access rights to the over-allocated part + // in the range `len..capacity` though. + // Full access rights are later needed to reconstruct the string soundly under + // Stack Borrows rules. `Vec` does implement `as_mut_ptr` directly specifically + // to avoid this problem. + let mut ptx = self.lines.join("\n").into_bytes(); + ptx.push(b'\0'); + + let mut md_ptx = std::mem::ManuallyDrop::new(ptx); + let (ptr, len, capacity) = (md_ptx.as_mut_ptr(), md_ptx.len(), md_ptx.capacity()); + let result = NirToPtxOutput { + ptx: ptr, + ptx_len: len, + ptx_capacity: capacity, + }; + + result + } + + fn init(&mut self) { + // We need to cap the versions otherwise compilation can fail on newer GPUs + let target_ptx = cmp::min(83, self.options.target_ptx); + let sm = cmp::min(80, self.options.target_sm); + let address_size = unsafe { self.nir.info.__bindgen_anon_1.cs }.ptr_size; + + let ptx_major = target_ptx / 10; + let ptx_minor = target_ptx % 10; + + self.lines.extend_from_slice(&[ + format!(".version {ptx_major}.{ptx_minor}"), + format!(".target sm_{sm}, texmode_independent"), + format!(".address_size {address_size}"), + ]); + } + + fn new(nir: *const nir_shader, options: NirToPtxOptions) -> Self { + Self { + nir: unsafe { &*nir }, + options: options, + vars: Vec::new(), + block: Vec::new(), + lines: Vec::new(), + regs: HashMap::new(), + } + } + + fn parse(&mut self) -> Result<(), Utf8Error> { + for func in self.nir.iter_functions() { + assert!(!func.impl_.is_null()); + assert!(func.is_entrypoint); + + self.lines.push(format!( + ".entry {}(", + unsafe { CStr::from_ptr(func.name) }.to_str()? + )); + + let mut iter = self + .nir + .iter_variables() + .filter(|var| var.data.mode() == nir_var_uniform) + .peekable(); + + let mut last_offset = 0; + while let Some(var) = iter.next() { + let mut byte_size = 0; + let mut align = 0; + let vec: u32; + let driver_location = var.data.driver_location; + + let bit_size; + unsafe { glsl_get_cl_type_size_align(var.type_, &mut byte_size, &mut align) }; + if unsafe { glsl_type_is_vector_or_scalar(var.type_) } { + vec = unsafe { *var.type_ }.vector_elements.into(); + bit_size = (byte_size / u32::from(vec)) * 8; + } else { + // If we have anything we don't parse properly, just emit a byte array. + bit_size = 8; + vec = byte_size; + } + + // Add explicit padding, *sigh* + if last_offset != driver_location { + self.lines.push(format!( + "\t.param .b8 padding{last_offset}[{}],", + driver_location - last_offset, + )); + } + + self.lines.push(format!( + "\t.param .u{bit_size} param{driver_location}{}{}", + if vec > 1 { + format!("[{vec}]") + } else { + "".to_owned() + }, + if iter.peek().is_some() { "," } else { "" } + )); + + last_offset = driver_location + byte_size; + } + + self.lines.push(") {".to_owned()); + + // declare local memory inside the function so we don't run into ABI issues + if self.nir.scratch_size != 0 { + self.lines.extend_from_slice(&[ + format!("\t.local .u8 local_mem[{}];", self.nir.scratch_size), + "\t.reg .b32 %LMEM;".to_owned(), + "\t.reg .b32 %LMEM_TMP;".to_owned(), + "\tmov.u32 %LMEM, local_mem;".to_owned(), + ]); + } + + let func_impl = func.get_impl().unwrap(); + for cf_node in func_impl.iter_body() { + match cf_node.type_ { + nir_cf_node_block => { + let block = cf_node.as_block().unwrap(); + let index = block.index; + self.lines.push(format!("b{index}:")); + + for instr in block.iter_instr_list() { + self.parse_instruction(instr); + } + + self.lines.append(&mut self.vars); + self.lines.append(&mut self.block); + } + _ => unreachable!("Unsupported cf_node type"), + } + } + + let end_block = func_impl.end_block().index; + self.lines.push(format!("b{end_block}:")); + + self.lines.push("}".to_owned()); + } + + Ok(()) + } + + fn create_dest_var(&mut self, def: &nir_def) { + self.create_dest_var_prefixed(def, ""); + } + + fn create_dest_var_prefixed(&mut self, def: &nir_def, prefix: &str) { + self.create_dest_var_raw(def.index, def.bit_size, def.num_components, prefix); + } + + fn create_dest_var_raw(&mut self, index: u32, bit_size: u8, num_components: u8, prefix: &str) { + self.vars + .push(def_var_string_raw(index, bit_size, num_components, prefix)); + } + + fn parse_alu(&mut self, alu: &nir_alu_instr) { + let info = alu.info(); + let dst_bit_size = alu.def.bit_size; + let src0_bit_size = alu.get_src(0).bit_size(); + self.create_dest_var(&alu.def); + + let float_exec_mode = self.nir.info.float_controls_execution_mode; + #[rustfmt::skip] + let ftz = (info.input_types[0] & nir_type_float != 0 && ( + (src0_bit_size == 16 && float_exec_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 != 0) || + (src0_bit_size == 32 && float_exec_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 != 0) || + (src0_bit_size == 64 && float_exec_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64 != 0) + )).then_some(".ftz").unwrap_or_default(); + + if let Some((op_str, type_str)) = opcode(alu) { + let dst = def_string(&alu.def, 0); + let op_bit_size = bit_size_string(src0_bit_size); + self.block.push(format!( + "\t{op_str}{ftz}.{type_str}{op_bit_size} {dst}, {};", + alu.srcs_as_slice() + .iter() + .map(|src| { src_string(src, 0) }) + .collect::>() + .join(", "), + )); + + return; + } + + match alu.op { + // conversions + nir_op_b2b1 => { + let src = src_string(alu.get_src(0), 0); + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tsetp.eq.b{src0_bit_size} {dst}, {src}, 1;")); + } + nir_op_b2f16 => { + let src = src_string(alu.get_src(0), 0); + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tselp.b16 {dst}, 0x3c00, 0x0000, {src};")); + } + nir_op_b2f32 => { + let src = src_string(alu.get_src(0), 0); + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tselp.f32 {dst}, 1.0, 0.0, {src};")); + } + nir_op_b2i16 | nir_op_b2b32 | nir_op_b2i32 | nir_op_b2i64 => { + let src = src_string(alu.get_src(0), 0); + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tselp.s{dst_bit_size} {dst}, 1, 0, {src};")); + } + nir_op_bcsel => { + let src0 = src_string(alu.get_src(0), 0); + let src1 = src_string(alu.get_src(1), 0); + let src2 = src_string(alu.get_src(2), 0); + let dst = def_string(&alu.def, 0); + + self.block.push(format!( + "\tselp.b{dst_bit_size} {dst}, {src1}, {src2}, {src0};" + )); + } + nir_op_i32csel_ge => { + let src0 = src_string(alu.get_src(0), 0); + let src1 = src_string(alu.get_src(1), 0); + let src2 = src_string(alu.get_src(2), 0); + let dst = def_string(&alu.def, 0); + + self.block.push(format!( + "\tslct.s{dst_bit_size}.s32 {dst}, {src1}, {src2}, {src0};" + )); + } + + #[rustfmt::skip] + nir_op_f2i8 | + nir_op_f2u8 | + nir_op_f2i16 | + nir_op_f2u16 | + nir_op_f2i32 | + nir_op_f2u32 | + nir_op_f2i64 | + nir_op_f2u64 | + nir_op_i2f16 | + nir_op_i2f32 | + nir_op_i2f64 | + nir_op_u2f16 | + nir_op_u2f32 | + nir_op_u2f64 | + nir_op_fceil | + nir_op_ffloor | + nir_op_ftrunc | + nir_op_fround_even => { + let src = src_string(alu.get_src(0), 0); + let dst = def_string(&alu.def, 0); + + let dtype = match alu.op { + nir_op_f2i8 | + nir_op_f2i16 | + nir_op_f2i32 | + nir_op_f2i64 => "s", + nir_op_f2u8 | + nir_op_f2u16 | + nir_op_f2u32 | + nir_op_f2u64 => "u", + nir_op_i2f16 | + nir_op_i2f32 | + nir_op_i2f64 | + nir_op_u2f16 | + nir_op_u2f32 | + nir_op_u2f64 | + nir_op_fceil | + nir_op_ffloor | + nir_op_ftrunc | + nir_op_fround_even => "f", + _ => unreachable!("invalid conversion dtype"), + }; + + let stype = match alu.op { + nir_op_i2f16 | + nir_op_i2f32 | + nir_op_i2f64 => "s", + nir_op_u2f16 | + nir_op_u2f32 | + nir_op_u2f64 => "u", + nir_op_f2i8 | + nir_op_f2u8 | + nir_op_f2i16 | + nir_op_f2u16 | + nir_op_f2i32 | + nir_op_f2u32 | + nir_op_f2i64 | + nir_op_f2u64 | + nir_op_fceil | + nir_op_ffloor | + nir_op_ftrunc | + nir_op_fround_even => "f", + _ => unreachable!("invalid conversion stype"), + }; + + let rnd = match alu.op { + nir_op_fceil => "rpi", + nir_op_ffloor => "rmi", + nir_op_fround_even => "rni", + nir_op_ftrunc => "rzi", + nir_op_f2i8 | + nir_op_f2i16 | + nir_op_f2i32 | + nir_op_f2i64 | + nir_op_f2u8 | + nir_op_f2u16 | + nir_op_f2u32 | + nir_op_f2u64 => "rzi", + nir_op_i2f16 | + nir_op_i2f32 | + nir_op_i2f64 | + nir_op_u2f16 | + nir_op_u2f32 | + nir_op_u2f64 => "rn", + _ => unreachable!("invalid conversion rounding mode"), + }; + + self.block.push(format!( + "\tcvt.{rnd}{ftz}.{dtype}{dst_bit_size}.{stype}{src0_bit_size} {dst}, {src};" + )); + } + + // extract + nir_op_extract_i8 => { + let src0 = src_string(alu.get_src(0), 0); + let index = alu.get_src(1).comp_as_uint(0).unwrap(); + let sign = 0x8 | index; + let index = sign << 12 | sign << 8 | sign << 4 | index; + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tprmt.b32 {dst}, {src0}, 0, 0x{index:x};")); + } + nir_op_extract_i16 => { + let src0 = src_string(alu.get_src(0), 0); + let index = 2 * alu.get_src(1).comp_as_uint(0).unwrap(); + let indexp1 = index + 1; + let sign = 0x8 | indexp1; + let index = sign << 12 | sign << 8 | indexp1 << 4 | index; + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tprmt.b32 {dst}, {src0}, 0, 0x{index:x};")); + } + nir_op_extract_u8 => { + let src0 = src_string(alu.get_src(0), 0); + let index = 0x4440 | alu.get_src(1).comp_as_uint(0).unwrap(); + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tprmt.b32 {dst}, {src0}, 0, 0x{index:x};")); + } + nir_op_extract_u16 => { + let src0 = src_string(alu.get_src(0), 0); + let index = 2 * alu.get_src(1).comp_as_uint(0).unwrap(); + let indexp1 = index + 1; + let index = 0x4400 | indexp1 << 4 | index; + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tprmt.b32 {dst}, {src0}, 0, 0x{index:x};")); + } + + // bitfield + nir_op_bitfield_insert => { + let src0 = src_string(alu.get_src(0), 0); + let src1 = src_string(alu.get_src(1), 0); + let src2 = src_string(alu.get_src(2), 0); + let src3 = src_string(alu.get_src(3), 0); + let dst = def_string(&alu.def, 0); + + self.block + .push(format!("\tbfi.b{dst_bit_size} {dst}, {src1}, {src0}, {src2}, {src3};")); + } + + // pack and unpack + nir_op_pack_32_4x8 | nir_op_pack_64_4x16 => { + let src0 = src_string(alu.get_src(0), 0); + let src1 = src_string(alu.get_src(0), 1); + let src2 = src_string(alu.get_src(0), 2); + let src3 = src_string(alu.get_src(0), 3); + let dst = def_string(&alu.def, 0); + self.block.push(format!( + "\tmov.b{dst_bit_size} {dst}, {{{src0}, {src1}, {src2}, {src3}}};" + )); + } + nir_op_pack_32_2x16 | nir_op_pack_64_2x32 => { + let src0 = src_string(alu.get_src(0), 0); + let src1 = src_string(alu.get_src(0), 1); + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tmov.b{dst_bit_size} {dst}, {{{src0}, {src1}}};")); + } + nir_op_pack_64_2x32_split => { + let src0 = src_string(alu.get_src(0), 0); + let src1 = src_string(alu.get_src(1), 0); + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tmov.b64 {dst}, {{{src0}, {src1}}};")); + } + + nir_op_unpack_32_4x8 + | nir_op_unpack_32_2x16 + | nir_op_unpack_64_4x16 + | nir_op_unpack_64_2x32 => { + let index = alu.def.index; + let src = src_string(alu.get_src(0), 0); + + // TODO: PTX seems to ignore the vector component on unpack movs? + if alu.get_src(0).src.num_components() != 1 { + self.create_dest_var_raw(index, src0_bit_size, 1, "unpack"); + self.block.extend_from_slice(&[ + format!("\tmov.b{src0_bit_size} %r{index}unpack, {src};"), + format!("\tmov.b{src0_bit_size} %r{index}, %r{index}unpack;"), + ]); + } else { + self.block + .push(format!("\tmov.b{src0_bit_size} %r{index}, {src};")); + } + } + nir_op_unpack_64_2x32_split_x => { + let src = src_string(alu.get_src(0), 0); + let dst = def_string(&alu.def, 0); + self.block.push(format!("\tmov.b64 {{{dst}, _}}, {src};")); + } + nir_op_unpack_32_2x16_split_y | nir_op_unpack_64_2x32_split_y => { + let src = src_string(alu.get_src(0), 0); + let dst = def_string(&alu.def, 0); + self.block + .push(format!("\tmov.b{src0_bit_size} {{_, {dst}}}, {src};")); + } + + // rotate + nir_op_urol | nir_op_uror => { + let src0 = src_string(alu.get_src(0), 0); + let src1 = src_string(alu.get_src(1), 0); + let dst = def_string(&alu.def, 0); + + let op = match alu.op { + nir_op_urol => "shf.l.wrap.b", + nir_op_uror => "shf.r.wrap.b", + _ => unreachable!(), + }; + + self.block + .push(format!("\t{op}32 {dst}, {src0}, {src0}, {src1};")) + } + + nir_op_bitfield_select => { + let src0 = src_string(alu.get_src(0), 0); + let src1 = src_string(alu.get_src(1), 0); + let src2 = src_string(alu.get_src(2), 0); + let dst = def_string(&alu.def, 0); + + self.block + .push(format!("\tlop3.b32 {dst}, {src0}, {src1}, {src2}, 0xca;")); + } + + // movs + nir_op_mov | nir_op_vec3 => { + if src0_bit_size == 8 { + let index = alu.def.index; + let vec = vec_string(alu.def.num_components); + self.vars.push(format!("\t.reg {vec} .b16 %r{index}mov;")); + } + + for lane in 0..info.num_inputs { + let src = src_string(alu.get_src(lane.into()), 0); + let dst = def_string(&alu.def, lane); + let dst_tmp = def_string_prefixed(&alu.def, lane, "mov"); + + // TODO: mov.b8 does not exist. Is there a better solution here? + // Maybe we should track movs and just copy prop while generating PTX? + if src0_bit_size == 8 { + self.block.extend_from_slice(&[ + format!("\tmov.b16 {dst_tmp}, {{{src}, {src}}};"), + format!("\tmov.b16 {{{dst}, _}}, {dst_tmp};"), + ]); + } else { + self.block + .push(format!("\tmov.b{src0_bit_size} {dst}, {src};")); + } + } + } + nir_op_vec2 | nir_op_vec4 => { + // TODO: mov.b8 does not exist. Is there a better solution here? + // Maybe we should track movs and just copy prop while generating PTX? + if src0_bit_size == 8 { + let def_bit_size = 8 * info.num_inputs; + let index = alu.def.index; + let mut srcs = Vec::new(); + for lane in 0..info.num_inputs { + srcs.push(src_string(alu.get_src(lane.into()), 0)); + } + let srcs = srcs.join(", "); + + self.vars + .push(format!("\t.reg .b{def_bit_size} %r{index}mov;")); + self.block.extend_from_slice(&[ + format!("\tmov.b{def_bit_size} %r{index}mov, {{{srcs}}};"), + format!("\tmov.b{def_bit_size} %r{index}, %r{index}mov;"), + ]); + } else { + // We can't move from vectors to vectors, so we need to do it lane by lane. + for lane in 0..info.num_inputs { + let src = src_string(alu.get_src(lane.into()), 0); + let dst = def_string(&alu.def, lane); + self.block + .push(format!("\tmov.b{src0_bit_size} {dst}, {src};")); + } + } + } + _ => unreachable!("Unknown ALU {}", info.name()), + } + } + + fn parse_intrinsic(&mut self, intrin: &nir_intrinsic_instr) { + let vec; + let info = intrin.info(); + let bit_size = intrin.def.bit_size; + let num_components = intrin.def.num_components; + let index = intrin.def.index; + + if info.has_dest && intrin.intrinsic != nir_intrinsic_decl_reg { + self.create_dest_var(&intrin.def); + vec = vec_string(num_components); + } else { + vec = ""; + } + + match intrin.intrinsic { + // registers + nir_intrinsic_decl_reg => { + assert_eq!(intrin.get_const_index(NIR_INTRINSIC_NUM_ARRAY_ELEMS), 0); + assert_eq!(intrin.get_const_index(NIR_INTRINSIC_NUM_COMPONENTS), 1); + + let bit_size = intrin.get_const_index(NIR_INTRINSIC_BIT_SIZE); + + // Create temporary for 8 bit movs + if bit_size == 8 { + self.vars.push(format!("\t.reg .b16 %r{index}store_reg;")); + } + + self.vars.push(if bit_size == 1 { + format!("\t.reg .pred %r{index};") + } else { + format!("\t.reg .b{bit_size} %r{index};") + }); + + self.regs.insert( + index, + Reg { + index: index, + bit_size: bit_size as u8, + }, + ); + } + nir_intrinsic_load_reg => { + assert_eq!(intrin.base(), 0); + + let reg = intrin.get_src(0); + let reg = &self.regs[®.as_def().index]; + let val = def_string(&intrin.def, 0); + + let type_ = if reg.bit_size == 1 { "pred" } else { "b" }; + let bit_size = reg.bit_size; + let bit_size_str = bit_size_string(bit_size); + let reg = reg.src_string(); + + if bit_size == 8 { + self.vars.push(format!("\t.reg .b16 {val}load_reg;")); + self.block.extend_from_slice(&[ + format!("\tmov.b16 {val}load_reg, {{{reg}, {reg}}};"), + format!("\tmov.b16 {{{val}, _}}, {val}load_reg;"), + ]); + } else { + self.block + .push(format!("\tmov.{type_}{bit_size_str} {val}, {reg};")); + } + } + nir_intrinsic_store_reg => { + assert_eq!(intrin.base(), 0); + assert_eq!(intrin.write_mask(), 1); + + let reg = intrin.get_src(1); + let reg = &self.regs[®.as_def().index]; + let val = def_string(intrin.get_src(0).as_def(), 0); + + let type_ = if reg.bit_size == 1 { "pred" } else { "b" }; + let bit_size = reg.bit_size; + let bit_size_str = bit_size_string(bit_size); + let reg = reg.src_string(); + + if bit_size == 8 { + self.block.extend_from_slice(&[ + format!("\tmov.b16 {reg}store_reg, {{{val}, {val}}};"), + format!("\tmov.b16 {{{reg}, _}}, {reg}store_reg;"), + ]); + } else { + self.block + .push(format!("\tmov.{type_}{bit_size_str} {reg}, {val};")); + } + } + + // system values + nir_intrinsic_load_local_invocation_id + | nir_intrinsic_load_workgroup_id + | nir_intrinsic_load_workgroup_size => { + let sreg = match intrin.intrinsic { + nir_intrinsic_load_local_invocation_id => "tid", + nir_intrinsic_load_workgroup_id => "ctaid", + nir_intrinsic_load_workgroup_size => "ntid", + _ => unreachable!("unknown intrinsic"), + }; + + self.block.extend_from_slice(&[ + format!("\tmov.b32 %r{index}.x, %{sreg}.x;"), + format!("\tmov.b32 %r{index}.y, %{sreg}.y;"), + format!("\tmov.b32 %r{index}.z, %{sreg}.z;"), + ]); + } + nir_intrinsic_load_subgroup_invocation + | nir_intrinsic_load_subgroup_eq_mask + | nir_intrinsic_load_subgroup_ge_mask + | nir_intrinsic_load_subgroup_gt_mask + | nir_intrinsic_load_subgroup_le_mask + | nir_intrinsic_load_subgroup_lt_mask => { + let sreg = match intrin.intrinsic { + nir_intrinsic_load_subgroup_invocation => "laneid", + nir_intrinsic_load_subgroup_eq_mask => "lanemask_eq", + nir_intrinsic_load_subgroup_ge_mask => "lanemask_ge", + nir_intrinsic_load_subgroup_gt_mask => "lanemask_gt", + nir_intrinsic_load_subgroup_le_mask => "lanemask_le", + nir_intrinsic_load_subgroup_lt_mask => "lanemask_lt", + _ => unreachable!("unknown intrinsic"), + }; + + if num_components != 1 { + self.block + .push(format!("\tmov.b{bit_size} %r{index}.x, %{sreg};")); + for comp in 1..num_components { + let def = def_string(&intrin.def, comp); + self.block.push(format!("\tmov.b{bit_size} {def}, 0;")); + } + } else { + self.block + .push(format!("\tmov.b{bit_size} %r{index}, %{sreg};")); + } + } + nir_intrinsic_shader_clock => { + self.block + .push(format!("\tmov.b64 {{%r{index}.x, %r{index}.y}}, %clock64;")); + } + + // memory + nir_intrinsic_global_atomic + | nir_intrinsic_global_atomic_swap + | nir_intrinsic_shared_atomic + | nir_intrinsic_shared_atomic_swap => { + let address = intrin.get_src(0).as_def().index; + let value = intrin.get_src(1).as_def().index; + + let op = match intrin.atomic_op() { + nir_atomic_op_cmpxchg => "cas.b", + nir_atomic_op_iadd => "add.s", + nir_atomic_op_iand => "and.b", + nir_atomic_op_imax => "max.s", + nir_atomic_op_imin => "min.s", + nir_atomic_op_ior => "or.b", + nir_atomic_op_ixor => "xor.b", + nir_atomic_op_umax => "max.u", + nir_atomic_op_umin => "min.u", + nir_atomic_op_xchg => "exch.b", + op => unreachable!("unknown atomic op {op}"), + }; + + let mem = match intrin.intrinsic { + nir_intrinsic_global_atomic | nir_intrinsic_global_atomic_swap => "global", + nir_intrinsic_shared_atomic | nir_intrinsic_shared_atomic_swap => "shared", + _ => unreachable!(""), + }; + + let scope = match intrin.intrinsic { + _ if self.options.target_sm < 60 => "", + nir_intrinsic_global_atomic | nir_intrinsic_global_atomic_swap => ".gpu", + nir_intrinsic_shared_atomic | nir_intrinsic_shared_atomic_swap => ".cta", + _ => unreachable!(""), + }; + + let src1 = match intrin.intrinsic { + nir_intrinsic_global_atomic_swap | nir_intrinsic_shared_atomic_swap => { + let src1 = intrin.get_src(2).as_def().index; + format!(", %r{src1}") + } + _ => "".to_owned(), + }; + + self.block.push(format!( + "\tatom{scope}.{mem}.{op}{bit_size} %r{index}, [%r{address}], %r{value}{src1};" + )); + } + nir_intrinsic_load_kernel_input => { + let src = intrin.get_src(0); + let param = unsafe { (*nir_src_as_const_value(*src)).u64_ }; + + // Not sure if this is defined behavior, but we can offset the kernel parameter. + // TODO: only do it for array arguments? + self.block.push(format!( + "\tld.param{vec}.b{bit_size} %r{index}, [param0 + {param}];" + )); + } + nir_intrinsic_load_global + | nir_intrinsic_load_global_constant + | nir_intrinsic_load_shared => { + let address = intrin.get_src(0); + let address = address.as_def().index; + let mem = mem_from_intrinsic(intrin); + + self.block.push(format!( + "\tld.{mem}{vec}.b{bit_size} %r{index}, [%r{address}];" + )); + } + nir_intrinsic_store_global | nir_intrinsic_store_shared => { + let value = intrin.get_src(0); + let address = intrin.get_src(1); + let bit_size = value.bit_size(); + let num_components = value.num_components(); + let address = address.as_def().index; + let index = value.as_def().index; + let mem = mem_from_intrinsic(intrin); + let vec = if num_components > 1 { + format!(".v{num_components}") + } else { + "".to_owned() + }; + + self.block.push(format!( + "\tst.{mem}{vec}.b{bit_size} [%r{address}], %r{index};" + )); + } + nir_intrinsic_load_scratch => { + let address = intrin.get_src(0); + let address = address.as_def().index; + + self.block.extend_from_slice(&[ + format!("\tadd.u32 %LMEM_TMP, %LMEM, %r{address};"), + format!("\tld.local{vec}.b{bit_size} %r{index}, [%LMEM_TMP];"), + ]); + } + nir_intrinsic_store_scratch => { + let value = intrin.get_src(0); + let address = intrin.get_src(1); + let bit_size = value.bit_size(); + let num_components = value.num_components(); + let address = address.as_def().index; + let index = value.as_def().index; + let vec = if num_components > 1 { + format!(".v{num_components}") + } else { + "".to_owned() + }; + + self.block.extend_from_slice(&[ + format!("\tadd.u32 %LMEM_TMP, %LMEM, %r{address};"), + format!("\tst.local{vec}.b{bit_size} [%LMEM_TMP], %r{index};"), + ]); + } + + // barriers + nir_intrinsic_barrier => { + let exec_scope = intrin.execution_scope(); + let mem_scope = intrin.memory_scope(); + let mem_sem = intrin.memory_semantics(); + let mem_modes = intrin.memory_modes(); + + match exec_scope { + SCOPE_SUBGROUP => { + self.block.push(format!("\tbar.warp.sync 0xffffffff;")); + } + SCOPE_WORKGROUP => { + self.block.push(format!("\tbar.sync 0;")); + } + _ => {} + } + + if mem_modes & (nir_var_mem_global | nir_var_image | nir_var_mem_shared) != 0 { + if self.options.target_sm >= 70 { + let mem_sem = match mem_sem { + // TODO: bug in the PTX compiler? + NIR_MEMORY_ACQUIRE | //=> "acquire", + NIR_MEMORY_RELEASE | //=> "release", + NIR_MEMORY_ACQ_REL => "acq_rel", + _ => unreachable!("unkown barrier semantics"), + }; + + let mem_scope = match mem_scope { + SCOPE_SUBGROUP | SCOPE_SHADER_CALL | SCOPE_WORKGROUP => "cta", + SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => "gpu", + _ => unreachable!("unkown barrier scope"), + }; + + self.block.push(format!("\tfence.{mem_sem}.{mem_scope};")); + } else { + let level = match mem_scope { + SCOPE_SUBGROUP | SCOPE_SHADER_CALL | SCOPE_WORKGROUP => "cta", + SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => "gl", + _ => unreachable!("unkown barrier scope"), + }; + + self.block.push(format!("\tmembar.{level};")); + } + } + } + + // subgroups + nir_intrinsic_ballot => { + let value = intrin.get_src(0).as_def().index; + + self.block.push(format!( + "\tvote.sync.ballot.b32 %r{index}, %r{value}, 0xffffffff;" + )); + } + // TODO: sm90+ + // nir_intrinsic_elect => { + // self.block.push(format!( + // "\telect.sync _|%r{index}, 0xffffffff;" + // )); + // } + nir_intrinsic_read_invocation + | nir_intrinsic_shuffle + | nir_intrinsic_shuffle_down + | nir_intrinsic_shuffle_up + | nir_intrinsic_shuffle_xor => { + let value = intrin.get_src(0).as_def().index; + let lane = intrin.get_src(1).as_def().index; + + let (mode, val) = match intrin.intrinsic { + nir_intrinsic_shuffle_down => ("down", 0x1f), + nir_intrinsic_shuffle_up => ("up", 0), + nir_intrinsic_shuffle_xor => ("bfly", 0x1f), + nir_intrinsic_read_invocation | nir_intrinsic_shuffle => ("idx", 0x1f), + _ => unreachable!(), + }; + + self.block.push(format!( + "\tshfl.sync.{mode}.b32 %r{index}, %r{value}, %r{lane}, 0x{val:x}, 0xffffffff;" + )); + } + nir_intrinsic_reduce => { + let value = intrin.get_src(0).as_def().index; + + let op = match intrin.reduction_op() { + nir_op_iadd => "add.s", + nir_op_iand => "and.b", + nir_op_imax => "max.s", + nir_op_imin => "min.s", + nir_op_ior => "or.b", + nir_op_ixor => "xor.b", + nir_op_umax => "max.u", + nir_op_umin => "min.u", + op => unreachable!("Unknown reduction op: {op}"), + }; + + self.block.push(format!( + "\tredux.sync.{op}32 %r{index}, %r{value}, 0xffffffff;", + )); + } + nir_intrinsic_vote_all | nir_intrinsic_vote_any => { + let value = intrin.get_src(0).as_def().index; + let mode = match intrin.intrinsic { + nir_intrinsic_vote_all => "all", + nir_intrinsic_vote_any => "any", + _ => unreachable!(""), + }; + + self.block.push(format!( + "\tvote.sync.{mode}.pred %r{index}, %r{value}, 0xffffffff;" + )); + } + nir_intrinsic_vote_ieq => { + let value = intrin.get_src(0).as_def(); + let bit_size = value.bit_size; + let value = value.index; + + self.block.push(format!( + "\tmatch.all.sync.b{bit_size} _|%r{index}, %r{value}, 0xffffffff;" + )); + } + _ => unreachable!("Unknown intrinsic {}", info.name()), + } + } + + fn parse_jump(&mut self, jump: &nir_jump_instr) { + match jump.type_ { + nir_jump_goto => { + let jump_target = jump.target().unwrap().index; + self.block.push(format!("\t\tbra b{jump_target};")); + } + nir_jump_goto_if => { + let jump_target = jump.target().unwrap().index; + let else_target = jump.else_target().unwrap().index; + let cond = &jump.condition.as_def().index; + + self.block.extend_from_slice(&[ + format!("\t@%r{cond}\tbra b{jump_target};"), + format!("\t\tbra b{else_target};"), + ]); + } + _ => unreachable!("Unsupported jump instruction: {}", jump.type_), + } + } + + fn parse_instruction(&mut self, instr: &nir_instr) { + match instr.type_ { + nir_instr_type_load_const => { + let load_const = instr.as_load_const().unwrap(); + debug_assert_eq!(load_const.def.num_components, 1); + + let index = load_const.def.index; + let bit_size = load_const.def.bit_size; + + self.create_dest_var(&load_const.def); + let val = match bit_size { + 64 => unsafe { load_const.values()[0].u64_ }, + 32 => unsafe { load_const.values()[0].u32_ }.into(), + 16 => unsafe { load_const.values()[0].u16_ }.into(), + 8 => unsafe { load_const.values()[0].u8_ }.into(), + 1 => unsafe { load_const.values()[0].b }.into(), + _ => unreachable!("unsupported load_const"), + }; + + // Yeah.... + if bit_size == 1 { + self.block.push(format!("\tmov.pred %r{index}, {val};")); + } else if bit_size == 8 { + self.vars.push(format!("\t.reg .b16 %r{index}load_const;")); + self.block.extend_from_slice(&[ + format!("\tmov.b16 %r{index}load_const, {{{val}, 0}};"), + format!("\tmov.b16 {{%r{index}, _}}, %r{index}load_const;"), + ]); + } else { + self.block + .push(format!("\tmov.b{bit_size} %r{index}, {val};")); + } + } + nir_instr_type_undef => { + let undef = instr.as_undef().unwrap(); + self.create_dest_var(&undef.def); + } + nir_instr_type_alu => self.parse_alu(instr.as_alu().unwrap()), + nir_instr_type_intrinsic => self.parse_intrinsic(instr.as_intrinsic().unwrap()), + nir_instr_type_jump => self.parse_jump(instr.as_jump().unwrap()), + _ => unreachable!("Unsupported instruction type {}", instr.type_), + } + } +} + +#[no_mangle] +pub extern "C" fn nir_to_ptx(nir: *const nir_shader, options: NirToPtxOptions) -> NirToPtxOutput { + let mut state = NirToPtxState::new(nir, options); + state.init(); + state.parse().unwrap(); + state.finalize() +} + +#[no_mangle] +pub extern "C" fn nir_to_ptx_free(ptx: NirToPtxOutput) { + unsafe { Vec::from_raw_parts(ptx.ptx, ptx.ptx_len, ptx.ptx_capacity) }; +} diff --git a/src/gallium/drivers/nocl/compiler/meson.build b/src/gallium/drivers/nocl/compiler/meson.build new file mode 100644 index 00000000000..d630a0aeae2 --- /dev/null +++ b/src/gallium/drivers/nocl/compiler/meson.build @@ -0,0 +1,44 @@ +# Copyright © 2025 Karol Herbst +# SPDX-License-Identifier: MIT + +prog_cbindgen = find_program( + 'cbindgen', + required : true, + native : true, + version : '>= 0.25' +) + +libnir_to_ptx_h = custom_target( + 'libnir_to_ptx_h', + input : [files('cbindgen.toml'), 'lib.rs'], + output : ['nir_to_ptx.h'], + command : [ + prog_cbindgen, '-q', '--config', '@INPUT0@', '--lang', 'c', + '--output', '@OUTPUT0@', '--depfile', '@DEPFILE@', + '--', '@INPUT1@', + ], + depfile : 'nir_to_ptx.h.d', +) + +files_libnir_to_ptx = files( + 'lib.rs', +) + +libnir_to_ptx = static_library( + 'nir_to_ptx', + [files_libnir_to_ptx], + gnu_symbol_visibility : 'hidden', + rust_abi : 'c', + dependencies : [ + idep_compiler_rs, + ], +) + +idep_nir_to_ptx = declare_dependency( + sources : [ + libnir_to_ptx_h + ], + link_with : [ + libnir_to_ptx, + ] +) diff --git a/src/gallium/drivers/nocl/meson.build b/src/gallium/drivers/nocl/meson.build new file mode 100644 index 00000000000..50d1930a70e --- /dev/null +++ b/src/gallium/drivers/nocl/meson.build @@ -0,0 +1,54 @@ +# Copyright © 2025 Karol Herbst +# SPDX-License-Identifier: MIT + +subdir('compiler') + +files_libnocl = files( + 'nocl_nir.c', + 'nocl_pipe.c', + 'nocl_private.h', +) + +nocl_nir_algebraic_c = custom_target( + 'nocl_nir_algebraic.c', + input : 'nocl_nir_algebraic.py', + output : 'nocl_nir_algebraic.c', + command : [ + prog_python, '@INPUT@', + '-p', dir_compiler_nir, + '--out', '@OUTPUT@', + ], + depend_files : nir_algebraic_depends, +) + +libnocl = static_library( + 'nocl', + [ + files_libnocl, + nocl_nir_algebraic_c, + ], + include_directories : [ + inc_gallium, + inc_gallium_aux, + inc_gallium_winsys, + inc_util, + ], + gnu_symbol_visibility : 'hidden', + dependencies : [ + dep_libcuda, + idep_nir, + idep_nir_to_ptx, + ], +) + +driver_nocl = declare_dependency( + compile_args : '-DGALLIUM_NOCL', + dependencies : [ + dep_libcuda, + idep_libnoclwinsys, + ], + link_with : [ + libnocl, + libnir_to_ptx, + ], +) diff --git a/src/gallium/drivers/nocl/nocl_nir.c b/src/gallium/drivers/nocl/nocl_nir.c new file mode 100644 index 00000000000..12f2e029c5d --- /dev/null +++ b/src/gallium/drivers/nocl/nocl_nir.c @@ -0,0 +1,824 @@ +/* + * Copyright © 2022 Collabora, Ltd. + * Copyright © 2025 Karol Herbst + * + * SPDX-License-Identifier: MIT + */ + +#include "nocl_private.h" + +#include "nir.h" +#include "nir_builder.h" + +static void +push_block(nir_builder *b, nir_block *block) +{ + assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl))); + block->cf_node.parent = &b->impl->cf_node; + exec_list_push_tail(&b->impl->body, &block->cf_node.node); + b->cursor = nir_after_block(block); +} + +enum scope_type { + SCOPE_TYPE_SHADER, + SCOPE_TYPE_IF_MERGE, + SCOPE_TYPE_LOOP_BREAK, + SCOPE_TYPE_LOOP_CONT, +}; + +struct scope { + enum scope_type type; + + struct scope *parent; + uint32_t depth; + + nir_block *merge; + nir_def *bar; + + uint32_t escapes; +}; + +static struct scope +push_scope(nir_builder *b, + enum scope_type scope_type, + struct scope *parent, + nir_block *merge_block) +{ + struct scope scope = { + .type = scope_type, + .parent = parent, + .depth = parent->depth + 1, + .merge = merge_block, + }; + + return scope; +} + +static void +pop_scope(nir_builder *b, nir_def *esc_reg, struct scope scope) +{ + if (scope.bar == NULL) + return; + + if (scope.escapes > 0) { + /* Find the nearest scope with a sync. */ + nir_block *parent_merge = b->impl->end_block; + for (struct scope *p = scope.parent; p != NULL; p = p->parent) { + if (p->bar != NULL) { + parent_merge = p->merge; + break; + } + } + + /* No escape is ~0, halt is 0, and we choose outer scope indices such + * that outer scopes always have lower indices than inner scopes. + */ + nir_def *esc = nir_ult_imm(b, nir_load_reg(b, esc_reg), scope.depth); + + /* We have to put the escape in its own block to avoid critical edges. + * If we just did goto_if, we would end up with multiple successors, + * including a jump to the parent's merge block which has multiple + * predecessors. + */ + nir_block *esc_block = nir_block_create(b->shader); + nir_block *next_block = nir_block_create(b->shader); + nir_goto_if(b, esc_block, esc, next_block); + push_block(b, esc_block); + nir_goto(b, parent_merge); + push_block(b, next_block); + } +} + +static enum scope_type +jump_target_scope_type(nir_jump_type jump_type) +{ + switch (jump_type) { + case nir_jump_break: return SCOPE_TYPE_LOOP_BREAK; + case nir_jump_continue: return SCOPE_TYPE_LOOP_CONT; + default: + UNREACHABLE("Unknown jump type"); + } +} + +static void +break_scopes(nir_builder *b, nir_def *esc_reg, + struct scope *current_scope, + nir_jump_type jump_type) +{ + nir_block *first_sync = NULL; + uint32_t target_depth = UINT32_MAX; + enum scope_type target_scope_type = jump_target_scope_type(jump_type); + for (struct scope *scope = current_scope; scope; scope = scope->parent) { + if (first_sync == NULL && scope->bar != NULL) + first_sync = scope->merge; + + if (scope->type == target_scope_type) { + if (first_sync == NULL) { + first_sync = scope->merge; + } else { + /* In order for our cascade to work, we need to have the invariant + * that anything which escapes any scope with a warp sync needs to + * target a scope with a warp sync. + */ + assert(scope->bar != NULL); + } + target_depth = scope->depth; + break; + } else { + scope->escapes++; + } + } + assert(target_depth < UINT32_MAX); + + nir_store_reg(b, nir_imm_int(b, target_depth), esc_reg); + nir_goto(b, first_sync); +} + +static void +normal_exit(nir_builder *b, nir_def *esc_reg, nir_block *merge_block) +{ + assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl))); + nir_block *block = nir_cursor_current_block(b->cursor); + + if (!nir_block_ends_in_jump(block)) { + nir_store_reg(b, nir_imm_int(b, ~0), esc_reg); + nir_goto(b, merge_block); + } +} + +/* This is a heuristic for what instructions are allowed before we sync. + * Annoyingly, we've gotten rid of phis so it's not as simple as "is it a + * phi?". + */ +static bool +instr_is_allowed_before_sync(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + /* We could probably allow more ALU as long as it doesn't contain + * derivatives but let's be conservative and only allow mov for now. + */ + return alu->op == nir_op_mov; + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_load_reg || + intrin->intrinsic == nir_intrinsic_store_reg; + } + + default: + return false; + } +} + +/** Returns true if our successor will sync for us + * + * This is a bit of a heuristic + */ +static bool +parent_scope_will_sync(nir_cf_node *node, struct scope *parent_scope) +{ + /* First search forward to see if there's anything non-trivial after this + * node within the parent scope. + */ + nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node)); + nir_foreach_instr(instr, block) { + if (!instr_is_allowed_before_sync(instr)) + return false; + } + + /* There's another loop or if following and we didn't find a sync */ + if (nir_cf_node_next(&block->cf_node)) + return false; + + /* See if the parent scope will sync for us. */ + if (parent_scope->bar != NULL) + return true; + + switch (parent_scope->type) { + case SCOPE_TYPE_SHADER: + return true; + + case SCOPE_TYPE_IF_MERGE: + return parent_scope_will_sync(block->cf_node.parent, + parent_scope->parent); + + case SCOPE_TYPE_LOOP_CONT: + /* In this case, the loop doesn't have a sync of its own so we're + * expected to be uniform before we hit the continue. + */ + return false; + + case SCOPE_TYPE_LOOP_BREAK: + UNREACHABLE("Loops must have a continue scope"); + + default: + UNREACHABLE("Unknown scope type"); + } +} + +static bool +block_is_merge(const nir_block *block) +{ + /* If it's unreachable, there is no merge */ + if (block->imm_dom == NULL) + return false; + + unsigned num_preds = 0; + set_foreach(&block->predecessors, entry) { + const nir_block *pred = entry->key; + + /* We don't care about unreachable blocks */ + if (pred->imm_dom == NULL) + continue; + + num_preds++; + } + + return num_preds > 1; +} + +static void +lower_cf_list(nir_builder *b, nir_def *esc_reg, struct scope *parent_scope, + struct exec_list *cf_list) +{ + foreach_list_typed_safe(nir_cf_node, node, node, cf_list) { + switch (node->type) { + case nir_cf_node_block: { + nir_block *block = nir_cf_node_as_block(node); + if (exec_list_is_empty(&block->instr_list)) + break; + + nir_cursor start = nir_before_block(block); + nir_cursor end = nir_after_block(block); + + nir_jump_instr *jump = NULL; + nir_instr *last_instr = nir_block_last_instr(block); + if (last_instr->type == nir_instr_type_jump) { + jump = nir_instr_as_jump(last_instr); + end = nir_before_instr(&jump->instr); + } + + nir_cf_list instrs; + nir_cf_extract(&instrs, start, end); + b->cursor = nir_cf_reinsert(&instrs, b->cursor); + + if (jump != NULL) { + if (jump->type == nir_jump_halt) { + /* Halt instructions map to OpExit on NVIDIA hardware and + * exited lanes never block a bsync. + */ + nir_instr_remove(&jump->instr); + nir_builder_instr_insert(b, &jump->instr); + } else { + /* Everything else needs a break cascade */ + break_scopes(b, esc_reg, parent_scope, jump->type); + } + } + break; + } + + case nir_cf_node_if: { + nir_if *nif = nir_cf_node_as_if(node); + + nir_def *cond = nif->condition.ssa; + nir_instr_clear_src(NULL, &nif->condition); + + nir_block *then_block = nir_block_create(b->shader); + nir_block *else_block = nir_block_create(b->shader); + nir_block *merge_block = nir_block_create(b->shader); + + struct scope scope = push_scope(b, SCOPE_TYPE_IF_MERGE, + parent_scope, merge_block); + + nir_goto_if(b, then_block, cond, else_block); + + push_block(b, then_block); + lower_cf_list(b, esc_reg, &scope, &nif->then_list); + normal_exit(b, esc_reg, merge_block); + + push_block(b, else_block); + lower_cf_list(b, esc_reg, &scope, &nif->else_list); + normal_exit(b, esc_reg, merge_block); + + push_block(b, merge_block); + pop_scope(b, esc_reg, scope); + + break; + } + + case nir_cf_node_loop: { + nir_loop *loop = nir_cf_node_as_loop(node); + + nir_block *head_block = nir_block_create(b->shader); + nir_block *break_block = nir_block_create(b->shader); + nir_block *cont_block = nir_block_create(b->shader); + + /* TODO: We can potentially avoid the break sync for loops when the + * parent scope syncs for us. However, we still need to handle the + * continue clause cascading to the break. If there is a + * nir_jump_halt involved, then we have a real cascade where it needs + * to then jump to the next scope. Getting all these cases right + * while avoiding an extra sync for the loop break is tricky at best. + */ + struct scope break_scope = push_scope(b, SCOPE_TYPE_LOOP_BREAK, + parent_scope, + break_block); + + nir_goto(b, head_block); + push_block(b, head_block); + + struct scope cont_scope = push_scope(b, SCOPE_TYPE_LOOP_CONT, + &break_scope, + cont_block); + + lower_cf_list(b, esc_reg, &cont_scope, &loop->body); + normal_exit(b, esc_reg, cont_block); + + push_block(b, cont_block); + + pop_scope(b, esc_reg, cont_scope); + + lower_cf_list(b, esc_reg, &break_scope, &loop->continue_list); + + nir_goto(b, head_block); + push_block(b, break_block); + + pop_scope(b, esc_reg, break_scope); + + break; + } + + default: + UNREACHABLE("Unknown CF node type"); + } + } +} + +static bool +lower_cf_func(nir_function *func) +{ + if (func->impl == NULL) + return false; + + if (exec_list_is_singular(&func->impl->body)) { + return nir_no_progress(func->impl); + } + + nir_function_impl *old_impl = func->impl; + + /* We use this in block_is_merge() */ + nir_metadata_require(old_impl, nir_metadata_dominance | nir_metadata_divergence); + + /* First, we temporarily get rid of SSA. This will make all our block + * motion way easier. Ask the pass to place reg writes directly in the + * immediate predecessors of the phis instead of trying to be clever. + * This will ensure that we never get a write to a uniform register from + * non-uniform control flow and makes our divergence reconstruction for + * phis more reliable. + */ + nir_foreach_block(block, old_impl) + nir_lower_phis_to_regs_block(block, true); + + /* We create a whole new nir_function_impl and copy the contents over */ + func->impl = NULL; + nir_function_impl *new_impl = nir_function_impl_create(func); + new_impl->structured = false; + + /* We copy defs from the old impl */ + new_impl->ssa_alloc = old_impl->ssa_alloc; + + nir_builder b = nir_builder_at(nir_before_impl(new_impl)); + nir_def *esc_reg = nir_decl_reg(&b, 1, 32, 0); + + /* Having a function scope makes everything easier */ + struct scope scope = { + .type = SCOPE_TYPE_SHADER, + .merge = new_impl->end_block, + }; + lower_cf_list(&b, esc_reg, &scope, &old_impl->body); + normal_exit(&b, esc_reg, new_impl->end_block); + + /* Now sort by reverse PDFS and restore SSA + * + * Note: Since we created a new nir_function_impl, there is no metadata, + * dirty or otherwise, so we have no need to call nir_progress(). + */ + nir_sort_unstructured_blocks(new_impl); + + return true; +} + +/* Copied from nak_nir_lower_cf.c but without divergency or barriers */ +bool +nocl_nir_lower_cf(struct nir_shader *nir) +{ + bool progress = false; + + nir_foreach_function(func, nir) { + if (lower_cf_func(func)) + progress = true; + } + + return progress; +} + +nir_shader_compiler_options * +nocl_get_nir_options(struct nocl_screen *nocl) +{ + nir_shader_compiler_options *nir_options = ralloc(nocl, nir_shader_compiler_options); + *nir_options = (nir_shader_compiler_options) { + // TODO: + // .has_bit_test = nocl->ptx >= 60, + // .has_bitfield_select = nocl->ptx >= 43, + // .has_f2i32_rtne = true, + // .has_uclz = true, + // .has_dot_2x16 = true, + // .has_fneo_fcmpu = true, + // .has_fused_comp_and_csel = true, + // .has_pack_32_4x8 = true, + // .has_shfr32 = true, + + .fuse_ffma32 = true, + .has_fsub = true, + .has_imul24 = true, + .has_imad32 = true, + .has_isub = true, + .has_rotate32 = nocl->sm >= 32, + .has_sdot_4x8 = nocl->sm >= 61, + .has_sudot_4x8 = nocl->sm >= 61, + .has_udot_4x8 = nocl->sm >= 61, + .has_umul24 = true, + .has_umad24 = true, + .lower_hadd = true, + .lower_find_lsb = true, + .lower_flrp16 = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_fisnormal = true, + .lower_fsat = true, + .lower_fsign = true, + .lower_int64_options = + /* _technically PTX supports it, but it fails to compile with vector sources... + * The hardware doesn't have an idiv anyway. */ + nir_lower_divmod64 | + nir_lower_iadd_sat64 | + nir_lower_imul64 | + nir_lower_scan_reduce_bitwise64 | + nir_lower_scan_reduce_iadd64 | + nir_lower_subgroup_shuffle64 | + (nocl->sm < 70 ? nir_lower_vote_ieq64 : 0), + .lower_insert_byte = true, + .lower_insert_word = true, + .lower_isign = true, + .lower_ldexp = true, + .lower_uadd_sat = true, + .lower_usub_sat = true, + .max_unroll_iterations = 32, + .support_16bit_alu = true, + }; + + return nir_options; +} + +static uint8_t +alu_width_cb(const nir_instr *instr, const void *) +{ + return 1; +} + +static unsigned +lower_bit_size_callback(const nir_instr *instr, void *data) +{ + const struct nocl_screen *nocl = data; + + switch (instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + unsigned dest_bit_size = alu->def.bit_size; + unsigned src0_bit_size = alu->src[0].src.ssa->bit_size; + switch (alu->op) { + case nir_op_extract_i8: + case nir_op_extract_i16: + case nir_op_extract_u8: + case nir_op_extract_u16: + case nir_op_bitfield_reverse: + if (dest_bit_size < 32) + return 32; + return 0; + case nir_op_bcsel: + if (dest_bit_size < 16) + return 16; + return 0; + + case nir_op_bit_count: + case nir_op_bitfield_insert: + case nir_op_fdiv: + case nir_op_frsq: + case nir_op_fsqrt: + case nir_op_iadd_sat: + case nir_op_ibitfield_extract: + case nir_op_isub_sat: + case nir_op_ubitfield_extract: + case nir_op_ufind_msb: + if (src0_bit_size < 32) + return 32; + return 0; + + case nir_op_iabs: + case nir_op_iadd: + case nir_op_iand: + case nir_op_idiv: + case nir_op_ieq: + case nir_op_ige: + case nir_op_ilt: + case nir_op_imad: + case nir_op_imax: + case nir_op_imin: + case nir_op_imul: + case nir_op_imul_high: + case nir_op_ine: + case nir_op_ineg: + case nir_op_inot: + case nir_op_ior: + case nir_op_irem: + case nir_op_ishl: + case nir_op_ishr: + case nir_op_isub: + case nir_op_ixor: + case nir_op_udiv: + case nir_op_uge: + case nir_op_ult: + case nir_op_umax: + case nir_op_umin: + case nir_op_umod: + case nir_op_umul_high: + case nir_op_ushr: + if (src0_bit_size == 8) + return 16; + return 0; + + case nir_op_fmax: + case nir_op_fmin: + if (src0_bit_size == 16 && nocl->sm < 80) + return 32; + return 0; + + case nir_op_feq: + case nir_op_fequ: + case nir_op_fge: + case nir_op_fgeu: + case nir_op_flt: + case nir_op_fltu: + case nir_op_fneo: + case nir_op_fneu: + if (src0_bit_size == 16 && nocl->sm < 53) + return 32; + return 0; + default: + return 0; + } + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_read_invocation: + case nir_intrinsic_reduce: + case nir_intrinsic_shuffle: + case nir_intrinsic_shuffle_down: + case nir_intrinsic_shuffle_up: + case nir_intrinsic_shuffle_xor: + if (intrin->def.bit_size < 32) + return 32; + return 0; + default: + return 0; + } + } + + default: + return 0; + } +} + +static bool +lower_subgroups_filter(const nir_intrinsic_instr *intrin, const void *data) +{ + const struct nocl_screen *nocl = data; + if (intrin->def.num_components > 1) + return true; + + switch (intrin->intrinsic) { + case nir_intrinsic_ballot_bit_count_reduce: + case nir_intrinsic_ballot_bit_count_exclusive: + case nir_intrinsic_ballot_bit_count_inclusive: + case nir_intrinsic_ballot_bitfield_extract: + case nir_intrinsic_ballot_find_lsb: + case nir_intrinsic_ballot_find_msb: + case nir_intrinsic_exclusive_scan: + case nir_intrinsic_first_invocation: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_inverse_ballot: + case nir_intrinsic_load_subgroup_size: + case nir_intrinsic_quad_vote_all: + case nir_intrinsic_quad_vote_any: + case nir_intrinsic_read_first_invocation: + case nir_intrinsic_rotate: + case nir_intrinsic_vote_feq: + /* TODO: exists on SM90+ */ + case nir_intrinsic_elect: + return true; + case nir_intrinsic_vote_ieq: + return nocl->sm < 70; + case nir_intrinsic_reduce: { + if (nocl->sm < 80) + return true; + unsigned cluster_size = nir_intrinsic_cluster_size(intrin); + if (cluster_size != 0 && cluster_size != 32) + return true; + + switch (nir_intrinsic_reduction_op(intrin)) { + case nir_op_fadd: + case nir_op_fmax: + case nir_op_fmin: + case nir_op_fmul: + case nir_op_imul: + return true; + case nir_op_imax: + case nir_op_imin: + case nir_op_umax: + case nir_op_umin: + return intrin->def.bit_size == 64; + default: + /* nir_lower_int64 will handle some of it for us */ + return false; + } + } + case nir_intrinsic_read_invocation: + case nir_intrinsic_shuffle: + case nir_intrinsic_shuffle_down: + case nir_intrinsic_shuffle_up: + case nir_intrinsic_shuffle_xor: + return intrin->def.bit_size > 32; + default: + return false; + } +} + +static nir_mem_access_size_align +lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin, + uint8_t bytes, + uint8_t bit_size, + uint32_t align, + uint32_t align_offset, + bool offset_is_const, + enum gl_access_qualifier, + const void *cb_data) +{ + align = nir_combined_align(align, align_offset); + bytes = MIN2(MIN2(bytes, align), 16); + bit_size = MIN2(bit_size, bytes * 8); + + unsigned num_components = MIN2(bytes / (bit_size / 8), 4); + if (num_components == 3) + num_components = 2; + + return (nir_mem_access_size_align) { + .bit_size = bit_size, + .num_components = num_components, + .align = align, + .shift = nir_mem_access_shift_method_scalar, + }; +} + +static bool +lower_alu_vec_srcs(nir_builder *b, nir_alu_instr *alu, void *_data) +{ + const nir_op_info *info = &nir_op_infos[alu->op]; + + bool changed = false; + b->cursor = nir_before_instr(&alu->instr); + for (int i = 0; i < info->num_inputs; i++) { + if (info->input_sizes[i]) + continue; + + /* We lower everything that is bigger than vec4 _and_ 128 bits */ + nir_def *src_ssa = alu->src[i].src.ssa; + if (src_ssa->bit_size * src_ssa->num_components <= 128 && src_ssa->num_components <= 4) + continue; + + changed = true; + nir_def *comps[NIR_MAX_VEC_COMPONENTS]; + for (int c = 0; c < alu->def.num_components; c++) { + unsigned swizzle = alu->src[i].swizzle[c]; + alu->src[i].swizzle[c] = c; + + nir_const_value *const_val = nir_src_as_const_value(alu->src[i].src); + if (const_val) { + comps[c] = nir_build_imm(b, 1, alu->src[i].src.ssa->bit_size, &const_val[swizzle]); + } else { + comps[c] = nir_swizzle(b, alu->src[i].src.ssa, &swizzle, 1); + } + } + nir_def *src = nir_vec(b, comps, alu->def.num_components); + nir_src_rewrite(&alu->src[i].src, src); + } + + return changed; +} + +/* copied from nir_lower_alu_vec8_16_srcs */ +static bool +nir_lower_alu_vec_srcs(nir_shader *shader) +{ + return nir_shader_alu_pass(shader, lower_alu_vec_srcs, + nir_metadata_control_flow, + NULL); +} + +void +nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir) +{ + struct nocl_screen *nocl = nocl_screen(pscreen); + nir->info.max_subgroup_size = nocl->base.caps.shader_subgroup_size; + nir->info.min_subgroup_size = nocl->base.caps.shader_subgroup_size; + nir->info.api_subgroup_size = nocl->base.caps.shader_subgroup_size; + + struct nir_lower_compute_system_values_options sysval_options = { + .lower_local_invocation_index = true, + .lower_num_subgroups = true, + .lower_subgroup_id = true, + }; + + struct nir_lower_mem_access_bit_sizes_options mem_access_bit_sizes_options = { + .callback = lower_mem_access_bit_sizes_cb, + .cb_data = nocl, + .modes = nir_var_all, + }; + + struct nir_lower_subgroups_options subgroup_options = { + .ballot_bit_size = 32, + .ballot_components = 1, + .subgroup_size = 32, + .lower_elect = true, + .lower_first_invocation_to_ballot = true, + .lower_inverse_ballot = true, + .lower_quad_vote = true, + .lower_read_first_invocation = true, + .lower_reduce = true, + .lower_rotate_to_shuffle = true, + .lower_shuffle_to_32bit = true, + .lower_to_scalar = true, + .lower_vote_feq = true, + .lower_vote_ieq = nocl->sm < 70, + .filter = lower_subgroups_filter, + .filter_data = nocl, + }; + + NIR_PASS(_, nir, nir_lower_compute_system_values, &sysval_options); + + /* TODO: keeping 8 bit vectors would help a bit... */ + NIR_PASS(_, nir, nir_lower_all_phis_to_scalar); + NIR_PASS(_, nir, nir_lower_subgroups, &subgroup_options); + + bool progress; + do { + progress = false; + NIR_PASS(progress, nir, nir_opt_shrink_vectors, true); + NIR_PASS(progress, nir, nir_lower_mem_access_bit_sizes, &mem_access_bit_sizes_options); + NIR_PASS(progress, nir, nir_lower_alu_width, alu_width_cb, nocl); + NIR_PASS(progress, nir, nir_lower_alu_vec_srcs); + NIR_PASS(progress, nir, nir_lower_flrp, 16 | 32 | 64, false); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_cse); + } while (progress); + + do { + progress = false; + NIR_PASS(progress, nir, nir_lower_bit_size, lower_bit_size_callback, nocl); + NIR_PASS(progress, nir, nir_opt_algebraic_late); + NIR_PASS(progress, nir, nocl_nir_opt_algebraic_late, nocl); + if (progress) { + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, nir_copy_prop); + NIR_PASS(_, nir, nir_opt_dce); + NIR_PASS(_, nir, nir_opt_cse); + } + } while (progress); + + /* Run only once */ + NIR_PASS(progress, nir, nocl_nir_lower_algebraic_late, nocl); + if (progress) { + NIR_PASS(_, nir, nir_opt_constant_folding); + } + + /* After nir_opt_constant_folding */ + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_opt_dce); +} diff --git a/src/gallium/drivers/nocl/nocl_nir_algebraic.py b/src/gallium/drivers/nocl/nocl_nir_algebraic.py new file mode 100644 index 00000000000..756e1e69545 --- /dev/null +++ b/src/gallium/drivers/nocl/nocl_nir_algebraic.py @@ -0,0 +1,65 @@ +# Copyright © 2025 Karol Herbst +# SPDX-License-Identifier: MIT + +import argparse +import sys + +a = 'a' +b = 'b' +c = 'c' +s = 's' + +late_algebraic = [ + (('ior@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)), + (('iadd@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)), + (('ixor@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)), + (('ixor@32', ('iand', a, ('ixor', b, c)), c), ('bitfield_select', a, b, c)), + + (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c)), + (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c)), + (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b)), + (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c)), + (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c)), + (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c)), +] + +late_algebraic_lowering = [ + (('b2i8', a), ('u2u8', ('b2i16', a))), +] + +for bit_size in [16, 32, 64]: + late_algebraic_lowering += [ + ((f'ishl@{bit_size}', a, b), ('ishl', a, ('iand', b, bit_size - 1))), + ((f'ishr@{bit_size}', a, b), ('ishr', a, ('iand', b, bit_size - 1))), + ((f'ushr@{bit_size}', a, b), ('ushr', a, ('iand', b, bit_size - 1))), + ] + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--out', required=True, help='Output file.') + parser.add_argument('-p', '--import-path', required=True) + args = parser.parse_args() + sys.path.insert(0, args.import_path) + + import nir_algebraic # pylint: disable=import-error + + try: + with open(args.out, 'w', encoding='utf-8') as f: + f.write('#include "nocl_private.h"') + f.write(nir_algebraic.AlgebraicPass( + "nocl_nir_opt_algebraic_late", + late_algebraic, + [ + ("const struct nocl_screen *", "nocl"), + ]).render()) + f.write(nir_algebraic.AlgebraicPass( + "nocl_nir_lower_algebraic_late", + late_algebraic_lowering, + [ + ("const struct nocl_screen *", "nocl"), + ]).render()) + except Exception: + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/src/gallium/drivers/nocl/nocl_pipe.c b/src/gallium/drivers/nocl/nocl_pipe.c new file mode 100644 index 00000000000..a3a549af35e --- /dev/null +++ b/src/gallium/drivers/nocl/nocl_pipe.c @@ -0,0 +1,1027 @@ +/* + * Copyright © 2025 Karol Herbst + * SPDX-License-Identifier: MIT + */ + +#include "nocl_private.h" +#include "compiler/nir_to_ptx.h" +#include "util/blob.h" +#include "util/disk_cache.h" +#include "util/helpers.h" +#include "util/mesa-sha1.h" +#include "util/u_debug.h" +#include "util/u_screen.h" +#include "util/u_transfer.h" + +#include "git_sha1.h" +#include "nir.h" +#include "nir_serialize.h" + +/* clang-format off */ +static const struct debug_named_value nocl_debug_options[] = { + {"nir", NOCL_DBG_NIR, "Dump the generated NIRs"}, + {"ptx", NOCL_DBG_PTX, "Dump the PTX and compiler info messages"}, + {"jit_no_opts", NOCL_DBG_JIT_NO_OPTS, "Disables all JIT optimizations"}, + {"verbose", NOCL_DBG_VERBOSE, "Enables verbose logging"}, + DEBUG_NAMED_VALUE_END +}; +/* clang-format on */ + +static void +nocl_bind_compute_state(struct pipe_context *pctx, void *cso) +{ + struct nocl_context *ctx = nocl_context(pctx); + ctx->prog = nocl_program(cso); +} + +static void +nocl_bind_sampler_states(struct pipe_context *pctx, + mesa_shader_stage shader, + unsigned start_slot, unsigned num_samplers, + void **samplers) +{ + assert(!num_samplers); +} + +static void * +nocl_buffer_map(struct pipe_context *pctx, + struct pipe_resource *pres, + unsigned level, + unsigned usage, /* a combination of PIPE_MAP_x */ + const struct pipe_box *box, + struct pipe_transfer **out_transfer) +{ + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + struct nocl_buffer *res = nocl_buffer(pres); + struct nocl_transfer *xfer = CALLOC_STRUCT(nocl_transfer); + bool sync = !(usage & PIPE_MAP_UNSYNCHRONIZED); + + if (!xfer) + return NULL; + + void *ptr; + if (pres->usage & PIPE_USAGE_STAGING) { + ptr = res->ptr + box->x; + } else { + if (NOCL_CALL(cuMemAllocHost_v2(&xfer->ptr, box->width))) + goto err_alloc_host; + + if (!(usage & (PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_DISCARD_RANGE))) { + if (NOCL_CALL(cuMemcpyDtoHAsync_v2(xfer->ptr, res->dptr + box->x, box->width, ctx->cu_stream))) + goto err_copy_host; + } + + ptr = xfer->ptr; + } + + if (sync) + NOCL_CALL(cuStreamSynchronize(ctx->cu_stream)); + + *out_transfer = &xfer->base; + xfer->base = (struct pipe_transfer) { + .box = *box, + .level = level, + .resource = pres, + .usage = usage, + }; + + return ptr; + +err_copy_host: + NOCL_CALL(cuMemFreeHost(xfer->ptr)); +err_alloc_host: + FREE(xfer); + return NULL; +} + +static void +nocl_buffer_subdata(struct pipe_context *pctx, + struct pipe_resource *pres, + unsigned usage, + unsigned offset, + unsigned size, + const void *data) +{ + nocl_context_and_make_current(pctx); + struct nocl_buffer *res = nocl_buffer(pres); + NOCL_CALL(cuMemcpyHtoD_v2(res->dptr + offset, data, size)); +} + +static void +nocl_buffer_unmap(struct pipe_context *pctx, + struct pipe_transfer *pxfer) +{ + struct nocl_transfer *xfer = nocl_transfer(pxfer); + struct nocl_buffer *res = nocl_buffer(pxfer->resource); + + if (!(res->base.usage & PIPE_USAGE_STAGING)) { + nocl_context_and_make_current(pctx); + if (pxfer->usage & PIPE_MAP_WRITE) { + /* TODO: use async and delay freeing the host alloc until finish? */ + NOCL_CALL(cuMemcpyHtoD_v2(res->dptr + pxfer->box.x, xfer->ptr, pxfer->box.width)); + } + + NOCL_CALL(cuMemFreeHost(xfer->ptr)); + } + + FREE(xfer); +} + +static void +nocl_clear_buffer(struct pipe_context *pctx, + struct pipe_resource *pres, + unsigned offset, + unsigned size, + const void *clear_value, + int clear_value_size) +{ + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + struct nocl_buffer *res = nocl_buffer(pres); + CUdeviceptr ptr = res->dptr + offset; + + uint32_t clamped; + + if (clear_value_size > 4) { + if (util_lower_clearsize_to_dword(clear_value, &clear_value_size, &clamped)) + clear_value = &clamped; + } + + switch (clear_value_size) { + case 1: + NOCL_CALL(cuMemsetD8Async(ptr, *(uint8_t*)clear_value, size, ctx->cu_stream)); + break; + case 2: + NOCL_CALL(cuMemsetD16Async(ptr, *(uint16_t*)clear_value, size / 2, ctx->cu_stream)); + break; + case 4: + NOCL_CALL(cuMemsetD32Async(ptr, *(uint32_t*)clear_value, size / 4, ctx->cu_stream)); + break; + default: + /* TODO: accelerate */ + u_default_clear_buffer(pctx, pres, offset, size, clear_value, clear_value_size); + break; + } +} + +static void +nocl_context_destroy(struct pipe_context *pctx) +{ + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + NOCL_CALL(cuEventDestroy_v2(ctx->cu_timestamp_start)); + NOCL_CALL(cuStreamSynchronize(ctx->cu_stream)); + NOCL_CALL(cuStreamDestroy_v2(ctx->cu_stream)); + ralloc_free(ctx); +} + +static struct nocl_program * +nocl_compile_nir(struct nocl_screen *nocl, nir_shader *nir) +{ + const char *kernel_name = nir_shader_get_entrypoint(nir)->function->name; + CUlinkState cu_linker = NULL; + void *cubin = NULL; + cache_key key; + + struct nocl_program *prog = rzalloc(nocl, struct nocl_program); + if (!prog) + return NULL; + + unsigned int threads; + if (nir->info.workgroup_size_variable) { + threads = nir->info.cs.workgroup_size_hint[0] + * nir->info.cs.workgroup_size_hint[1] + * nir->info.cs.workgroup_size_hint[2]; + } else { + threads = nir->info.workgroup_size[0] + * nir->info.workgroup_size[1] + * nir->info.workgroup_size[2]; + } + + if (nocl->disk_cache) { + struct blob blob; + size_t size; + + blob_init(&blob); + nir_serialize(&blob, nir, false); + disk_cache_compute_key(nocl->disk_cache, blob.data, blob.size, key); + blob_finish(&blob); + + cubin = disk_cache_get(nocl->disk_cache, key, &size); + } + + if (!cubin) { + NIR_PASS(_, nir, nocl_nir_lower_cf); + nir_foreach_function_impl(impl, nir) { + nir_metadata_require(impl, nir_metadata_block_index); + } + + if (nocl->debug & NOCL_DBG_NIR) + nir_print_shader(nir, stdout); + + char *error_log = CALLOC(1, NOCL_COMPILER_LOG_SIZE); + if (!error_log) + goto out_err; + + char *info_log = CALLOC(1, NOCL_COMPILER_LOG_SIZE); + if (!info_log) { + FREE(error_log); + goto out_err; + } + + /* TODO: better occupancy calculation, e.g. take shared_memory usage into account */ + CUjit_option options[] = { + CU_JIT_TARGET_FROM_CUCONTEXT, + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_LOG_VERBOSE, + CU_JIT_OPTIMIZATION_LEVEL, + CU_JIT_MAX_REGISTERS, + CU_JIT_POSITION_INDEPENDENT_CODE, + /* Keep last so we can skip setting it */ + CU_JIT_THREADS_PER_BLOCK, + }; + + void * values[] = { + NULL, + info_log, + (void *)(uintptr_t)NOCL_COMPILER_LOG_SIZE, + error_log, + (void *)(uintptr_t)NOCL_COMPILER_LOG_SIZE, + (void *)(nocl->debug & NOCL_DBG_VERBOSE ? 1ull : 0ull), + (void *)(nocl->debug & NOCL_DBG_JIT_NO_OPTS ? 0ull : 4ull), + (void *)255, + (void *)0, + (void *)(uintptr_t)threads, + }; + STATIC_ASSERT(ARRAY_SIZE(options) == ARRAY_SIZE(values)); + + unsigned num_options = ARRAY_SIZE(options); + if (threads == 0) + num_options -= 1; + + struct NirToPtxOptions ptx_options = { + .target_sm = nocl->sm, + .target_ptx = nocl->ptx, + }; + struct NirToPtxOutput ptx = nir_to_ptx(nir, ptx_options); + if (nocl->debug & NOCL_DBG_PTX) + printf("PTX input:\n%s\n", ptx.ptx); + + size_t cubin_size; + NOCL_CALL(cuLinkCreate_v2(num_options, options, values, &cu_linker)); + if (!NOCL_CALL(cuLinkAddData_v2(cu_linker, CU_JIT_INPUT_PTX, ptx.ptx, ptx.ptx_len, kernel_name, 0, NULL, NULL))) + NOCL_CALL(cuLinkComplete(cu_linker, &cubin, &cubin_size)); + nir_to_ptx_free(ptx); + + if (nocl->debug & NOCL_DBG_PTX) { + if (strlen(info_log)) + printf("PTXJIT info log:\n%s\n", info_log); + } + + if (strlen(error_log)) + printf("PTXJIT error log:\n%s\n", error_log); + + FREE(info_log); + FREE(error_log); + + if (!cubin) + goto out_err; + + if (nocl->disk_cache) + disk_cache_put(nocl->disk_cache, key, cubin, cubin_size, NULL); + } + + prog->shared_size = nir->info.shared_size; + + CUjit_option options[] = { + CU_JIT_TARGET_FROM_CUCONTEXT, + CU_JIT_OPTIMIZATION_LEVEL, + CU_JIT_MAX_REGISTERS, + CU_JIT_POSITION_INDEPENDENT_CODE, + /* Keep last so we can skip setting it */ + CU_JIT_THREADS_PER_BLOCK, + }; + + unsigned num_options = ARRAY_SIZE(options); + if (threads == 0) + num_options -= 1; + + void * values[] = { + NULL, + (void *)(nocl->debug & NOCL_DBG_JIT_NO_OPTS ? 0ull : 4ull), + (void *)256, + (void *)0, + (void *)(uintptr_t)threads, + }; + STATIC_ASSERT(ARRAY_SIZE(options) == ARRAY_SIZE(values)); + + NOCL_CALL(cuModuleLoadDataEx(&prog->cu_mod, cubin, num_options, options, values)); + NOCL_CALL(cuModuleGetFunction(&prog->cu_func, prog->cu_mod, kernel_name)); + + /* The linker owns the cubin, so destroy only after creating the lib object */ + if (cu_linker) + NOCL_CALL(cuLinkDestroy(cu_linker)); + +out_err: + return prog; +} + +static void +nocl_delete_compute_state(struct pipe_context *, void *mod_ptr) +{ + struct nocl_program *prog = nocl_program(mod_ptr); + NOCL_CALL(cuModuleUnload(prog->cu_mod)); + ralloc_free(prog); +} + +static void * +nocl_create_compute_state(struct pipe_context *pctx, + const struct pipe_compute_state *comp) +{ + assert(comp->ir_type == PIPE_SHADER_IR_NIR); + + struct nocl_screen *nocl = nocl_screen_and_make_current(pctx->screen); + nir_shader *nir = (void *)comp->prog; + + struct nocl_program *prog = nocl_compile_nir(nocl, nir); + if (!prog->cu_mod || !prog->cu_func) { + nocl_delete_compute_state(pctx, prog); + prog = NULL; + goto out_err; + } + +out_err: + ralloc_free(nir); + return prog; +} + +static struct pipe_query * +nocl_create_query(struct pipe_context *pctx, + unsigned query_type, + unsigned index) +{ + assert(query_type == PIPE_QUERY_TIMESTAMP); + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + + struct pipe_query *query = CALLOC_STRUCT(pipe_query); + if (!query) + return NULL; + + NOCL_CALL(cuEventCreate(&query->cu_event_base, CU_EVENT_DEFAULT)); + NOCL_CALL(cuEventCreate(&query->cu_event, CU_EVENT_BLOCKING_SYNC)); + if (!query->cu_event_base || !query->cu_event) { + if (query->cu_event) + NOCL_CALL(cuEventDestroy(query->cu_event)); + if (query->cu_event_base) + NOCL_CALL(cuEventDestroy(query->cu_event_base)); + FREE(query); + return NULL; + } + + NOCL_CALL(cuEventRecord(query->cu_event_base, ctx->cu_stream)); + return query; +} + +static void * +nocl_create_sampler_state(struct pipe_context *, + const struct pipe_sampler_state *) +{ + assert(false); + return NULL; +} + +static void +nocl_delete_sampler_state(struct pipe_context *, void *) +{ + assert(false); +} + +static void +nocl_destroy_query(struct pipe_context *pipe, + struct pipe_query *query) +{ + NOCL_CALL(cuEventDestroy(query->cu_event)); + NOCL_CALL(cuEventDestroy(query->cu_event_base)); + FREE(query); +} + +static bool +nocl_end_query(struct pipe_context *pctx, struct pipe_query *query) +{ + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + NOCL_CALL(cuEventRecord(query->cu_event, ctx->cu_stream)); + return true; +} + +static void +nocl_flush(struct pipe_context *pctx, + struct pipe_fence_handle **fence, + unsigned flags) +{ + struct pipe_fence_handle *new_fence = CALLOC_STRUCT(pipe_fence_handle); + if (!new_fence) { + *fence = NULL; + return; + } + + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + if (NOCL_CALL(cuEventCreate(&new_fence->cu_event, CU_EVENT_BLOCKING_SYNC))) { + *fence = NULL; + FREE(new_fence); + } else { + NOCL_CALL(cuEventRecord(new_fence->cu_event, ctx->cu_stream)); + pipe_reference_init(&new_fence->ref, 1); + *fence = new_fence; + } +} + +static void +nocl_get_compute_state_info(struct pipe_context *pctx, void *cso, + struct pipe_compute_state_object_info *info) +{ + struct nocl_screen *nocl = nocl_screen(pctx->screen); + struct nocl_program *prog = nocl_program(cso); + + if (!prog) + return; + + *info = (struct pipe_compute_state_object_info) { + .max_threads = nocl->base.compute_caps.max_threads_per_block, + .preferred_simd_size = nocl->base.caps.shader_subgroup_size, + .simd_sizes = nocl->base.compute_caps.subgroup_sizes, + }; + + NOCL_CALL(cuFuncGetAttribute((int*)&info->private_memory, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, prog->cu_func)); + NOCL_CALL(cuFuncGetAttribute((int*)&info->max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, prog->cu_func)); +} + +static bool +nocl_get_query_result(struct pipe_context *pctx, + struct pipe_query *query, + bool wait, + union pipe_query_result *result) +{ + assert(wait); + + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + float time_base, time; + + /* In order to not lose precision over time, we'll use two event objects */ + NOCL_CALL(cuEventSynchronize(query->cu_event)); + NOCL_CALL(cuEventElapsedTime_v1(&time_base, ctx->cu_timestamp_start, query->cu_event_base)); + NOCL_CALL(cuEventElapsedTime_v1(&time, query->cu_event_base, query->cu_event)); + + double hires_time_base = (double)time_base; + double hires_time = (double)time; + + /* the resolution is 0.5us, so let's add half of that that just in case */ + result->u64 = ceil((hires_time_base + hires_time) * 1000000) + ctx->timestamp_start + 250; + return true; +} + +static void +nocl_launch_grid(struct pipe_context *pctx, + const struct pipe_grid_info *info) +{ + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + struct nocl_program *prog = ctx->prog; + + void *input[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, (void *)ctx->kernel_input, + CU_LAUNCH_PARAM_BUFFER_SIZE, &ctx->kernel_input_size, + CU_LAUNCH_PARAM_END + }; + + NOCL_CALL(cuLaunchKernel( + prog->cu_func, + info->grid[0], info->grid[1], info->grid[2], + info->block[0], info->block[1], info->block[2], + prog->shared_size + info->variable_shared_mem, + ctx->cu_stream, + NULL, + input + )); +} + +static void +nocl_memory_barrier(struct pipe_context *, unsigned flags) +{ + // assert(false); +} + +static void +nocl_resource_copy_region(struct pipe_context *pctx, + struct pipe_resource *pdst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *psrc, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct nocl_context *ctx = nocl_context_and_make_current(pctx); + struct nocl_buffer *src = nocl_buffer(psrc); + struct nocl_buffer *dst = nocl_buffer(pdst); + + if (psrc->target == PIPE_BUFFER && pdst->target == PIPE_BUFFER) { + NOCL_CALL(cuMemcpyAsync(dst->dptr + dstx, src->dptr + src_box->x, src_box->width, ctx->cu_stream)); + } else { + assert(false); + } +} + +static void +nocl_sampler_view_destroy(struct pipe_context *ctx, + struct pipe_sampler_view *view) +{ + assert(false); +} + +static void +nocl_sampler_view_release(struct pipe_context *ctx, + struct pipe_sampler_view *view) +{ + assert(false); +} + +static void +nocl_set_constant_buffer(struct pipe_context *pctx, + mesa_shader_stage shader, uint index, + const struct pipe_constant_buffer *buf) +{ + struct nocl_context *ctx = nocl_context(pctx); + assert(index == 0); + assert(shader == MESA_SHADER_COMPUTE); + + if (buf) { + assert(buf->buffer == NULL); + assert(buf->buffer_offset == 0); + + if (buf->buffer_size > ctx->kernel_input_capacity) { + ralloc_free(ctx->kernel_input); + ctx->kernel_input = ralloc_size(ctx, buf->buffer_size); + } + + ctx->kernel_input_size = buf->buffer_size; + memcpy(ctx->kernel_input, buf->user_buffer, buf->buffer_size); + } else { + ctx->kernel_input_size = 0; + } +} + +static void +nocl_set_global_binding(struct pipe_context *pctx, + unsigned first, unsigned count, + struct pipe_resource **resources, + uint32_t **handles) +{ + if (!resources) + return; + + for (unsigned i = 0; i < count; i++) { + struct nocl_buffer *res = nocl_buffer(resources[i]); + uint64_t handle; + + memcpy(&handle, handles[i], 8); + handle += res->dptr; + memcpy(handles[i], &handle, 8); + } +} + +static void +nocl_set_sampler_views(struct pipe_context *pctx, + mesa_shader_stage shader, + unsigned start_slot, unsigned num_views, + unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views) +{ + assert(!num_views); +} + +static void +nocl_set_shader_images(struct pipe_context *pctx, + mesa_shader_stage shader, + unsigned start_slot, unsigned count, + unsigned unbind_num_trailing_slots, + const struct pipe_image_view *images) +{ + assert(!count); +} + +static void * +nocl_texture_map(struct pipe_context *, + struct pipe_resource *resource, + unsigned level, + unsigned usage, /* a combination of PIPE_MAP_x */ + const struct pipe_box *, + struct pipe_transfer **out_transfer) +{ + assert(false); + return NULL; +} + +static void +nocl_texture_subdata(struct pipe_context *, + struct pipe_resource *, + unsigned level, + unsigned usage, /* a combination of PIPE_MAP_x */ + const struct pipe_box *, + const void *data, + unsigned stride, + uintptr_t layer_stride) +{ + assert(false); +} + +static void +nocl_texture_unmap(struct pipe_context *, + struct pipe_transfer *transfer) +{ + assert(false); +} + +static struct pipe_context * +nocl_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) +{ + nocl_screen_and_make_current(pscreen); + struct nocl_context *ctx = rzalloc(pscreen, struct nocl_context); + + ctx->base = (struct pipe_context) { + .bind_compute_state = nocl_bind_compute_state, + .bind_sampler_states = nocl_bind_sampler_states, + .buffer_map = nocl_buffer_map, + .buffer_subdata = nocl_buffer_subdata, + .buffer_unmap = nocl_buffer_unmap, + .clear_buffer = nocl_clear_buffer, + .create_compute_state = nocl_create_compute_state, + .create_query = nocl_create_query, + .create_sampler_state = nocl_create_sampler_state, + .delete_compute_state = nocl_delete_compute_state, + .delete_sampler_state = nocl_delete_sampler_state, + .destroy = nocl_context_destroy, + .destroy_query = nocl_destroy_query, + .end_query = nocl_end_query, + .flush = nocl_flush, + .get_compute_state_info = nocl_get_compute_state_info, + .get_query_result = nocl_get_query_result, + .launch_grid = nocl_launch_grid, + .memory_barrier = nocl_memory_barrier, + .resource_copy_region = nocl_resource_copy_region, + .sampler_view_destroy = nocl_sampler_view_destroy, + .sampler_view_release = nocl_sampler_view_release, + .set_constant_buffer = nocl_set_constant_buffer, + .set_global_binding = nocl_set_global_binding, + .set_sampler_views = nocl_set_sampler_views, + .set_shader_images = nocl_set_shader_images, + .screen = pscreen, + .texture_map = nocl_texture_map, + .texture_subdata = nocl_texture_subdata, + .texture_unmap = nocl_texture_unmap, + }; + + /* TODO: support multiple streams */ + ctx->cu_stream = NULL; + // if (nocl_error(cuStreamCreate(&ctx->cu_stream, CU_STREAM_NON_BLOCKING))) { + // FREE(ctx); + // return NULL; + // } + + NOCL_CALL(cuEventCreate(&ctx->cu_timestamp_start, CU_EVENT_BLOCKING_SYNC)); + NOCL_CALL(cuEventRecord(ctx->cu_timestamp_start, ctx->cu_stream)); + NOCL_CALL(cuEventSynchronize(ctx->cu_timestamp_start)); + /* Need to fetch the host time after waiting on the event */ + ctx->timestamp_start = pscreen->get_timestamp(pscreen); + + return &ctx->base; +} + +static void +nocl_destroy(struct pipe_screen *pscreen) +{ + struct nocl_screen *nocl = nocl_screen_and_make_current(pscreen); +#if CUDA_VERSION >= 12090 + NOCL_CALL(cuLogsUnregisterCallback(nocl->verbose_handle)); +#endif + disk_cache_destroy(nocl->disk_cache); + NOCL_CALL(cuCtxDestroy(nocl->cu_ctx)); + ralloc_free(nocl); +} + +#if CUDA_VERSION >= 11040 +static void +nocl_get_device_uuid(struct pipe_screen *pscreen, char *uuid) +{ + struct nocl_screen *nocl = nocl_screen(pscreen); + CUuuid uuid_tmp; + + STATIC_ASSERT(sizeof(uuid_tmp.bytes) == PIPE_UUID_SIZE); + NOCL_CALL(cuDeviceGetUuid_v2(&uuid_tmp, nocl->cu_dev)); + memcpy(uuid, uuid_tmp.bytes, PIPE_UUID_SIZE); +} +#endif + +static void +nocl_get_driver_uuid(struct pipe_screen *screen, char *uuid) +{ + const char* nocl_driver = PACKAGE_VERSION MESA_GIT_SHA1; + struct mesa_sha1 sha1_ctx; + uint8_t sha1[20]; + + _mesa_sha1_init(&sha1_ctx); + _mesa_sha1_update(&sha1_ctx, nocl_driver, strlen(nocl_driver)); + _mesa_sha1_final(&sha1_ctx, sha1); + memcpy(uuid, sha1, PIPE_UUID_SIZE); +} + +static bool +nocl_fence_finish(struct pipe_screen *pscreen, + struct pipe_context *pctx, + struct pipe_fence_handle *fence, + uint64_t timeout) +{ + nocl_screen_and_make_current(pscreen); + return !NOCL_CALL(cuEventSynchronize(fence->cu_event)); +} + +static void +nocl_fence_reference(struct pipe_screen *screen, + struct pipe_fence_handle **dst, + struct pipe_fence_handle *src) +{ + if (pipe_reference(*dst ? &(*dst)->ref : NULL, + src ? &src->ref : NULL)) { + nocl_fence_finish(screen, NULL, *dst, OS_TIMEOUT_INFINITE); + NOCL_CALL(cuEventDestroy_v2((*dst)->cu_event)); + FREE(*dst); + } + + *dst = src; +} + +static const char * +nocl_get_device_vendor(struct pipe_screen *) +{ + return "NVIDIA?"; +} + +static struct disk_cache * +nocl_get_disk_shader_cache(struct pipe_screen *pscreen) +{ + return nocl_screen(pscreen)->disk_cache; +} + +static const char * +nocl_get_name(struct pipe_screen *pscreen) +{ + struct nocl_screen *nocl = nocl_screen(pscreen); + return nocl->name; +} + +static bool +nocl_is_format_supported(struct pipe_screen *, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, + unsigned bindings) +{ + return false; +} + +static struct pipe_resource * +nocl_buffer_create(struct pipe_screen *pscreen, + const struct pipe_resource *template) +{ + nocl_screen_and_make_current(pscreen); + struct nocl_buffer *cu_res = CALLOC_STRUCT_CL(nocl_buffer); + if (!cu_res) + return NULL; + + if (template->usage & PIPE_USAGE_STAGING) { + if (NOCL_CALL(cuMemAllocHost_v2(&cu_res->ptr, template->width0))) { + FREE(cu_res); + return NULL; + } + } else { + if (NOCL_CALL(cuMemAlloc_v2(&cu_res->dptr, template->width0))) { + FREE(cu_res); + return NULL; + } + } + + cu_res->base = *template; + cu_res->base.screen = pscreen; + pipe_reference_init(&cu_res->base.reference, 1); + return &cu_res->base; +} + +static struct pipe_resource * +nocl_resource_create(struct pipe_screen *pscreen, + const struct pipe_resource *template) +{ + if (template->target == PIPE_BUFFER) + return nocl_buffer_create(pscreen, template); + return NULL; +} + +static void +nocl_resource_destroy(struct pipe_screen *pscreen, + struct pipe_resource *pres) +{ + if (pres->target == PIPE_BUFFER) { + struct nocl_buffer *res = nocl_buffer(pres); + if (pres->usage & PIPE_USAGE_STAGING) { + NOCL_CALL(cuMemFreeHost(res->ptr)); + } else { + NOCL_CALL(cuMemFree_v2(res->dptr)); + } + } + + FREE(pres); +} + +static struct pipe_resource * +nocl_resource_from_user_memory(struct pipe_screen *pscreen, + const struct pipe_resource *pres, + void *user_memory) +{ + /* TODO: use cuMemHostRegister */ + return NULL; +} + +static uint64_t +nocl_resource_get_address(struct pipe_screen *pscreen, + struct pipe_resource *pres) +{ + STATIC_ASSERT(sizeof(CUdeviceptr) == sizeof(void *)); + assert(pres->target == PIPE_BUFFER); + struct nocl_buffer *res = nocl_buffer(pres); + return res->dptr; +} + +#if CUDA_VERSION >= 12090 +static void +nocl_log_callback(void *data, CUlogLevel logLevel, char *message, size_t length) +{ + UNUSED struct nocl_screen *nocl = data; + printf("NOCL-VERBOSE: %s\n", message); +} +#endif + +struct pipe_screen * +nocl_create_screen(struct cuda_pipe_loader_device *dev) +{ + CUdevice cu_dev = dev->dev; + struct nocl_screen *nocl = rzalloc(NULL, struct nocl_screen); + if (!nocl) + return NULL; + + nocl->debug = debug_get_flags_option("NOCL_DEBUG", nocl_debug_options, 0); + +#if CUDA_VERSION >= 12090 + if (nocl->debug & NOCL_DBG_VERBOSE) + NOCL_CALL(cuLogsRegisterCallback(nocl_log_callback, nocl, &nocl->verbose_handle)); +#endif + + NOCL_CALL(cuDeviceGetName(nocl->name, sizeof(nocl->name), cu_dev)); + if (NOCL_CALL(cuCtxCreate_v2(&nocl->cu_ctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, cu_dev))) { +#if CUDA_VERSION >= 12090 + NOCL_CALL(cuLogsUnregisterCallback(nocl->verbose_handle)); +#endif + ralloc_free(nocl); + return NULL; + } + + nocl->sm = debug_get_num_option("NOCL_SM_TARGET", 0); + if (!nocl->sm) { + int sm_tmp; + NOCL_CALL(cuDeviceGetAttribute(&sm_tmp, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_dev)); + nocl->sm = sm_tmp * 10; + NOCL_CALL(cuDeviceGetAttribute(&sm_tmp, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_dev)); + assert(sm_tmp < 10); + nocl->sm |= sm_tmp; + } + + nocl->ptx = debug_get_num_option("NOCL_PTX_VERSION", 0); + if (!nocl->ptx) { + /* We have to set the highest PTX version to the sm level we enable features upon + * e.g. SM80+ enables fp16 min/max and subgroup reduce, therefore the highest PTX version we + * use is 7.0. If we'd start supporting `match`, we'd need to check for SM90 -> PTX 7.8 + */ + if (dev->cuda_version >= 11000) + nocl->ptx = 70; /* SM80 */ + else if (dev->cuda_version >= 10000) + nocl->ptx = 63; /* SM75 */ + else if (dev->cuda_version >= 9010) + nocl->ptx = 61; /* SM72 */ + else if (dev->cuda_version >= 9000) + nocl->ptx = 60; /* SM70 */ + else if (dev->cuda_version >= 8000) + nocl->ptx = 50; /* SM60/SM61/SM62 */ + else if (dev->cuda_version >= 7500) + nocl->ptx = 43; /* lop3 */ + else if (dev->cuda_version >= 7000) + nocl->ptx = 42; /* SM53 */ + else if (dev->cuda_version >= 6500) + nocl->ptx = 41; + } + + { + /* TODO: also check libcuda */ + const struct build_id_note *note = build_id_find_nhdr_for_addr(nocl_create_screen); + assert(note && build_id_length(note) == 20); /* sha1 */ + const uint8_t *id_sha1 = build_id_data(note); + assert(id_sha1); + + char timestamp[41]; + _mesa_sha1_format(timestamp, id_sha1); + + uint64_t driver_flags = 0; + driver_flags |= nocl->sm; + driver_flags |= nocl->ptx << 16; + if (nocl->debug & NOCL_DBG_JIT_NO_OPTS) + driver_flags |= BITFIELD64_BIT(32); + + nocl->disk_cache = disk_cache_create( + nocl->name, timestamp, driver_flags + ); + } + + nocl->cu_dev = cu_dev; + nocl->base.context_create = nocl_context_create; + nocl->base.destroy = nocl_destroy; +#if CUDA_VERSION >= 11040 + if (nocl_dispatch.cuDeviceGetUuid_v2) + nocl->base.get_device_uuid = nocl_get_device_uuid; +#endif + nocl->base.get_driver_uuid = nocl_get_driver_uuid; + nocl->base.fence_finish = nocl_fence_finish; + nocl->base.fence_reference = nocl_fence_reference; + nocl->base.finalize_nir = nocl_finalize_nir; + nocl->base.get_device_vendor = nocl_get_device_vendor; + nocl->base.get_disk_shader_cache = nocl_get_disk_shader_cache; + nocl->base.get_name = nocl_get_name; + nocl->base.get_timestamp = u_default_get_timestamp; + nocl->base.is_format_supported = nocl_is_format_supported; + nocl->base.resource_create = nocl_resource_create; + nocl->base.resource_destroy = nocl_resource_destroy; + nocl->base.resource_from_user_memory = nocl_resource_from_user_memory; + nocl->base.resource_get_address = nocl_resource_get_address; + + struct pipe_caps *caps = (void *)&nocl->base.caps; + struct pipe_compute_caps *compute_caps = (void *)&nocl->base.compute_caps; + struct pipe_shader_caps *shader_caps = (void *)&nocl->base.shader_caps[MESA_SHADER_COMPUTE]; + struct nir_shader_compiler_options *nir_options = nocl_get_nir_options(nocl); + nocl->base.nir_options[MESA_SHADER_COMPUTE] = nir_options; + + caps->accelerated = true; + caps->compute = true; + caps->fp16 = nocl->sm >= 53; + caps->int64 = true; + caps->max_shader_buffer_size = INT32_MAX; + caps->query_timestamp = true; + caps->shader_clock = true; + caps->shareable_shaders = true; + if (nocl->sm >= 30 && nocl->ptx >= 60) { + caps->shader_subgroup_supported_features = PIPE_SHADER_SUBGROUP_FEATURE_MASK; + caps->shader_subgroup_supported_stages = MESA_SHADER_COMPUTE; + } + caps->timer_resolution = 500; + + NOCL_CALL(cuDeviceGetAttribute((int *)&caps->max_constant_buffer_size, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&caps->pci_group, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&caps->pci_bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&caps->pci_device, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cu_dev)); + caps->pci_function = 0; + NOCL_CALL(cuDeviceGetAttribute((int *)&caps->shader_subgroup_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cu_dev)); + + shader_caps->fp16 = caps->fp16; + shader_caps->max_const_buffer0_size = caps->max_constant_buffer_size; + shader_caps->max_const_buffers = 8; // TODO: is there a query for it? + shader_caps->supported_irs = 1 << PIPE_SHADER_IR_NIR; + + compute_caps->address_bits = nocl->sm < 20 ? 32 : 64; + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_clock_frequency, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cu_dev)); + compute_caps->max_clock_frequency /= 1000; + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_compute_units, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_block_size[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_block_size[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_block_size[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, cu_dev)); + size_t max_global_size; + NOCL_CALL(cuDeviceTotalMem_v2(&max_global_size, cu_dev)); + compute_caps->max_global_size = max_global_size; + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_grid_size[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_grid_size[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, cu_dev)); + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_grid_size[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, cu_dev)); + compute_caps->max_mem_alloc_size = INT32_MAX; + NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_local_size, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, cu_dev)); + compute_caps->max_variable_threads_per_block = compute_caps->max_threads_per_block; + + if (nocl->sm >= 30) { + compute_caps->max_subgroups = compute_caps->max_threads_per_block / caps->shader_subgroup_size; + compute_caps->subgroup_sizes = caps->shader_subgroup_size; + } + + return &nocl->base; +} diff --git a/src/gallium/drivers/nocl/nocl_private.h b/src/gallium/drivers/nocl/nocl_private.h new file mode 100644 index 00000000000..8dc271b82c4 --- /dev/null +++ b/src/gallium/drivers/nocl/nocl_private.h @@ -0,0 +1,147 @@ +/* + * Copyright © 2025 Karol Herbst + * SPDX-License-Identifier: MIT + */ + +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" + +#include "nocl/nocl_cuda_public.h" + +static const uint32_t NOCL_COMPILER_LOG_SIZE = 0x1000; + +enum nocl_dbg { + NOCL_DBG_32 = BITFIELD_BIT(0), + NOCL_DBG_JIT_NO_OPTS = BITFIELD_BIT(1), + NOCL_DBG_NIR = BITFIELD_BIT(2), + NOCL_DBG_PTX = BITFIELD_BIT(3), + NOCL_DBG_VERBOSE = BITFIELD_BIT(4), +}; + +#define NOCL_CALL(func) nocl_error(nocl_dispatch.func) + +static inline bool +nocl_error(CUresult err) +{ + if (err == CUDA_SUCCESS) + return false; + + const char *err_str; + if (nocl_dispatch.cuGetErrorString(err, &err_str)) { + printf("NOCL-ERROR: UNKNOWN\n"); + } else { + printf("NOCL-ERROR: %s\n", err_str); + } + + return true; +} + +struct nocl_screen { + struct pipe_screen base; +#if CUDA_VERSION >= 12090 + CUlogsCallbackHandle verbose_handle; +#endif + CUdevice cu_dev; + CUcontext cu_ctx; + struct disk_cache *disk_cache; + uint32_t debug; + uint16_t ptx; + uint16_t sm; + char name[128]; +}; + +struct nocl_buffer { + struct pipe_resource base; + union { + CUdeviceptr dptr; + void *ptr; + }; +}; + +struct nocl_program { + CUmodule cu_mod; + CUfunction cu_func; + uint32_t shared_size; +}; + +struct nocl_context { + struct pipe_context base; + void *kernel_input; + uint32_t kernel_input_size; + uint32_t kernel_input_capacity; + CUstream cu_stream; + CUevent cu_timestamp_start; + uint64_t timestamp_start; + struct nocl_program *prog; +}; + +struct pipe_fence_handle { + struct pipe_reference ref; + CUevent cu_event; +}; + +struct nocl_transfer { + struct pipe_transfer base; + void *ptr; +}; + +struct pipe_query { + CUevent cu_event_base; + CUevent cu_event; +}; + +static inline struct nocl_screen* +nocl_screen(struct pipe_screen *pscreen) +{ + return (struct nocl_screen *)pscreen; +} + +static inline struct nocl_screen* +nocl_screen_and_make_current(struct pipe_screen *pscreen) +{ + struct nocl_screen *nocl = nocl_screen(pscreen); + NOCL_CALL(cuCtxSetCurrent(nocl->cu_ctx)); + return nocl; +} + +static inline struct nocl_context* +nocl_context(struct pipe_context *pctx) +{ + return (struct nocl_context *)pctx; +} + +static inline struct nocl_context* +nocl_context_and_make_current(struct pipe_context *pctx) +{ + nocl_screen_and_make_current(pctx->screen); + return nocl_context(pctx); +} + +static inline struct nocl_program* +nocl_program(void *prog) +{ + return (struct nocl_program *)prog; +} + +static inline struct nocl_buffer* +nocl_buffer(struct pipe_resource *pres) +{ + return (struct nocl_buffer *)pres; +} + +static inline struct nocl_transfer* +nocl_transfer(struct pipe_transfer *pxfer) +{ + return (struct nocl_transfer *)pxfer; +} + +void nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir); +bool nocl_nir_lower_cf(struct nir_shader *nir); +bool nocl_nir_opt_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl); +bool nocl_nir_lower_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl); +struct nir_shader_compiler_options *nocl_get_nir_options(struct nocl_screen *nocl); diff --git a/src/gallium/frontends/rusticl/mesa/pipe/device.rs b/src/gallium/frontends/rusticl/mesa/pipe/device.rs index edb7af668bc..de1fb5b22a9 100644 --- a/src/gallium/frontends/rusticl/mesa/pipe/device.rs +++ b/src/gallium/frontends/rusticl/mesa/pipe/device.rs @@ -99,6 +99,7 @@ fn get_enabled_devs() -> HashMap { let driver_str = match driver_str[0] { "llvmpipe" | "lp" => "swrast", + "cuda" => "nocl", "freedreno" => "msm", a => a, }; diff --git a/src/gallium/meson.build b/src/gallium/meson.build index 9b02fd1189a..2d7d8ec436a 100644 --- a/src/gallium/meson.build +++ b/src/gallium/meson.build @@ -206,6 +206,12 @@ if with_gallium_d3d12 else driver_d3d12 = declare_dependency() endif +if with_gallium_nocl + subdir('winsys/nocl') + subdir('drivers/nocl') +else + driver_nocl = declare_dependency() +endif if with_gallium_rusticl subdir('frontends/rusticl') subdir('targets/rusticl') diff --git a/src/gallium/targets/rusticl/meson.build b/src/gallium/targets/rusticl/meson.build index 2b214adbbf5..06b86b65595 100644 --- a/src/gallium/targets/rusticl/meson.build +++ b/src/gallium/targets/rusticl/meson.build @@ -34,6 +34,7 @@ librusticl = shared_library( ], dependencies : [ driver_asahi, + driver_nocl, driver_freedreno, driver_iris, driver_nouveau, diff --git a/src/gallium/winsys/nocl/meson.build b/src/gallium/winsys/nocl/meson.build new file mode 100644 index 00000000000..19c435948c0 --- /dev/null +++ b/src/gallium/winsys/nocl/meson.build @@ -0,0 +1,25 @@ +# Copyright © 2025 Karol Herbst +# SPDX-License-Identifier: MIT + +libnocl_cuda = static_library( + 'nocl_cuda', + [ + 'nocl_cuda.c', + 'nocl_cuda_public.h', + ], + include_directories : [ + inc_util, + inc_gallium, + inc_gallium_aux, + ], + dependencies : [ + dep_libcuda, + ], +) + +idep_libnoclwinsys = declare_dependency( + include_directories : [ + include_directories('.'), + ], + link_with : [libnocl_cuda], +) diff --git a/src/gallium/winsys/nocl/nocl_cuda.c b/src/gallium/winsys/nocl/nocl_cuda.c new file mode 100644 index 00000000000..611267b054e --- /dev/null +++ b/src/gallium/winsys/nocl/nocl_cuda.c @@ -0,0 +1,212 @@ +/* + * Copyright © 2025 Karol Herbst + * + * SPDX-License-Identifier: MIT + */ + +#include "pipe-loader/pipe_loader.h" +#include "pipe-loader/pipe_loader_priv.h" +#include "target-helpers/inline_debug_helper.h" +#include "util/u_dl.h" +#include "util/u_memory.h" + +#include "cuda.h" +#include "nocl_cuda_public.h" + + +#define cuda_err(err) if (err) { \ + const char *err_str; \ + nocl_dispatch.cuGetErrorString(err, &err_str); \ + printf("CUDA returned error: %s\n", err_str); \ + return 0; \ +} + +static struct driOptionDescription nocl_dri_conf = {}; +static struct util_dl_library *libcuda = NULL; +struct cuda_symbol_table nocl_dispatch = {}; + +static struct pipe_screen * +nocl_loader_create_screen(struct pipe_loader_device *dev, + const struct pipe_screen_config *config, + bool sw_vk) +{ + struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)dev; + struct pipe_screen *pscreen = nocl_create_screen(cuda_dev); + return pscreen ? debug_screen_wrap(pscreen) : NULL; +} + +static const struct driOptionDescription * +nocl_get_driconf(struct pipe_loader_device *dev, unsigned *count) +{ + *count = 0; + return &nocl_dri_conf; +} + +static void +nocl_release(struct pipe_loader_device **dev) +{ + struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)*dev; + FREE(cuda_dev); +} + +static struct pipe_loader_ops cuda_loader_ops = { + .create_screen = nocl_loader_create_screen, + .release = nocl_release, + .get_driconf = nocl_get_driconf, +}; + +static int32_t faked_version = 0; + +static bool +nocl_load_pfn(const char *name, void **pfn, int version) +{ + if (version > faked_version) { + *pfn = NULL; + return false; + } + + CUdriverProcAddressQueryResult proc_result; + CUresult res = nocl_dispatch.cuGetProcAddress_v2(name, pfn, version, CU_GET_PROC_ADDRESS_LEGACY_STREAM, &proc_result); + bool success = res == CUDA_SUCCESS && proc_result == CU_GET_PROC_ADDRESS_SUCCESS; + /* Probably always NULL on errors but let's be sure */ + if (!success) + *pfn = NULL; + return success; +} + +int +pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev) +{ + if (!libcuda) { + UNUSED void *func = NULL; + faked_version = debug_get_num_option("NOCL_API_TARGET", INT32_MAX); + libcuda = util_dl_open("libcuda.so"); + if (!libcuda) + return 0; + + /* Needs 12.0 */ + nocl_dispatch.cuGetProcAddress_v2 = (void*)util_dl_get_proc_address(libcuda, "cuGetProcAddress_v2"); + if (!nocl_dispatch.cuGetProcAddress_v2) { + return 0; + } + + bool success = true; + success &= nocl_load_pfn("cuCtxCreate", (void**)&nocl_dispatch.cuCtxCreate_v2, 3020); + success &= nocl_load_pfn("cuCtxDestroy", (void**)&nocl_dispatch.cuCtxDestroy_v2, 4000); + success &= nocl_load_pfn("cuCtxSetCurrent", (void**)&nocl_dispatch.cuCtxSetCurrent, 4000); + success &= nocl_load_pfn("cuDeviceGet", (void**)&nocl_dispatch.cuDeviceGet, 2000); + success &= nocl_load_pfn("cuDeviceGetAttribute", (void**)&nocl_dispatch.cuDeviceGetAttribute, 2000); + success &= nocl_load_pfn("cuDeviceGetCount", (void**)&nocl_dispatch.cuDeviceGetCount, 2000); + success &= nocl_load_pfn("cuDeviceGetName", (void**)&nocl_dispatch.cuDeviceGetName, 2000); + success &= nocl_load_pfn("cuDeviceTotalMem", (void**)&nocl_dispatch.cuDeviceTotalMem_v2, 3020); + success &= nocl_load_pfn("cuDriverGetVersion", (void**)&nocl_dispatch.cuDriverGetVersion, 2020); + success &= nocl_load_pfn("cuEventCreate", (void**)&nocl_dispatch.cuEventCreate, 2000); + success &= nocl_load_pfn("cuEventDestroy", (void**)&nocl_dispatch.cuEventDestroy_v2, 4000); + success &= nocl_load_pfn("cuEventElapsedTime", (void**)&nocl_dispatch.cuEventElapsedTime_v1, 2000); + success &= nocl_load_pfn("cuEventRecord", (void**)&nocl_dispatch.cuEventRecord, 2000); + success &= nocl_load_pfn("cuEventSynchronize", (void**)&nocl_dispatch.cuEventSynchronize, 2000); + success &= nocl_load_pfn("cuFuncGetAttribute", (void**)&nocl_dispatch.cuFuncGetAttribute, 2020); + success &= nocl_load_pfn("cuGetErrorString", (void**)&nocl_dispatch.cuGetErrorString, 6000); + success &= nocl_load_pfn("cuInit", (void**)&nocl_dispatch.cuInit, 2000); + success &= nocl_load_pfn("cuLaunchKernel", (void**)&nocl_dispatch.cuLaunchKernel, 4000); + success &= nocl_load_pfn("cuLinkAddData", (void**)&nocl_dispatch.cuLinkAddData_v2, 6050); + success &= nocl_load_pfn("cuLinkComplete", (void**)&nocl_dispatch.cuLinkComplete, 5050); + success &= nocl_load_pfn("cuLinkCreate", (void**)&nocl_dispatch.cuLinkCreate_v2, 6050); + success &= nocl_load_pfn("cuLinkDestroy", (void**)&nocl_dispatch.cuLinkDestroy, 5050); + success &= nocl_load_pfn("cuMemAlloc", (void**)&nocl_dispatch.cuMemAlloc_v2, 3020); + success &= nocl_load_pfn("cuMemAllocHost", (void**)&nocl_dispatch.cuMemAllocHost_v2, 3020); + success &= nocl_load_pfn("cuMemcpy", (void**)&nocl_dispatch.cuMemcpy, 4000); + success &= nocl_load_pfn("cuMemcpyAsync", (void**)&nocl_dispatch.cuMemcpyAsync, 4000); + success &= nocl_load_pfn("cuMemcpyDtoHAsync", (void**)&nocl_dispatch.cuMemcpyDtoHAsync_v2, 3020); + success &= nocl_load_pfn("cuMemcpyHtoD", (void**)&nocl_dispatch.cuMemcpyHtoD_v2, 3020); + success &= nocl_load_pfn("cuMemFree", (void**)&nocl_dispatch.cuMemFree_v2, 3020); + success &= nocl_load_pfn("cuMemFreeHost", (void**)&nocl_dispatch.cuMemFreeHost, 2000); + success &= nocl_load_pfn("cuMemsetD8Async", (void**)&nocl_dispatch.cuMemsetD8Async, 3020); + success &= nocl_load_pfn("cuMemsetD16Async", (void**)&nocl_dispatch.cuMemsetD16Async, 3020); + success &= nocl_load_pfn("cuMemsetD32Async", (void**)&nocl_dispatch.cuMemsetD32Async, 3020); + success &= nocl_load_pfn("cuModuleGetFunction", (void**)&nocl_dispatch.cuModuleGetFunction, 2000); + success &= nocl_load_pfn("cuModuleLoadDataEx", (void**)&nocl_dispatch.cuModuleLoadDataEx, 2010); + success &= nocl_load_pfn("cuModuleUnload", (void**)&nocl_dispatch.cuModuleUnload, 2000); + success &= nocl_load_pfn("cuStreamDestroy", (void**)&nocl_dispatch.cuStreamDestroy_v2, 4000); + success &= nocl_load_pfn("cuStreamSynchronize", (void**)&nocl_dispatch.cuStreamSynchronize, 2000); + if (!success) + return 0; + + /* Only documented difference appears to be that v2 won't return errors on previous async + * commands, so just overwrite the previously fetched pointer */ + nocl_load_pfn("cuEventElapsedTime", &func, 12080); + if (func) + nocl_dispatch.cuEventElapsedTime_v1 = func; + + nocl_dispatch.all_required_available = true; +#if CUDA_VERSION >= 11040 + success &= nocl_load_pfn("cuDeviceGetUuid", (void**)&nocl_dispatch.cuDeviceGetUuid_v2, 11040); +#endif +#if CUDA_VERSION >= 12090 + nocl_load_pfn("cuLogsRegisterCallback", (void**)&nocl_dispatch.cuLogsRegisterCallback, 12090); + nocl_load_pfn("cuLogsUnregisterCallback", (void**)&nocl_dispatch.cuLogsUnregisterCallback, 12090); +#endif + } + + if (!nocl_dispatch.all_required_available) + return 0; + + int driver_version; + CUresult res = nocl_dispatch.cuInit(0); + cuda_err(res); + + res = nocl_dispatch.cuDriverGetVersion(&driver_version); + /* Our highest version requirement across all required APIs */ + if (driver_version < 8000) + return 0; + + if (!ndev) { + res = nocl_dispatch.cuDeviceGetCount(&ndev); + cuda_err(res); + return ndev; + } + + if (!devs) + return 0; + + int cnt; + res = nocl_dispatch.cuDeviceGetCount(&cnt); + cuda_err(res); + for (int i = 0; i < ndev; i++) { + if (i >= cnt) + return i; + + CUdevice cuda_dev; + res = nocl_dispatch.cuDeviceGet(&cuda_dev, i); + if (res) + return i; + + int pci_device_id; +#if CUDA_VERSION >= 12080 + res = nocl_dispatch.cuDeviceGetAttribute(&pci_device_id, CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID, cuda_dev); + cuda_err(res); +#else + /* TODO: we might need to get the device id from somewhere, but so far + * nothing needs it */ + pci_device_id = 0x10de; +#endif + + struct cuda_pipe_loader_device *dev = CALLOC_STRUCT(cuda_pipe_loader_device); + *dev = (struct cuda_pipe_loader_device) { + .base = { + .driver_name = "nocl", + .ops = &cuda_loader_ops, + /* TODO: tegra */ + .type = PIPE_LOADER_DEVICE_PCI, + .u.pci.chip_id = ((unsigned int)pci_device_id) >> 16, + .u.pci.vendor_id = pci_device_id & 0xffff, + }, + .dev = cuda_dev, + .cuda_version = driver_version, + }; + + devs[i] = &dev->base; + } + + return ndev; +} diff --git a/src/gallium/winsys/nocl/nocl_cuda_public.h b/src/gallium/winsys/nocl/nocl_cuda_public.h new file mode 100644 index 00000000000..c8a9b800d3e --- /dev/null +++ b/src/gallium/winsys/nocl/nocl_cuda_public.h @@ -0,0 +1,73 @@ +/* + * Copyright © 2025 Karol Herbst + * + * SPDX-License-Identifier: MIT + */ + +#ifndef __NOCL_DRM_PUBLIC_H__ +#define __NOCL_DRM_PUBLIC_H__ + +#include + +#include "pipe-loader/pipe_loader.h" + +struct cuda_pipe_loader_device { + struct pipe_loader_device base; + CUdevice dev; + int cuda_version; +}; + +extern struct cuda_symbol_table { + PFN_cuCtxCreate_v3020 cuCtxCreate_v2; + PFN_cuCtxDestroy_v4000 cuCtxDestroy_v2; + PFN_cuCtxSetCurrent_v4000 cuCtxSetCurrent; + PFN_cuDeviceGet_v2000 cuDeviceGet; + PFN_cuDeviceGetAttribute_v2000 cuDeviceGetAttribute; + PFN_cuDeviceGetCount_v2000 cuDeviceGetCount; + PFN_cuDeviceGetName_v2000 cuDeviceGetName; + PFN_cuDeviceTotalMem_v3020 cuDeviceTotalMem_v2; + PFN_cuDriverGetVersion_v2020 cuDriverGetVersion; + PFN_cuEventCreate_v2000 cuEventCreate; + PFN_cuEventDestroy_v4000 cuEventDestroy_v2; + PFN_cuEventElapsedTime_v2000 cuEventElapsedTime_v1; + PFN_cuEventRecord_v2000 cuEventRecord; + PFN_cuEventSynchronize_v2000 cuEventSynchronize; + PFN_cuFuncGetAttribute_v2020 cuFuncGetAttribute; + PFN_cuGetErrorString_v6000 cuGetErrorString; + PFN_cuGetProcAddress_v12000 cuGetProcAddress_v2; + PFN_cuInit_v2000 cuInit; + PFN_cuLaunchKernel_v4000 cuLaunchKernel; + PFN_cuLinkAddData_v6050 cuLinkAddData_v2; + PFN_cuLinkComplete_v5050 cuLinkComplete; + PFN_cuLinkCreate_v6050 cuLinkCreate_v2; + PFN_cuLinkDestroy_v5050 cuLinkDestroy; + PFN_cuMemAlloc_v3020 cuMemAlloc_v2; + PFN_cuMemAllocHost_v3020 cuMemAllocHost_v2; + PFN_cuMemcpy_v4000 cuMemcpy; + PFN_cuMemcpyAsync_v4000 cuMemcpyAsync; + PFN_cuMemcpyDtoHAsync_v3020 cuMemcpyDtoHAsync_v2; + PFN_cuMemcpyHtoD_v3020 cuMemcpyHtoD_v2; + PFN_cuMemFree_v3020 cuMemFree_v2; + PFN_cuMemFreeHost_v2000 cuMemFreeHost; + PFN_cuMemsetD8Async_v3020 cuMemsetD8Async; + PFN_cuMemsetD16Async_v3020 cuMemsetD16Async; + PFN_cuMemsetD32Async_v3020 cuMemsetD32Async; + PFN_cuModuleGetFunction_v2000 cuModuleGetFunction; + PFN_cuModuleLoadDataEx_v2010 cuModuleLoadDataEx; + PFN_cuModuleUnload_v2000 cuModuleUnload; + PFN_cuStreamDestroy_v4000 cuStreamDestroy_v2; + PFN_cuStreamSynchronize_v2000 cuStreamSynchronize; +#if CUDA_VERSION >= 11040 + PFN_cuDeviceGetUuid_v11040 cuDeviceGetUuid_v2; +#endif +#if CUDA_VERSION >= 12090 + PFN_cuLogsRegisterCallback_v12090 cuLogsRegisterCallback; + PFN_cuLogsUnregisterCallback_v12090 cuLogsUnregisterCallback; +#endif + bool all_required_available; +} nocl_dispatch; + +typedef int CUdevice; +struct pipe_screen *nocl_create_screen(struct cuda_pipe_loader_device *dev); + +#endif /* __NOCL_DRM_PUBLIC_H__ */