From 43ea91b10d480e9dd9d563842da8cfb9a9663a6a Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sat, 8 Nov 2025 14:23:02 +0100
Subject: [PATCH 01/12] rusticl/queue: fix error code for invalid queue
 properties part 1

Cc: mesa-stable
---
 src/gallium/frontends/rusticl/api/queue.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/frontends/rusticl/api/queue.rs b/src/gallium/frontends/rusticl/api/queue.rs
index fb38fa9c768..04327f523b0 100644
--- a/src/gallium/frontends/rusticl/api/queue.rs
+++ b/src/gallium/frontends/rusticl/api/queue.rs
@@ -135,7 +135,7 @@ fn create_command_queue_with_properties(
     let d = Device::ref_from_raw(device)?;
 
     // SAFETY: properties is a 0 terminated array by spec.
-    let properties = unsafe { Properties::new(properties) }.ok_or(CL_INVALID_PROPERTY)?;
+    let properties = unsafe { Properties::new(properties) }.ok_or(CL_INVALID_VALUE)?;
     for (&key, &val) in properties.iter() {
         match u32::try_from(key).or(Err(CL_INVALID_PROPERTY))? {
             CL_QUEUE_PROPERTIES => queue_properties = val,
@@ -152,7 +152,7 @@ fn create_command_queue_with_properties(
             // CL_INVALID_QUEUE_PROPERTIES if values specified in properties are valid but are not
             // supported by the device.
             CL_QUEUE_SIZE => return Err(CL_INVALID_QUEUE_PROPERTIES),
-            _ => return Err(CL_INVALID_PROPERTY),
+            _ => return Err(CL_INVALID_VALUE),
         }
     }
 

From ccb3a06f7ac8f91a22b40d8ff7cbe70913d21742 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sat, 8 Nov 2025 14:24:01 +0100
Subject: [PATCH 02/12] rusticl/queue: fix error code for invalid queue
 properties part 2

Fixes: 2c202eb7870 ("rusticl: verify validity of property names and values")
---
 src/gallium/frontends/rusticl/api/queue.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/frontends/rusticl/api/queue.rs b/src/gallium/frontends/rusticl/api/queue.rs
index 04327f523b0..9b4e8114f1b 100644
--- a/src/gallium/frontends/rusticl/api/queue.rs
+++ b/src/gallium/frontends/rusticl/api/queue.rs
@@ -137,7 +137,7 @@ fn create_command_queue_with_properties(
     // SAFETY: properties is a 0 terminated array by spec.
     let properties = unsafe { Properties::new(properties) }.ok_or(CL_INVALID_VALUE)?;
     for (&key, &val) in properties.iter() {
-        match u32::try_from(key).or(Err(CL_INVALID_PROPERTY))? {
+        match u32::try_from(key).or(Err(CL_INVALID_VALUE))? {
             CL_QUEUE_PROPERTIES => queue_properties = val,
             CL_QUEUE_PRIORITY_KHR if d.context_priority_supported() != 0 => {
                 let valid_props: cl_queue_properties = (CL_QUEUE_PRIORITY_LOW_KHR

From 315024248aab8de402d351c28bee802cb8a4a120 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sat, 8 Nov 2025 14:29:47 +0100
Subject: [PATCH 03/12] rusticl/queue: fix error code for invalid sampler
 kernel arg

Fixes: 5795ee0e083 ("rusticl: translate spirv to nir and first steps to kernel arg handling")
---
 src/gallium/frontends/rusticl/api/kernel.rs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/gallium/frontends/rusticl/api/kernel.rs b/src/gallium/frontends/rusticl/api/kernel.rs
index b3571cf25be..164d0aa1c11 100644
--- a/src/gallium/frontends/rusticl/api/kernel.rs
+++ b/src/gallium/frontends/rusticl/api/kernel.rs
@@ -416,13 +416,18 @@ fn set_kernel_arg(
                     return Err(CL_INVALID_ARG_VALUE);
                 }
             }
-            // If the argument is of type sampler_t, the arg_value entry must be a pointer to the
-            // sampler object.
-            KernelArgType::Constant(_) | KernelArgType::Sampler => {
+            KernelArgType::Constant(_) => {
                 if arg_value.is_null() {
                     return Err(CL_INVALID_ARG_VALUE);
                 }
             }
+            KernelArgType::Sampler => {
+                // CL_INVALID_SAMPLER for an argument declared to be of type sampler_t when the
+                // specified arg_value is not a valid sampler object.
+                if arg_value.is_null() {
+                    return Err(CL_INVALID_SAMPLER);
+                }
+            }
             _ => {}
         };
 

From 04ef78b786c67bd2a19b6b0abdc187db70432e32 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sat, 8 Nov 2025 16:15:07 +0100
Subject: [PATCH 04/12] rusticl/kernel: take no kernel_info reference inside
 the launch closure

Otherwise patterns like this wouldn't work:

clCreateKernel(prog)
clEnqueueNDRangeKernel
clReleaseKernel
clBuildProgram(prog)

Fixes: bb2453c6495 ("rusticl/kernel: move most of the code in launch inside the closure")
---
 src/gallium/frontends/rusticl/core/kernel.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs
index 748bd3c2143..aa1ce195242 100644
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@@ -1501,7 +1501,8 @@ impl Kernel {
         offsets: &[usize],
     ) -> CLResult<EventSig> {
         // Clone all the data we need to execute this kernel
-        let kernel_info = Arc::clone(&self.kernel_info);
+        let work_group_size_hint = self.kernel_info.work_group_size_hint;
+        let args = self.kernel_info.args.clone();
         let arg_values = self.values.clone();
         let nir_kernel_builds = Arc::clone(&self.builds[q.device]);
         let mut bdas = self.bdas.clone();
@@ -1548,8 +1549,7 @@ impl Kernel {
                 && grid[0] <= hw_max_grid[0]
                 && grid[1] <= hw_max_grid[1]
                 && grid[2] <= hw_max_grid[2]
-                && (kernel_info.work_group_size_hint == [0; 3]
-                    || block == kernel_info.work_group_size_hint)
+                && (work_group_size_hint == [0; 3] || block == work_group_size_hint)
             {
                 NirKernelVariant::Optimized
             } else {
@@ -1603,7 +1603,7 @@ impl Kernel {
 
             for arg in &nir_kernel_build.compiled_args {
                 let is_opaque = if let CompiledKernelArgType::APIArg(idx) = arg.kind {
-                    kernel_info.args[idx].kind.is_opaque()
+                    args[idx].kind.is_opaque()
                 } else {
                     false
                 };
@@ -1614,7 +1614,7 @@ impl Kernel {
 
                 match arg.kind {
                     CompiledKernelArgType::APIArg(idx) => {
-                        let api_arg = &kernel_info.args[idx];
+                        let api_arg = &args[idx];
                         let Some(value) = &arg_values[idx] else {
                             continue;
                         };

From 97de88fa2ab7c919cf690013c57a83b973cedcf2 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sun, 9 Nov 2025 00:30:06 +0100
Subject: [PATCH 05/12] rusticl/spirv: preserve signed zeroes by default

Cc: mesa-stable
---
 src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs
index 3482bb045e3..9f84f67d4e9 100644
--- a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs
+++ b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs
@@ -303,13 +303,13 @@ impl SPIRVBin {
             private_data: ptr::from_mut(log).cast(),
         });
 
+        let float_controls = float_controls::FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 as u32
+            | float_controls::FLOAT_CONTROLS_SIGNED_ZERO_PRESERVE as u32;
         spirv_to_nir_options {
             create_library: library,
             environment: nir_spirv_execution_environment::NIR_SPIRV_OPENCL,
             clc_shader: clc_shader,
-            float_controls_execution_mode: float_controls::FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32
-                as u32,
-
+            float_controls_execution_mode: float_controls,
             printf: true,
             capabilities: caps,
             constant_addr_format: global_addr_format,

From 9807cc078e075e30155e29cb908e6cd4efd6a768 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sun, 19 Oct 2025 15:57:14 +0200
Subject: [PATCH 06/12] glsl_types: include compiler/builtin_types.h

Apparently CUDA has a builtin_types.h file and it can conflict with ours...

Reviewed-by: Adam Jackson <ajax@redhat.com>
---
 src/compiler/glsl_types.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index 85028726622..abf804ebd7c 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -29,6 +29,7 @@
 #include <assert.h>
 #include <stdio.h>
 
+#include "compiler/builtin_types.h"
 #include "shader_enums.h"
 #include "c11/threads.h"
 #include "util/blob.h"

From 834a50a3ebcca582b34efc8d90ad33e3d3ba38f9 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Sat, 11 Oct 2025 17:32:04 +0200
Subject: [PATCH 07/12] nir/lower_system_values: add num_subgroups and
 subgroup_id lowering

Nvidia hardware doesn't have those in hardware. NAK currently loweres them
itself.

Reviewed-by: Adam Jackson <ajax@redhat.com>
---
 src/compiler/nir/nir.h                     |  2 ++
 src/compiler/nir/nir_lower_system_values.c | 24 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 05b9a4c76e0..b962c16025d 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5560,6 +5560,8 @@ typedef struct nir_lower_compute_system_values_options {
    bool lower_local_invocation_index : 1;
    bool lower_cs_local_id_to_index : 1;
    bool lower_workgroup_id_to_index : 1;
+   bool lower_num_subgroups : 1;
+   bool lower_subgroup_id : 1;
    bool global_id_is_32bit : 1;
    /* At shader execution time, check if WorkGroupId should be 1D
     * and compute it quickly. Fall back to slow computation if not.
diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
index 61d5282db80..f00ee581bce 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -794,6 +794,30 @@ lower_compute_system_value_instr(nir_builder *b,
    case nir_intrinsic_load_shader_index:
       return nir_imm_int(b, b->shader->info.cs.shader_index);
 
+   case nir_intrinsic_load_num_subgroups: {
+      if (!options || !options->lower_num_subgroups)
+         return NULL;
+
+      nir_def *group_size = nir_load_workgroup_size(b);
+      nir_def *threads = nir_imul(b, nir_channel(b, group_size, 0),
+                                     nir_channel(b, group_size, 1));
+      threads = nir_imul(b, threads, nir_channel(b, group_size, 2));
+
+      /* DIV_ROUND_UP(A, B) = ((A + B - 1) / B) */
+      nir_def *subgroup_size = nir_load_subgroup_size(b);
+      nir_def *subgroup_size_m1 = nir_iadd_imm(b, subgroup_size, -1);
+      nir_def *numerator = nir_iadd(b, threads, subgroup_size_m1);
+      return nir_udiv(b, numerator, subgroup_size);
+   }
+
+   case nir_intrinsic_load_subgroup_id: {
+      if (!options || !options->lower_subgroup_id)
+         return NULL;
+
+      nir_def *tid = nir_load_local_invocation_index(b);
+      return nir_udiv(b, tid, nir_load_subgroup_size(b));
+   }
+
    default:
       return NULL;
    }

From 8e8a6d1e09cbfdbd0b215d550fe81d9efe4771ad Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Thu, 4 Sep 2025 10:29:20 +0200
Subject: [PATCH 08/12] gallium: add SUBGROUP_FEATURE bits for rotate and
 rotate_clustered

The bit values are taken from Vulkan to make it easy for Zink. Those new
subgroup features will be used by rusticl for cl_khr_subgroup_rotate.

Reviewed-by: Adam Jackson <ajax@redhat.com>
---
 src/gallium/include/pipe/p_defines.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index ae0fdba1a9c..ced85b506e3 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -698,6 +698,14 @@ enum pipe_conservative_raster_mode
 #define PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED        (1 << 6)
 #define PIPE_SHADER_SUBGROUP_FEATURE_QUAD             (1 << 7)
 #define PIPE_SHADER_SUBGROUP_NUM_FEATURES             8
+/* VK_SUBGROUP_FEATURE_ROTATE_BIT */
+#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE           (1 << 9)
+/* VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT */
+#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED (1 << 10)
+#define PIPE_SHADER_SUBGROUP_FEATURE_MASK \
+   (BITFIELD_MASK(PIPE_SHADER_SUBGROUP_NUM_FEATURES) | \
+   PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | \
+   PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED)
 
 enum pipe_point_size_lower_mode {
    PIPE_POINT_SIZE_LOWER_ALWAYS,

From 2f7c79cf5ff08d152ef83430607affd93ef34bc0 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Wed, 3 Sep 2025 21:52:08 +0200
Subject: [PATCH 09/12] clc: handle all optional subgroup extensions

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Adam Jackson <ajax@redhat.com>
---
 src/compiler/clc/clc.h           |  8 +++++++-
 src/compiler/clc/clc_helpers.cpp | 24 +++++++++++++++++++++---
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/compiler/clc/clc.h b/src/compiler/clc/clc.h
index 95ef4dd759b..73588a31ad5 100644
--- a/src/compiler/clc/clc.h
+++ b/src/compiler/clc/clc.h
@@ -72,9 +72,15 @@ struct clc_optional_features {
     * progress
     */
    bool subgroups_ifp;
+   bool subgroups_ballot;
+   bool subgroups_clustered;
+   bool subgroups_extended_types;
+   bool subgroups_named_barrier;
+   bool subgroups_non_uniform_arithmetic;
+   bool subgroups_non_uniform_vote;
+   bool subgroups_rotate;
    bool subgroups_shuffle;
    bool subgroups_shuffle_relative;
-   bool subgroups_ballot;
 };
 
 struct clc_compile_args {
diff --git a/src/compiler/clc/clc_helpers.cpp b/src/compiler/clc/clc_helpers.cpp
index 8e69ef845a1..9912f9b0256 100644
--- a/src/compiler/clc/clc_helpers.cpp
+++ b/src/compiler/clc/clc_helpers.cpp
@@ -1024,15 +1024,33 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
    }
    if (args->features.subgroups) {
       c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_subgroups");
+      if (args->features.subgroups_ballot) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1");
+      }
+      if (args->features.subgroups_clustered) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_clustered_reduce=1");
+      }
+      if (args->features.subgroups_extended_types) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_extended_types=1");
+      }
+      if (args->features.subgroups_named_barrier) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_named_barrier=1");
+      }
+      if (args->features.subgroups_non_uniform_arithmetic) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_arithmetic=1");
+      }
+      if (args->features.subgroups_non_uniform_vote) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_vote=1");
+      }
+      if (args->features.subgroups_rotate) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_rotate=1");
+      }
       if (args->features.subgroups_shuffle) {
          c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle=1");
       }
       if (args->features.subgroups_shuffle_relative) {
          c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle_relative=1");
       }
-      if (args->features.subgroups_ballot) {
-         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1");
-      }
    }
    if (args->features.subgroups_ifp) {
       assert(args->features.subgroups);

From cb5e5222877b2c9c577ca8e4e5da11e36a91f2db Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Wed, 3 Sep 2025 21:55:48 +0200
Subject: [PATCH 10/12] rusticl: properly check for subgroup support

Also add zink to the features.txt entries.

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Adam Jackson <ajax@redhat.com>
---
 docs/features.txt                            |  6 ++--
 src/gallium/frontends/rusticl/core/device.rs | 36 ++++++++++++++++----
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index 08763314c91..5cea523628c 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -814,7 +814,7 @@ Rusticl Optional OpenCL 2.x Features:
   Device and host timer synchronization                 DONE (freedreno, iris, llvmpipe, radeonsi, zink)
   OpenCL C 2.0                                          in progress
   - Memory Consistency Model (atomics)                  not started
-  - Sub-groups                                          DONE (iris, llvmpipe, radeonsi, asahi)
+  - Sub-groups                                          DONE (asahi, iris, llvmpipe, radeonsi, zink)
   - Work-group Collective Functions                     not started
   - Generic Address Space                               in progress
   cl_khr_il_program                                     DONE
@@ -877,8 +877,8 @@ Rusticl extensions:
   cl_khr_subgroup_non_uniform_arithmetic                not started
   cl_khr_subgroup_non_uniform_vote                      not started
   cl_khr_subgroup_rotate                                not started
-  cl_khr_subgroup_shuffle                               DONE (iris, llvmpipe, radeonsi, asahi)
-  cl_khr_subgroup_shuffle_relative                      DONE (iris, llvmpipe, radeonsi, asahi)
+  cl_khr_subgroup_shuffle                               DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  cl_khr_subgroup_shuffle_relative                      DONE (asahi, iris, llvmpipe, radeonsi, zink)
   cl_khr_subgroups                                      in progress
   cl_khr_suggested_local_work_size                      DONE
   cl_khr_terminate_context                              not started
diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs
index f1b8671e589..9239a89a287 100644
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@@ -741,17 +741,22 @@ impl DeviceBase {
         }
 
         if self.subgroups_supported() {
-            add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
-            add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative);
             add_cap(SpvCapability::SpvCapabilityGroups);
             add_cap(SpvCapability::SpvCapabilitySubgroupDispatch);
             // requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
             //add_ext(1, 0, 0, "cl_khr_subgroups");
             add_feat(1, 0, 0, "__opencl_c_subgroups");
 
-            // we have lowering in `nir_lower_subgroups`, drivers can just use that
-            add_ext(1, 0, 0, "cl_khr_subgroup_shuffle");
-            add_ext(1, 0, 0, "cl_khr_subgroup_shuffle_relative");
+            if self.subgroup_shuffle_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
+                add_ext(1, 0, 0, "cl_khr_subgroup_shuffle");
+            }
+
+            if self.subgroup_shuffle_relative_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative);
+                add_ext(1, 0, 0, "cl_khr_subgroup_shuffle_relative");
+            }
+
             if self.intel_subgroups_supported() {
                 // add_cap(SpvCapability::SpvCapabilitySubgroupBufferBlockIOINTEL);
                 // add_cap(SpvCapability::SpvCapabilitySubgroupImageBlockIOINTEL);
@@ -1174,6 +1179,23 @@ impl DeviceBase {
         // supported, doing it without shareable shaders isn't practical
         self.max_subgroups() > 0
             && (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders()))
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_BASIC
+                != 0
+    }
+
+    pub fn subgroup_shuffle_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE
+                != 0
+    }
+
+    pub fn subgroup_shuffle_relative_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE_RELATIVE
+                != 0
     }
 
     pub fn system_svm_supported(&self) -> bool {
@@ -1256,8 +1278,8 @@ impl DeviceBase {
             intel_subgroups: self.intel_subgroups_supported(),
             kernel_clock: self.kernel_clock_supported(),
             subgroups: subgroups_supported,
-            subgroups_shuffle: subgroups_supported,
-            subgroups_shuffle_relative: subgroups_supported,
+            subgroups_shuffle: self.subgroup_shuffle_supported(),
+            subgroups_shuffle_relative: self.subgroup_shuffle_relative_supported(),
             ..Default::default()
         }
     }

From d1870c2212e38427dad9adc54308fce74b58ab63 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Wed, 3 Sep 2025 21:57:31 +0200
Subject: [PATCH 11/12] rusticl: support more subgroup extensions

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Adam Jackson <ajax@redhat.com>
---
 docs/features.txt                            | 12 ++--
 src/gallium/frontends/rusticl/core/device.rs | 69 ++++++++++++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index 5cea523628c..6f6adea8b60 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -870,13 +870,13 @@ Rusticl extensions:
   cl_khr_spirv_no_integer_wrap_decoration               DONE
   cl_khr_spirv_queries                                  DONE
   cl_khr_srgb_image_writes                              not started
-  cl_khr_subgroup_ballot                                not started
-  cl_khr_subgroup_clustered_reduce                      not started
-  cl_khr_subgroup_extended_types                        not started
+  cl_khr_subgroup_ballot                                DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  cl_khr_subgroup_clustered_reduce                      DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  cl_khr_subgroup_extended_types                        DONE (asahi, iris, llvmpipe, radeonsi, zink)
   cl_khr_subgroup_named_barrier                         not started
-  cl_khr_subgroup_non_uniform_arithmetic                not started
-  cl_khr_subgroup_non_uniform_vote                      not started
-  cl_khr_subgroup_rotate                                not started
+  cl_khr_subgroup_non_uniform_arithmetic                DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  cl_khr_subgroup_non_uniform_vote                      DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  cl_khr_subgroup_rotate                                DONE (iris, llvmpipe, zink)
   cl_khr_subgroup_shuffle                               DONE (asahi, iris, llvmpipe, radeonsi, zink)
   cl_khr_subgroup_shuffle_relative                      DONE (asahi, iris, llvmpipe, radeonsi, zink)
   cl_khr_subgroups                                      in progress
diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs
index 9239a89a287..dac45f8f7f5 100644
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@@ -745,8 +745,36 @@ impl DeviceBase {
             add_cap(SpvCapability::SpvCapabilitySubgroupDispatch);
             // requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
             //add_ext(1, 0, 0, "cl_khr_subgroups");
+            add_ext(1, 0, 0, "cl_khr_subgroup_extended_types");
             add_feat(1, 0, 0, "__opencl_c_subgroups");
 
+            if self.subgroup_ballot_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformBallot);
+                add_ext(1, 0, 0, "cl_khr_subgroup_ballot");
+            }
+
+            if self.subgroup_clustered_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformClustered);
+                add_ext(1, 0, 0, "cl_khr_subgroup_clustered_reduce");
+            }
+
+            if self.subgroup_non_uniform_arithmetic_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformArithmetic);
+                add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_arithmetic");
+            }
+
+            if self.subgroup_non_uniform_vote_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniform);
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformVote);
+                add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_vote");
+            }
+
+            if self.subgroup_rotate_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformRotateKHR);
+                add_ext(1, 0, 0, "cl_khr_subgroup_rotate");
+                add_spirv(c"SPV_KHR_subgroup_rotate");
+            }
+
             if self.subgroup_shuffle_supported() {
                 add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
                 add_ext(1, 0, 0, "cl_khr_subgroup_shuffle");
@@ -1184,6 +1212,41 @@ impl DeviceBase {
                 != 0
     }
 
+    pub fn subgroup_ballot_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_BALLOT
+                != 0
+    }
+
+    pub fn subgroup_clustered_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED
+                != 0
+    }
+
+    pub fn subgroup_non_uniform_arithmetic_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_ARITHMETIC
+                != 0
+    }
+
+    pub fn subgroup_non_uniform_vote_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_VOTE
+                != 0
+    }
+
+    pub fn subgroup_rotate_supported(&self) -> bool {
+        let mask =
+            PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED;
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features & mask == mask
+    }
+
     pub fn subgroup_shuffle_supported(&self) -> bool {
         self.subgroups_supported()
             && self.screen().caps().shader_subgroup_supported_features
@@ -1278,6 +1341,12 @@ impl DeviceBase {
             intel_subgroups: self.intel_subgroups_supported(),
             kernel_clock: self.kernel_clock_supported(),
             subgroups: subgroups_supported,
+            subgroups_ballot: self.subgroup_ballot_supported(),
+            subgroups_clustered: self.subgroup_clustered_supported(),
+            subgroups_extended_types: subgroups_supported,
+            subgroups_non_uniform_arithmetic: self.subgroup_non_uniform_arithmetic_supported(),
+            subgroups_non_uniform_vote: self.subgroup_non_uniform_vote_supported(),
+            subgroups_rotate: self.subgroup_rotate_supported(),
             subgroups_shuffle: self.subgroup_shuffle_supported(),
             subgroups_shuffle_relative: self.subgroup_shuffle_relative_supported(),
             ..Default::default()

From a18a4750069169b6c9fa39a627380dfb71a3b10d Mon Sep 17 00:00:00 2001
From: Karol Herbst <karolherbst@gmail.com>
Date: Sat, 4 Oct 2025 19:05:10 +0200
Subject: [PATCH 12/12] nocl: add it

Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
---
 docs/drivers/nocl.rst                         |   13 +
 docs/envvars.rst                              |   23 +
 docs/features.txt                             |   26 +-
 docs/index.rst                                |    1 +
 meson.build                                   |   13 +-
 meson.options                                 |    2 +-
 src/compiler/meson.build                      |    2 +-
 .../auxiliary/pipe-loader/pipe_loader.c       |    3 +
 .../auxiliary/pipe-loader/pipe_loader.h       |    5 +
 .../drivers/nocl/compiler/cbindgen.toml       |   12 +
 src/gallium/drivers/nocl/compiler/lib.rs      | 1191 +++++++++++++++++
 src/gallium/drivers/nocl/compiler/meson.build |   44 +
 src/gallium/drivers/nocl/meson.build          |   54 +
 src/gallium/drivers/nocl/nocl_nir.c           |  824 ++++++++++++
 .../drivers/nocl/nocl_nir_algebraic.py        |   65 +
 src/gallium/drivers/nocl/nocl_pipe.c          | 1027 ++++++++++++++
 src/gallium/drivers/nocl/nocl_private.h       |  147 ++
 .../frontends/rusticl/mesa/pipe/device.rs     |    1 +
 src/gallium/meson.build                       |    6 +
 src/gallium/targets/rusticl/meson.build       |    1 +
 src/gallium/winsys/nocl/meson.build           |   25 +
 src/gallium/winsys/nocl/nocl_cuda.c           |  212 +++
 src/gallium/winsys/nocl/nocl_cuda_public.h    |   73 +
 23 files changed, 3754 insertions(+), 16 deletions(-)
 create mode 100644 docs/drivers/nocl.rst
 create mode 100644 src/gallium/drivers/nocl/compiler/cbindgen.toml
 create mode 100644 src/gallium/drivers/nocl/compiler/lib.rs
 create mode 100644 src/gallium/drivers/nocl/compiler/meson.build
 create mode 100644 src/gallium/drivers/nocl/meson.build
 create mode 100644 src/gallium/drivers/nocl/nocl_nir.c
 create mode 100644 src/gallium/drivers/nocl/nocl_nir_algebraic.py
 create mode 100644 src/gallium/drivers/nocl/nocl_pipe.c
 create mode 100644 src/gallium/drivers/nocl/nocl_private.h
 create mode 100644 src/gallium/winsys/nocl/meson.build
 create mode 100644 src/gallium/winsys/nocl/nocl_cuda.c
 create mode 100644 src/gallium/winsys/nocl/nocl_cuda_public.h

diff --git a/docs/drivers/nocl.rst b/docs/drivers/nocl.rst
new file mode 100644
index 00000000000..885dbfc45e5
--- /dev/null
+++ b/docs/drivers/nocl.rst
@@ -0,0 +1,13 @@
+NoCL
+====
+
+Overview
+--------
+
+The NoCL driver is a Gallium driver that targets the CUDA driver API and
+therefore can be used to layer OpenCL on top of the NVIDIA proprietary driver.
+
+The minimum supported CUDA API version is 6.5.
+
+It translates NIR to PTX and makes use of the CUDA internal JIT compiler to
+generate device code.
diff --git a/docs/envvars.rst b/docs/envvars.rst
index 6fbf8e6bc03..9952db963fb 100644
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@@ -2147,6 +2147,29 @@ Freedreno driver environment variables
 Other Gallium drivers have their own environment variables. These may
 change frequently so the source code should be consulted for details.
 
+NoCL driver environment variables
+---------------------------------
+
+.. envvar:: NOCL_DEBUG
+
+   Debug flags for the NoCL driver.
+
+.. envvar:: NOCL_API_TARGET
+
+   Assumes a lower CUDA Driver version is used than what's loaded at runtime.
+   Useful to ensure that code runs on older Driver versions. For development
+   only.
+
+.. envvar:: NOCL_PTX_VERSION
+
+   Overrides the PTX target version with the specified value. Can be used to
+   ensure code compiles successfully on various CUDA versions. For development
+   only. Specified without the dot.
+
+.. envvar:: NOCL_SM_TARGET
+
+   Overrides the PTX target SM level with the specified value. Can be used to
+   ensure code compiles successfully on various GPUs. For development only.
 
 Vulkan loader environment variables
 -----------------------------------
diff --git a/docs/features.txt b/docs/features.txt
index 6f6adea8b60..15860805104 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -814,7 +814,7 @@ Rusticl Optional OpenCL 2.x Features:
   Device and host timer synchronization                 DONE (freedreno, iris, llvmpipe, radeonsi, zink)
   OpenCL C 2.0                                          in progress
   - Memory Consistency Model (atomics)                  not started
-  - Sub-groups                                          DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  - Sub-groups                                          DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
   - Work-group Collective Functions                     not started
   - Generic Address Space                               in progress
   cl_khr_il_program                                     DONE
@@ -834,7 +834,7 @@ Rusticl extensions:
   cl_khr_d3d10_sharing                                  not started
   cl_khr_d3d11_sharing                                  not started
   cl_khr_device_enqueue_local_arg_types                 not started
-  cl_khr_device_uuid                                    DONE (freedreno, iris, llvmpipe, radeonsi, zink)
+  cl_khr_device_uuid                                    DONE (freedreno, iris, llvmpipe, nocl, radeonsi, zink)
   cl_khr_dx9_media_sharing                              not started
   cl_khr_egl_event                                      not started
   cl_khr_egl_image                                      not started
@@ -849,7 +849,7 @@ Rusticl extensions:
   cl_khr_external_semaphore_opaque_fd                   not started
   cl_khr_external_semaphore_sync_fd                     DONE (radeonsi, zink)
   cl_khr_external_semaphore_win32                       not started
-  cl_khr_fp16                                           DONE (asahi, freedreno, llvmpipe, panfrost, radeonsi, zink)
+  cl_khr_fp16                                           DONE (asahi, freedreno, llvmpipe, nocl, panfrost, radeonsi, zink)
   cl_khr_gl_depth_images                                not started
   cl_khr_gl_event                                       not started
   cl_khr_gl_msaa_sharing                                not started
@@ -859,10 +859,10 @@ Rusticl extensions:
   cl_khr_int64_base_atomics                             not started
   cl_khr_int64_extended_atomics                         not started
   cl_khr_integer_dot_product                            DONE
-  cl_khr_kernel_clock                                   DONE (freedreno, iris, llvmpipe, nvc0, panfrost, radeonsi, zink, needs llvm-19)
+  cl_khr_kernel_clock                                   DONE (freedreno, iris, llvmpipe, nocl, nvc0, panfrost, radeonsi, zink, needs llvm-19)
   cl_khr_mipmap_image                                   not started
   cl_khr_mipmap_image_writes                            not started
-  cl_khr_pci_bus_info                                   DONE (iris, nvc0, radeonsi, zink)
+  cl_khr_pci_bus_info                                   DONE (iris, nocl, nvc0, radeonsi, zink)
   cl_khr_priority_hints                                 DONE (asahi, freedreno, iris, panfrost, radeonsi)
   cl_khr_semaphore                                      DONE (radeonsi, zink)
   cl_khr_spirv_extended_debug_info                      not started
@@ -870,21 +870,21 @@ Rusticl extensions:
   cl_khr_spirv_no_integer_wrap_decoration               DONE
   cl_khr_spirv_queries                                  DONE
   cl_khr_srgb_image_writes                              not started
-  cl_khr_subgroup_ballot                                DONE (asahi, iris, llvmpipe, radeonsi, zink)
-  cl_khr_subgroup_clustered_reduce                      DONE (asahi, iris, llvmpipe, radeonsi, zink)
-  cl_khr_subgroup_extended_types                        DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  cl_khr_subgroup_ballot                                DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_clustered_reduce                      DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_extended_types                        DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
   cl_khr_subgroup_named_barrier                         not started
-  cl_khr_subgroup_non_uniform_arithmetic                DONE (asahi, iris, llvmpipe, radeonsi, zink)
-  cl_khr_subgroup_non_uniform_vote                      DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  cl_khr_subgroup_non_uniform_arithmetic                DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_non_uniform_vote                      DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
   cl_khr_subgroup_rotate                                DONE (iris, llvmpipe, zink)
-  cl_khr_subgroup_shuffle                               DONE (asahi, iris, llvmpipe, radeonsi, zink)
-  cl_khr_subgroup_shuffle_relative                      DONE (asahi, iris, llvmpipe, radeonsi, zink)
+  cl_khr_subgroup_shuffle                               DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_shuffle_relative                      DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
   cl_khr_subgroups                                      in progress
   cl_khr_suggested_local_work_size                      DONE
   cl_khr_terminate_context                              not started
   cl_khr_throttle_hints                                 not started
   cl_khr_work_group_uniform_arithmetic                  not started
-  cl_ext_buffer_device_address                          DONE (iris, llvmpipe, radeonsi, zink)
+  cl_ext_buffer_device_address                          DONE (iris, llvmpipe, nocl, radeonsi, zink)
   cl_ext_cxx_for_opencl                                 not started
   cl_ext_device_fission                                 not started
   cl_ext_float_atomics                                  not started
diff --git a/docs/index.rst b/docs/index.rst
index 36b6acd2192..161ffcd36f0 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -85,6 +85,7 @@ Linux, FreeBSD, and other operating systems.
    drivers/freedreno
    drivers/lima
    drivers/llvmpipe
+   drivers/nocl
    drivers/nvk
    drivers/panfrost
    drivers/powervr
diff --git a/meson.build b/meson.build
index f241ba862cd..bb7750154a3 100644
--- a/meson.build
+++ b/meson.build
@@ -215,6 +215,7 @@ with_gallium_d3d12 = gallium_drivers.contains('d3d12')
 with_gallium_asahi = gallium_drivers.contains('asahi')
 with_gallium_rocket = gallium_drivers.contains('rocket')
 with_gallium_ethosu = gallium_drivers.contains('ethosu')
+with_gallium_nocl = gallium_drivers.contains('nocl')
 foreach gallium_driver : gallium_drivers
   pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
 endforeach
@@ -754,7 +755,7 @@ if with_gallium_rusticl
 endif
 
 with_virtgpu_kumquat = get_option('virtgpu_kumquat') and with_gfxstream_vk
-if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat
+if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat or with_gallium_nocl
   # rust.bindgen() does not pass `--rust-target` to bindgen until 1.7.0.
   if meson.version().version_compare('< 1.7.0')
     error('Mesa Rust support requires Meson 1.7.0 or newer')
@@ -1600,6 +1601,16 @@ elif with_shader_cache
   error('Shader Cache requires compression')
 endif
 
+if with_gallium_nocl
+  dep_libcuda = dependency(
+    'cuda',
+    # By default meson searches for the cudart, which we don't want to use. We want to use the
+    # CUDA driver library.
+    modules : ['cuda'],
+    version : '>= 6.5',
+  )
+endif
+
 if host_machine.system() == 'windows'
   # For MSVC and MinGW we aren't using pthreads, and dependency('threads') will add linkage
   # to pthread for MinGW, so leave the dependency null_dep for Windows. For Windows linking to
diff --git a/meson.options b/meson.options
index 75731475c12..43ca2c7c367 100644
--- a/meson.options
+++ b/meson.options
@@ -87,7 +87,7 @@ option(
   choices : [
     'all', 'auto',
     'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris',
-    'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
+    'lima', 'llvmpipe', 'nocl', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
     'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
   ],
   description : 'List of gallium drivers to build. If this is set to auto ' +
diff --git a/src/compiler/meson.build b/src/compiler/meson.build
index 92e72a44f9a..8742b8ce533 100644
--- a/src/compiler/meson.build
+++ b/src/compiler/meson.build
@@ -72,6 +72,6 @@ if with_gallium
 endif
 subdir('isaspec')
 
-if with_nouveau_vk
+if with_nouveau_vk or with_gallium_nocl
   subdir('rust')
 endif
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.c b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
index 1f54a3b8b84..56fc578909d 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
@@ -44,6 +44,9 @@
 static int (*backends[])(struct pipe_loader_device **, int) = {
 #ifdef HAVE_LIBDRM
    &pipe_loader_drm_probe,
+#endif
+#ifdef HAVE_NOCL
+   &pipe_loader_cuda_probe,
 #endif
    &pipe_loader_sw_probe
 };
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
index eda16a71692..9f04e40094e 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@@ -215,6 +215,11 @@ pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev,
 int
 pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev);
 
+#ifdef HAVE_NOCL
+int
+pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev);
+#endif
+
 /**
  * Get a list of known DRM accel devices.
  *
diff --git a/src/gallium/drivers/nocl/compiler/cbindgen.toml b/src/gallium/drivers/nocl/compiler/cbindgen.toml
new file mode 100644
index 00000000000..82b8a9dc0e9
--- /dev/null
+++ b/src/gallium/drivers/nocl/compiler/cbindgen.toml
@@ -0,0 +1,12 @@
+language = "C"
+
+includes = ["nir.h"]
+autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */"
+include_guard = "NIR_TO_PTX_H"
+usize_is_size_t = true
+style = "tag"
+
+[export]
+include = ["NirToPtxOutput"]
+prefix = ""
+renaming_overrides_prefixing = true
diff --git a/src/gallium/drivers/nocl/compiler/lib.rs b/src/gallium/drivers/nocl/compiler/lib.rs
new file mode 100644
index 00000000000..a0015561192
--- /dev/null
+++ b/src/gallium/drivers/nocl/compiler/lib.rs
@@ -0,0 +1,1191 @@
+// Copyright © 2025 Karol Herbst
+// SPDX-License-Identifier: MIT
+
+#![allow(non_upper_case_globals)]
+
+use std::{cmp, collections::HashMap, ffi::CStr, str::Utf8Error};
+
+use compiler::{bindings::*, nir::AsDef};
+
+#[repr(C)]
+pub struct NirToPtxOptions {
+    pub target_ptx: u16,
+    pub target_sm: u16,
+}
+
+#[repr(C)]
+pub struct NirToPtxOutput {
+    pub ptx: *mut u8,
+    pub ptx_len: usize,
+    pub ptx_capacity: usize,
+}
+
+struct Reg {
+    index: u32,
+    bit_size: u8,
+}
+
+impl Reg {
+    fn src_string(&self) -> String {
+        def_string_raw_prefixed(self.index, 1, 0, "")
+    }
+}
+
+struct NirToPtxState<'n> {
+    nir: &'n nir_shader,
+    options: NirToPtxOptions,
+    vars: Vec<String>,
+    block: Vec<String>,
+    lines: Vec<String>,
+    regs: HashMap<u32, Reg>,
+}
+
+fn opcode(alu: &nir_alu_instr) -> Option<(&'static str, &'static str)> {
+    let alu_op = alu.op;
+    let def_bit_size = alu.def.bit_size;
+    let src_bit_size = alu.get_src(0).bit_size();
+
+    // Not all ops support predicates, so let's be specific about it here.
+    Some(if src_bit_size == 1 {
+        match alu_op {
+            nir_op_iand => ("and", "pred"),
+            nir_op_ine => ("xor", "pred"),
+            nir_op_inot => ("not", "pred"),
+            nir_op_ior => ("or", "pred"),
+            nir_op_ixor => ("xor", "pred"),
+            _ => return None,
+        }
+    } else if def_bit_size == 1 {
+        match alu_op {
+            nir_op_feq => ("setp.eq", "f"),
+            nir_op_fequ => ("setp.equ", "f"),
+            nir_op_fisfinite => ("testp.finite", "f"),
+            nir_op_fneo => ("setp.ne", "f"),
+            nir_op_fneu => ("setp.neu", "f"),
+            nir_op_fge => ("setp.ge", "f"),
+            nir_op_fgeu => ("setp.geu", "f"),
+            nir_op_flt => ("setp.lt", "f"),
+            nir_op_fltu => ("setp.ltu", "f"),
+            nir_op_ieq => ("setp.eq", "b"),
+            nir_op_ine => ("setp.ne", "b"),
+            nir_op_ige => ("setp.ge", "s"),
+            nir_op_ilt => ("setp.lt", "s"),
+            nir_op_uge => ("setp.ge", "u"),
+            nir_op_ult => ("setp.lt", "u"),
+            _ => return None,
+        }
+    } else {
+        match alu_op {
+            nir_op_bit_count => ("popc", "b"),
+            nir_op_bitfield_reverse => ("brev", "b"),
+            nir_op_f2f16 => ("cvt.rn", "f16.f"),
+            nir_op_f2f16_rtz => ("cvt.rz", "f16.f"),
+            nir_op_f2f32 => ("cvt", "f32.f"),
+            nir_op_fabs => ("abs", "f"),
+            // .rn prevents contraction
+            nir_op_fadd => ("add.rn", "f"),
+            nir_op_fdiv => ("div.approx", "f"),
+            nir_op_ffma => ("fma.rn", "f"),
+            nir_op_fmax => ("max", "f"),
+            nir_op_fmin => ("min", "f"),
+            // .rn prevents contraction
+            nir_op_fmul => ("mul.rn", "f"),
+            nir_op_fneg => ("neg", "f"),
+            nir_op_frcp => ("rcp.approx", "f"),
+            nir_op_frsq => ("rsqrt.approx", "f"),
+            nir_op_fsqrt => ("sqrt.approx", "f"),
+            nir_op_fsub => ("sub", "f"),
+            nir_op_i2i8 => ("cvt", "s8.s"),
+            nir_op_i2i16 => ("cvt", "s16.s"),
+            nir_op_i2i32 => ("cvt", "s32.s"),
+            nir_op_i2i64 => ("cvt", "s64.s"),
+            nir_op_iabs => ("abs", "s"),
+            nir_op_iadd => ("add", "s"),
+            nir_op_iadd_sat => ("add.sat", "s"),
+            nir_op_iand => ("and", "b"),
+            nir_op_ibitfield_extract => ("bfe", "s"),
+            nir_op_idiv => ("div", "s"),
+            nir_op_ieq => ("set.eq", "b"),
+            nir_op_imad => ("mad.lo", "s"),
+            nir_op_imax => ("max", "s"),
+            nir_op_imin => ("min", "s"),
+            nir_op_imul => ("mul.lo", "s"),
+            nir_op_imul24 => ("mul24.lo", "s"),
+            nir_op_imul_high => ("mul.hi", "s"),
+            nir_op_ineg => ("neg", "s"),
+            nir_op_inot => ("not", "b"),
+            nir_op_ior => ("or", "b"),
+            nir_op_irem => ("rem", "s"),
+            nir_op_ishl => ("shl", "b"),
+            nir_op_ishr => ("shr", "s"),
+            nir_op_isub => ("sub", "s"),
+            nir_op_isub_sat => ("sub.sat", "s"),
+            nir_op_ixor => ("xor", "b"),
+            nir_op_sdot_4x8_iadd => ("dp4a.s32", "s"),
+            nir_op_sudot_4x8_iadd => ("dp4a.s32", "u"),
+            nir_op_u2u8 => ("cvt", "u8.u"),
+            nir_op_u2u16 => ("cvt", "u16.u"),
+            nir_op_u2u32 => ("cvt", "u32.u"),
+            nir_op_u2u64 => ("cvt", "u64.u"),
+            nir_op_ubitfield_extract => ("bfe", "u"),
+            nir_op_udiv => ("div", "u"),
+            nir_op_udot_4x8_uadd => ("dp4a", "u32.u"),
+            nir_op_ufind_msb => ("bfind", "u"),
+            nir_op_umad24 => ("mad24.lo", "u"),
+            nir_op_umax => ("max", "u"),
+            nir_op_umin => ("min", "u"),
+            nir_op_umod => ("rem", "u"),
+            nir_op_umul24 => ("mul24.lo", "u"),
+            nir_op_umul_2x32_64 => ("mul.wide", "u"),
+            nir_op_umul_high => ("mul.hi", "u"),
+            nir_op_ushr => ("shr", "u"),
+            _ => return None,
+        }
+    })
+}
+
+fn bit_size_string(bit_size: u8) -> &'static str {
+    match bit_size {
+        1 => "",
+        8 => "8",
+        16 => "16",
+        32 => "32",
+        64 => "64",
+        _ => unreachable!("unsupported bit_size"),
+    }
+}
+
+fn def_string(def: &nir_def, comp: u8) -> String {
+    def_string_prefixed(def, comp, "")
+}
+
+fn def_string_raw_prefixed(index: u32, num_components: u8, comp: u8, prefix: &str) -> String {
+    let swizzle = swizzle_reg(comp, num_components);
+    format!("%r{index}{prefix}{swizzle}")
+}
+
+fn def_string_prefixed(def: &nir_def, comp: u8, prefix: &str) -> String {
+    def_string_raw_prefixed(def.index, def.num_components, comp, prefix)
+}
+
+fn def_var_string_raw(index: u32, bit_size: u8, num_components: u8, prefix: &str) -> String {
+    let vec = vec_string(num_components);
+
+    if bit_size == 1 {
+        format!("\t.reg {vec} .pred %r{index}{prefix};")
+    } else {
+        format!("\t.reg {vec} .b{bit_size} %r{index}{prefix};")
+    }
+}
+
+fn src_string(src: &nir_alu_src, comp: u8) -> String {
+    def_string(src.src.as_def(), src.swizzle[usize::from(comp)])
+}
+
+fn mem_from_intrinsic(intrin: &nir_intrinsic_instr) -> &'static str {
+    match intrin.intrinsic {
+        nir_intrinsic_load_global_constant => "global",
+        nir_intrinsic_load_global | nir_intrinsic_store_global => "global",
+        nir_intrinsic_load_shared | nir_intrinsic_store_shared => "shared",
+        nir_intrinsic_load_scratch | nir_intrinsic_store_scratch => "local",
+        _ => unreachable!("intrinsic not a memory operation!"),
+    }
+}
+
+fn swizzle_reg(swizzle: u8, num_components: u8) -> &'static str {
+    if num_components == 1 {
+        ""
+    } else {
+        match swizzle {
+            0 => ".x",
+            1 => ".y",
+            2 => ".z",
+            3 => ".w",
+            _ => unreachable!("unsupported swizzle"),
+        }
+    }
+}
+
+fn vec_string(num_components: u8) -> &'static str {
+    match num_components {
+        1 => "",
+        2 => ".v2",
+        3 | 4 => ".v4",
+        8 => ".v8",
+        _ => unreachable!("unsupported vec size {num_components}"),
+    }
+}
+
+impl NirToPtxState<'_> {
+    fn finalize(self) -> NirToPtxOutput {
+        // Until `String::into_raw_parts` is stabilized, this string needs to be turned
+        // into a `Vec` before calling `as_mut_ptr`. `String` does not implement
+        // `as_mut_ptr` itself but relies on `Deref<Target = str>`.
+        // By going through a slice the resulting pointer only has access rights to the
+        // portion of the string containing the actual string data, that is in the range
+        // `0..len`. The pointer does not have access rights to the over-allocated part
+        // in the range `len..capacity` though.
+        // Full access rights are later needed to reconstruct the string soundly under
+        // Stack Borrows rules. `Vec` does implement `as_mut_ptr` directly specifically
+        // to avoid this problem.
+        let mut ptx = self.lines.join("\n").into_bytes();
+        ptx.push(b'\0');
+
+        let mut md_ptx = std::mem::ManuallyDrop::new(ptx);
+        let (ptr, len, capacity) = (md_ptx.as_mut_ptr(), md_ptx.len(), md_ptx.capacity());
+        let result = NirToPtxOutput {
+            ptx: ptr,
+            ptx_len: len,
+            ptx_capacity: capacity,
+        };
+
+        result
+    }
+
+    fn init(&mut self) {
+        // We need to cap the versions otherwise compilation can fail on newer GPUs
+        let target_ptx = cmp::min(83, self.options.target_ptx);
+        let sm = cmp::min(80, self.options.target_sm);
+        let address_size = unsafe { self.nir.info.__bindgen_anon_1.cs }.ptr_size;
+
+        let ptx_major = target_ptx / 10;
+        let ptx_minor = target_ptx % 10;
+
+        self.lines.extend_from_slice(&[
+            format!(".version {ptx_major}.{ptx_minor}"),
+            format!(".target sm_{sm}, texmode_independent"),
+            format!(".address_size {address_size}"),
+        ]);
+    }
+
+    fn new(nir: *const nir_shader, options: NirToPtxOptions) -> Self {
+        Self {
+            nir: unsafe { &*nir },
+            options: options,
+            vars: Vec::new(),
+            block: Vec::new(),
+            lines: Vec::new(),
+            regs: HashMap::new(),
+        }
+    }
+
+    fn parse(&mut self) -> Result<(), Utf8Error> {
+        for func in self.nir.iter_functions() {
+            assert!(!func.impl_.is_null());
+            assert!(func.is_entrypoint);
+
+            self.lines.push(format!(
+                ".entry {}(",
+                unsafe { CStr::from_ptr(func.name) }.to_str()?
+            ));
+
+            let mut iter = self
+                .nir
+                .iter_variables()
+                .filter(|var| var.data.mode() == nir_var_uniform)
+                .peekable();
+
+            let mut last_offset = 0;
+            while let Some(var) = iter.next() {
+                let mut byte_size = 0;
+                let mut align = 0;
+                let vec: u32;
+                let driver_location = var.data.driver_location;
+
+                let bit_size;
+                unsafe { glsl_get_cl_type_size_align(var.type_, &mut byte_size, &mut align) };
+                if unsafe { glsl_type_is_vector_or_scalar(var.type_) } {
+                    vec = unsafe { *var.type_ }.vector_elements.into();
+                    bit_size = (byte_size / u32::from(vec)) * 8;
+                } else {
+                    // If we have anything we don't parse properly, just emit a byte array.
+                    bit_size = 8;
+                    vec = byte_size;
+                }
+
+                // Add explicit padding, *sigh*
+                if last_offset != driver_location {
+                    self.lines.push(format!(
+                        "\t.param .b8 padding{last_offset}[{}],",
+                        driver_location - last_offset,
+                    ));
+                }
+
+                self.lines.push(format!(
+                    "\t.param .u{bit_size} param{driver_location}{}{}",
+                    if vec > 1 {
+                        format!("[{vec}]")
+                    } else {
+                        "".to_owned()
+                    },
+                    if iter.peek().is_some() { "," } else { "" }
+                ));
+
+                last_offset = driver_location + byte_size;
+            }
+
+            self.lines.push(") {".to_owned());
+
+            // declare local memory inside the function so we don't run into ABI issues
+            if self.nir.scratch_size != 0 {
+                self.lines.extend_from_slice(&[
+                    format!("\t.local .u8 local_mem[{}];", self.nir.scratch_size),
+                    "\t.reg .b32 %LMEM;".to_owned(),
+                    "\t.reg .b32 %LMEM_TMP;".to_owned(),
+                    "\tmov.u32 %LMEM, local_mem;".to_owned(),
+                ]);
+            }
+
+            let func_impl = func.get_impl().unwrap();
+            for cf_node in func_impl.iter_body() {
+                match cf_node.type_ {
+                    nir_cf_node_block => {
+                        let block = cf_node.as_block().unwrap();
+                        let index = block.index;
+                        self.lines.push(format!("b{index}:"));
+
+                        for instr in block.iter_instr_list() {
+                            self.parse_instruction(instr);
+                        }
+
+                        self.lines.append(&mut self.vars);
+                        self.lines.append(&mut self.block);
+                    }
+                    _ => unreachable!("Unsupported cf_node type"),
+                }
+            }
+
+            let end_block = func_impl.end_block().index;
+            self.lines.push(format!("b{end_block}:"));
+
+            self.lines.push("}".to_owned());
+        }
+
+        Ok(())
+    }
+
+    fn create_dest_var(&mut self, def: &nir_def) {
+        self.create_dest_var_prefixed(def, "");
+    }
+
+    fn create_dest_var_prefixed(&mut self, def: &nir_def, prefix: &str) {
+        self.create_dest_var_raw(def.index, def.bit_size, def.num_components, prefix);
+    }
+
+    fn create_dest_var_raw(&mut self, index: u32, bit_size: u8, num_components: u8, prefix: &str) {
+        self.vars
+            .push(def_var_string_raw(index, bit_size, num_components, prefix));
+    }
+
+    fn parse_alu(&mut self, alu: &nir_alu_instr) {
+        let info = alu.info();
+        let dst_bit_size = alu.def.bit_size;
+        let src0_bit_size = alu.get_src(0).bit_size();
+        self.create_dest_var(&alu.def);
+
+        let float_exec_mode = self.nir.info.float_controls_execution_mode;
+        #[rustfmt::skip]
+        let ftz = (info.input_types[0] & nir_type_float != 0 && (
+            (src0_bit_size == 16 && float_exec_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 != 0) ||
+            (src0_bit_size == 32 && float_exec_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 != 0) ||
+            (src0_bit_size == 64 && float_exec_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64 != 0)
+        )).then_some(".ftz").unwrap_or_default();
+
+        if let Some((op_str, type_str)) = opcode(alu) {
+            let dst = def_string(&alu.def, 0);
+            let op_bit_size = bit_size_string(src0_bit_size);
+            self.block.push(format!(
+                "\t{op_str}{ftz}.{type_str}{op_bit_size} {dst}, {};",
+                alu.srcs_as_slice()
+                    .iter()
+                    .map(|src| { src_string(src, 0) })
+                    .collect::<Vec<String>>()
+                    .join(", "),
+            ));
+
+            return;
+        }
+
+        match alu.op {
+            // conversions
+            nir_op_b2b1 => {
+                let src = src_string(alu.get_src(0), 0);
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tsetp.eq.b{src0_bit_size} {dst}, {src}, 1;"));
+            }
+            nir_op_b2f16 => {
+                let src = src_string(alu.get_src(0), 0);
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tselp.b16 {dst}, 0x3c00, 0x0000, {src};"));
+            }
+            nir_op_b2f32 => {
+                let src = src_string(alu.get_src(0), 0);
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tselp.f32 {dst}, 1.0, 0.0, {src};"));
+            }
+            nir_op_b2i16 | nir_op_b2b32 | nir_op_b2i32 | nir_op_b2i64 => {
+                let src = src_string(alu.get_src(0), 0);
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tselp.s{dst_bit_size} {dst}, 1, 0, {src};"));
+            }
+            nir_op_bcsel => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let src1 = src_string(alu.get_src(1), 0);
+                let src2 = src_string(alu.get_src(2), 0);
+                let dst = def_string(&alu.def, 0);
+
+                self.block.push(format!(
+                    "\tselp.b{dst_bit_size} {dst}, {src1}, {src2}, {src0};"
+                ));
+            }
+            nir_op_i32csel_ge => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let src1 = src_string(alu.get_src(1), 0);
+                let src2 = src_string(alu.get_src(2), 0);
+                let dst = def_string(&alu.def, 0);
+
+                self.block.push(format!(
+                    "\tslct.s{dst_bit_size}.s32 {dst}, {src1}, {src2}, {src0};"
+                ));
+            }
+
+            #[rustfmt::skip]
+            nir_op_f2i8   |
+            nir_op_f2u8   |
+            nir_op_f2i16  |
+            nir_op_f2u16  |
+            nir_op_f2i32  |
+            nir_op_f2u32  |
+            nir_op_f2i64  |
+            nir_op_f2u64  |
+            nir_op_i2f16  |
+            nir_op_i2f32  |
+            nir_op_i2f64  |
+            nir_op_u2f16  |
+            nir_op_u2f32  |
+            nir_op_u2f64  |
+            nir_op_fceil  |
+            nir_op_ffloor |
+            nir_op_ftrunc |
+            nir_op_fround_even => {
+                let src = src_string(alu.get_src(0), 0);
+                let dst = def_string(&alu.def, 0);
+
+                let dtype = match alu.op {
+                    nir_op_f2i8   |
+                    nir_op_f2i16  |
+                    nir_op_f2i32  |
+                    nir_op_f2i64       => "s",
+                    nir_op_f2u8   |
+                    nir_op_f2u16  |
+                    nir_op_f2u32  |
+                    nir_op_f2u64       => "u",
+                    nir_op_i2f16  |
+                    nir_op_i2f32  |
+                    nir_op_i2f64  |
+                    nir_op_u2f16  |
+                    nir_op_u2f32  |
+                    nir_op_u2f64  |
+                    nir_op_fceil  |
+                    nir_op_ffloor |
+                    nir_op_ftrunc |
+                    nir_op_fround_even => "f",
+                    _ => unreachable!("invalid conversion dtype"),
+                };
+
+                let stype = match alu.op {
+                    nir_op_i2f16  |
+                    nir_op_i2f32  |
+                    nir_op_i2f64       => "s",
+                    nir_op_u2f16  |
+                    nir_op_u2f32  |
+                    nir_op_u2f64       => "u",
+                    nir_op_f2i8   |
+                    nir_op_f2u8   |
+                    nir_op_f2i16  |
+                    nir_op_f2u16  |
+                    nir_op_f2i32  |
+                    nir_op_f2u32  |
+                    nir_op_f2i64  |
+                    nir_op_f2u64  |
+                    nir_op_fceil  |
+                    nir_op_ffloor |
+                    nir_op_ftrunc |
+                    nir_op_fround_even => "f",
+                    _ => unreachable!("invalid conversion stype"),
+                };
+
+                let rnd = match alu.op {
+                    nir_op_fceil       => "rpi",
+                    nir_op_ffloor      => "rmi",
+                    nir_op_fround_even => "rni",
+                    nir_op_ftrunc      => "rzi",
+                    nir_op_f2i8   |
+                    nir_op_f2i16  |
+                    nir_op_f2i32  |
+                    nir_op_f2i64  |
+                    nir_op_f2u8   |
+                    nir_op_f2u16  |
+                    nir_op_f2u32  |
+                    nir_op_f2u64       => "rzi",
+                    nir_op_i2f16  |
+                    nir_op_i2f32  |
+                    nir_op_i2f64  |
+                    nir_op_u2f16  |
+                    nir_op_u2f32  |
+                    nir_op_u2f64       => "rn",
+                    _ => unreachable!("invalid conversion rounding mode"),
+                };
+
+                self.block.push(format!(
+                    "\tcvt.{rnd}{ftz}.{dtype}{dst_bit_size}.{stype}{src0_bit_size} {dst}, {src};"
+                ));
+            }
+
+            // extract
+            nir_op_extract_i8 => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let index = alu.get_src(1).comp_as_uint(0).unwrap();
+                let sign = 0x8 | index;
+                let index = sign << 12 | sign << 8 | sign << 4 | index;
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tprmt.b32 {dst}, {src0}, 0, 0x{index:x};"));
+            }
+            nir_op_extract_i16 => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let index = 2 * alu.get_src(1).comp_as_uint(0).unwrap();
+                let indexp1 = index + 1;
+                let sign = 0x8 | indexp1;
+                let index = sign << 12 | sign << 8 | indexp1 << 4 | index;
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tprmt.b32 {dst}, {src0}, 0, 0x{index:x};"));
+            }
+            nir_op_extract_u8 => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let index = 0x4440 | alu.get_src(1).comp_as_uint(0).unwrap();
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tprmt.b32 {dst}, {src0}, 0, 0x{index:x};"));
+            }
+            nir_op_extract_u16 => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let index = 2 * alu.get_src(1).comp_as_uint(0).unwrap();
+                let indexp1 = index + 1;
+                let index = 0x4400 | indexp1 << 4 | index;
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tprmt.b32 {dst}, {src0}, 0, 0x{index:x};"));
+            }
+
+            // bitfield
+            nir_op_bitfield_insert => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let src1 = src_string(alu.get_src(1), 0);
+                let src2 = src_string(alu.get_src(2), 0);
+                let src3 = src_string(alu.get_src(3), 0);
+                let dst = def_string(&alu.def, 0);
+
+                self.block
+                    .push(format!("\tbfi.b{dst_bit_size} {dst}, {src1}, {src0}, {src2}, {src3};"));
+            }
+
+            // pack and unpack
+            nir_op_pack_32_4x8 | nir_op_pack_64_4x16 => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let src1 = src_string(alu.get_src(0), 1);
+                let src2 = src_string(alu.get_src(0), 2);
+                let src3 = src_string(alu.get_src(0), 3);
+                let dst = def_string(&alu.def, 0);
+                self.block.push(format!(
+                    "\tmov.b{dst_bit_size} {dst}, {{{src0}, {src1}, {src2}, {src3}}};"
+                ));
+            }
+            nir_op_pack_32_2x16 | nir_op_pack_64_2x32 => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let src1 = src_string(alu.get_src(0), 1);
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tmov.b{dst_bit_size} {dst}, {{{src0}, {src1}}};"));
+            }
+            nir_op_pack_64_2x32_split => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let src1 = src_string(alu.get_src(1), 0);
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tmov.b64 {dst}, {{{src0}, {src1}}};"));
+            }
+
+            nir_op_unpack_32_4x8
+            | nir_op_unpack_32_2x16
+            | nir_op_unpack_64_4x16
+            | nir_op_unpack_64_2x32 => {
+                let index = alu.def.index;
+                let src = src_string(alu.get_src(0), 0);
+
+                // TODO: PTX seems to ignore the vector component on unpack movs?
+                if alu.get_src(0).src.num_components() != 1 {
+                    self.create_dest_var_raw(index, src0_bit_size, 1, "unpack");
+                    self.block.extend_from_slice(&[
+                        format!("\tmov.b{src0_bit_size} %r{index}unpack, {src};"),
+                        format!("\tmov.b{src0_bit_size} %r{index}, %r{index}unpack;"),
+                    ]);
+                } else {
+                    self.block
+                        .push(format!("\tmov.b{src0_bit_size} %r{index}, {src};"));
+                }
+            }
+            nir_op_unpack_64_2x32_split_x => {
+                let src = src_string(alu.get_src(0), 0);
+                let dst = def_string(&alu.def, 0);
+                self.block.push(format!("\tmov.b64 {{{dst}, _}}, {src};"));
+            }
+            nir_op_unpack_32_2x16_split_y | nir_op_unpack_64_2x32_split_y => {
+                let src = src_string(alu.get_src(0), 0);
+                let dst = def_string(&alu.def, 0);
+                self.block
+                    .push(format!("\tmov.b{src0_bit_size} {{_, {dst}}}, {src};"));
+            }
+
+            // rotate
+            nir_op_urol | nir_op_uror => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let src1 = src_string(alu.get_src(1), 0);
+                let dst = def_string(&alu.def, 0);
+
+                let op = match alu.op {
+                    nir_op_urol => "shf.l.wrap.b",
+                    nir_op_uror => "shf.r.wrap.b",
+                    _ => unreachable!(),
+                };
+
+                self.block
+                    .push(format!("\t{op}32 {dst}, {src0}, {src0}, {src1};"))
+            }
+
+            nir_op_bitfield_select => {
+                let src0 = src_string(alu.get_src(0), 0);
+                let src1 = src_string(alu.get_src(1), 0);
+                let src2 = src_string(alu.get_src(2), 0);
+                let dst = def_string(&alu.def, 0);
+
+                self.block
+                    .push(format!("\tlop3.b32 {dst}, {src0}, {src1}, {src2}, 0xca;"));
+            }
+
+            // movs
+            nir_op_mov | nir_op_vec3 => {
+                if src0_bit_size == 8 {
+                    let index = alu.def.index;
+                    let vec = vec_string(alu.def.num_components);
+                    self.vars.push(format!("\t.reg {vec} .b16 %r{index}mov;"));
+                }
+
+                for lane in 0..info.num_inputs {
+                    let src = src_string(alu.get_src(lane.into()), 0);
+                    let dst = def_string(&alu.def, lane);
+                    let dst_tmp = def_string_prefixed(&alu.def, lane, "mov");
+
+                    // TODO: mov.b8 does not exist. Is there a better solution here?
+                    //       Maybe we should track movs and just copy prop while generating PTX?
+                    if src0_bit_size == 8 {
+                        self.block.extend_from_slice(&[
+                            format!("\tmov.b16 {dst_tmp}, {{{src}, {src}}};"),
+                            format!("\tmov.b16 {{{dst}, _}}, {dst_tmp};"),
+                        ]);
+                    } else {
+                        self.block
+                            .push(format!("\tmov.b{src0_bit_size} {dst}, {src};"));
+                    }
+                }
+            }
+            nir_op_vec2 | nir_op_vec4 => {
+                // TODO: mov.b8 does not exist. Is there a better solution here?
+                //       Maybe we should track movs and just copy prop while generating PTX?
+                if src0_bit_size == 8 {
+                    let def_bit_size = 8 * info.num_inputs;
+                    let index = alu.def.index;
+                    let mut srcs = Vec::new();
+                    for lane in 0..info.num_inputs {
+                        srcs.push(src_string(alu.get_src(lane.into()), 0));
+                    }
+                    let srcs = srcs.join(", ");
+
+                    self.vars
+                        .push(format!("\t.reg .b{def_bit_size} %r{index}mov;"));
+                    self.block.extend_from_slice(&[
+                        format!("\tmov.b{def_bit_size} %r{index}mov, {{{srcs}}};"),
+                        format!("\tmov.b{def_bit_size} %r{index}, %r{index}mov;"),
+                    ]);
+                } else {
+                    // We can't move from vectors to vectors, so we need to do it lane by lane.
+                    for lane in 0..info.num_inputs {
+                        let src = src_string(alu.get_src(lane.into()), 0);
+                        let dst = def_string(&alu.def, lane);
+                        self.block
+                            .push(format!("\tmov.b{src0_bit_size} {dst}, {src};"));
+                    }
+                }
+            }
+            _ => unreachable!("Unknown ALU {}", info.name()),
+        }
+    }
+
+    fn parse_intrinsic(&mut self, intrin: &nir_intrinsic_instr) {
+        let vec;
+        let info = intrin.info();
+        let bit_size = intrin.def.bit_size;
+        let num_components = intrin.def.num_components;
+        let index = intrin.def.index;
+
+        if info.has_dest && intrin.intrinsic != nir_intrinsic_decl_reg {
+            self.create_dest_var(&intrin.def);
+            vec = vec_string(num_components);
+        } else {
+            vec = "";
+        }
+
+        match intrin.intrinsic {
+            // registers
+            nir_intrinsic_decl_reg => {
+                assert_eq!(intrin.get_const_index(NIR_INTRINSIC_NUM_ARRAY_ELEMS), 0);
+                assert_eq!(intrin.get_const_index(NIR_INTRINSIC_NUM_COMPONENTS), 1);
+
+                let bit_size = intrin.get_const_index(NIR_INTRINSIC_BIT_SIZE);
+
+                // Create temporary for 8 bit movs
+                if bit_size == 8 {
+                    self.vars.push(format!("\t.reg .b16 %r{index}store_reg;"));
+                }
+
+                self.vars.push(if bit_size == 1 {
+                    format!("\t.reg .pred %r{index};")
+                } else {
+                    format!("\t.reg .b{bit_size} %r{index};")
+                });
+
+                self.regs.insert(
+                    index,
+                    Reg {
+                        index: index,
+                        bit_size: bit_size as u8,
+                    },
+                );
+            }
+            nir_intrinsic_load_reg => {
+                assert_eq!(intrin.base(), 0);
+
+                let reg = intrin.get_src(0);
+                let reg = &self.regs[&reg.as_def().index];
+                let val = def_string(&intrin.def, 0);
+
+                let type_ = if reg.bit_size == 1 { "pred" } else { "b" };
+                let bit_size = reg.bit_size;
+                let bit_size_str = bit_size_string(bit_size);
+                let reg = reg.src_string();
+
+                if bit_size == 8 {
+                    self.vars.push(format!("\t.reg .b16 {val}load_reg;"));
+                    self.block.extend_from_slice(&[
+                        format!("\tmov.b16 {val}load_reg, {{{reg}, {reg}}};"),
+                        format!("\tmov.b16 {{{val}, _}}, {val}load_reg;"),
+                    ]);
+                } else {
+                    self.block
+                        .push(format!("\tmov.{type_}{bit_size_str} {val}, {reg};"));
+                }
+            }
+            nir_intrinsic_store_reg => {
+                assert_eq!(intrin.base(), 0);
+                assert_eq!(intrin.write_mask(), 1);
+
+                let reg = intrin.get_src(1);
+                let reg = &self.regs[&reg.as_def().index];
+                let val = def_string(intrin.get_src(0).as_def(), 0);
+
+                let type_ = if reg.bit_size == 1 { "pred" } else { "b" };
+                let bit_size = reg.bit_size;
+                let bit_size_str = bit_size_string(bit_size);
+                let reg = reg.src_string();
+
+                if bit_size == 8 {
+                    self.block.extend_from_slice(&[
+                        format!("\tmov.b16 {reg}store_reg, {{{val}, {val}}};"),
+                        format!("\tmov.b16 {{{reg}, _}}, {reg}store_reg;"),
+                    ]);
+                } else {
+                    self.block
+                        .push(format!("\tmov.{type_}{bit_size_str} {reg}, {val};"));
+                }
+            }
+
+            // system values
+            nir_intrinsic_load_local_invocation_id
+            | nir_intrinsic_load_workgroup_id
+            | nir_intrinsic_load_workgroup_size => {
+                let sreg = match intrin.intrinsic {
+                    nir_intrinsic_load_local_invocation_id => "tid",
+                    nir_intrinsic_load_workgroup_id => "ctaid",
+                    nir_intrinsic_load_workgroup_size => "ntid",
+                    _ => unreachable!("unknown intrinsic"),
+                };
+
+                self.block.extend_from_slice(&[
+                    format!("\tmov.b32 %r{index}.x, %{sreg}.x;"),
+                    format!("\tmov.b32 %r{index}.y, %{sreg}.y;"),
+                    format!("\tmov.b32 %r{index}.z, %{sreg}.z;"),
+                ]);
+            }
+            nir_intrinsic_load_subgroup_invocation
+            | nir_intrinsic_load_subgroup_eq_mask
+            | nir_intrinsic_load_subgroup_ge_mask
+            | nir_intrinsic_load_subgroup_gt_mask
+            | nir_intrinsic_load_subgroup_le_mask
+            | nir_intrinsic_load_subgroup_lt_mask => {
+                let sreg = match intrin.intrinsic {
+                    nir_intrinsic_load_subgroup_invocation => "laneid",
+                    nir_intrinsic_load_subgroup_eq_mask => "lanemask_eq",
+                    nir_intrinsic_load_subgroup_ge_mask => "lanemask_ge",
+                    nir_intrinsic_load_subgroup_gt_mask => "lanemask_gt",
+                    nir_intrinsic_load_subgroup_le_mask => "lanemask_le",
+                    nir_intrinsic_load_subgroup_lt_mask => "lanemask_lt",
+                    _ => unreachable!("unknown intrinsic"),
+                };
+
+                if num_components != 1 {
+                    self.block
+                        .push(format!("\tmov.b{bit_size} %r{index}.x, %{sreg};"));
+                    for comp in 1..num_components {
+                        let def = def_string(&intrin.def, comp);
+                        self.block.push(format!("\tmov.b{bit_size} {def}, 0;"));
+                    }
+                } else {
+                    self.block
+                        .push(format!("\tmov.b{bit_size} %r{index}, %{sreg};"));
+                }
+            }
+            nir_intrinsic_shader_clock => {
+                self.block
+                    .push(format!("\tmov.b64 {{%r{index}.x, %r{index}.y}}, %clock64;"));
+            }
+
+            // memory
+            nir_intrinsic_global_atomic
+            | nir_intrinsic_global_atomic_swap
+            | nir_intrinsic_shared_atomic
+            | nir_intrinsic_shared_atomic_swap => {
+                let address = intrin.get_src(0).as_def().index;
+                let value = intrin.get_src(1).as_def().index;
+
+                let op = match intrin.atomic_op() {
+                    nir_atomic_op_cmpxchg => "cas.b",
+                    nir_atomic_op_iadd => "add.s",
+                    nir_atomic_op_iand => "and.b",
+                    nir_atomic_op_imax => "max.s",
+                    nir_atomic_op_imin => "min.s",
+                    nir_atomic_op_ior => "or.b",
+                    nir_atomic_op_ixor => "xor.b",
+                    nir_atomic_op_umax => "max.u",
+                    nir_atomic_op_umin => "min.u",
+                    nir_atomic_op_xchg => "exch.b",
+                    op => unreachable!("unknown atomic op {op}"),
+                };
+
+                let mem = match intrin.intrinsic {
+                    nir_intrinsic_global_atomic | nir_intrinsic_global_atomic_swap => "global",
+                    nir_intrinsic_shared_atomic | nir_intrinsic_shared_atomic_swap => "shared",
+                    _ => unreachable!(""),
+                };
+
+                let scope = match intrin.intrinsic {
+                    _ if self.options.target_sm < 60 => "",
+                    nir_intrinsic_global_atomic | nir_intrinsic_global_atomic_swap => ".gpu",
+                    nir_intrinsic_shared_atomic | nir_intrinsic_shared_atomic_swap => ".cta",
+                    _ => unreachable!(""),
+                };
+
+                let src1 = match intrin.intrinsic {
+                    nir_intrinsic_global_atomic_swap | nir_intrinsic_shared_atomic_swap => {
+                        let src1 = intrin.get_src(2).as_def().index;
+                        format!(", %r{src1}")
+                    }
+                    _ => "".to_owned(),
+                };
+
+                self.block.push(format!(
+                    "\tatom{scope}.{mem}.{op}{bit_size} %r{index}, [%r{address}], %r{value}{src1};"
+                ));
+            }
+            nir_intrinsic_load_kernel_input => {
+                let src = intrin.get_src(0);
+                let param = unsafe { (*nir_src_as_const_value(*src)).u64_ };
+
+                // Not sure if this is defined behavior, but we can offset the kernel parameter.
+                // TODO: only do it for array arguments?
+                self.block.push(format!(
+                    "\tld.param{vec}.b{bit_size} %r{index}, [param0 + {param}];"
+                ));
+            }
+            nir_intrinsic_load_global
+            | nir_intrinsic_load_global_constant
+            | nir_intrinsic_load_shared => {
+                let address = intrin.get_src(0);
+                let address = address.as_def().index;
+                let mem = mem_from_intrinsic(intrin);
+
+                self.block.push(format!(
+                    "\tld.{mem}{vec}.b{bit_size} %r{index}, [%r{address}];"
+                ));
+            }
+            nir_intrinsic_store_global | nir_intrinsic_store_shared => {
+                let value = intrin.get_src(0);
+                let address = intrin.get_src(1);
+                let bit_size = value.bit_size();
+                let num_components = value.num_components();
+                let address = address.as_def().index;
+                let index = value.as_def().index;
+                let mem = mem_from_intrinsic(intrin);
+                let vec = if num_components > 1 {
+                    format!(".v{num_components}")
+                } else {
+                    "".to_owned()
+                };
+
+                self.block.push(format!(
+                    "\tst.{mem}{vec}.b{bit_size} [%r{address}], %r{index};"
+                ));
+            }
+            nir_intrinsic_load_scratch => {
+                let address = intrin.get_src(0);
+                let address = address.as_def().index;
+
+                self.block.extend_from_slice(&[
+                    format!("\tadd.u32 %LMEM_TMP, %LMEM, %r{address};"),
+                    format!("\tld.local{vec}.b{bit_size} %r{index}, [%LMEM_TMP];"),
+                ]);
+            }
+            nir_intrinsic_store_scratch => {
+                let value = intrin.get_src(0);
+                let address = intrin.get_src(1);
+                let bit_size = value.bit_size();
+                let num_components = value.num_components();
+                let address = address.as_def().index;
+                let index = value.as_def().index;
+                let vec = if num_components > 1 {
+                    format!(".v{num_components}")
+                } else {
+                    "".to_owned()
+                };
+
+                self.block.extend_from_slice(&[
+                    format!("\tadd.u32 %LMEM_TMP, %LMEM, %r{address};"),
+                    format!("\tst.local{vec}.b{bit_size} [%LMEM_TMP], %r{index};"),
+                ]);
+            }
+
+            // barriers
+            nir_intrinsic_barrier => {
+                let exec_scope = intrin.execution_scope();
+                let mem_scope = intrin.memory_scope();
+                let mem_sem = intrin.memory_semantics();
+                let mem_modes = intrin.memory_modes();
+
+                match exec_scope {
+                    SCOPE_SUBGROUP => {
+                        self.block.push(format!("\tbar.warp.sync 0xffffffff;"));
+                    }
+                    SCOPE_WORKGROUP => {
+                        self.block.push(format!("\tbar.sync 0;"));
+                    }
+                    _ => {}
+                }
+
+                if mem_modes & (nir_var_mem_global | nir_var_image | nir_var_mem_shared) != 0 {
+                    if self.options.target_sm >= 70 {
+                        let mem_sem = match mem_sem {
+                            // TODO: bug in the PTX compiler?
+                            NIR_MEMORY_ACQUIRE | //=> "acquire",
+                            NIR_MEMORY_RELEASE | //=> "release",
+                            NIR_MEMORY_ACQ_REL => "acq_rel",
+                            _ => unreachable!("unkown barrier semantics"),
+                        };
+
+                        let mem_scope = match mem_scope {
+                            SCOPE_SUBGROUP | SCOPE_SHADER_CALL | SCOPE_WORKGROUP => "cta",
+                            SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => "gpu",
+                            _ => unreachable!("unkown barrier scope"),
+                        };
+
+                        self.block.push(format!("\tfence.{mem_sem}.{mem_scope};"));
+                    } else {
+                        let level = match mem_scope {
+                            SCOPE_SUBGROUP | SCOPE_SHADER_CALL | SCOPE_WORKGROUP => "cta",
+                            SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => "gl",
+                            _ => unreachable!("unkown barrier scope"),
+                        };
+
+                        self.block.push(format!("\tmembar.{level};"));
+                    }
+                }
+            }
+
+            // subgroups
+            nir_intrinsic_ballot => {
+                let value = intrin.get_src(0).as_def().index;
+
+                self.block.push(format!(
+                    "\tvote.sync.ballot.b32 %r{index}, %r{value}, 0xffffffff;"
+                ));
+            }
+            // TODO: sm90+
+            // nir_intrinsic_elect => {
+            //     self.block.push(format!(
+            //         "\telect.sync _|%r{index}, 0xffffffff;"
+            //     ));
+            // }
+            nir_intrinsic_read_invocation
+            | nir_intrinsic_shuffle
+            | nir_intrinsic_shuffle_down
+            | nir_intrinsic_shuffle_up
+            | nir_intrinsic_shuffle_xor => {
+                let value = intrin.get_src(0).as_def().index;
+                let lane = intrin.get_src(1).as_def().index;
+
+                let (mode, val) = match intrin.intrinsic {
+                    nir_intrinsic_shuffle_down => ("down", 0x1f),
+                    nir_intrinsic_shuffle_up => ("up", 0),
+                    nir_intrinsic_shuffle_xor => ("bfly", 0x1f),
+                    nir_intrinsic_read_invocation | nir_intrinsic_shuffle => ("idx", 0x1f),
+                    _ => unreachable!(),
+                };
+
+                self.block.push(format!(
+                    "\tshfl.sync.{mode}.b32 %r{index}, %r{value}, %r{lane}, 0x{val:x}, 0xffffffff;"
+                ));
+            }
+            nir_intrinsic_reduce => {
+                let value = intrin.get_src(0).as_def().index;
+
+                let op = match intrin.reduction_op() {
+                    nir_op_iadd => "add.s",
+                    nir_op_iand => "and.b",
+                    nir_op_imax => "max.s",
+                    nir_op_imin => "min.s",
+                    nir_op_ior => "or.b",
+                    nir_op_ixor => "xor.b",
+                    nir_op_umax => "max.u",
+                    nir_op_umin => "min.u",
+                    op => unreachable!("Unknown reduction op: {op}"),
+                };
+
+                self.block.push(format!(
+                    "\tredux.sync.{op}32 %r{index}, %r{value}, 0xffffffff;",
+                ));
+            }
+            nir_intrinsic_vote_all | nir_intrinsic_vote_any => {
+                let value = intrin.get_src(0).as_def().index;
+                let mode = match intrin.intrinsic {
+                    nir_intrinsic_vote_all => "all",
+                    nir_intrinsic_vote_any => "any",
+                    _ => unreachable!(""),
+                };
+
+                self.block.push(format!(
+                    "\tvote.sync.{mode}.pred %r{index}, %r{value}, 0xffffffff;"
+                ));
+            }
+            nir_intrinsic_vote_ieq => {
+                let value = intrin.get_src(0).as_def();
+                let bit_size = value.bit_size;
+                let value = value.index;
+
+                self.block.push(format!(
+                    "\tmatch.all.sync.b{bit_size} _|%r{index}, %r{value}, 0xffffffff;"
+                ));
+            }
+            _ => unreachable!("Unknown intrinsic {}", info.name()),
+        }
+    }
+
+    fn parse_jump(&mut self, jump: &nir_jump_instr) {
+        match jump.type_ {
+            nir_jump_goto => {
+                let jump_target = jump.target().unwrap().index;
+                self.block.push(format!("\t\tbra b{jump_target};"));
+            }
+            nir_jump_goto_if => {
+                let jump_target = jump.target().unwrap().index;
+                let else_target = jump.else_target().unwrap().index;
+                let cond = &jump.condition.as_def().index;
+
+                self.block.extend_from_slice(&[
+                    format!("\t@%r{cond}\tbra b{jump_target};"),
+                    format!("\t\tbra b{else_target};"),
+                ]);
+            }
+            _ => unreachable!("Unsupported jump instruction: {}", jump.type_),
+        }
+    }
+
+    fn parse_instruction(&mut self, instr: &nir_instr) {
+        match instr.type_ {
+            nir_instr_type_load_const => {
+                let load_const = instr.as_load_const().unwrap();
+                debug_assert_eq!(load_const.def.num_components, 1);
+
+                let index = load_const.def.index;
+                let bit_size = load_const.def.bit_size;
+
+                self.create_dest_var(&load_const.def);
+                let val = match bit_size {
+                    64 => unsafe { load_const.values()[0].u64_ },
+                    32 => unsafe { load_const.values()[0].u32_ }.into(),
+                    16 => unsafe { load_const.values()[0].u16_ }.into(),
+                    8 => unsafe { load_const.values()[0].u8_ }.into(),
+                    1 => unsafe { load_const.values()[0].b }.into(),
+                    _ => unreachable!("unsupported load_const"),
+                };
+
+                // Yeah....
+                if bit_size == 1 {
+                    self.block.push(format!("\tmov.pred %r{index}, {val};"));
+                } else if bit_size == 8 {
+                    self.vars.push(format!("\t.reg .b16 %r{index}load_const;"));
+                    self.block.extend_from_slice(&[
+                        format!("\tmov.b16 %r{index}load_const, {{{val}, 0}};"),
+                        format!("\tmov.b16 {{%r{index}, _}}, %r{index}load_const;"),
+                    ]);
+                } else {
+                    self.block
+                        .push(format!("\tmov.b{bit_size} %r{index}, {val};"));
+                }
+            }
+            nir_instr_type_undef => {
+                let undef = instr.as_undef().unwrap();
+                self.create_dest_var(&undef.def);
+            }
+            nir_instr_type_alu => self.parse_alu(instr.as_alu().unwrap()),
+            nir_instr_type_intrinsic => self.parse_intrinsic(instr.as_intrinsic().unwrap()),
+            nir_instr_type_jump => self.parse_jump(instr.as_jump().unwrap()),
+            _ => unreachable!("Unsupported instruction type {}", instr.type_),
+        }
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn nir_to_ptx(nir: *const nir_shader, options: NirToPtxOptions) -> NirToPtxOutput {
+    let mut state = NirToPtxState::new(nir, options);
+    state.init();
+    state.parse().unwrap();
+    state.finalize()
+}
+
+#[no_mangle]
+pub extern "C" fn nir_to_ptx_free(ptx: NirToPtxOutput) {
+    unsafe { Vec::from_raw_parts(ptx.ptx, ptx.ptx_len, ptx.ptx_capacity) };
+}
diff --git a/src/gallium/drivers/nocl/compiler/meson.build b/src/gallium/drivers/nocl/compiler/meson.build
new file mode 100644
index 00000000000..d630a0aeae2
--- /dev/null
+++ b/src/gallium/drivers/nocl/compiler/meson.build
@@ -0,0 +1,44 @@
+# Copyright © 2025 Karol Herbst
+# SPDX-License-Identifier: MIT
+
+prog_cbindgen = find_program(
+  'cbindgen',
+  required : true,
+  native : true,
+  version : '>= 0.25'
+)
+
+libnir_to_ptx_h = custom_target(
+  'libnir_to_ptx_h',
+  input : [files('cbindgen.toml'), 'lib.rs'],
+  output : ['nir_to_ptx.h'],
+  command : [
+    prog_cbindgen, '-q', '--config', '@INPUT0@', '--lang', 'c',
+    '--output', '@OUTPUT0@', '--depfile', '@DEPFILE@',
+    '--', '@INPUT1@',
+  ],
+  depfile : 'nir_to_ptx.h.d',
+)
+
+files_libnir_to_ptx = files(
+  'lib.rs',
+)
+
+libnir_to_ptx = static_library(
+  'nir_to_ptx',
+  [files_libnir_to_ptx],
+  gnu_symbol_visibility : 'hidden',
+  rust_abi : 'c',
+  dependencies : [
+    idep_compiler_rs,
+  ],
+)
+
+idep_nir_to_ptx = declare_dependency(
+  sources : [
+    libnir_to_ptx_h
+  ],
+  link_with : [
+    libnir_to_ptx,
+  ]
+)
diff --git a/src/gallium/drivers/nocl/meson.build b/src/gallium/drivers/nocl/meson.build
new file mode 100644
index 00000000000..50d1930a70e
--- /dev/null
+++ b/src/gallium/drivers/nocl/meson.build
@@ -0,0 +1,54 @@
+# Copyright © 2025 Karol Herbst
+# SPDX-License-Identifier: MIT
+
+subdir('compiler')
+
+files_libnocl = files(
+  'nocl_nir.c',
+  'nocl_pipe.c',
+  'nocl_private.h',
+)
+
+nocl_nir_algebraic_c = custom_target(
+  'nocl_nir_algebraic.c',
+  input : 'nocl_nir_algebraic.py',
+  output : 'nocl_nir_algebraic.c',
+  command : [
+    prog_python, '@INPUT@',
+    '-p', dir_compiler_nir,
+    '--out', '@OUTPUT@',
+  ],
+  depend_files : nir_algebraic_depends,
+)
+
+libnocl = static_library(
+  'nocl',
+  [
+    files_libnocl,
+    nocl_nir_algebraic_c,
+  ],
+  include_directories : [
+    inc_gallium,
+    inc_gallium_aux,
+    inc_gallium_winsys,
+    inc_util,
+  ],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [
+    dep_libcuda,
+    idep_nir,
+    idep_nir_to_ptx,
+  ],
+)
+
+driver_nocl = declare_dependency(
+  compile_args : '-DGALLIUM_NOCL',
+  dependencies : [
+    dep_libcuda,
+    idep_libnoclwinsys,
+  ],
+  link_with : [
+    libnocl,
+    libnir_to_ptx,
+  ],
+)
diff --git a/src/gallium/drivers/nocl/nocl_nir.c b/src/gallium/drivers/nocl/nocl_nir.c
new file mode 100644
index 00000000000..12f2e029c5d
--- /dev/null
+++ b/src/gallium/drivers/nocl/nocl_nir.c
@@ -0,0 +1,824 @@
+/*
+ * Copyright © 2022 Collabora, Ltd.
+ * Copyright © 2025 Karol Herbst
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nocl_private.h"
+
+#include "nir.h"
+#include "nir_builder.h"
+
+static void
+push_block(nir_builder *b, nir_block *block)
+{
+   assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
+   block->cf_node.parent = &b->impl->cf_node;
+   exec_list_push_tail(&b->impl->body, &block->cf_node.node);
+   b->cursor = nir_after_block(block);
+}
+
+enum scope_type {
+   SCOPE_TYPE_SHADER,
+   SCOPE_TYPE_IF_MERGE,
+   SCOPE_TYPE_LOOP_BREAK,
+   SCOPE_TYPE_LOOP_CONT,
+};
+
+struct scope {
+   enum scope_type type;
+
+   struct scope *parent;
+   uint32_t depth;
+
+   nir_block *merge;
+   nir_def *bar;
+
+   uint32_t escapes;
+};
+
+static struct scope
+push_scope(nir_builder *b,
+           enum scope_type scope_type,
+           struct scope *parent,
+           nir_block *merge_block)
+{
+   struct scope scope = {
+      .type = scope_type,
+      .parent = parent,
+      .depth = parent->depth + 1,
+      .merge = merge_block,
+   };
+
+   return scope;
+}
+
+static void
+pop_scope(nir_builder *b, nir_def *esc_reg, struct scope scope)
+{
+   if (scope.bar == NULL)
+      return;
+
+   if (scope.escapes > 0) {
+      /* Find the nearest scope with a sync. */
+      nir_block *parent_merge = b->impl->end_block;
+      for (struct scope *p = scope.parent; p != NULL; p = p->parent) {
+         if (p->bar != NULL) {
+            parent_merge = p->merge;
+            break;
+         }
+      }
+
+      /* No escape is ~0, halt is 0, and we choose outer scope indices such
+       * that outer scopes always have lower indices than inner scopes.
+       */
+      nir_def *esc = nir_ult_imm(b, nir_load_reg(b, esc_reg), scope.depth);
+
+      /* We have to put the escape in its own block to avoid critical edges.
+       * If we just did goto_if, we would end up with multiple successors,
+       * including a jump to the parent's merge block which has multiple
+       * predecessors.
+       */
+      nir_block *esc_block = nir_block_create(b->shader);
+      nir_block *next_block = nir_block_create(b->shader);
+      nir_goto_if(b, esc_block, esc, next_block);
+      push_block(b, esc_block);
+      nir_goto(b, parent_merge);
+      push_block(b, next_block);
+   }
+}
+
+static enum scope_type
+jump_target_scope_type(nir_jump_type jump_type)
+{
+   switch (jump_type) {
+   case nir_jump_break:    return SCOPE_TYPE_LOOP_BREAK;
+   case nir_jump_continue: return SCOPE_TYPE_LOOP_CONT;
+   default:
+      UNREACHABLE("Unknown jump type");
+   }
+}
+
+static void
+break_scopes(nir_builder *b, nir_def *esc_reg,
+             struct scope *current_scope,
+             nir_jump_type jump_type)
+{
+   nir_block *first_sync = NULL;
+   uint32_t target_depth = UINT32_MAX;
+   enum scope_type target_scope_type = jump_target_scope_type(jump_type);
+   for (struct scope *scope = current_scope; scope; scope = scope->parent) {
+      if (first_sync == NULL && scope->bar != NULL)
+         first_sync = scope->merge;
+
+      if (scope->type == target_scope_type) {
+         if (first_sync == NULL) {
+            first_sync = scope->merge;
+         } else {
+            /* In order for our cascade to work, we need to have the invariant
+             * that anything which escapes any scope with a warp sync needs to
+             * target a scope with a warp sync.
+             */
+            assert(scope->bar != NULL);
+         }
+         target_depth = scope->depth;
+         break;
+      } else {
+         scope->escapes++;
+      }
+   }
+   assert(target_depth < UINT32_MAX);
+
+   nir_store_reg(b, nir_imm_int(b, target_depth), esc_reg);
+   nir_goto(b, first_sync);
+}
+
+static void
+normal_exit(nir_builder *b, nir_def *esc_reg, nir_block *merge_block)
+{
+   assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
+   nir_block *block = nir_cursor_current_block(b->cursor);
+
+   if (!nir_block_ends_in_jump(block)) {
+      nir_store_reg(b, nir_imm_int(b, ~0), esc_reg);
+      nir_goto(b, merge_block);
+   }
+}
+
+/* This is a heuristic for what instructions are allowed before we sync.
+ * Annoyingly, we've gotten rid of phis so it's not as simple as "is it a
+ * phi?".
+ */
+static bool
+instr_is_allowed_before_sync(nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      /* We could probably allow more ALU as long as it doesn't contain
+       * derivatives but let's be conservative and only allow mov for now.
+       */
+      return alu->op == nir_op_mov;
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      return intrin->intrinsic == nir_intrinsic_load_reg ||
+             intrin->intrinsic == nir_intrinsic_store_reg;
+   }
+
+   default:
+      return false;
+   }
+}
+
+/** Returns true if our successor will sync for us
+ *
+ * This is a bit of a heuristic
+ */
+static bool
+parent_scope_will_sync(nir_cf_node *node, struct scope *parent_scope)
+{
+   /* First search forward to see if there's anything non-trivial after this
+    * node within the parent scope.
+    */
+   nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node));
+   nir_foreach_instr(instr, block) {
+      if (!instr_is_allowed_before_sync(instr))
+         return false;
+   }
+
+   /* There's another loop or if following and we didn't find a sync */
+   if (nir_cf_node_next(&block->cf_node))
+      return false;
+
+   /* See if the parent scope will sync for us. */
+   if (parent_scope->bar != NULL)
+      return true;
+
+   switch (parent_scope->type) {
+   case SCOPE_TYPE_SHADER:
+      return true;
+
+   case SCOPE_TYPE_IF_MERGE:
+      return parent_scope_will_sync(block->cf_node.parent,
+                                    parent_scope->parent);
+
+   case SCOPE_TYPE_LOOP_CONT:
+      /* In this case, the loop doesn't have a sync of its own so we're
+       * expected to be uniform before we hit the continue.
+       */
+      return false;
+
+   case SCOPE_TYPE_LOOP_BREAK:
+      UNREACHABLE("Loops must have a continue scope");
+
+   default:
+      UNREACHABLE("Unknown scope type");
+   }
+}
+
+static bool
+block_is_merge(const nir_block *block)
+{
+   /* If it's unreachable, there is no merge */
+   if (block->imm_dom == NULL)
+      return false;
+
+   unsigned num_preds = 0;
+   set_foreach(&block->predecessors, entry) {
+      const nir_block *pred = entry->key;
+
+      /* We don't care about unreachable blocks */
+      if (pred->imm_dom == NULL)
+         continue;
+
+      num_preds++;
+   }
+
+   return num_preds > 1;
+}
+
+static void
+lower_cf_list(nir_builder *b, nir_def *esc_reg, struct scope *parent_scope,
+              struct exec_list *cf_list)
+{
+   foreach_list_typed_safe(nir_cf_node, node, node, cf_list) {
+      switch (node->type) {
+      case nir_cf_node_block: {
+         nir_block *block = nir_cf_node_as_block(node);
+         if (exec_list_is_empty(&block->instr_list))
+            break;
+
+         nir_cursor start = nir_before_block(block);
+         nir_cursor end = nir_after_block(block);
+
+         nir_jump_instr *jump = NULL;
+         nir_instr *last_instr = nir_block_last_instr(block);
+         if (last_instr->type == nir_instr_type_jump) {
+            jump = nir_instr_as_jump(last_instr);
+            end = nir_before_instr(&jump->instr);
+         }
+
+         nir_cf_list instrs;
+         nir_cf_extract(&instrs, start, end);
+         b->cursor = nir_cf_reinsert(&instrs, b->cursor);
+
+         if (jump != NULL) {
+            if (jump->type == nir_jump_halt) {
+               /* Halt instructions map to OpExit on NVIDIA hardware and
+                * exited lanes never block a bsync.
+                */
+               nir_instr_remove(&jump->instr);
+               nir_builder_instr_insert(b, &jump->instr);
+            } else {
+               /* Everything else needs a break cascade */
+               break_scopes(b, esc_reg, parent_scope, jump->type);
+            }
+         }
+         break;
+      }
+
+      case nir_cf_node_if: {
+         nir_if *nif = nir_cf_node_as_if(node);
+
+         nir_def *cond = nif->condition.ssa;
+         nir_instr_clear_src(NULL, &nif->condition);
+
+         nir_block *then_block = nir_block_create(b->shader);
+         nir_block *else_block = nir_block_create(b->shader);
+         nir_block *merge_block = nir_block_create(b->shader);
+
+         struct scope scope = push_scope(b, SCOPE_TYPE_IF_MERGE,
+                                         parent_scope, merge_block);
+
+         nir_goto_if(b, then_block, cond, else_block);
+
+         push_block(b, then_block);
+         lower_cf_list(b, esc_reg, &scope, &nif->then_list);
+         normal_exit(b, esc_reg, merge_block);
+
+         push_block(b, else_block);
+         lower_cf_list(b, esc_reg, &scope, &nif->else_list);
+         normal_exit(b, esc_reg, merge_block);
+
+         push_block(b, merge_block);
+         pop_scope(b, esc_reg, scope);
+
+         break;
+      }
+
+      case nir_cf_node_loop: {
+         nir_loop *loop = nir_cf_node_as_loop(node);
+
+         nir_block *head_block = nir_block_create(b->shader);
+         nir_block *break_block = nir_block_create(b->shader);
+         nir_block *cont_block = nir_block_create(b->shader);
+
+         /* TODO: We can potentially avoid the break sync for loops when the
+          * parent scope syncs for us.  However, we still need to handle the
+          * continue clause cascading to the break.  If there is a
+          * nir_jump_halt involved, then we have a real cascade where it needs
+          * to then jump to the next scope.  Getting all these cases right
+          * while avoiding an extra sync for the loop break is tricky at best.
+          */
+         struct scope break_scope = push_scope(b, SCOPE_TYPE_LOOP_BREAK,
+                                               parent_scope,
+                                               break_block);
+
+         nir_goto(b, head_block);
+         push_block(b, head_block);
+
+         struct scope cont_scope = push_scope(b, SCOPE_TYPE_LOOP_CONT,
+                                              &break_scope,
+                                              cont_block);
+
+         lower_cf_list(b, esc_reg, &cont_scope, &loop->body);
+         normal_exit(b, esc_reg, cont_block);
+
+         push_block(b, cont_block);
+
+         pop_scope(b, esc_reg, cont_scope);
+
+         lower_cf_list(b, esc_reg, &break_scope, &loop->continue_list);
+
+         nir_goto(b, head_block);
+         push_block(b, break_block);
+
+         pop_scope(b, esc_reg, break_scope);
+
+         break;
+      }
+
+      default:
+         UNREACHABLE("Unknown CF node type");
+      }
+   }
+}
+
+static bool
+lower_cf_func(nir_function *func)
+{
+   if (func->impl == NULL)
+      return false;
+
+   if (exec_list_is_singular(&func->impl->body)) {
+      return nir_no_progress(func->impl);
+   }
+
+   nir_function_impl *old_impl = func->impl;
+
+   /* We use this in block_is_merge() */
+   nir_metadata_require(old_impl, nir_metadata_dominance | nir_metadata_divergence);
+
+   /* First, we temporarily get rid of SSA.  This will make all our block
+    * motion way easier.  Ask the pass to place reg writes directly in the
+    * immediate predecessors of the phis instead of trying to be clever.
+    * This will ensure that we never get a write to a uniform register from
+    * non-uniform control flow and makes our divergence reconstruction for
+    * phis more reliable.
+    */
+   nir_foreach_block(block, old_impl)
+      nir_lower_phis_to_regs_block(block, true);
+
+   /* We create a whole new nir_function_impl and copy the contents over */
+   func->impl = NULL;
+   nir_function_impl *new_impl = nir_function_impl_create(func);
+   new_impl->structured = false;
+
+   /* We copy defs from the old impl */
+   new_impl->ssa_alloc = old_impl->ssa_alloc;
+
+   nir_builder b = nir_builder_at(nir_before_impl(new_impl));
+   nir_def *esc_reg = nir_decl_reg(&b, 1, 32, 0);
+
+   /* Having a function scope makes everything easier */
+   struct scope scope = {
+      .type = SCOPE_TYPE_SHADER,
+      .merge = new_impl->end_block,
+   };
+   lower_cf_list(&b, esc_reg, &scope, &old_impl->body);
+   normal_exit(&b, esc_reg, new_impl->end_block);
+
+   /* Now sort by reverse PDFS and restore SSA
+    *
+    * Note: Since we created a new nir_function_impl, there is no metadata,
+    * dirty or otherwise, so we have no need to call nir_progress().
+    */
+   nir_sort_unstructured_blocks(new_impl);
+
+   return true;
+}
+
+/* Copied from nak_nir_lower_cf.c but without divergency or barriers */
+bool
+nocl_nir_lower_cf(struct nir_shader *nir)
+{
+   bool progress = false;
+
+   nir_foreach_function(func, nir) {
+      if (lower_cf_func(func))
+         progress = true;
+   }
+
+   return progress;
+}
+
+nir_shader_compiler_options *
+nocl_get_nir_options(struct nocl_screen *nocl)
+{
+   nir_shader_compiler_options *nir_options = ralloc(nocl, nir_shader_compiler_options);
+   *nir_options = (nir_shader_compiler_options) {
+      // TODO:
+      // .has_bit_test = nocl->ptx >= 60,
+      // .has_bitfield_select = nocl->ptx >= 43,
+      // .has_f2i32_rtne = true,
+      // .has_uclz = true,
+      // .has_dot_2x16 = true,
+      // .has_fneo_fcmpu = true,
+      // .has_fused_comp_and_csel = true,
+      // .has_pack_32_4x8 = true,
+      // .has_shfr32 = true,
+
+      .fuse_ffma32 = true,
+      .has_fsub = true,
+      .has_imul24 = true,
+      .has_imad32 = true,
+      .has_isub = true,
+      .has_rotate32 = nocl->sm >= 32,
+      .has_sdot_4x8 = nocl->sm >= 61,
+      .has_sudot_4x8 = nocl->sm >= 61,
+      .has_udot_4x8 = nocl->sm >= 61,
+      .has_umul24 = true,
+      .has_umad24 = true,
+      .lower_hadd = true,
+      .lower_find_lsb = true,
+      .lower_flrp16 = true,
+      .lower_flrp32 = true,
+      .lower_flrp64 = true,
+      .lower_fisnormal = true,
+      .lower_fsat = true,
+      .lower_fsign = true,
+      .lower_int64_options =
+         /* _technically PTX supports it, but it fails to compile with vector sources...
+          * The hardware doesn't have an idiv anyway. */
+         nir_lower_divmod64 |
+         nir_lower_iadd_sat64 |
+         nir_lower_imul64 |
+         nir_lower_scan_reduce_bitwise64 |
+         nir_lower_scan_reduce_iadd64 |
+         nir_lower_subgroup_shuffle64 |
+         (nocl->sm < 70 ? nir_lower_vote_ieq64 : 0),
+      .lower_insert_byte = true,
+      .lower_insert_word = true,
+      .lower_isign = true,
+      .lower_ldexp = true,
+      .lower_uadd_sat = true,
+      .lower_usub_sat = true,
+      .max_unroll_iterations = 32,
+      .support_16bit_alu = true,
+   };
+
+   return nir_options;
+}
+
+static uint8_t
+alu_width_cb(const nir_instr *instr, const void *)
+{
+   return 1;
+}
+
+static unsigned
+lower_bit_size_callback(const nir_instr *instr, void *data)
+{
+   const struct nocl_screen *nocl = data;
+
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      unsigned dest_bit_size = alu->def.bit_size;
+      unsigned src0_bit_size = alu->src[0].src.ssa->bit_size;
+      switch (alu->op) {
+      case nir_op_extract_i8:
+      case nir_op_extract_i16:
+      case nir_op_extract_u8:
+      case nir_op_extract_u16:
+      case nir_op_bitfield_reverse:
+         if (dest_bit_size < 32)
+            return 32;
+         return 0;
+      case nir_op_bcsel:
+         if (dest_bit_size < 16)
+            return 16;
+         return 0;
+
+      case nir_op_bit_count:
+      case nir_op_bitfield_insert:
+      case nir_op_fdiv:
+      case nir_op_frsq:
+      case nir_op_fsqrt:
+      case nir_op_iadd_sat:
+      case nir_op_ibitfield_extract:
+      case nir_op_isub_sat:
+      case nir_op_ubitfield_extract:
+      case nir_op_ufind_msb:
+         if (src0_bit_size < 32)
+            return 32;
+         return 0;
+
+      case nir_op_iabs:
+      case nir_op_iadd:
+      case nir_op_iand:
+      case nir_op_idiv:
+      case nir_op_ieq:
+      case nir_op_ige:
+      case nir_op_ilt:
+      case nir_op_imad:
+      case nir_op_imax:
+      case nir_op_imin:
+      case nir_op_imul:
+      case nir_op_imul_high:
+      case nir_op_ine:
+      case nir_op_ineg:
+      case nir_op_inot:
+      case nir_op_ior:
+      case nir_op_irem:
+      case nir_op_ishl:
+      case nir_op_ishr:
+      case nir_op_isub:
+      case nir_op_ixor:
+      case nir_op_udiv:
+      case nir_op_uge:
+      case nir_op_ult:
+      case nir_op_umax:
+      case nir_op_umin:
+      case nir_op_umod:
+      case nir_op_umul_high:
+      case nir_op_ushr:
+         if (src0_bit_size == 8)
+            return 16;
+         return 0;
+
+      case nir_op_fmax:
+      case nir_op_fmin:
+         if (src0_bit_size == 16 && nocl->sm < 80)
+            return 32;
+         return 0;
+
+      case nir_op_feq:
+      case nir_op_fequ:
+      case nir_op_fge:
+      case nir_op_fgeu:
+      case nir_op_flt:
+      case nir_op_fltu:
+      case nir_op_fneo:
+      case nir_op_fneu:
+         if (src0_bit_size == 16 && nocl->sm < 53)
+            return 32;
+         return 0;
+      default:
+         return 0;
+      }
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_read_invocation:
+      case nir_intrinsic_reduce:
+      case nir_intrinsic_shuffle:
+      case nir_intrinsic_shuffle_down:
+      case nir_intrinsic_shuffle_up:
+      case nir_intrinsic_shuffle_xor:
+         if (intrin->def.bit_size < 32)
+            return 32;
+         return 0;
+      default:
+         return 0;
+      }
+   }
+
+   default:
+      return 0;
+   }
+}
+
+static bool
+lower_subgroups_filter(const nir_intrinsic_instr *intrin, const void *data)
+{
+   const struct nocl_screen *nocl = data;
+   if (intrin->def.num_components > 1)
+      return true;
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_ballot_bit_count_reduce:
+   case nir_intrinsic_ballot_bit_count_exclusive:
+   case nir_intrinsic_ballot_bit_count_inclusive:
+   case nir_intrinsic_ballot_bitfield_extract:
+   case nir_intrinsic_ballot_find_lsb:
+   case nir_intrinsic_ballot_find_msb:
+   case nir_intrinsic_exclusive_scan:
+   case nir_intrinsic_first_invocation:
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_inverse_ballot:
+   case nir_intrinsic_load_subgroup_size:
+   case nir_intrinsic_quad_vote_all:
+   case nir_intrinsic_quad_vote_any:
+   case nir_intrinsic_read_first_invocation:
+   case nir_intrinsic_rotate:
+   case nir_intrinsic_vote_feq:
+   /* TODO: exists on SM90+ */
+   case nir_intrinsic_elect:
+      return true;
+   case nir_intrinsic_vote_ieq:
+      return nocl->sm < 70;
+   case nir_intrinsic_reduce: {
+      if (nocl->sm < 80)
+         return true;
+      unsigned cluster_size = nir_intrinsic_cluster_size(intrin);
+      if (cluster_size != 0 && cluster_size != 32)
+         return true;
+
+      switch (nir_intrinsic_reduction_op(intrin)) {
+      case nir_op_fadd:
+      case nir_op_fmax:
+      case nir_op_fmin:
+      case nir_op_fmul:
+      case nir_op_imul:
+         return true;
+      case nir_op_imax:
+      case nir_op_imin:
+      case nir_op_umax:
+      case nir_op_umin:
+         return intrin->def.bit_size == 64;
+      default:
+         /* nir_lower_int64 will handle some of it for us */
+         return false;
+      }
+   }
+   case nir_intrinsic_read_invocation:
+   case nir_intrinsic_shuffle:
+   case nir_intrinsic_shuffle_down:
+   case nir_intrinsic_shuffle_up:
+   case nir_intrinsic_shuffle_xor:
+      return intrin->def.bit_size > 32;
+   default:
+      return false;
+   }
+}
+
+static nir_mem_access_size_align
+lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin,
+                              uint8_t bytes,
+                              uint8_t bit_size,
+                              uint32_t align,
+                              uint32_t align_offset,
+                              bool offset_is_const,
+                              enum gl_access_qualifier,
+                              const void *cb_data)
+{
+   align = nir_combined_align(align, align_offset);
+   bytes = MIN2(MIN2(bytes, align), 16);
+   bit_size = MIN2(bit_size, bytes * 8);
+
+   unsigned num_components = MIN2(bytes / (bit_size / 8), 4);
+   if (num_components == 3)
+      num_components = 2;
+
+   return (nir_mem_access_size_align) {
+      .bit_size = bit_size,
+      .num_components = num_components,
+      .align = align,
+      .shift = nir_mem_access_shift_method_scalar,
+   };
+}
+
+static bool
+lower_alu_vec_srcs(nir_builder *b, nir_alu_instr *alu, void *_data)
+{
+   const nir_op_info *info = &nir_op_infos[alu->op];
+
+   bool changed = false;
+   b->cursor = nir_before_instr(&alu->instr);
+   for (int i = 0; i < info->num_inputs; i++) {
+      if (info->input_sizes[i])
+         continue;
+
+      /* We lower everything that is bigger than vec4 _and_ 128 bits */
+      nir_def *src_ssa = alu->src[i].src.ssa;
+      if (src_ssa->bit_size * src_ssa->num_components <= 128 && src_ssa->num_components <= 4)
+         continue;
+
+      changed = true;
+      nir_def *comps[NIR_MAX_VEC_COMPONENTS];
+      for (int c = 0; c < alu->def.num_components; c++) {
+         unsigned swizzle = alu->src[i].swizzle[c];
+         alu->src[i].swizzle[c] = c;
+
+         nir_const_value *const_val = nir_src_as_const_value(alu->src[i].src);
+         if (const_val) {
+            comps[c] = nir_build_imm(b, 1, alu->src[i].src.ssa->bit_size, &const_val[swizzle]);
+         } else {
+            comps[c] = nir_swizzle(b, alu->src[i].src.ssa, &swizzle, 1);
+         }
+      }
+      nir_def *src = nir_vec(b, comps, alu->def.num_components);
+      nir_src_rewrite(&alu->src[i].src, src);
+   }
+
+   return changed;
+}
+
+/* copied from nir_lower_alu_vec8_16_srcs */
+static bool
+nir_lower_alu_vec_srcs(nir_shader *shader)
+{
+   return nir_shader_alu_pass(shader, lower_alu_vec_srcs,
+                              nir_metadata_control_flow,
+                              NULL);
+}
+
+void
+nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir)
+{
+   struct nocl_screen *nocl = nocl_screen(pscreen);
+   nir->info.max_subgroup_size = nocl->base.caps.shader_subgroup_size;
+   nir->info.min_subgroup_size = nocl->base.caps.shader_subgroup_size;
+   nir->info.api_subgroup_size = nocl->base.caps.shader_subgroup_size;
+
+   struct nir_lower_compute_system_values_options sysval_options = {
+      .lower_local_invocation_index = true,
+      .lower_num_subgroups = true,
+      .lower_subgroup_id = true,
+   };
+
+   struct nir_lower_mem_access_bit_sizes_options mem_access_bit_sizes_options = {
+      .callback = lower_mem_access_bit_sizes_cb,
+      .cb_data = nocl,
+      .modes = nir_var_all,
+   };
+
+   struct nir_lower_subgroups_options subgroup_options = {
+      .ballot_bit_size = 32,
+      .ballot_components = 1,
+      .subgroup_size = 32,
+      .lower_elect = true,
+      .lower_first_invocation_to_ballot = true,
+      .lower_inverse_ballot = true,
+      .lower_quad_vote = true,
+      .lower_read_first_invocation = true,
+      .lower_reduce = true,
+      .lower_rotate_to_shuffle = true,
+      .lower_shuffle_to_32bit = true,
+      .lower_to_scalar = true,
+      .lower_vote_feq = true,
+      .lower_vote_ieq = nocl->sm < 70,
+      .filter = lower_subgroups_filter,
+      .filter_data = nocl,
+   };
+
+   NIR_PASS(_, nir, nir_lower_compute_system_values, &sysval_options);
+
+   /* TODO: keeping 8 bit vectors would help a bit... */
+   NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
+   NIR_PASS(_, nir, nir_lower_subgroups, &subgroup_options);
+
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
+      NIR_PASS(progress, nir, nir_lower_mem_access_bit_sizes, &mem_access_bit_sizes_options);
+      NIR_PASS(progress, nir, nir_lower_alu_width, alu_width_cb, nocl);
+      NIR_PASS(progress, nir, nir_lower_alu_vec_srcs);
+      NIR_PASS(progress, nir, nir_lower_flrp, 16 | 32 | 64, false);
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      NIR_PASS(progress, nir, nir_opt_cse);
+   } while (progress);
+
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_lower_bit_size, lower_bit_size_callback, nocl);
+      NIR_PASS(progress, nir, nir_opt_algebraic_late);
+      NIR_PASS(progress, nir, nocl_nir_opt_algebraic_late, nocl);
+      if (progress) {
+         NIR_PASS(_, nir, nir_opt_constant_folding);
+         NIR_PASS(_, nir, nir_copy_prop);
+         NIR_PASS(_, nir, nir_opt_dce);
+         NIR_PASS(_, nir, nir_opt_cse);
+      }
+   } while (progress);
+
+   /* Run only once */
+   NIR_PASS(progress, nir, nocl_nir_lower_algebraic_late, nocl);
+   if (progress) {
+      NIR_PASS(_, nir, nir_opt_constant_folding);
+   }
+
+   /* After nir_opt_constant_folding */
+   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
+   NIR_PASS(_, nir, nir_opt_dce);
+}
diff --git a/src/gallium/drivers/nocl/nocl_nir_algebraic.py b/src/gallium/drivers/nocl/nocl_nir_algebraic.py
new file mode 100644
index 00000000000..756e1e69545
--- /dev/null
+++ b/src/gallium/drivers/nocl/nocl_nir_algebraic.py
@@ -0,0 +1,65 @@
+# Copyright © 2025 Karol Herbst
+# SPDX-License-Identifier: MIT
+
+import argparse
+import sys
+
+a = 'a'
+b = 'b'
+c = 'c'
+s = 's'
+
+late_algebraic = [
+    (('ior@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
+    (('iadd@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
+    (('ixor@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
+    (('ixor@32', ('iand', a, ('ixor', b, c)), c), ('bitfield_select', a, b, c)),
+
+    (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c)),
+    (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c)),
+    (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b)),
+    (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c)),
+    (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c)),
+    (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c)),
+]
+
+late_algebraic_lowering = [
+    (('b2i8', a), ('u2u8', ('b2i16', a))),
+]
+
+for bit_size in [16, 32, 64]:
+    late_algebraic_lowering += [
+        ((f'ishl@{bit_size}', a, b), ('ishl', a, ('iand', b, bit_size - 1))),
+        ((f'ishr@{bit_size}', a, b), ('ishr', a, ('iand', b, bit_size - 1))),
+        ((f'ushr@{bit_size}', a, b), ('ushr', a, ('iand', b, bit_size - 1))),
+    ]
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--out', required=True, help='Output file.')
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+
+    import nir_algebraic  # pylint: disable=import-error
+
+    try:
+        with open(args.out, 'w', encoding='utf-8') as f:
+            f.write('#include "nocl_private.h"')
+            f.write(nir_algebraic.AlgebraicPass(
+                "nocl_nir_opt_algebraic_late",
+                late_algebraic,
+                [
+                    ("const struct nocl_screen *", "nocl"),
+                ]).render())
+            f.write(nir_algebraic.AlgebraicPass(
+                "nocl_nir_lower_algebraic_late",
+                late_algebraic_lowering,
+                [
+                    ("const struct nocl_screen *", "nocl"),
+                ]).render())
+    except Exception:
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/drivers/nocl/nocl_pipe.c b/src/gallium/drivers/nocl/nocl_pipe.c
new file mode 100644
index 00000000000..a3a549af35e
--- /dev/null
+++ b/src/gallium/drivers/nocl/nocl_pipe.c
@@ -0,0 +1,1027 @@
+/*
+ * Copyright © 2025 Karol Herbst
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nocl_private.h"
+#include "compiler/nir_to_ptx.h"
+#include "util/blob.h"
+#include "util/disk_cache.h"
+#include "util/helpers.h"
+#include "util/mesa-sha1.h"
+#include "util/u_debug.h"
+#include "util/u_screen.h"
+#include "util/u_transfer.h"
+
+#include "git_sha1.h"
+#include "nir.h"
+#include "nir_serialize.h"
+
+/* clang-format off */
+static const struct debug_named_value nocl_debug_options[] = {
+   {"nir",         NOCL_DBG_NIR,         "Dump the generated NIRs"},
+   {"ptx",         NOCL_DBG_PTX,         "Dump the PTX and compiler info messages"},
+   {"jit_no_opts", NOCL_DBG_JIT_NO_OPTS, "Disables all JIT optimizations"},
+   {"verbose",     NOCL_DBG_VERBOSE,     "Enables verbose logging"},
+   DEBUG_NAMED_VALUE_END
+};
+/* clang-format on */
+
+static void
+nocl_bind_compute_state(struct pipe_context *pctx, void *cso)
+{
+   struct nocl_context *ctx = nocl_context(pctx);
+   ctx->prog = nocl_program(cso);
+}
+
+static void
+nocl_bind_sampler_states(struct pipe_context *pctx,
+                         mesa_shader_stage shader,
+                         unsigned start_slot, unsigned num_samplers,
+                         void **samplers)
+{
+   assert(!num_samplers);
+}
+
+static void *
+nocl_buffer_map(struct pipe_context *pctx,
+                struct pipe_resource *pres,
+                unsigned level,
+                unsigned usage,  /* a combination of PIPE_MAP_x */
+                const struct pipe_box *box,
+                struct pipe_transfer **out_transfer)
+{
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+   struct nocl_buffer *res = nocl_buffer(pres);
+   struct nocl_transfer *xfer = CALLOC_STRUCT(nocl_transfer);
+   bool sync = !(usage & PIPE_MAP_UNSYNCHRONIZED);
+
+   if (!xfer)
+      return NULL;
+
+   void *ptr;
+   if (pres->usage & PIPE_USAGE_STAGING) {
+      ptr = res->ptr + box->x;
+   } else {
+      if (NOCL_CALL(cuMemAllocHost_v2(&xfer->ptr, box->width)))
+         goto err_alloc_host;
+
+      if (!(usage & (PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_DISCARD_RANGE))) {
+         if (NOCL_CALL(cuMemcpyDtoHAsync_v2(xfer->ptr, res->dptr + box->x, box->width, ctx->cu_stream)))
+            goto err_copy_host;
+      }
+
+      ptr = xfer->ptr;
+   }
+
+   if (sync)
+      NOCL_CALL(cuStreamSynchronize(ctx->cu_stream));
+
+   *out_transfer = &xfer->base;
+   xfer->base = (struct pipe_transfer) {
+      .box = *box,
+      .level = level,
+      .resource = pres,
+      .usage = usage,
+   };
+
+   return ptr;
+
+err_copy_host:
+   NOCL_CALL(cuMemFreeHost(xfer->ptr));
+err_alloc_host:
+   FREE(xfer);
+   return NULL;
+}
+
+static void
+nocl_buffer_subdata(struct pipe_context *pctx,
+                    struct pipe_resource *pres,
+                    unsigned usage,
+                    unsigned offset,
+                    unsigned size,
+                    const void *data)
+{
+   nocl_context_and_make_current(pctx);
+   struct nocl_buffer *res = nocl_buffer(pres);
+   NOCL_CALL(cuMemcpyHtoD_v2(res->dptr + offset, data, size));
+}
+
+static void
+nocl_buffer_unmap(struct pipe_context *pctx,
+                  struct pipe_transfer *pxfer)
+{
+   struct nocl_transfer *xfer = nocl_transfer(pxfer);
+   struct nocl_buffer *res = nocl_buffer(pxfer->resource);
+
+   if (!(res->base.usage & PIPE_USAGE_STAGING)) {
+      nocl_context_and_make_current(pctx);
+      if (pxfer->usage & PIPE_MAP_WRITE) {
+         /* TODO: use async and delay freeing the host alloc until finish? */
+         NOCL_CALL(cuMemcpyHtoD_v2(res->dptr + pxfer->box.x, xfer->ptr, pxfer->box.width));
+      }
+
+      NOCL_CALL(cuMemFreeHost(xfer->ptr));
+   }
+
+   FREE(xfer);
+}
+
+static void
+nocl_clear_buffer(struct pipe_context *pctx,
+                  struct pipe_resource *pres,
+                  unsigned offset,
+                  unsigned size,
+                  const void *clear_value,
+                  int clear_value_size)
+{
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+   struct nocl_buffer *res = nocl_buffer(pres);
+   CUdeviceptr ptr = res->dptr + offset;
+
+   uint32_t clamped;
+
+   if (clear_value_size > 4) {
+      if (util_lower_clearsize_to_dword(clear_value, &clear_value_size, &clamped))
+         clear_value = &clamped;
+   }
+
+   switch (clear_value_size) {
+   case 1:
+      NOCL_CALL(cuMemsetD8Async(ptr, *(uint8_t*)clear_value, size, ctx->cu_stream));
+      break;
+   case 2:
+      NOCL_CALL(cuMemsetD16Async(ptr, *(uint16_t*)clear_value, size / 2, ctx->cu_stream));
+      break;
+   case 4:
+      NOCL_CALL(cuMemsetD32Async(ptr, *(uint32_t*)clear_value, size / 4, ctx->cu_stream));
+      break;
+   default:
+      /* TODO: accelerate */
+      u_default_clear_buffer(pctx, pres, offset, size, clear_value, clear_value_size);
+      break;
+   }
+}
+
+static void
+nocl_context_destroy(struct pipe_context *pctx)
+{
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+   NOCL_CALL(cuEventDestroy_v2(ctx->cu_timestamp_start));
+   NOCL_CALL(cuStreamSynchronize(ctx->cu_stream));
+   NOCL_CALL(cuStreamDestroy_v2(ctx->cu_stream));
+   ralloc_free(ctx);
+}
+
+static struct nocl_program *
+nocl_compile_nir(struct nocl_screen *nocl, nir_shader *nir)
+{
+   const char *kernel_name = nir_shader_get_entrypoint(nir)->function->name;
+   CUlinkState cu_linker = NULL;
+   void *cubin = NULL;
+   cache_key key;
+
+   struct nocl_program *prog = rzalloc(nocl, struct nocl_program);
+   if (!prog)
+      return NULL;
+
+   unsigned int threads;
+   if (nir->info.workgroup_size_variable) {
+      threads = nir->info.cs.workgroup_size_hint[0]
+         * nir->info.cs.workgroup_size_hint[1]
+         * nir->info.cs.workgroup_size_hint[2];
+   } else {
+      threads = nir->info.workgroup_size[0]
+         * nir->info.workgroup_size[1]
+         * nir->info.workgroup_size[2];
+   }
+
+   if (nocl->disk_cache) {
+      struct blob blob;
+      size_t size;
+
+      blob_init(&blob);
+      nir_serialize(&blob, nir, false);
+      disk_cache_compute_key(nocl->disk_cache, blob.data, blob.size, key);
+      blob_finish(&blob);
+
+      cubin = disk_cache_get(nocl->disk_cache, key, &size);
+   }
+
+   if (!cubin) {
+      NIR_PASS(_, nir, nocl_nir_lower_cf);
+      nir_foreach_function_impl(impl, nir) {
+         nir_metadata_require(impl, nir_metadata_block_index);
+      }
+
+      if (nocl->debug & NOCL_DBG_NIR)
+         nir_print_shader(nir, stdout);
+
+      char *error_log = CALLOC(1, NOCL_COMPILER_LOG_SIZE);
+      if (!error_log)
+         goto out_err;
+
+      char *info_log = CALLOC(1, NOCL_COMPILER_LOG_SIZE);
+      if (!info_log) {
+         FREE(error_log);
+         goto out_err;
+      }
+
+      /* TODO: better occupancy calculation, e.g. take shared_memory usage into account */
+      CUjit_option options[] = {
+         CU_JIT_TARGET_FROM_CUCONTEXT,
+         CU_JIT_INFO_LOG_BUFFER,
+         CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+         CU_JIT_ERROR_LOG_BUFFER,
+         CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+         CU_JIT_LOG_VERBOSE,
+         CU_JIT_OPTIMIZATION_LEVEL,
+         CU_JIT_MAX_REGISTERS,
+         CU_JIT_POSITION_INDEPENDENT_CODE,
+         /* Keep last so we can skip setting it */
+         CU_JIT_THREADS_PER_BLOCK,
+      };
+
+      void * values[] = {
+         NULL,
+         info_log,
+         (void *)(uintptr_t)NOCL_COMPILER_LOG_SIZE,
+         error_log,
+         (void *)(uintptr_t)NOCL_COMPILER_LOG_SIZE,
+         (void *)(nocl->debug & NOCL_DBG_VERBOSE ? 1ull : 0ull),
+         (void *)(nocl->debug & NOCL_DBG_JIT_NO_OPTS ? 0ull : 4ull),
+         (void *)255,
+         (void *)0,
+         (void *)(uintptr_t)threads,
+      };
+      STATIC_ASSERT(ARRAY_SIZE(options) == ARRAY_SIZE(values));
+
+      unsigned num_options = ARRAY_SIZE(options);
+      if (threads == 0)
+         num_options -= 1;
+
+      struct NirToPtxOptions ptx_options = {
+         .target_sm = nocl->sm,
+         .target_ptx = nocl->ptx,
+      };
+      struct NirToPtxOutput ptx = nir_to_ptx(nir, ptx_options);
+      if (nocl->debug & NOCL_DBG_PTX)
+         printf("PTX input:\n%s\n", ptx.ptx);
+
+      size_t cubin_size;
+      NOCL_CALL(cuLinkCreate_v2(num_options, options, values, &cu_linker));
+      if (!NOCL_CALL(cuLinkAddData_v2(cu_linker, CU_JIT_INPUT_PTX, ptx.ptx, ptx.ptx_len, kernel_name, 0, NULL, NULL)))
+         NOCL_CALL(cuLinkComplete(cu_linker, &cubin, &cubin_size));
+      nir_to_ptx_free(ptx);
+
+      if (nocl->debug & NOCL_DBG_PTX) {
+         if (strlen(info_log))
+            printf("PTXJIT info log:\n%s\n", info_log);
+      }
+
+      if (strlen(error_log))
+         printf("PTXJIT error log:\n%s\n", error_log);
+
+      FREE(info_log);
+      FREE(error_log);
+
+      if (!cubin)
+         goto out_err;
+
+      if (nocl->disk_cache)
+         disk_cache_put(nocl->disk_cache, key, cubin, cubin_size, NULL);
+   }
+
+   prog->shared_size = nir->info.shared_size;
+
+   CUjit_option options[] = {
+      CU_JIT_TARGET_FROM_CUCONTEXT,
+      CU_JIT_OPTIMIZATION_LEVEL,
+      CU_JIT_MAX_REGISTERS,
+      CU_JIT_POSITION_INDEPENDENT_CODE,
+      /* Keep last so we can skip setting it */
+      CU_JIT_THREADS_PER_BLOCK,
+   };
+
+   unsigned num_options = ARRAY_SIZE(options);
+   if (threads == 0)
+      num_options -= 1;
+
+   void * values[] = {
+      NULL,
+      (void *)(nocl->debug & NOCL_DBG_JIT_NO_OPTS ? 0ull : 4ull),
+      (void *)256,
+      (void *)0,
+      (void *)(uintptr_t)threads,
+   };
+   STATIC_ASSERT(ARRAY_SIZE(options) == ARRAY_SIZE(values));
+
+   NOCL_CALL(cuModuleLoadDataEx(&prog->cu_mod, cubin, num_options, options, values));
+   NOCL_CALL(cuModuleGetFunction(&prog->cu_func, prog->cu_mod, kernel_name));
+
+   /* The linker owns the cubin, so destroy only after creating the lib object */
+   if (cu_linker)
+      NOCL_CALL(cuLinkDestroy(cu_linker));
+
+out_err:
+   return prog;
+}
+
+static void
+nocl_delete_compute_state(struct pipe_context *, void *mod_ptr)
+{
+   struct nocl_program *prog = nocl_program(mod_ptr);
+   NOCL_CALL(cuModuleUnload(prog->cu_mod));
+   ralloc_free(prog);
+}
+
+static void *
+nocl_create_compute_state(struct pipe_context *pctx,
+                          const struct pipe_compute_state *comp)
+{
+   assert(comp->ir_type == PIPE_SHADER_IR_NIR);
+
+   struct nocl_screen *nocl = nocl_screen_and_make_current(pctx->screen);
+   nir_shader *nir = (void *)comp->prog;
+
+   struct nocl_program *prog = nocl_compile_nir(nocl, nir);
+   if (!prog->cu_mod || !prog->cu_func) {
+      nocl_delete_compute_state(pctx, prog);
+      prog = NULL;
+      goto out_err;
+   }
+
+out_err:
+   ralloc_free(nir);
+   return prog;
+}
+
+static struct pipe_query *
+nocl_create_query(struct pipe_context *pctx,
+                  unsigned query_type,
+                  unsigned index)
+{
+   assert(query_type == PIPE_QUERY_TIMESTAMP);
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+
+   struct pipe_query *query = CALLOC_STRUCT(pipe_query);
+   if (!query)
+      return NULL;
+
+   NOCL_CALL(cuEventCreate(&query->cu_event_base, CU_EVENT_DEFAULT));
+   NOCL_CALL(cuEventCreate(&query->cu_event, CU_EVENT_BLOCKING_SYNC));
+   if (!query->cu_event_base || !query->cu_event) {
+      if (query->cu_event)
+         NOCL_CALL(cuEventDestroy(query->cu_event));
+      if (query->cu_event_base)
+         NOCL_CALL(cuEventDestroy(query->cu_event_base));
+      FREE(query);
+      return NULL;
+   }
+
+   NOCL_CALL(cuEventRecord(query->cu_event_base, ctx->cu_stream));
+   return query;
+}
+
+static void *
+nocl_create_sampler_state(struct pipe_context *,
+                          const struct pipe_sampler_state *)
+{
+   assert(false);
+   return NULL;
+}
+
+static void
+nocl_delete_sampler_state(struct pipe_context *, void *)
+{
+   assert(false);
+}
+
+static void
+nocl_destroy_query(struct pipe_context *pipe,
+                   struct pipe_query *query)
+{
+   NOCL_CALL(cuEventDestroy(query->cu_event));
+   NOCL_CALL(cuEventDestroy(query->cu_event_base));
+   FREE(query);
+}
+
+static bool
+nocl_end_query(struct pipe_context *pctx, struct pipe_query *query)
+{
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+   NOCL_CALL(cuEventRecord(query->cu_event, ctx->cu_stream));
+   return true;
+}
+
+static void
+nocl_flush(struct pipe_context *pctx,
+           struct pipe_fence_handle **fence,
+           unsigned flags)
+{
+   struct pipe_fence_handle *new_fence = CALLOC_STRUCT(pipe_fence_handle);
+   if (!new_fence) {
+      *fence = NULL;
+      return;
+   }
+
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+   if (NOCL_CALL(cuEventCreate(&new_fence->cu_event, CU_EVENT_BLOCKING_SYNC))) {
+      *fence = NULL;
+      FREE(new_fence);
+   } else {
+      NOCL_CALL(cuEventRecord(new_fence->cu_event, ctx->cu_stream));
+      pipe_reference_init(&new_fence->ref, 1);
+      *fence = new_fence;
+   }
+}
+
+static void
+nocl_get_compute_state_info(struct pipe_context *pctx, void *cso,
+                            struct pipe_compute_state_object_info *info)
+{
+   struct nocl_screen *nocl = nocl_screen(pctx->screen);
+   struct nocl_program *prog = nocl_program(cso);
+
+   if (!prog)
+      return;
+
+   *info = (struct pipe_compute_state_object_info) {
+      .max_threads = nocl->base.compute_caps.max_threads_per_block,
+      .preferred_simd_size = nocl->base.caps.shader_subgroup_size,
+      .simd_sizes = nocl->base.compute_caps.subgroup_sizes,
+   };
+
+   NOCL_CALL(cuFuncGetAttribute((int*)&info->private_memory, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, prog->cu_func));
+   NOCL_CALL(cuFuncGetAttribute((int*)&info->max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, prog->cu_func));
+}
+
+static bool
+nocl_get_query_result(struct pipe_context *pctx,
+                      struct pipe_query *query,
+                      bool wait,
+                      union pipe_query_result *result)
+{
+   assert(wait);
+
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+   float time_base, time;
+
+   /* In order to not lose precision over time, we'll use two event objects */
+   NOCL_CALL(cuEventSynchronize(query->cu_event));
+   NOCL_CALL(cuEventElapsedTime_v1(&time_base, ctx->cu_timestamp_start, query->cu_event_base));
+   NOCL_CALL(cuEventElapsedTime_v1(&time, query->cu_event_base, query->cu_event));
+
+   double hires_time_base = (double)time_base;
+   double hires_time = (double)time;
+
+   /* the resolution is 0.5us, so let's add half of that that just in case */
+   result->u64 = ceil((hires_time_base + hires_time) * 1000000) + ctx->timestamp_start + 250;
+   return true;
+}
+
+static void
+nocl_launch_grid(struct pipe_context *pctx,
+                 const struct pipe_grid_info *info)
+{
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+   struct nocl_program *prog = ctx->prog;
+
+   void *input[] = {
+      CU_LAUNCH_PARAM_BUFFER_POINTER, (void *)ctx->kernel_input,
+      CU_LAUNCH_PARAM_BUFFER_SIZE,    &ctx->kernel_input_size,
+      CU_LAUNCH_PARAM_END
+   };
+
+   NOCL_CALL(cuLaunchKernel(
+      prog->cu_func,
+      info->grid[0], info->grid[1], info->grid[2],
+      info->block[0], info->block[1], info->block[2],
+      prog->shared_size + info->variable_shared_mem,
+      ctx->cu_stream,
+      NULL,
+      input
+   ));
+}
+
+static void
+nocl_memory_barrier(struct pipe_context *, unsigned flags)
+{
+   // assert(false);
+}
+
+static void
+nocl_resource_copy_region(struct pipe_context *pctx,
+                          struct pipe_resource *pdst,
+                          unsigned dst_level,
+                          unsigned dstx, unsigned dsty, unsigned dstz,
+                          struct pipe_resource *psrc,
+                          unsigned src_level,
+                          const struct pipe_box *src_box)
+{
+   struct nocl_context *ctx = nocl_context_and_make_current(pctx);
+   struct nocl_buffer *src = nocl_buffer(psrc);
+   struct nocl_buffer *dst = nocl_buffer(pdst);
+
+   if (psrc->target == PIPE_BUFFER && pdst->target == PIPE_BUFFER) {
+      NOCL_CALL(cuMemcpyAsync(dst->dptr + dstx, src->dptr + src_box->x, src_box->width, ctx->cu_stream));
+   } else {
+      assert(false);
+   }
+}
+
+static void
+nocl_sampler_view_destroy(struct pipe_context *ctx,
+                          struct pipe_sampler_view *view)
+{
+   assert(false);
+}
+
+static void
+nocl_sampler_view_release(struct pipe_context *ctx,
+                          struct pipe_sampler_view *view)
+{
+   assert(false);
+}
+
+static void
+nocl_set_constant_buffer(struct pipe_context *pctx,
+                         mesa_shader_stage shader, uint index,
+                         const struct pipe_constant_buffer *buf)
+{
+   struct nocl_context *ctx = nocl_context(pctx);
+   assert(index == 0);
+   assert(shader == MESA_SHADER_COMPUTE);
+
+   if (buf) {
+      assert(buf->buffer == NULL);
+      assert(buf->buffer_offset == 0);
+
+      if (buf->buffer_size > ctx->kernel_input_capacity) {
+         ralloc_free(ctx->kernel_input);
+         ctx->kernel_input = ralloc_size(ctx, buf->buffer_size);
+      }
+
+      ctx->kernel_input_size = buf->buffer_size;
+      memcpy(ctx->kernel_input, buf->user_buffer, buf->buffer_size);
+   } else {
+      ctx->kernel_input_size = 0;
+   }
+}
+
+static void
+nocl_set_global_binding(struct pipe_context *pctx,
+                        unsigned first, unsigned count,
+                        struct pipe_resource **resources,
+                        uint32_t **handles)
+{
+   if (!resources)
+      return;
+
+   for (unsigned i = 0; i < count; i++) {
+      struct nocl_buffer *res = nocl_buffer(resources[i]);
+      uint64_t handle;
+
+      memcpy(&handle, handles[i], 8);
+      handle += res->dptr;
+      memcpy(handles[i], &handle, 8);
+   }
+}
+
+static void
+nocl_set_sampler_views(struct pipe_context *pctx,
+                       mesa_shader_stage shader,
+                       unsigned start_slot, unsigned num_views,
+                       unsigned unbind_num_trailing_slots,
+                       struct pipe_sampler_view **views)
+{
+   assert(!num_views);
+}
+
+static void
+nocl_set_shader_images(struct pipe_context *pctx,
+                       mesa_shader_stage shader,
+                       unsigned start_slot, unsigned count,
+                       unsigned unbind_num_trailing_slots,
+                       const struct pipe_image_view *images)
+{
+   assert(!count);
+}
+
+static void *
+nocl_texture_map(struct pipe_context *,
+                 struct pipe_resource *resource,
+                 unsigned level,
+                 unsigned usage,  /* a combination of PIPE_MAP_x */
+                 const struct pipe_box *,
+                 struct pipe_transfer **out_transfer)
+{
+   assert(false);
+   return NULL;
+}
+
+static void
+nocl_texture_subdata(struct pipe_context *,
+                     struct pipe_resource *,
+                     unsigned level,
+                     unsigned usage, /* a combination of PIPE_MAP_x */
+                     const struct pipe_box *,
+                     const void *data,
+                     unsigned stride,
+                     uintptr_t layer_stride)
+{
+   assert(false);
+}
+
+static void
+nocl_texture_unmap(struct pipe_context *,
+                   struct pipe_transfer *transfer)
+{
+   assert(false);
+}
+
+static struct pipe_context *
+nocl_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
+{
+   nocl_screen_and_make_current(pscreen);
+   struct nocl_context *ctx = rzalloc(pscreen, struct nocl_context);
+
+   ctx->base = (struct pipe_context) {
+      .bind_compute_state = nocl_bind_compute_state,
+      .bind_sampler_states = nocl_bind_sampler_states,
+      .buffer_map = nocl_buffer_map,
+      .buffer_subdata = nocl_buffer_subdata,
+      .buffer_unmap = nocl_buffer_unmap,
+      .clear_buffer = nocl_clear_buffer,
+      .create_compute_state = nocl_create_compute_state,
+      .create_query = nocl_create_query,
+      .create_sampler_state = nocl_create_sampler_state,
+      .delete_compute_state = nocl_delete_compute_state,
+      .delete_sampler_state = nocl_delete_sampler_state,
+      .destroy = nocl_context_destroy,
+      .destroy_query = nocl_destroy_query,
+      .end_query = nocl_end_query,
+      .flush = nocl_flush,
+      .get_compute_state_info = nocl_get_compute_state_info,
+      .get_query_result = nocl_get_query_result,
+      .launch_grid = nocl_launch_grid,
+      .memory_barrier = nocl_memory_barrier,
+      .resource_copy_region = nocl_resource_copy_region,
+      .sampler_view_destroy = nocl_sampler_view_destroy,
+      .sampler_view_release = nocl_sampler_view_release,
+      .set_constant_buffer = nocl_set_constant_buffer,
+      .set_global_binding = nocl_set_global_binding,
+      .set_sampler_views = nocl_set_sampler_views,
+      .set_shader_images = nocl_set_shader_images,
+      .screen = pscreen,
+      .texture_map = nocl_texture_map,
+      .texture_subdata = nocl_texture_subdata,
+      .texture_unmap = nocl_texture_unmap,
+   };
+
+   /* TODO: support multiple streams */
+   ctx->cu_stream = NULL;
+   // if (nocl_error(cuStreamCreate(&ctx->cu_stream, CU_STREAM_NON_BLOCKING))) {
+   //    FREE(ctx);
+   //    return NULL;
+   // }
+
+   NOCL_CALL(cuEventCreate(&ctx->cu_timestamp_start, CU_EVENT_BLOCKING_SYNC));
+   NOCL_CALL(cuEventRecord(ctx->cu_timestamp_start, ctx->cu_stream));
+   NOCL_CALL(cuEventSynchronize(ctx->cu_timestamp_start));
+   /* Need to fetch the host time after waiting on the event */
+   ctx->timestamp_start = pscreen->get_timestamp(pscreen);
+
+   return &ctx->base;
+}
+
+static void
+nocl_destroy(struct pipe_screen *pscreen)
+{
+   struct nocl_screen *nocl = nocl_screen_and_make_current(pscreen);
+#if CUDA_VERSION >= 12090
+   NOCL_CALL(cuLogsUnregisterCallback(nocl->verbose_handle));
+#endif
+   disk_cache_destroy(nocl->disk_cache);
+   NOCL_CALL(cuCtxDestroy(nocl->cu_ctx));
+   ralloc_free(nocl);
+}
+
+#if CUDA_VERSION >= 11040
+static void
+nocl_get_device_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+   struct nocl_screen *nocl = nocl_screen(pscreen);
+   CUuuid uuid_tmp;
+
+   STATIC_ASSERT(sizeof(uuid_tmp.bytes) == PIPE_UUID_SIZE);
+   NOCL_CALL(cuDeviceGetUuid_v2(&uuid_tmp, nocl->cu_dev));
+   memcpy(uuid, uuid_tmp.bytes, PIPE_UUID_SIZE);
+}
+#endif
+
+static void
+nocl_get_driver_uuid(struct pipe_screen *screen, char *uuid)
+{
+   const char* nocl_driver = PACKAGE_VERSION MESA_GIT_SHA1;
+   struct mesa_sha1 sha1_ctx;
+   uint8_t sha1[20];
+
+   _mesa_sha1_init(&sha1_ctx);
+   _mesa_sha1_update(&sha1_ctx, nocl_driver, strlen(nocl_driver));
+   _mesa_sha1_final(&sha1_ctx, sha1);
+   memcpy(uuid, sha1, PIPE_UUID_SIZE);
+}
+
+static bool
+nocl_fence_finish(struct pipe_screen *pscreen,
+                  struct pipe_context *pctx,
+                  struct pipe_fence_handle *fence,
+                  uint64_t timeout)
+{
+   nocl_screen_and_make_current(pscreen);
+   return !NOCL_CALL(cuEventSynchronize(fence->cu_event));
+}
+
+static void
+nocl_fence_reference(struct pipe_screen *screen,
+                     struct pipe_fence_handle **dst,
+                     struct pipe_fence_handle *src)
+{
+   if (pipe_reference(*dst ? &(*dst)->ref : NULL,
+                      src ? &src->ref : NULL)) {
+      nocl_fence_finish(screen, NULL, *dst, OS_TIMEOUT_INFINITE);
+      NOCL_CALL(cuEventDestroy_v2((*dst)->cu_event));
+      FREE(*dst);
+   }
+
+   *dst = src;
+}
+
+static const char *
+nocl_get_device_vendor(struct pipe_screen *)
+{
+   return "NVIDIA?";
+}
+
+static struct disk_cache *
+nocl_get_disk_shader_cache(struct pipe_screen *pscreen)
+{
+   return nocl_screen(pscreen)->disk_cache;
+}
+
+static const char *
+nocl_get_name(struct pipe_screen *pscreen)
+{
+   struct nocl_screen *nocl = nocl_screen(pscreen);
+   return nocl->name;
+}
+
+static bool
+nocl_is_format_supported(struct pipe_screen *,
+                         enum pipe_format format,
+                         enum pipe_texture_target target,
+                         unsigned sample_count,
+                         unsigned storage_sample_count,
+                         unsigned bindings)
+{
+   return false;
+}
+
+static struct pipe_resource *
+nocl_buffer_create(struct pipe_screen *pscreen,
+                   const struct pipe_resource *template)
+{
+   nocl_screen_and_make_current(pscreen);
+   struct nocl_buffer *cu_res = CALLOC_STRUCT_CL(nocl_buffer);
+   if (!cu_res)
+      return NULL;
+
+   if (template->usage & PIPE_USAGE_STAGING) {
+      if (NOCL_CALL(cuMemAllocHost_v2(&cu_res->ptr, template->width0))) {
+         FREE(cu_res);
+         return NULL;
+      }
+   } else {
+      if (NOCL_CALL(cuMemAlloc_v2(&cu_res->dptr, template->width0))) {
+         FREE(cu_res);
+         return NULL;
+      }
+   }
+
+   cu_res->base = *template;
+   cu_res->base.screen = pscreen;
+   pipe_reference_init(&cu_res->base.reference, 1);
+   return &cu_res->base;
+}
+
+static struct pipe_resource *
+nocl_resource_create(struct pipe_screen *pscreen,
+                     const struct pipe_resource *template)
+{
+   if (template->target == PIPE_BUFFER)
+      return nocl_buffer_create(pscreen, template);
+   return NULL;
+}
+
+static void
+nocl_resource_destroy(struct pipe_screen *pscreen,
+                      struct pipe_resource *pres)
+{
+   if (pres->target == PIPE_BUFFER) {
+      struct nocl_buffer *res = nocl_buffer(pres);
+      if (pres->usage & PIPE_USAGE_STAGING) {
+         NOCL_CALL(cuMemFreeHost(res->ptr));
+      } else {
+         NOCL_CALL(cuMemFree_v2(res->dptr));
+      }
+   }
+
+   FREE(pres);
+}
+
+static struct pipe_resource *
+nocl_resource_from_user_memory(struct pipe_screen *pscreen,
+                               const struct pipe_resource *pres,
+                               void *user_memory)
+{
+   /* TODO: use cuMemHostRegister */
+   return NULL;
+}
+
+static uint64_t
+nocl_resource_get_address(struct pipe_screen *pscreen,
+                          struct pipe_resource *pres)
+{
+   STATIC_ASSERT(sizeof(CUdeviceptr) == sizeof(void *));
+   assert(pres->target == PIPE_BUFFER);
+   struct nocl_buffer *res = nocl_buffer(pres);
+   return res->dptr;
+}
+
+#if CUDA_VERSION >= 12090
+static void
+nocl_log_callback(void *data, CUlogLevel logLevel, char *message, size_t length)
+{
+   UNUSED struct nocl_screen *nocl = data;
+   printf("NOCL-VERBOSE: %s\n", message);
+}
+#endif
+
+struct pipe_screen *
+nocl_create_screen(struct cuda_pipe_loader_device *dev)
+{
+   CUdevice cu_dev = dev->dev;
+   struct nocl_screen *nocl = rzalloc(NULL, struct nocl_screen);
+   if (!nocl)
+      return NULL;
+
+   nocl->debug = debug_get_flags_option("NOCL_DEBUG", nocl_debug_options, 0);
+
+#if CUDA_VERSION >= 12090
+   if (nocl->debug & NOCL_DBG_VERBOSE)
+      NOCL_CALL(cuLogsRegisterCallback(nocl_log_callback, nocl, &nocl->verbose_handle));
+#endif
+
+   NOCL_CALL(cuDeviceGetName(nocl->name, sizeof(nocl->name), cu_dev));
+   if (NOCL_CALL(cuCtxCreate_v2(&nocl->cu_ctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, cu_dev))) {
+#if CUDA_VERSION >= 12090
+      NOCL_CALL(cuLogsUnregisterCallback(nocl->verbose_handle));
+#endif
+      ralloc_free(nocl);
+      return NULL;
+   }
+
+   nocl->sm = debug_get_num_option("NOCL_SM_TARGET", 0);
+   if (!nocl->sm) {
+      int sm_tmp;
+      NOCL_CALL(cuDeviceGetAttribute(&sm_tmp, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_dev));
+      nocl->sm = sm_tmp * 10;
+      NOCL_CALL(cuDeviceGetAttribute(&sm_tmp, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_dev));
+      assert(sm_tmp < 10);
+      nocl->sm |= sm_tmp;
+   }
+
+   nocl->ptx = debug_get_num_option("NOCL_PTX_VERSION", 0);
+   if (!nocl->ptx) {
+      /* We have to set the highest PTX version to the sm level we enable features upon
+       * e.g. SM80+ enables fp16 min/max and subgroup reduce, therefore the highest PTX version we
+       *      use is 7.0. If we'd start supporting `match`, we'd need to check for SM90 -> PTX 7.8
+       */
+      if (dev->cuda_version >= 11000)
+         nocl->ptx = 70; /* SM80 */
+      else if (dev->cuda_version >= 10000)
+         nocl->ptx = 63; /* SM75 */
+      else if (dev->cuda_version >= 9010)
+         nocl->ptx = 61; /* SM72 */
+      else if (dev->cuda_version >= 9000)
+         nocl->ptx = 60; /* SM70 */
+      else if (dev->cuda_version >= 8000)
+         nocl->ptx = 50; /* SM60/SM61/SM62 */
+      else if (dev->cuda_version >= 7500)
+         nocl->ptx = 43; /* lop3 */
+      else if (dev->cuda_version >= 7000)
+         nocl->ptx = 42; /* SM53 */
+      else if (dev->cuda_version >= 6500)
+         nocl->ptx = 41;
+   }
+
+   {
+      /* TODO: also check libcuda */
+      const struct build_id_note *note = build_id_find_nhdr_for_addr(nocl_create_screen);
+      assert(note && build_id_length(note) == 20); /* sha1 */
+      const uint8_t *id_sha1 = build_id_data(note);
+      assert(id_sha1);
+
+      char timestamp[41];
+      _mesa_sha1_format(timestamp, id_sha1);
+
+      uint64_t driver_flags = 0;
+      driver_flags |= nocl->sm;
+      driver_flags |= nocl->ptx << 16;
+      if (nocl->debug & NOCL_DBG_JIT_NO_OPTS)
+         driver_flags |= BITFIELD64_BIT(32);
+
+      nocl->disk_cache = disk_cache_create(
+         nocl->name, timestamp, driver_flags
+      );
+   }
+
+   nocl->cu_dev = cu_dev;
+   nocl->base.context_create = nocl_context_create;
+   nocl->base.destroy = nocl_destroy;
+#if CUDA_VERSION >= 11040
+   if (nocl_dispatch.cuDeviceGetUuid_v2)
+      nocl->base.get_device_uuid = nocl_get_device_uuid;
+#endif
+   nocl->base.get_driver_uuid = nocl_get_driver_uuid;
+   nocl->base.fence_finish = nocl_fence_finish;
+   nocl->base.fence_reference = nocl_fence_reference;
+   nocl->base.finalize_nir = nocl_finalize_nir;
+   nocl->base.get_device_vendor = nocl_get_device_vendor;
+   nocl->base.get_disk_shader_cache = nocl_get_disk_shader_cache;
+   nocl->base.get_name = nocl_get_name;
+   nocl->base.get_timestamp = u_default_get_timestamp;
+   nocl->base.is_format_supported = nocl_is_format_supported;
+   nocl->base.resource_create = nocl_resource_create;
+   nocl->base.resource_destroy = nocl_resource_destroy;
+   nocl->base.resource_from_user_memory = nocl_resource_from_user_memory;
+   nocl->base.resource_get_address = nocl_resource_get_address;
+
+   struct pipe_caps *caps = (void *)&nocl->base.caps;
+   struct pipe_compute_caps *compute_caps = (void *)&nocl->base.compute_caps;
+   struct pipe_shader_caps *shader_caps = (void *)&nocl->base.shader_caps[MESA_SHADER_COMPUTE];
+   struct nir_shader_compiler_options *nir_options = nocl_get_nir_options(nocl);
+   nocl->base.nir_options[MESA_SHADER_COMPUTE] = nir_options;
+
+   caps->accelerated = true;
+   caps->compute = true;
+   caps->fp16 = nocl->sm >= 53;
+   caps->int64 = true;
+   caps->max_shader_buffer_size = INT32_MAX;
+   caps->query_timestamp = true;
+   caps->shader_clock = true;
+   caps->shareable_shaders = true;
+   if (nocl->sm >= 30 && nocl->ptx >= 60) {
+      caps->shader_subgroup_supported_features = PIPE_SHADER_SUBGROUP_FEATURE_MASK;
+      caps->shader_subgroup_supported_stages = MESA_SHADER_COMPUTE;
+   }
+   caps->timer_resolution = 500;
+
+   NOCL_CALL(cuDeviceGetAttribute((int *)&caps->max_constant_buffer_size, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&caps->pci_group, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&caps->pci_bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&caps->pci_device, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cu_dev));
+   caps->pci_function = 0;
+   NOCL_CALL(cuDeviceGetAttribute((int *)&caps->shader_subgroup_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cu_dev));
+
+   shader_caps->fp16 = caps->fp16;
+   shader_caps->max_const_buffer0_size = caps->max_constant_buffer_size;
+   shader_caps->max_const_buffers = 8; // TODO: is there a query for it?
+   shader_caps->supported_irs = 1 << PIPE_SHADER_IR_NIR;
+
+   compute_caps->address_bits = nocl->sm < 20 ? 32 : 64;
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_clock_frequency, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cu_dev));
+   compute_caps->max_clock_frequency /= 1000;
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_compute_units, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_block_size[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_block_size[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_block_size[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, cu_dev));
+   size_t max_global_size;
+   NOCL_CALL(cuDeviceTotalMem_v2(&max_global_size, cu_dev));
+   compute_caps->max_global_size = max_global_size;
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_grid_size[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_grid_size[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, cu_dev));
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_grid_size[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, cu_dev));
+   compute_caps->max_mem_alloc_size = INT32_MAX;
+   NOCL_CALL(cuDeviceGetAttribute((int *)&compute_caps->max_local_size, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, cu_dev));
+   compute_caps->max_variable_threads_per_block = compute_caps->max_threads_per_block;
+
+   if (nocl->sm >= 30) {
+      compute_caps->max_subgroups = compute_caps->max_threads_per_block / caps->shader_subgroup_size;
+      compute_caps->subgroup_sizes = caps->shader_subgroup_size;
+   }
+
+   return &nocl->base;
+}
diff --git a/src/gallium/drivers/nocl/nocl_private.h b/src/gallium/drivers/nocl/nocl_private.h
new file mode 100644
index 00000000000..8dc271b82c4
--- /dev/null
+++ b/src/gallium/drivers/nocl/nocl_private.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright © 2025 Karol Herbst
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/macros.h"
+#include "util/ralloc.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+#include "nocl/nocl_cuda_public.h"
+
+static const uint32_t NOCL_COMPILER_LOG_SIZE = 0x1000;
+
+enum nocl_dbg {
+   NOCL_DBG_32          = BITFIELD_BIT(0),
+   NOCL_DBG_JIT_NO_OPTS = BITFIELD_BIT(1),
+   NOCL_DBG_NIR         = BITFIELD_BIT(2),
+   NOCL_DBG_PTX         = BITFIELD_BIT(3),
+   NOCL_DBG_VERBOSE     = BITFIELD_BIT(4),
+};
+
+#define NOCL_CALL(func) nocl_error(nocl_dispatch.func)
+
+static inline bool
+nocl_error(CUresult err)
+{
+   if (err == CUDA_SUCCESS)
+      return false;
+
+   const char *err_str;
+   if (nocl_dispatch.cuGetErrorString(err, &err_str)) {
+      printf("NOCL-ERROR: UNKNOWN\n");
+   } else {
+      printf("NOCL-ERROR: %s\n", err_str);
+   }
+
+   return true;
+}
+
+struct nocl_screen {
+   struct pipe_screen base;
+#if CUDA_VERSION >= 12090
+   CUlogsCallbackHandle verbose_handle;
+#endif
+   CUdevice cu_dev;
+   CUcontext cu_ctx;
+   struct disk_cache *disk_cache;
+   uint32_t debug;
+   uint16_t ptx;
+   uint16_t sm;
+   char name[128];
+};
+
+struct nocl_buffer {
+   struct pipe_resource base;
+   union {
+      CUdeviceptr dptr;
+      void *ptr;
+   };
+};
+
+struct nocl_program {
+   CUmodule cu_mod;
+   CUfunction cu_func;
+   uint32_t shared_size;
+};
+
+struct nocl_context {
+   struct pipe_context base;
+   void *kernel_input;
+   uint32_t kernel_input_size;
+   uint32_t kernel_input_capacity;
+   CUstream cu_stream;
+   CUevent cu_timestamp_start;
+   uint64_t timestamp_start;
+   struct nocl_program *prog;
+};
+
+struct pipe_fence_handle {
+   struct pipe_reference ref;
+   CUevent cu_event;
+};
+
+struct nocl_transfer {
+   struct pipe_transfer base;
+   void *ptr;
+};
+
+struct pipe_query {
+   CUevent cu_event_base;
+   CUevent cu_event;
+};
+
+static inline struct nocl_screen*
+nocl_screen(struct pipe_screen *pscreen)
+{
+   return (struct nocl_screen *)pscreen;
+}
+
+static inline struct nocl_screen*
+nocl_screen_and_make_current(struct pipe_screen *pscreen)
+{
+   struct nocl_screen *nocl = nocl_screen(pscreen);
+   NOCL_CALL(cuCtxSetCurrent(nocl->cu_ctx));
+   return nocl;
+}
+
+static inline struct nocl_context*
+nocl_context(struct pipe_context *pctx)
+{
+   return (struct nocl_context *)pctx;
+}
+
+static inline struct nocl_context*
+nocl_context_and_make_current(struct pipe_context *pctx)
+{
+   nocl_screen_and_make_current(pctx->screen);
+   return nocl_context(pctx);
+}
+
+static inline struct nocl_program*
+nocl_program(void *prog)
+{
+   return (struct nocl_program *)prog;
+}
+
+static inline struct nocl_buffer*
+nocl_buffer(struct pipe_resource *pres)
+{
+   return (struct nocl_buffer *)pres;
+}
+
+static inline struct nocl_transfer*
+nocl_transfer(struct pipe_transfer *pxfer)
+{
+   return (struct nocl_transfer *)pxfer;
+}
+
+void nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir);
+bool nocl_nir_lower_cf(struct nir_shader *nir);
+bool nocl_nir_opt_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl);
+bool nocl_nir_lower_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl);
+struct nir_shader_compiler_options *nocl_get_nir_options(struct nocl_screen *nocl);
diff --git a/src/gallium/frontends/rusticl/mesa/pipe/device.rs b/src/gallium/frontends/rusticl/mesa/pipe/device.rs
index edb7af668bc..de1fb5b22a9 100644
--- a/src/gallium/frontends/rusticl/mesa/pipe/device.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/device.rs
@@ -99,6 +99,7 @@ fn get_enabled_devs() -> HashMap<String, u32> {
 
             let driver_str = match driver_str[0] {
                 "llvmpipe" | "lp" => "swrast",
+                "cuda" => "nocl",
                 "freedreno" => "msm",
                 a => a,
             };
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index 9b02fd1189a..2d7d8ec436a 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -206,6 +206,12 @@ if with_gallium_d3d12
 else
   driver_d3d12 = declare_dependency()
 endif
+if with_gallium_nocl
+  subdir('winsys/nocl')
+  subdir('drivers/nocl')
+else
+  driver_nocl = declare_dependency()
+endif
 if with_gallium_rusticl
   subdir('frontends/rusticl')
   subdir('targets/rusticl')
diff --git a/src/gallium/targets/rusticl/meson.build b/src/gallium/targets/rusticl/meson.build
index 2b214adbbf5..06b86b65595 100644
--- a/src/gallium/targets/rusticl/meson.build
+++ b/src/gallium/targets/rusticl/meson.build
@@ -34,6 +34,7 @@ librusticl = shared_library(
   ],
   dependencies : [
     driver_asahi,
+    driver_nocl,
     driver_freedreno,
     driver_iris,
     driver_nouveau,
diff --git a/src/gallium/winsys/nocl/meson.build b/src/gallium/winsys/nocl/meson.build
new file mode 100644
index 00000000000..19c435948c0
--- /dev/null
+++ b/src/gallium/winsys/nocl/meson.build
@@ -0,0 +1,25 @@
+# Copyright © 2025 Karol Herbst
+# SPDX-License-Identifier: MIT
+
+libnocl_cuda = static_library(
+  'nocl_cuda',
+  [
+    'nocl_cuda.c',
+    'nocl_cuda_public.h',
+  ],
+  include_directories : [
+    inc_util,
+    inc_gallium,
+    inc_gallium_aux,
+  ],
+  dependencies : [
+    dep_libcuda,
+  ],
+)
+
+idep_libnoclwinsys = declare_dependency(
+  include_directories : [
+    include_directories('.'),
+  ],
+  link_with : [libnocl_cuda],
+)
diff --git a/src/gallium/winsys/nocl/nocl_cuda.c b/src/gallium/winsys/nocl/nocl_cuda.c
new file mode 100644
index 00000000000..611267b054e
--- /dev/null
+++ b/src/gallium/winsys/nocl/nocl_cuda.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright © 2025 Karol Herbst
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe-loader/pipe_loader.h"
+#include "pipe-loader/pipe_loader_priv.h"
+#include "target-helpers/inline_debug_helper.h"
+#include "util/u_dl.h"
+#include "util/u_memory.h"
+
+#include "cuda.h"
+#include "nocl_cuda_public.h"
+
+
+#define cuda_err(err) if (err) {                  \
+   const char *err_str;                           \
+   nocl_dispatch.cuGetErrorString(err, &err_str); \
+   printf("CUDA returned error: %s\n", err_str);  \
+   return 0; \
+}
+
+static struct driOptionDescription nocl_dri_conf = {};
+static struct util_dl_library *libcuda = NULL;
+struct cuda_symbol_table nocl_dispatch = {};
+
+static struct pipe_screen *
+nocl_loader_create_screen(struct pipe_loader_device *dev,
+                           const struct pipe_screen_config *config,
+                           bool sw_vk)
+{
+   struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)dev;
+   struct pipe_screen *pscreen = nocl_create_screen(cuda_dev);
+   return pscreen ? debug_screen_wrap(pscreen) : NULL;
+}
+
+static const struct driOptionDescription *
+nocl_get_driconf(struct pipe_loader_device *dev, unsigned *count)
+{
+   *count = 0;
+   return &nocl_dri_conf;
+}
+
+static void
+nocl_release(struct pipe_loader_device **dev)
+{
+   struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)*dev;
+   FREE(cuda_dev);
+}
+
+static struct pipe_loader_ops cuda_loader_ops = {
+   .create_screen = nocl_loader_create_screen,
+   .release = nocl_release,
+   .get_driconf = nocl_get_driconf,
+};
+
+static int32_t faked_version = 0;
+
+static bool
+nocl_load_pfn(const char *name, void **pfn, int version)
+{
+   if (version > faked_version) {
+      *pfn = NULL;
+      return false;
+   }
+
+   CUdriverProcAddressQueryResult proc_result;
+   CUresult res = nocl_dispatch.cuGetProcAddress_v2(name, pfn, version, CU_GET_PROC_ADDRESS_LEGACY_STREAM, &proc_result);
+   bool success = res == CUDA_SUCCESS && proc_result == CU_GET_PROC_ADDRESS_SUCCESS;
+   /* Probably always NULL on errors but let's be sure */
+   if (!success)
+      *pfn = NULL;
+   return success;
+}
+
+int
+pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev)
+{
+   if (!libcuda) {
+      UNUSED void *func = NULL;
+      faked_version = debug_get_num_option("NOCL_API_TARGET", INT32_MAX);
+      libcuda = util_dl_open("libcuda.so");
+      if (!libcuda)
+         return 0;
+
+      /* Needs 12.0 */
+      nocl_dispatch.cuGetProcAddress_v2 = (void*)util_dl_get_proc_address(libcuda, "cuGetProcAddress_v2");
+      if (!nocl_dispatch.cuGetProcAddress_v2) {
+         return 0;
+      }
+
+      bool success = true;
+      success &= nocl_load_pfn("cuCtxCreate",               (void**)&nocl_dispatch.cuCtxCreate_v2,             3020);
+      success &= nocl_load_pfn("cuCtxDestroy",              (void**)&nocl_dispatch.cuCtxDestroy_v2,            4000);
+      success &= nocl_load_pfn("cuCtxSetCurrent",           (void**)&nocl_dispatch.cuCtxSetCurrent,            4000);
+      success &= nocl_load_pfn("cuDeviceGet",               (void**)&nocl_dispatch.cuDeviceGet,                2000);
+      success &= nocl_load_pfn("cuDeviceGetAttribute",      (void**)&nocl_dispatch.cuDeviceGetAttribute,       2000);
+      success &= nocl_load_pfn("cuDeviceGetCount",          (void**)&nocl_dispatch.cuDeviceGetCount,           2000);
+      success &= nocl_load_pfn("cuDeviceGetName",           (void**)&nocl_dispatch.cuDeviceGetName,            2000);
+      success &= nocl_load_pfn("cuDeviceTotalMem",          (void**)&nocl_dispatch.cuDeviceTotalMem_v2,        3020);
+      success &= nocl_load_pfn("cuDriverGetVersion",        (void**)&nocl_dispatch.cuDriverGetVersion,         2020);
+      success &= nocl_load_pfn("cuEventCreate",             (void**)&nocl_dispatch.cuEventCreate,              2000);
+      success &= nocl_load_pfn("cuEventDestroy",            (void**)&nocl_dispatch.cuEventDestroy_v2,          4000);
+      success &= nocl_load_pfn("cuEventElapsedTime",        (void**)&nocl_dispatch.cuEventElapsedTime_v1,      2000);
+      success &= nocl_load_pfn("cuEventRecord",             (void**)&nocl_dispatch.cuEventRecord,              2000);
+      success &= nocl_load_pfn("cuEventSynchronize",        (void**)&nocl_dispatch.cuEventSynchronize,         2000);
+      success &= nocl_load_pfn("cuFuncGetAttribute",        (void**)&nocl_dispatch.cuFuncGetAttribute,         2020);
+      success &= nocl_load_pfn("cuGetErrorString",          (void**)&nocl_dispatch.cuGetErrorString,           6000);
+      success &= nocl_load_pfn("cuInit",                    (void**)&nocl_dispatch.cuInit,                     2000);
+      success &= nocl_load_pfn("cuLaunchKernel",            (void**)&nocl_dispatch.cuLaunchKernel,             4000);
+      success &= nocl_load_pfn("cuLinkAddData",             (void**)&nocl_dispatch.cuLinkAddData_v2,           6050);
+      success &= nocl_load_pfn("cuLinkComplete",            (void**)&nocl_dispatch.cuLinkComplete,             5050);
+      success &= nocl_load_pfn("cuLinkCreate",              (void**)&nocl_dispatch.cuLinkCreate_v2,            6050);
+      success &= nocl_load_pfn("cuLinkDestroy",             (void**)&nocl_dispatch.cuLinkDestroy,              5050);
+      success &= nocl_load_pfn("cuMemAlloc",                (void**)&nocl_dispatch.cuMemAlloc_v2,              3020);
+      success &= nocl_load_pfn("cuMemAllocHost",            (void**)&nocl_dispatch.cuMemAllocHost_v2,          3020);
+      success &= nocl_load_pfn("cuMemcpy",                  (void**)&nocl_dispatch.cuMemcpy,                   4000);
+      success &= nocl_load_pfn("cuMemcpyAsync",             (void**)&nocl_dispatch.cuMemcpyAsync,              4000);
+      success &= nocl_load_pfn("cuMemcpyDtoHAsync",         (void**)&nocl_dispatch.cuMemcpyDtoHAsync_v2,       3020);
+      success &= nocl_load_pfn("cuMemcpyHtoD",              (void**)&nocl_dispatch.cuMemcpyHtoD_v2,            3020);
+      success &= nocl_load_pfn("cuMemFree",                 (void**)&nocl_dispatch.cuMemFree_v2,               3020);
+      success &= nocl_load_pfn("cuMemFreeHost",             (void**)&nocl_dispatch.cuMemFreeHost,              2000);
+      success &= nocl_load_pfn("cuMemsetD8Async",           (void**)&nocl_dispatch.cuMemsetD8Async,            3020);
+      success &= nocl_load_pfn("cuMemsetD16Async",          (void**)&nocl_dispatch.cuMemsetD16Async,           3020);
+      success &= nocl_load_pfn("cuMemsetD32Async",          (void**)&nocl_dispatch.cuMemsetD32Async,           3020);
+      success &= nocl_load_pfn("cuModuleGetFunction",       (void**)&nocl_dispatch.cuModuleGetFunction,        2000);
+      success &= nocl_load_pfn("cuModuleLoadDataEx",        (void**)&nocl_dispatch.cuModuleLoadDataEx,         2010);
+      success &= nocl_load_pfn("cuModuleUnload",            (void**)&nocl_dispatch.cuModuleUnload,             2000);
+      success &= nocl_load_pfn("cuStreamDestroy",           (void**)&nocl_dispatch.cuStreamDestroy_v2,         4000);
+      success &= nocl_load_pfn("cuStreamSynchronize",       (void**)&nocl_dispatch.cuStreamSynchronize,        2000);
+      if (!success)
+         return 0;
+
+      /* Only documented difference appears to be that v2 won't return errors on previous async
+       * commands, so just overwrite the previously fetched pointer */
+      nocl_load_pfn("cuEventElapsedTime",                   &func,                                            12080);
+      if (func)
+         nocl_dispatch.cuEventElapsedTime_v1 = func;
+
+      nocl_dispatch.all_required_available = true;
+#if CUDA_VERSION >= 11040
+      success &= nocl_load_pfn("cuDeviceGetUuid",           (void**)&nocl_dispatch.cuDeviceGetUuid_v2,        11040);
+#endif
+#if CUDA_VERSION >= 12090
+      nocl_load_pfn("cuLogsRegisterCallback",               (void**)&nocl_dispatch.cuLogsRegisterCallback,    12090);
+      nocl_load_pfn("cuLogsUnregisterCallback",             (void**)&nocl_dispatch.cuLogsUnregisterCallback,  12090);
+#endif
+   }
+
+   if (!nocl_dispatch.all_required_available)
+      return 0;
+
+   int driver_version;
+   CUresult res = nocl_dispatch.cuInit(0);
+   cuda_err(res);
+
+   res = nocl_dispatch.cuDriverGetVersion(&driver_version);
+   /* Our highest version requirement across all required APIs */
+   if (driver_version < 8000)
+      return 0;
+
+   if (!ndev) {
+      res = nocl_dispatch.cuDeviceGetCount(&ndev);
+      cuda_err(res);
+      return ndev;
+   }
+
+   if (!devs)
+      return 0;
+
+   int cnt;
+   res = nocl_dispatch.cuDeviceGetCount(&cnt);
+   cuda_err(res);
+   for (int i = 0; i < ndev; i++) {
+      if (i >= cnt)
+         return i;
+
+      CUdevice cuda_dev;
+      res = nocl_dispatch.cuDeviceGet(&cuda_dev, i);
+      if (res)
+        return i;
+
+      int pci_device_id;
+#if CUDA_VERSION >= 12080
+      res = nocl_dispatch.cuDeviceGetAttribute(&pci_device_id, CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID, cuda_dev);
+      cuda_err(res);
+#else
+      /* TODO: we might need to get the device id from somewhere, but so far
+       * nothing needs it */
+      pci_device_id = 0x10de;
+#endif
+
+      struct cuda_pipe_loader_device *dev = CALLOC_STRUCT(cuda_pipe_loader_device);
+      *dev = (struct cuda_pipe_loader_device) {
+         .base = {
+            .driver_name = "nocl",
+            .ops = &cuda_loader_ops,
+            /* TODO: tegra */
+            .type = PIPE_LOADER_DEVICE_PCI,
+            .u.pci.chip_id = ((unsigned int)pci_device_id) >> 16,
+            .u.pci.vendor_id = pci_device_id & 0xffff,
+         },
+         .dev = cuda_dev,
+         .cuda_version = driver_version,
+      };
+
+      devs[i] = &dev->base;
+   }
+
+   return ndev;
+}
diff --git a/src/gallium/winsys/nocl/nocl_cuda_public.h b/src/gallium/winsys/nocl/nocl_cuda_public.h
new file mode 100644
index 00000000000..c8a9b800d3e
--- /dev/null
+++ b/src/gallium/winsys/nocl/nocl_cuda_public.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2025 Karol Herbst
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __NOCL_DRM_PUBLIC_H__
+#define __NOCL_DRM_PUBLIC_H__
+
+#include <cudaTypedefs.h>
+
+#include "pipe-loader/pipe_loader.h"
+
+struct cuda_pipe_loader_device {
+   struct pipe_loader_device base;
+   CUdevice dev;
+   int cuda_version;
+};
+
+extern struct cuda_symbol_table {
+   PFN_cuCtxCreate_v3020 cuCtxCreate_v2;
+   PFN_cuCtxDestroy_v4000 cuCtxDestroy_v2;
+   PFN_cuCtxSetCurrent_v4000 cuCtxSetCurrent;
+   PFN_cuDeviceGet_v2000 cuDeviceGet;
+   PFN_cuDeviceGetAttribute_v2000 cuDeviceGetAttribute;
+   PFN_cuDeviceGetCount_v2000 cuDeviceGetCount;
+   PFN_cuDeviceGetName_v2000 cuDeviceGetName;
+   PFN_cuDeviceTotalMem_v3020 cuDeviceTotalMem_v2;
+   PFN_cuDriverGetVersion_v2020 cuDriverGetVersion;
+   PFN_cuEventCreate_v2000 cuEventCreate;
+   PFN_cuEventDestroy_v4000 cuEventDestroy_v2;
+   PFN_cuEventElapsedTime_v2000 cuEventElapsedTime_v1;
+   PFN_cuEventRecord_v2000 cuEventRecord;
+   PFN_cuEventSynchronize_v2000 cuEventSynchronize;
+   PFN_cuFuncGetAttribute_v2020 cuFuncGetAttribute;
+   PFN_cuGetErrorString_v6000 cuGetErrorString;
+   PFN_cuGetProcAddress_v12000 cuGetProcAddress_v2;
+   PFN_cuInit_v2000 cuInit;
+   PFN_cuLaunchKernel_v4000 cuLaunchKernel;
+   PFN_cuLinkAddData_v6050 cuLinkAddData_v2;
+   PFN_cuLinkComplete_v5050 cuLinkComplete;
+   PFN_cuLinkCreate_v6050 cuLinkCreate_v2;
+   PFN_cuLinkDestroy_v5050 cuLinkDestroy;
+   PFN_cuMemAlloc_v3020 cuMemAlloc_v2;
+   PFN_cuMemAllocHost_v3020 cuMemAllocHost_v2;
+   PFN_cuMemcpy_v4000 cuMemcpy;
+   PFN_cuMemcpyAsync_v4000 cuMemcpyAsync;
+   PFN_cuMemcpyDtoHAsync_v3020 cuMemcpyDtoHAsync_v2;
+   PFN_cuMemcpyHtoD_v3020 cuMemcpyHtoD_v2;
+   PFN_cuMemFree_v3020 cuMemFree_v2;
+   PFN_cuMemFreeHost_v2000 cuMemFreeHost;
+   PFN_cuMemsetD8Async_v3020 cuMemsetD8Async;
+   PFN_cuMemsetD16Async_v3020 cuMemsetD16Async;
+   PFN_cuMemsetD32Async_v3020 cuMemsetD32Async;
+   PFN_cuModuleGetFunction_v2000 cuModuleGetFunction;
+   PFN_cuModuleLoadDataEx_v2010 cuModuleLoadDataEx;
+   PFN_cuModuleUnload_v2000 cuModuleUnload;
+   PFN_cuStreamDestroy_v4000 cuStreamDestroy_v2;
+   PFN_cuStreamSynchronize_v2000 cuStreamSynchronize;
+#if CUDA_VERSION >= 11040
+   PFN_cuDeviceGetUuid_v11040 cuDeviceGetUuid_v2;
+#endif
+#if CUDA_VERSION >= 12090
+   PFN_cuLogsRegisterCallback_v12090 cuLogsRegisterCallback;
+   PFN_cuLogsUnregisterCallback_v12090 cuLogsUnregisterCallback;
+#endif
+   bool all_required_available;
+} nocl_dispatch;
+
+typedef int CUdevice;
+struct pipe_screen *nocl_create_screen(struct cuda_pipe_loader_device *dev);
+
+#endif /* __NOCL_DRM_PUBLIC_H__ */