Merge branch 'rusticl/cuda/what_the_fuck' into 'main'

add a new gallium compute only driver for nvidia hardware, named nocl See merge request mesa/mesa!37831
2025-12-20 07:20:10 +01:00 · 2025-12-20 00:49:03 +00:00 · 2025-12-20 00:49:03 +00:00 · 74e785ff41
commit 74e785ff41
parent c430f394c5 a18a475006
30 changed files with 3916 additions and 28 deletions
--- a/docs/drivers/nocl.rst
+++ b/docs/drivers/nocl.rst
@ -0,0 +1,13 @@
+NoCL
+====
+
+Overview
+--------
+
+The NoCL driver is a Gallium driver that targets the CUDA driver API and
+therefore can be used to layer OpenCL on top of the NVIDIA proprietary driver.
+
+The minimum supported CUDA API version is 6.5.
+
+It translates NIR to PTX and makes use of the CUDA internal JIT compiler to
+generate device code.
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@ -2161,6 +2161,29 @@ Freedreno driver environment variables
 Other Gallium drivers have their own environment variables. These may
 change frequently so the source code should be consulted for details.

+NoCL driver environment variables
+---------------------------------
+
+.. envvar:: NOCL_DEBUG
+
+   Debug flags for the NoCL driver.
+
+.. envvar:: NOCL_API_TARGET
+
+   Assumes a lower CUDA Driver version is used than what's loaded at runtime.
+   Useful to ensure that code runs on older Driver versions. For development
+   only.
+
+.. envvar:: NOCL_PTX_VERSION
+
+   Overrides the PTX target version with the specified value. Can be used to
+   ensure code compiles successfully on various CUDA versions. For development
+   only. Specified without the dot.
+
+.. envvar:: NOCL_SM_TARGET
+
+   Overrides the PTX target SM level with the specified value. Can be used to
+   ensure code compiles successfully on various GPUs. For development only.

 Vulkan loader environment variables
 -----------------------------------
--- a/docs/features.txt
+++ b/docs/features.txt
@ -820,7 +820,7 @@ Rusticl Optional OpenCL 2.x Features:
  Device and host timer synchronization                 DONE (freedreno, iris, llvmpipe, radeonsi, zink)
  OpenCL C 2.0                                          in progress
  - Memory Consistency Model (atomics)                  not started
-  - Sub-groups                                          DONE (iris, llvmpipe, radeonsi, asahi)
+  - Sub-groups                                          DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
  - Work-group Collective Functions                     not started
  - Generic Address Space                               in progress
  cl_khr_il_program                                     DONE
@ -840,7 +840,7 @@ Rusticl extensions:
  cl_khr_d3d10_sharing                                  not started
  cl_khr_d3d11_sharing                                  not started
  cl_khr_device_enqueue_local_arg_types                 not started
-  cl_khr_device_uuid                                    DONE (freedreno, iris, llvmpipe, radeonsi, zink)
+  cl_khr_device_uuid                                    DONE (freedreno, iris, llvmpipe, nocl, radeonsi, zink)
  cl_khr_dx9_media_sharing                              not started
  cl_khr_egl_event                                      not started
  cl_khr_egl_image                                      not started
@ -855,7 +855,7 @@ Rusticl extensions:
  cl_khr_external_semaphore_opaque_fd                   not started
  cl_khr_external_semaphore_sync_fd                     DONE (radeonsi, zink)
  cl_khr_external_semaphore_win32                       not started
-  cl_khr_fp16                                           DONE (asahi, freedreno, llvmpipe, panfrost, radeonsi, zink)
+  cl_khr_fp16                                           DONE (asahi, freedreno, llvmpipe, nocl, panfrost, radeonsi, zink)
  cl_khr_gl_depth_images                                not started
  cl_khr_gl_event                                       not started
  cl_khr_gl_msaa_sharing                                not started
@ -865,10 +865,10 @@ Rusticl extensions:
  cl_khr_int64_base_atomics                             not started
  cl_khr_int64_extended_atomics                         not started
  cl_khr_integer_dot_product                            DONE
-  cl_khr_kernel_clock                                   DONE (freedreno, iris, llvmpipe, nvc0, panfrost, radeonsi, zink, needs llvm-19)
+  cl_khr_kernel_clock                                   DONE (freedreno, iris, llvmpipe, nocl, nvc0, panfrost, radeonsi, zink, needs llvm-19)
  cl_khr_mipmap_image                                   not started
  cl_khr_mipmap_image_writes                            not started
-  cl_khr_pci_bus_info                                   DONE (iris, nvc0, radeonsi, zink)
+  cl_khr_pci_bus_info                                   DONE (iris, nocl, nvc0, radeonsi, zink)
  cl_khr_priority_hints                                 DONE (asahi, freedreno, iris, panfrost, radeonsi)
  cl_khr_semaphore                                      DONE (radeonsi, zink)
  cl_khr_spirv_extended_debug_info                      not started
@ -876,21 +876,21 @@ Rusticl extensions:
  cl_khr_spirv_no_integer_wrap_decoration               DONE
  cl_khr_spirv_queries                                  DONE
  cl_khr_srgb_image_writes                              not started
-  cl_khr_subgroup_ballot                                not started
-  cl_khr_subgroup_clustered_reduce                      not started
-  cl_khr_subgroup_extended_types                        not started
+  cl_khr_subgroup_ballot                                DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_clustered_reduce                      DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_extended_types                        DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
  cl_khr_subgroup_named_barrier                         not started
-  cl_khr_subgroup_non_uniform_arithmetic                not started
-  cl_khr_subgroup_non_uniform_vote                      not started
-  cl_khr_subgroup_rotate                                not started
-  cl_khr_subgroup_shuffle                               DONE (iris, llvmpipe, radeonsi, asahi)
-  cl_khr_subgroup_shuffle_relative                      DONE (iris, llvmpipe, radeonsi, asahi)
+  cl_khr_subgroup_non_uniform_arithmetic                DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_non_uniform_vote                      DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_rotate                                DONE (iris, llvmpipe, zink)
+  cl_khr_subgroup_shuffle                               DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
+  cl_khr_subgroup_shuffle_relative                      DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
  cl_khr_subgroups                                      in progress
  cl_khr_suggested_local_work_size                      DONE
  cl_khr_terminate_context                              not started
  cl_khr_throttle_hints                                 not started
  cl_khr_work_group_uniform_arithmetic                  not started
-  cl_ext_buffer_device_address                          DONE (iris, llvmpipe, radeonsi, zink)
+  cl_ext_buffer_device_address                          DONE (iris, llvmpipe, nocl, radeonsi, zink)
  cl_ext_cxx_for_opencl                                 not started
  cl_ext_device_fission                                 not started
  cl_ext_float_atomics                                  not started
--- a/docs/index.rst
+++ b/docs/index.rst
@ -86,6 +86,7 @@ Linux, FreeBSD, and other operating systems.
   drivers/kosmickrisp
   drivers/lima
   drivers/llvmpipe
+   drivers/nocl
   drivers/nvk
   drivers/panfrost
   drivers/powervr
--- a/meson.build
+++ b/meson.build
@ -221,6 +221,7 @@ with_gallium_d3d12 = gallium_drivers.contains('d3d12')
 with_gallium_asahi = gallium_drivers.contains('asahi')
 with_gallium_rocket = gallium_drivers.contains('rocket')
 with_gallium_ethosu = gallium_drivers.contains('ethosu')
+with_gallium_nocl = gallium_drivers.contains('nocl')
 foreach gallium_driver : gallium_drivers
  pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
 endforeach
@ -778,7 +779,7 @@ if with_gallium_rusticl
 endif

 with_virtgpu_kumquat = get_option('virtgpu_kumquat') and with_gfxstream_vk
-if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat
+if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat or with_gallium_nocl
  # rust.bindgen() does not pass `--rust-target` to bindgen until 1.7.0.
  if meson.version().version_compare('< 1.7.0')
    error('Mesa Rust support requires Meson 1.7.0 or newer')
@ -1641,6 +1642,16 @@ elif with_shader_cache
  error('Shader Cache requires compression')
 endif

+if with_gallium_nocl
+  dep_libcuda = dependency(
+    'cuda',
+    # By default meson searches for the cudart, which we don't want to use. We want to use the
+    # CUDA driver library.
+    modules : ['cuda'],
+    version : '>= 6.5',
+  )
+endif
+
 if host_machine.system() == 'windows'
  # For MSVC and MinGW we aren't using pthreads, and dependency('threads') will add linkage
  # to pthread for MinGW, so leave the dependency null_dep for Windows. For Windows linking to
--- a/meson.options
+++ b/meson.options
@ -87,7 +87,7 @@ option(
  choices : [
    'all', 'auto',
    'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris',
-    'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
+    'lima', 'llvmpipe', 'nocl', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
    'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
  ],
  description : 'List of gallium drivers to build. If this is set to auto ' +
--- a/src/compiler/clc/clc.h
+++ b/src/compiler/clc/clc.h
@ -72,9 +72,15 @@ struct clc_optional_features {
    * progress
    */
   bool subgroups_ifp;
+   bool subgroups_ballot;
+   bool subgroups_clustered;
+   bool subgroups_extended_types;
+   bool subgroups_named_barrier;
+   bool subgroups_non_uniform_arithmetic;
+   bool subgroups_non_uniform_vote;
+   bool subgroups_rotate;
   bool subgroups_shuffle;
   bool subgroups_shuffle_relative;
-   bool subgroups_ballot;
 };

 struct clc_compile_args {
--- a/src/compiler/clc/clc_helpers.cpp
+++ b/src/compiler/clc/clc_helpers.cpp
@ -1024,15 +1024,33 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
   }
   if (args->features.subgroups) {
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_subgroups");
+      if (args->features.subgroups_ballot) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1");
+      }
+      if (args->features.subgroups_clustered) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_clustered_reduce=1");
+      }
+      if (args->features.subgroups_extended_types) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_extended_types=1");
+      }
+      if (args->features.subgroups_named_barrier) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_named_barrier=1");
+      }
+      if (args->features.subgroups_non_uniform_arithmetic) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_arithmetic=1");
+      }
+      if (args->features.subgroups_non_uniform_vote) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_vote=1");
+      }
+      if (args->features.subgroups_rotate) {
+         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_rotate=1");
+      }
      if (args->features.subgroups_shuffle) {
         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle=1");
      }
      if (args->features.subgroups_shuffle_relative) {
         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle_relative=1");
      }
-      if (args->features.subgroups_ballot) {
-         c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1");
-      }
   }
   if (args->features.subgroups_ifp) {
      assert(args->features.subgroups);
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@ -29,6 +29,7 @@
 #include <assert.h>
 #include <stdio.h>

+#include "compiler/builtin_types.h"
 #include "shader_enums.h"
 #include "c11/threads.h"
 #include "util/blob.h"
--- a/src/compiler/meson.build
+++ b/src/compiler/meson.build
@ -73,6 +73,6 @@ if with_gallium
 endif
 subdir('isaspec')

-if with_nouveau_vk
+if with_nouveau_vk or with_gallium_nocl
  subdir('rust')
 endif
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -5745,6 +5745,8 @@ typedef struct nir_lower_compute_system_values_options {
   bool lower_local_invocation_index : 1;
   bool lower_cs_local_id_to_index : 1;
   bool lower_workgroup_id_to_index : 1;
+   bool lower_num_subgroups : 1;
+   bool lower_subgroup_id : 1;
   bool global_id_is_32bit : 1;
   /* At shader execution time, check if WorkGroupId should be 1D
    * and compute it quickly. Fall back to slow computation if not.
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@ -794,6 +794,30 @@ lower_compute_system_value_instr(nir_builder *b,
   case nir_intrinsic_load_shader_index:
      return nir_imm_int(b, b->shader->info.cs.shader_index);

+   case nir_intrinsic_load_num_subgroups: {
+      if (!options || !options->lower_num_subgroups)
+         return NULL;
+
+      nir_def *group_size = nir_load_workgroup_size(b);
+      nir_def *threads = nir_imul(b, nir_channel(b, group_size, 0),
+                                     nir_channel(b, group_size, 1));
+      threads = nir_imul(b, threads, nir_channel(b, group_size, 2));
+
+      /* DIV_ROUND_UP(A, B) = ((A + B - 1) / B) */
+      nir_def *subgroup_size = nir_load_subgroup_size(b);
+      nir_def *subgroup_size_m1 = nir_iadd_imm(b, subgroup_size, -1);
+      nir_def *numerator = nir_iadd(b, threads, subgroup_size_m1);
+      return nir_udiv(b, numerator, subgroup_size);
+   }
+
+   case nir_intrinsic_load_subgroup_id: {
+      if (!options || !options->lower_subgroup_id)
+         return NULL;
+
+      nir_def *tid = nir_load_local_invocation_index(b);
+      return nir_udiv(b, tid, nir_load_subgroup_size(b));
+   }
+
   default:
      return NULL;
   }
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
@ -44,6 +44,9 @@
 static int (*backends[])(struct pipe_loader_device **, int) = {
 #ifdef HAVE_LIBDRM
   &pipe_loader_drm_probe,
+#endif
+#ifdef HAVE_NOCL
+   &pipe_loader_cuda_probe,
 #endif
   &pipe_loader_sw_probe
 };
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@ -215,6 +215,11 @@ pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev,
 int
 pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev);

+#ifdef HAVE_NOCL
+int
+pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev);
+#endif
+
 /**
 * Get a list of known DRM accel devices.
 *
--- a/src/gallium/drivers/nocl/compiler/cbindgen.toml
+++ b/src/gallium/drivers/nocl/compiler/cbindgen.toml
@ -0,0 +1,12 @@
+language = "C"
+
+includes = ["nir.h"]
+autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */"
+include_guard = "NIR_TO_PTX_H"
+usize_is_size_t = true
+style = "tag"
+
+[export]
+include = ["NirToPtxOutput"]
+prefix = ""
+renaming_overrides_prefixing = true
--- a/src/gallium/drivers/nocl/compiler/lib.rs
+++ b/src/gallium/drivers/nocl/compiler/lib.rs
--- a/src/gallium/drivers/nocl/compiler/meson.build
+++ b/src/gallium/drivers/nocl/compiler/meson.build
@ -0,0 +1,44 @@
+# Copyright © 2025 Karol Herbst
+# SPDX-License-Identifier: MIT
+
+prog_cbindgen = find_program(
+  'cbindgen',
+  required : true,
+  native : true,
+  version : '>= 0.25'
+)
+
+libnir_to_ptx_h = custom_target(
+  'libnir_to_ptx_h',
+  input : [files('cbindgen.toml'), 'lib.rs'],
+  output : ['nir_to_ptx.h'],
+  command : [
+    prog_cbindgen, '-q', '--config', '@INPUT0@', '--lang', 'c',
+    '--output', '@OUTPUT0@', '--depfile', '@DEPFILE@',
+    '--', '@INPUT1@',
+  ],
+  depfile : 'nir_to_ptx.h.d',
+)
+
+files_libnir_to_ptx = files(
+  'lib.rs',
+)
+
+libnir_to_ptx = static_library(
+  'nir_to_ptx',
+  [files_libnir_to_ptx],
+  gnu_symbol_visibility : 'hidden',
+  rust_abi : 'c',
+  dependencies : [
+    idep_compiler_rs,
+  ],
+)
+
+idep_nir_to_ptx = declare_dependency(
+  sources : [
+    libnir_to_ptx_h
+  ],
+  link_with : [
+    libnir_to_ptx,
+  ]
+)
--- a/src/gallium/drivers/nocl/meson.build
+++ b/src/gallium/drivers/nocl/meson.build
@ -0,0 +1,54 @@
+# Copyright © 2025 Karol Herbst
+# SPDX-License-Identifier: MIT
+
+subdir('compiler')
+
+files_libnocl = files(
+  'nocl_nir.c',
+  'nocl_pipe.c',
+  'nocl_private.h',
+)
+
+nocl_nir_algebraic_c = custom_target(
+  'nocl_nir_algebraic.c',
+  input : 'nocl_nir_algebraic.py',
+  output : 'nocl_nir_algebraic.c',
+  command : [
+    prog_python, '@INPUT@',
+    '-p', dir_compiler_nir,
+    '--out', '@OUTPUT@',
+  ],
+  depend_files : nir_algebraic_depends,
+)
+
+libnocl = static_library(
+  'nocl',
+  [
+    files_libnocl,
+    nocl_nir_algebraic_c,
+  ],
+  include_directories : [
+    inc_gallium,
+    inc_gallium_aux,
+    inc_gallium_winsys,
+    inc_util,
+  ],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [
+    dep_libcuda,
+    idep_nir,
+    idep_nir_to_ptx,
+  ],
+)
+
+driver_nocl = declare_dependency(
+  compile_args : '-DGALLIUM_NOCL',
+  dependencies : [
+    dep_libcuda,
+    idep_libnoclwinsys,
+  ],
+  link_with : [
+    libnocl,
+    libnir_to_ptx,
+  ],
+)
--- a/src/gallium/drivers/nocl/nocl_nir.c
+++ b/src/gallium/drivers/nocl/nocl_nir.c
@ -0,0 +1,824 @@
+/*
+ * Copyright © 2022 Collabora, Ltd.
+ * Copyright © 2025 Karol Herbst
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nocl_private.h"
+
+#include "nir.h"
+#include "nir_builder.h"
+
+static void
+push_block(nir_builder *b, nir_block *block)
+{
+   assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
+   block->cf_node.parent = &b->impl->cf_node;
+   exec_list_push_tail(&b->impl->body, &block->cf_node.node);
+   b->cursor = nir_after_block(block);
+}
+
+enum scope_type {
+   SCOPE_TYPE_SHADER,
+   SCOPE_TYPE_IF_MERGE,
+   SCOPE_TYPE_LOOP_BREAK,
+   SCOPE_TYPE_LOOP_CONT,
+};
+
+struct scope {
+   enum scope_type type;
+
+   struct scope *parent;
+   uint32_t depth;
+
+   nir_block *merge;
+   nir_def *bar;
+
+   uint32_t escapes;
+};
+
+static struct scope
+push_scope(nir_builder *b,
+           enum scope_type scope_type,
+           struct scope *parent,
+           nir_block *merge_block)
+{
+   struct scope scope = {
+      .type = scope_type,
+      .parent = parent,
+      .depth = parent->depth + 1,
+      .merge = merge_block,
+   };
+
+   return scope;
+}
+
+static void
+pop_scope(nir_builder *b, nir_def *esc_reg, struct scope scope)
+{
+   if (scope.bar == NULL)
+      return;
+
+   if (scope.escapes > 0) {
+      /* Find the nearest scope with a sync. */
+      nir_block *parent_merge = b->impl->end_block;
+      for (struct scope *p = scope.parent; p != NULL; p = p->parent) {
+         if (p->bar != NULL) {
+            parent_merge = p->merge;
+            break;
+         }
+      }
+
+      /* No escape is ~0, halt is 0, and we choose outer scope indices such
+       * that outer scopes always have lower indices than inner scopes.
+       */
+      nir_def *esc = nir_ult_imm(b, nir_load_reg(b, esc_reg), scope.depth);
+
+      /* We have to put the escape in its own block to avoid critical edges.
+       * If we just did goto_if, we would end up with multiple successors,
+       * including a jump to the parent's merge block which has multiple
+       * predecessors.
+       */
+      nir_block *esc_block = nir_block_create(b->shader);
+      nir_block *next_block = nir_block_create(b->shader);
+      nir_goto_if(b, esc_block, esc, next_block);
+      push_block(b, esc_block);
+      nir_goto(b, parent_merge);
+      push_block(b, next_block);
+   }
+}
+
+static enum scope_type
+jump_target_scope_type(nir_jump_type jump_type)
+{
+   switch (jump_type) {
+   case nir_jump_break:    return SCOPE_TYPE_LOOP_BREAK;
+   case nir_jump_continue: return SCOPE_TYPE_LOOP_CONT;
+   default:
+      UNREACHABLE("Unknown jump type");
+   }
+}
+
+static void
+break_scopes(nir_builder *b, nir_def *esc_reg,
+             struct scope *current_scope,
+             nir_jump_type jump_type)
+{
+   nir_block *first_sync = NULL;
+   uint32_t target_depth = UINT32_MAX;
+   enum scope_type target_scope_type = jump_target_scope_type(jump_type);
+   for (struct scope *scope = current_scope; scope; scope = scope->parent) {
+      if (first_sync == NULL && scope->bar != NULL)
+         first_sync = scope->merge;
+
+      if (scope->type == target_scope_type) {
+         if (first_sync == NULL) {
+            first_sync = scope->merge;
+         } else {
+            /* In order for our cascade to work, we need to have the invariant
+             * that anything which escapes any scope with a warp sync needs to
+             * target a scope with a warp sync.
+             */
+            assert(scope->bar != NULL);
+         }
+         target_depth = scope->depth;
+         break;
+      } else {
+         scope->escapes++;
+      }
+   }
+   assert(target_depth < UINT32_MAX);
+
+   nir_store_reg(b, nir_imm_int(b, target_depth), esc_reg);
+   nir_goto(b, first_sync);
+}
+
+static void
+normal_exit(nir_builder *b, nir_def *esc_reg, nir_block *merge_block)
+{
+   assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
+   nir_block *block = nir_cursor_current_block(b->cursor);
+
+   if (!nir_block_ends_in_jump(block)) {
+      nir_store_reg(b, nir_imm_int(b, ~0), esc_reg);
+      nir_goto(b, merge_block);
+   }
+}
+
+/* This is a heuristic for what instructions are allowed before we sync.
+ * Annoyingly, we've gotten rid of phis so it's not as simple as "is it a
+ * phi?".
+ */
+static bool
+instr_is_allowed_before_sync(nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      /* We could probably allow more ALU as long as it doesn't contain
+       * derivatives but let's be conservative and only allow mov for now.
+       */
+      return alu->op == nir_op_mov;
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      return intrin->intrinsic == nir_intrinsic_load_reg ||
+             intrin->intrinsic == nir_intrinsic_store_reg;
+   }
+
+   default:
+      return false;
+   }
+}
+
+/** Returns true if our successor will sync for us
+ *
+ * This is a bit of a heuristic
+ */
+static bool
+parent_scope_will_sync(nir_cf_node *node, struct scope *parent_scope)
+{
+   /* First search forward to see if there's anything non-trivial after this
+    * node within the parent scope.
+    */
+   nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node));
+   nir_foreach_instr(instr, block) {
+      if (!instr_is_allowed_before_sync(instr))
+         return false;
+   }
+
+   /* There's another loop or if following and we didn't find a sync */
+   if (nir_cf_node_next(&block->cf_node))
+      return false;
+
+   /* See if the parent scope will sync for us. */
+   if (parent_scope->bar != NULL)
+      return true;
+
+   switch (parent_scope->type) {
+   case SCOPE_TYPE_SHADER:
+      return true;
+
+   case SCOPE_TYPE_IF_MERGE:
+      return parent_scope_will_sync(block->cf_node.parent,
+                                    parent_scope->parent);
+
+   case SCOPE_TYPE_LOOP_CONT:
+      /* In this case, the loop doesn't have a sync of its own so we're
+       * expected to be uniform before we hit the continue.
+       */
+      return false;
+
+   case SCOPE_TYPE_LOOP_BREAK:
+      UNREACHABLE("Loops must have a continue scope");
+
+   default:
+      UNREACHABLE("Unknown scope type");
+   }
+}
+
+static bool
+block_is_merge(const nir_block *block)
+{
+   /* If it's unreachable, there is no merge */
+   if (block->imm_dom == NULL)
+      return false;
+
+   unsigned num_preds = 0;
+   set_foreach(&block->predecessors, entry) {
+      const nir_block *pred = entry->key;
+
+      /* We don't care about unreachable blocks */
+      if (pred->imm_dom == NULL)
+         continue;
+
+      num_preds++;
+   }
+
+   return num_preds > 1;
+}
+
+static void
+lower_cf_list(nir_builder *b, nir_def *esc_reg, struct scope *parent_scope,
+              struct exec_list *cf_list)
+{
+   foreach_list_typed_safe(nir_cf_node, node, node, cf_list) {
+      switch (node->type) {
+      case nir_cf_node_block: {
+         nir_block *block = nir_cf_node_as_block(node);
+         if (exec_list_is_empty(&block->instr_list))
+            break;
+
+         nir_cursor start = nir_before_block(block);
+         nir_cursor end = nir_after_block(block);
+
+         nir_jump_instr *jump = NULL;
+         nir_instr *last_instr = nir_block_last_instr(block);
+         if (last_instr->type == nir_instr_type_jump) {
+            jump = nir_instr_as_jump(last_instr);
+            end = nir_before_instr(&jump->instr);
+         }
+
+         nir_cf_list instrs;
+         nir_cf_extract(&instrs, start, end);
+         b->cursor = nir_cf_reinsert(&instrs, b->cursor);
+
+         if (jump != NULL) {
+            if (jump->type == nir_jump_halt) {
+               /* Halt instructions map to OpExit on NVIDIA hardware and
+                * exited lanes never block a bsync.
+                */
+               nir_instr_remove(&jump->instr);
+               nir_builder_instr_insert(b, &jump->instr);
+            } else {
+               /* Everything else needs a break cascade */
+               break_scopes(b, esc_reg, parent_scope, jump->type);
+            }
+         }
+         break;
+      }
+
+      case nir_cf_node_if: {
+         nir_if *nif = nir_cf_node_as_if(node);
+
+         nir_def *cond = nif->condition.ssa;
+         nir_instr_clear_src(NULL, &nif->condition);
+
+         nir_block *then_block = nir_block_create(b->shader);
+         nir_block *else_block = nir_block_create(b->shader);
+         nir_block *merge_block = nir_block_create(b->shader);
+
+         struct scope scope = push_scope(b, SCOPE_TYPE_IF_MERGE,
+                                         parent_scope, merge_block);
+
+         nir_goto_if(b, then_block, cond, else_block);
+
+         push_block(b, then_block);
+         lower_cf_list(b, esc_reg, &scope, &nif->then_list);
+         normal_exit(b, esc_reg, merge_block);
+
+         push_block(b, else_block);
+         lower_cf_list(b, esc_reg, &scope, &nif->else_list);
+         normal_exit(b, esc_reg, merge_block);
+
+         push_block(b, merge_block);
+         pop_scope(b, esc_reg, scope);
+
+         break;
+      }
+
+      case nir_cf_node_loop: {
+         nir_loop *loop = nir_cf_node_as_loop(node);
+
+         nir_block *head_block = nir_block_create(b->shader);
+         nir_block *break_block = nir_block_create(b->shader);
+         nir_block *cont_block = nir_block_create(b->shader);
+
+         /* TODO: We can potentially avoid the break sync for loops when the
+          * parent scope syncs for us.  However, we still need to handle the
+          * continue clause cascading to the break.  If there is a
+          * nir_jump_halt involved, then we have a real cascade where it needs
+          * to then jump to the next scope.  Getting all these cases right
+          * while avoiding an extra sync for the loop break is tricky at best.
+          */
+         struct scope break_scope = push_scope(b, SCOPE_TYPE_LOOP_BREAK,
+                                               parent_scope,
+                                               break_block);
+
+         nir_goto(b, head_block);
+         push_block(b, head_block);
+
+         struct scope cont_scope = push_scope(b, SCOPE_TYPE_LOOP_CONT,
+                                              &break_scope,
+                                              cont_block);
+
+         lower_cf_list(b, esc_reg, &cont_scope, &loop->body);
+         normal_exit(b, esc_reg, cont_block);
+
+         push_block(b, cont_block);
+
+         pop_scope(b, esc_reg, cont_scope);
+
+         lower_cf_list(b, esc_reg, &break_scope, &loop->continue_list);
+
+         nir_goto(b, head_block);
+         push_block(b, break_block);
+
+         pop_scope(b, esc_reg, break_scope);
+
+         break;
+      }
+
+      default:
+         UNREACHABLE("Unknown CF node type");
+      }
+   }
+}
+
+static bool
+lower_cf_func(nir_function *func)
+{
+   if (func->impl == NULL)
+      return false;
+
+   if (exec_list_is_singular(&func->impl->body)) {
+      return nir_no_progress(func->impl);
+   }
+
+   nir_function_impl *old_impl = func->impl;
+
+   /* We use this in block_is_merge() */
+   nir_metadata_require(old_impl, nir_metadata_dominance | nir_metadata_divergence);
+
+   /* First, we temporarily get rid of SSA.  This will make all our block
+    * motion way easier.  Ask the pass to place reg writes directly in the
+    * immediate predecessors of the phis instead of trying to be clever.
+    * This will ensure that we never get a write to a uniform register from
+    * non-uniform control flow and makes our divergence reconstruction for
+    * phis more reliable.
+    */
+   nir_foreach_block(block, old_impl)
+      nir_lower_phis_to_regs_block(block, true);
+
+   /* We create a whole new nir_function_impl and copy the contents over */
+   func->impl = NULL;
+   nir_function_impl *new_impl = nir_function_impl_create(func);
+   new_impl->structured = false;
+
+   /* We copy defs from the old impl */
+   new_impl->ssa_alloc = old_impl->ssa_alloc;
+
+   nir_builder b = nir_builder_at(nir_before_impl(new_impl));
+   nir_def *esc_reg = nir_decl_reg(&b, 1, 32, 0);
+
+   /* Having a function scope makes everything easier */
+   struct scope scope = {
+      .type = SCOPE_TYPE_SHADER,
+      .merge = new_impl->end_block,
+   };
+   lower_cf_list(&b, esc_reg, &scope, &old_impl->body);
+   normal_exit(&b, esc_reg, new_impl->end_block);
+
+   /* Now sort by reverse PDFS and restore SSA
+    *
+    * Note: Since we created a new nir_function_impl, there is no metadata,
+    * dirty or otherwise, so we have no need to call nir_progress().
+    */
+   nir_sort_unstructured_blocks(new_impl);
+
+   return true;
+}
+
+/* Copied from nak_nir_lower_cf.c but without divergency or barriers */
+bool
+nocl_nir_lower_cf(struct nir_shader *nir)
+{
+   bool progress = false;
+
+   nir_foreach_function(func, nir) {
+      if (lower_cf_func(func))
+         progress = true;
+   }
+
+   return progress;
+}
+
+nir_shader_compiler_options *
+nocl_get_nir_options(struct nocl_screen *nocl)
+{
+   nir_shader_compiler_options *nir_options = ralloc(nocl, nir_shader_compiler_options);
+   *nir_options = (nir_shader_compiler_options) {
+      // TODO:
+      // .has_bit_test = nocl->ptx >= 60,
+      // .has_bitfield_select = nocl->ptx >= 43,
+      // .has_f2i32_rtne = true,
+      // .has_uclz = true,
+      // .has_dot_2x16 = true,
+      // .has_fneo_fcmpu = true,
+      // .has_fused_comp_and_csel = true,
+      // .has_pack_32_4x8 = true,
+      // .has_shfr32 = true,
+
+      .fuse_ffma32 = true,
+      .has_fsub = true,
+      .has_imul24 = true,
+      .has_imad32 = true,
+      .has_isub = true,
+      .has_rotate32 = nocl->sm >= 32,
+      .has_sdot_4x8 = nocl->sm >= 61,
+      .has_sudot_4x8 = nocl->sm >= 61,
+      .has_udot_4x8 = nocl->sm >= 61,
+      .has_umul24 = true,
+      .has_umad24 = true,
+      .lower_hadd = true,
+      .lower_find_lsb = true,
+      .lower_flrp16 = true,
+      .lower_flrp32 = true,
+      .lower_flrp64 = true,
+      .lower_fisnormal = true,
+      .lower_fsat = true,
+      .lower_fsign = true,
+      .lower_int64_options =
+         /* _technically PTX supports it, but it fails to compile with vector sources...
+          * The hardware doesn't have an idiv anyway. */
+         nir_lower_divmod64 |
+         nir_lower_iadd_sat64 |
+         nir_lower_imul64 |
+         nir_lower_scan_reduce_bitwise64 |
+         nir_lower_scan_reduce_iadd64 |
+         nir_lower_subgroup_shuffle64 |
+         (nocl->sm < 70 ? nir_lower_vote_ieq64 : 0),
+      .lower_insert_byte = true,
+      .lower_insert_word = true,
+      .lower_isign = true,
+      .lower_ldexp = true,
+      .lower_uadd_sat = true,
+      .lower_usub_sat = true,
+      .max_unroll_iterations = 32,
+      .support_16bit_alu = true,
+   };
+
+   return nir_options;
+}
+
+static uint8_t
+alu_width_cb(const nir_instr *instr, const void *)
+{
+   return 1;
+}
+
+static unsigned
+lower_bit_size_callback(const nir_instr *instr, void *data)
+{
+   const struct nocl_screen *nocl = data;
+
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      unsigned dest_bit_size = alu->def.bit_size;
+      unsigned src0_bit_size = alu->src[0].src.ssa->bit_size;
+      switch (alu->op) {
+      case nir_op_extract_i8:
+      case nir_op_extract_i16:
+      case nir_op_extract_u8:
+      case nir_op_extract_u16:
+      case nir_op_bitfield_reverse:
+         if (dest_bit_size < 32)
+            return 32;
+         return 0;
+      case nir_op_bcsel:
+         if (dest_bit_size < 16)
+            return 16;
+         return 0;
+
+      case nir_op_bit_count:
+      case nir_op_bitfield_insert:
+      case nir_op_fdiv:
+      case nir_op_frsq:
+      case nir_op_fsqrt:
+      case nir_op_iadd_sat:
+      case nir_op_ibitfield_extract:
+      case nir_op_isub_sat:
+      case nir_op_ubitfield_extract:
+      case nir_op_ufind_msb:
+         if (src0_bit_size < 32)
+            return 32;
+         return 0;
+
+      case nir_op_iabs:
+      case nir_op_iadd:
+      case nir_op_iand:
+      case nir_op_idiv:
+      case nir_op_ieq:
+      case nir_op_ige:
+      case nir_op_ilt:
+      case nir_op_imad:
+      case nir_op_imax:
+      case nir_op_imin:
+      case nir_op_imul:
+      case nir_op_imul_high:
+      case nir_op_ine:
+      case nir_op_ineg:
+      case nir_op_inot:
+      case nir_op_ior:
+      case nir_op_irem:
+      case nir_op_ishl:
+      case nir_op_ishr:
+      case nir_op_isub:
+      case nir_op_ixor:
+      case nir_op_udiv:
+      case nir_op_uge:
+      case nir_op_ult:
+      case nir_op_umax:
+      case nir_op_umin:
+      case nir_op_umod:
+      case nir_op_umul_high:
+      case nir_op_ushr:
+         if (src0_bit_size == 8)
+            return 16;
+         return 0;
+
+      case nir_op_fmax:
+      case nir_op_fmin:
+         if (src0_bit_size == 16 && nocl->sm < 80)
+            return 32;
+         return 0;
+
+      case nir_op_feq:
+      case nir_op_fequ:
+      case nir_op_fge:
+      case nir_op_fgeu:
+      case nir_op_flt:
+      case nir_op_fltu:
+      case nir_op_fneo:
+      case nir_op_fneu:
+         if (src0_bit_size == 16 && nocl->sm < 53)
+            return 32;
+         return 0;
+      default:
+         return 0;
+      }
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_read_invocation:
+      case nir_intrinsic_reduce:
+      case nir_intrinsic_shuffle:
+      case nir_intrinsic_shuffle_down:
+      case nir_intrinsic_shuffle_up:
+      case nir_intrinsic_shuffle_xor:
+         if (intrin->def.bit_size < 32)
+            return 32;
+         return 0;
+      default:
+         return 0;
+      }
+   }
+
+   default:
+      return 0;
+   }
+}
+
+static bool
+lower_subgroups_filter(const nir_intrinsic_instr *intrin, const void *data)
+{
+   const struct nocl_screen *nocl = data;
+   if (intrin->def.num_components > 1)
+      return true;
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_ballot_bit_count_reduce:
+   case nir_intrinsic_ballot_bit_count_exclusive:
+   case nir_intrinsic_ballot_bit_count_inclusive:
+   case nir_intrinsic_ballot_bitfield_extract:
+   case nir_intrinsic_ballot_find_lsb:
+   case nir_intrinsic_ballot_find_msb:
+   case nir_intrinsic_exclusive_scan:
+   case nir_intrinsic_first_invocation:
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_inverse_ballot:
+   case nir_intrinsic_load_subgroup_size:
+   case nir_intrinsic_quad_vote_all:
+   case nir_intrinsic_quad_vote_any:
+   case nir_intrinsic_read_first_invocation:
+   case nir_intrinsic_rotate:
+   case nir_intrinsic_vote_feq:
+   /* TODO: exists on SM90+ */
+   case nir_intrinsic_elect:
+      return true;
+   case nir_intrinsic_vote_ieq:
+      return nocl->sm < 70;
+   case nir_intrinsic_reduce: {
+      if (nocl->sm < 80)
+         return true;
+      unsigned cluster_size = nir_intrinsic_cluster_size(intrin);
+      if (cluster_size != 0 && cluster_size != 32)
+         return true;
+
+      switch (nir_intrinsic_reduction_op(intrin)) {
+      case nir_op_fadd:
+      case nir_op_fmax:
+      case nir_op_fmin:
+      case nir_op_fmul:
+      case nir_op_imul:
+         return true;
+      case nir_op_imax:
+      case nir_op_imin:
+      case nir_op_umax:
+      case nir_op_umin:
+         return intrin->def.bit_size == 64;
+      default:
+         /* nir_lower_int64 will handle some of it for us */
+         return false;
+      }
+   }
+   case nir_intrinsic_read_invocation:
+   case nir_intrinsic_shuffle:
+   case nir_intrinsic_shuffle_down:
+   case nir_intrinsic_shuffle_up:
+   case nir_intrinsic_shuffle_xor:
+      return intrin->def.bit_size > 32;
+   default:
+      return false;
+   }
+}
+
+static nir_mem_access_size_align
+lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin,
+                              uint8_t bytes,
+                              uint8_t bit_size,
+                              uint32_t align,
+                              uint32_t align_offset,
+                              bool offset_is_const,
+                              enum gl_access_qualifier,
+                              const void *cb_data)
+{
+   align = nir_combined_align(align, align_offset);
+   bytes = MIN2(MIN2(bytes, align), 16);
+   bit_size = MIN2(bit_size, bytes * 8);
+
+   unsigned num_components = MIN2(bytes / (bit_size / 8), 4);
+   if (num_components == 3)
+      num_components = 2;
+
+   return (nir_mem_access_size_align) {
+      .bit_size = bit_size,
+      .num_components = num_components,
+      .align = align,
+      .shift = nir_mem_access_shift_method_scalar,
+   };
+}
+
+static bool
+lower_alu_vec_srcs(nir_builder *b, nir_alu_instr *alu, void *_data)
+{
+   const nir_op_info *info = &nir_op_infos[alu->op];
+
+   bool changed = false;
+   b->cursor = nir_before_instr(&alu->instr);
+   for (int i = 0; i < info->num_inputs; i++) {
+      if (info->input_sizes[i])
+         continue;
+
+      /* We lower everything that is bigger than vec4 _and_ 128 bits */
+      nir_def *src_ssa = alu->src[i].src.ssa;
+      if (src_ssa->bit_size * src_ssa->num_components <= 128 && src_ssa->num_components <= 4)
+         continue;
+
+      changed = true;
+      nir_def *comps[NIR_MAX_VEC_COMPONENTS];
+      for (int c = 0; c < alu->def.num_components; c++) {
+         unsigned swizzle = alu->src[i].swizzle[c];
+         alu->src[i].swizzle[c] = c;
+
+         nir_const_value *const_val = nir_src_as_const_value(alu->src[i].src);
+         if (const_val) {
+            comps[c] = nir_build_imm(b, 1, alu->src[i].src.ssa->bit_size, &const_val[swizzle]);
+         } else {
+            comps[c] = nir_swizzle(b, alu->src[i].src.ssa, &swizzle, 1);
+         }
+      }
+      nir_def *src = nir_vec(b, comps, alu->def.num_components);
+      nir_src_rewrite(&alu->src[i].src, src);
+   }
+
+   return changed;
+}
+
+/* copied from nir_lower_alu_vec8_16_srcs */
+static bool
+nir_lower_alu_vec_srcs(nir_shader *shader)
+{
+   return nir_shader_alu_pass(shader, lower_alu_vec_srcs,
+                              nir_metadata_control_flow,
+                              NULL);
+}
+
+void
+nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir)
+{
+   struct nocl_screen *nocl = nocl_screen(pscreen);
+   nir->info.max_subgroup_size = nocl->base.caps.shader_subgroup_size;
+   nir->info.min_subgroup_size = nocl->base.caps.shader_subgroup_size;
+   nir->info.api_subgroup_size = nocl->base.caps.shader_subgroup_size;
+
+   struct nir_lower_compute_system_values_options sysval_options = {
+      .lower_local_invocation_index = true,
+      .lower_num_subgroups = true,
+      .lower_subgroup_id = true,
+   };
+
+   struct nir_lower_mem_access_bit_sizes_options mem_access_bit_sizes_options = {
+      .callback = lower_mem_access_bit_sizes_cb,
+      .cb_data = nocl,
+      .modes = nir_var_all,
+   };
+
+   struct nir_lower_subgroups_options subgroup_options = {
+      .ballot_bit_size = 32,
+      .ballot_components = 1,
+      .subgroup_size = 32,
+      .lower_elect = true,
+      .lower_first_invocation_to_ballot = true,
+      .lower_inverse_ballot = true,
+      .lower_quad_vote = true,
+      .lower_read_first_invocation = true,
+      .lower_reduce = true,
+      .lower_rotate_to_shuffle = true,
+      .lower_shuffle_to_32bit = true,
+      .lower_to_scalar = true,
+      .lower_vote_feq = true,
+      .lower_vote_ieq = nocl->sm < 70,
+      .filter = lower_subgroups_filter,
+      .filter_data = nocl,
+   };
+
+   NIR_PASS(_, nir, nir_lower_compute_system_values, &sysval_options);
+
+   /* TODO: keeping 8 bit vectors would help a bit... */
+   NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
+   NIR_PASS(_, nir, nir_lower_subgroups, &subgroup_options);
+
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
+      NIR_PASS(progress, nir, nir_lower_mem_access_bit_sizes, &mem_access_bit_sizes_options);
+      NIR_PASS(progress, nir, nir_lower_alu_width, alu_width_cb, nocl);
+      NIR_PASS(progress, nir, nir_lower_alu_vec_srcs);
+      NIR_PASS(progress, nir, nir_lower_flrp, 16 | 32 | 64, false);
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      NIR_PASS(progress, nir, nir_opt_cse);
+   } while (progress);
+
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_lower_bit_size, lower_bit_size_callback, nocl);
+      NIR_PASS(progress, nir, nir_opt_algebraic_late);
+      NIR_PASS(progress, nir, nocl_nir_opt_algebraic_late, nocl);
+      if (progress) {
+         NIR_PASS(_, nir, nir_opt_constant_folding);
+         NIR_PASS(_, nir, nir_copy_prop);
+         NIR_PASS(_, nir, nir_opt_dce);
+         NIR_PASS(_, nir, nir_opt_cse);
+      }
+   } while (progress);
+
+   /* Run only once */
+   NIR_PASS(progress, nir, nocl_nir_lower_algebraic_late, nocl);
+   if (progress) {
+      NIR_PASS(_, nir, nir_opt_constant_folding);
+   }
+
+   /* After nir_opt_constant_folding */
+   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
+   NIR_PASS(_, nir, nir_opt_dce);
+}
--- a/src/gallium/drivers/nocl/nocl_nir_algebraic.py
+++ b/src/gallium/drivers/nocl/nocl_nir_algebraic.py
@ -0,0 +1,65 @@
+# Copyright © 2025 Karol Herbst
+# SPDX-License-Identifier: MIT
+
+import argparse
+import sys
+
+a = 'a'
+b = 'b'
+c = 'c'
+s = 's'
+
+late_algebraic = [
+    (('ior@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
+    (('iadd@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
+    (('ixor@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
+    (('ixor@32', ('iand', a, ('ixor', b, c)), c), ('bitfield_select', a, b, c)),
+
+    (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c)),
+    (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c)),
+    (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b)),
+    (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c)),
+    (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c)),
+    (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c)),
+]
+
+late_algebraic_lowering = [
+    (('b2i8', a), ('u2u8', ('b2i16', a))),
+]
+
+for bit_size in [16, 32, 64]:
+    late_algebraic_lowering += [
+        ((f'ishl@{bit_size}', a, b), ('ishl', a, ('iand', b, bit_size - 1))),
+        ((f'ishr@{bit_size}', a, b), ('ishr', a, ('iand', b, bit_size - 1))),
+        ((f'ushr@{bit_size}', a, b), ('ushr', a, ('iand', b, bit_size - 1))),
+    ]
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--out', required=True, help='Output file.')
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+
+    import nir_algebraic  # pylint: disable=import-error
+
+    try:
+        with open(args.out, 'w', encoding='utf-8') as f:
+            f.write('#include "nocl_private.h"')
+            f.write(nir_algebraic.AlgebraicPass(
+                "nocl_nir_opt_algebraic_late",
+                late_algebraic,
+                [
+                    ("const struct nocl_screen *", "nocl"),
+                ]).render())
+            f.write(nir_algebraic.AlgebraicPass(
+                "nocl_nir_lower_algebraic_late",
+                late_algebraic_lowering,
+                [
+                    ("const struct nocl_screen *", "nocl"),
+                ]).render())
+    except Exception:
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
--- a/src/gallium/drivers/nocl/nocl_pipe.c
+++ b/src/gallium/drivers/nocl/nocl_pipe.c
--- a/src/gallium/drivers/nocl/nocl_private.h
+++ b/src/gallium/drivers/nocl/nocl_private.h
@ -0,0 +1,147 @@
+/*
+ * Copyright © 2025 Karol Herbst
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/macros.h"
+#include "util/ralloc.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+#include "nocl/nocl_cuda_public.h"
+
+static const uint32_t NOCL_COMPILER_LOG_SIZE = 0x1000;
+
+enum nocl_dbg {
+   NOCL_DBG_32          = BITFIELD_BIT(0),
+   NOCL_DBG_JIT_NO_OPTS = BITFIELD_BIT(1),
+   NOCL_DBG_NIR         = BITFIELD_BIT(2),
+   NOCL_DBG_PTX         = BITFIELD_BIT(3),
+   NOCL_DBG_VERBOSE     = BITFIELD_BIT(4),
+};
+
+#define NOCL_CALL(func) nocl_error(nocl_dispatch.func)
+
+static inline bool
+nocl_error(CUresult err)
+{
+   if (err == CUDA_SUCCESS)
+      return false;
+
+   const char *err_str;
+   if (nocl_dispatch.cuGetErrorString(err, &err_str)) {
+      printf("NOCL-ERROR: UNKNOWN\n");
+   } else {
+      printf("NOCL-ERROR: %s\n", err_str);
+   }
+
+   return true;
+}
+
+struct nocl_screen {
+   struct pipe_screen base;
+#if CUDA_VERSION >= 12090
+   CUlogsCallbackHandle verbose_handle;
+#endif
+   CUdevice cu_dev;
+   CUcontext cu_ctx;
+   struct disk_cache *disk_cache;
+   uint32_t debug;
+   uint16_t ptx;
+   uint16_t sm;
+   char name[128];
+};
+
+struct nocl_buffer {
+   struct pipe_resource base;
+   union {
+      CUdeviceptr dptr;
+      void *ptr;
+   };
+};
+
+struct nocl_program {
+   CUmodule cu_mod;
+   CUfunction cu_func;
+   uint32_t shared_size;
+};
+
+struct nocl_context {
+   struct pipe_context base;
+   void *kernel_input;
+   uint32_t kernel_input_size;
+   uint32_t kernel_input_capacity;
+   CUstream cu_stream;
+   CUevent cu_timestamp_start;
+   uint64_t timestamp_start;
+   struct nocl_program *prog;
+};
+
+struct pipe_fence_handle {
+   struct pipe_reference ref;
+   CUevent cu_event;
+};
+
+struct nocl_transfer {
+   struct pipe_transfer base;
+   void *ptr;
+};
+
+struct pipe_query {
+   CUevent cu_event_base;
+   CUevent cu_event;
+};
+
+static inline struct nocl_screen*
+nocl_screen(struct pipe_screen *pscreen)
+{
+   return (struct nocl_screen *)pscreen;
+}
+
+static inline struct nocl_screen*
+nocl_screen_and_make_current(struct pipe_screen *pscreen)
+{
+   struct nocl_screen *nocl = nocl_screen(pscreen);
+   NOCL_CALL(cuCtxSetCurrent(nocl->cu_ctx));
+   return nocl;
+}
+
+static inline struct nocl_context*
+nocl_context(struct pipe_context *pctx)
+{
+   return (struct nocl_context *)pctx;
+}
+
+static inline struct nocl_context*
+nocl_context_and_make_current(struct pipe_context *pctx)
+{
+   nocl_screen_and_make_current(pctx->screen);
+   return nocl_context(pctx);
+}
+
+static inline struct nocl_program*
+nocl_program(void *prog)
+{
+   return (struct nocl_program *)prog;
+}
+
+static inline struct nocl_buffer*
+nocl_buffer(struct pipe_resource *pres)
+{
+   return (struct nocl_buffer *)pres;
+}
+
+static inline struct nocl_transfer*
+nocl_transfer(struct pipe_transfer *pxfer)
+{
+   return (struct nocl_transfer *)pxfer;
+}
+
+void nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir);
+bool nocl_nir_lower_cf(struct nir_shader *nir);
+bool nocl_nir_opt_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl);
+bool nocl_nir_lower_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl);
+struct nir_shader_compiler_options *nocl_get_nir_options(struct nocl_screen *nocl);
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@ -740,17 +740,50 @@ impl DeviceBase {
        }

        if self.subgroups_supported() {
-            add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
-            add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative);
            add_cap(SpvCapability::SpvCapabilityGroups);
            add_cap(SpvCapability::SpvCapabilitySubgroupDispatch);
            // requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
            //add_ext(1, 0, 0, "cl_khr_subgroups");
+            add_ext(1, 0, 0, "cl_khr_subgroup_extended_types");
            add_feat(1, 0, 0, "__opencl_c_subgroups");

-            // we have lowering in `nir_lower_subgroups`, drivers can just use that
+            if self.subgroup_ballot_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformBallot);
+                add_ext(1, 0, 0, "cl_khr_subgroup_ballot");
+            }
+
+            if self.subgroup_clustered_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformClustered);
+                add_ext(1, 0, 0, "cl_khr_subgroup_clustered_reduce");
+            }
+
+            if self.subgroup_non_uniform_arithmetic_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformArithmetic);
+                add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_arithmetic");
+            }
+
+            if self.subgroup_non_uniform_vote_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniform);
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformVote);
+                add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_vote");
+            }
+
+            if self.subgroup_rotate_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformRotateKHR);
+                add_ext(1, 0, 0, "cl_khr_subgroup_rotate");
+                add_spirv(c"SPV_KHR_subgroup_rotate");
+            }
+
+            if self.subgroup_shuffle_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
                add_ext(1, 0, 0, "cl_khr_subgroup_shuffle");
+            }
+
+            if self.subgroup_shuffle_relative_supported() {
+                add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative);
                add_ext(1, 0, 0, "cl_khr_subgroup_shuffle_relative");
+            }
+
            if self.intel_subgroups_supported() {
                // add_cap(SpvCapability::SpvCapabilitySubgroupBufferBlockIOINTEL);
                // add_cap(SpvCapability::SpvCapabilitySubgroupImageBlockIOINTEL);
@ -1173,6 +1206,58 @@ impl DeviceBase {
        // supported, doing it without shareable shaders isn't practical
        self.max_subgroups() > 0
            && (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders()))
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_BASIC
+                != 0
+    }
+
+    pub fn subgroup_ballot_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_BALLOT
+                != 0
+    }
+
+    pub fn subgroup_clustered_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED
+                != 0
+    }
+
+    pub fn subgroup_non_uniform_arithmetic_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_ARITHMETIC
+                != 0
+    }
+
+    pub fn subgroup_non_uniform_vote_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_VOTE
+                != 0
+    }
+
+    pub fn subgroup_rotate_supported(&self) -> bool {
+        let mask =
+            PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED;
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features & mask == mask
+    }
+
+    pub fn subgroup_shuffle_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE
+                != 0
+    }
+
+    pub fn subgroup_shuffle_relative_supported(&self) -> bool {
+        self.subgroups_supported()
+            && self.screen().caps().shader_subgroup_supported_features
+                & PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE_RELATIVE
+                != 0
    }

    pub fn system_svm_supported(&self) -> bool {
@ -1255,8 +1340,14 @@ impl DeviceBase {
            intel_subgroups: self.intel_subgroups_supported(),
            kernel_clock: self.kernel_clock_supported(),
            subgroups: subgroups_supported,
-            subgroups_shuffle: subgroups_supported,
-            subgroups_shuffle_relative: subgroups_supported,
+            subgroups_ballot: self.subgroup_ballot_supported(),
+            subgroups_clustered: self.subgroup_clustered_supported(),
+            subgroups_extended_types: subgroups_supported,
+            subgroups_non_uniform_arithmetic: self.subgroup_non_uniform_arithmetic_supported(),
+            subgroups_non_uniform_vote: self.subgroup_non_uniform_vote_supported(),
+            subgroups_rotate: self.subgroup_rotate_supported(),
+            subgroups_shuffle: self.subgroup_shuffle_supported(),
+            subgroups_shuffle_relative: self.subgroup_shuffle_relative_supported(),
            ..Default::default()
        }
    }
--- a/src/gallium/frontends/rusticl/mesa/pipe/device.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/device.rs
@ -99,6 +99,7 @@ fn get_enabled_devs() -> HashMap<String, u32> {

            let driver_str = match driver_str[0] {
                "llvmpipe" | "lp" => "swrast",
+                "cuda" => "nocl",
                "freedreno" => "msm",
                a => a,
            };
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@ -698,6 +698,14 @@ enum pipe_conservative_raster_mode
 #define PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED        (1 << 6)
 #define PIPE_SHADER_SUBGROUP_FEATURE_QUAD             (1 << 7)
 #define PIPE_SHADER_SUBGROUP_NUM_FEATURES             8
+/* VK_SUBGROUP_FEATURE_ROTATE_BIT */
+#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE           (1 << 9)
+/* VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT */
+#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED (1 << 10)
+#define PIPE_SHADER_SUBGROUP_FEATURE_MASK \
+   (BITFIELD_MASK(PIPE_SHADER_SUBGROUP_NUM_FEATURES) | \
+   PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | \
+   PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED)

 enum pipe_point_size_lower_mode {
   PIPE_POINT_SIZE_LOWER_ALWAYS,
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@ -206,6 +206,12 @@ if with_gallium_d3d12
 else
  driver_d3d12 = declare_dependency()
 endif
+if with_gallium_nocl
+  subdir('winsys/nocl')
+  subdir('drivers/nocl')
+else
+  driver_nocl = declare_dependency()
+endif
 if with_gallium_rusticl
  subdir('frontends/rusticl')
  subdir('targets/rusticl')
--- a/src/gallium/targets/rusticl/meson.build
+++ b/src/gallium/targets/rusticl/meson.build
@ -34,6 +34,7 @@ librusticl = shared_library(
  ],
  dependencies : [
    driver_asahi,
+    driver_nocl,
    driver_freedreno,
    driver_iris,
    driver_nouveau,
--- a/src/gallium/winsys/nocl/meson.build
+++ b/src/gallium/winsys/nocl/meson.build
@ -0,0 +1,25 @@
+# Copyright © 2025 Karol Herbst
+# SPDX-License-Identifier: MIT
+
+libnocl_cuda = static_library(
+  'nocl_cuda',
+  [
+    'nocl_cuda.c',
+    'nocl_cuda_public.h',
+  ],
+  include_directories : [
+    inc_util,
+    inc_gallium,
+    inc_gallium_aux,
+  ],
+  dependencies : [
+    dep_libcuda,
+  ],
+)
+
+idep_libnoclwinsys = declare_dependency(
+  include_directories : [
+    include_directories('.'),
+  ],
+  link_with : [libnocl_cuda],
+)
--- a/src/gallium/winsys/nocl/nocl_cuda.c
+++ b/src/gallium/winsys/nocl/nocl_cuda.c
@ -0,0 +1,212 @@
+/*
+ * Copyright © 2025 Karol Herbst
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe-loader/pipe_loader.h"
+#include "pipe-loader/pipe_loader_priv.h"
+#include "target-helpers/inline_debug_helper.h"
+#include "util/u_dl.h"
+#include "util/u_memory.h"
+
+#include "cuda.h"
+#include "nocl_cuda_public.h"
+
+
+#define cuda_err(err) if (err) {                  \
+   const char *err_str;                           \
+   nocl_dispatch.cuGetErrorString(err, &err_str); \
+   printf("CUDA returned error: %s\n", err_str);  \
+   return 0; \
+}
+
+static struct driOptionDescription nocl_dri_conf = {};
+static struct util_dl_library *libcuda = NULL;
+struct cuda_symbol_table nocl_dispatch = {};
+
+static struct pipe_screen *
+nocl_loader_create_screen(struct pipe_loader_device *dev,
+                           const struct pipe_screen_config *config,
+                           bool sw_vk)
+{
+   struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)dev;
+   struct pipe_screen *pscreen = nocl_create_screen(cuda_dev);
+   return pscreen ? debug_screen_wrap(pscreen) : NULL;
+}
+
+static const struct driOptionDescription *
+nocl_get_driconf(struct pipe_loader_device *dev, unsigned *count)
+{
+   *count = 0;
+   return &nocl_dri_conf;
+}
+
+static void
+nocl_release(struct pipe_loader_device **dev)
+{
+   struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)*dev;
+   FREE(cuda_dev);
+}
+
+static struct pipe_loader_ops cuda_loader_ops = {
+   .create_screen = nocl_loader_create_screen,
+   .release = nocl_release,
+   .get_driconf = nocl_get_driconf,
+};
+
+static int32_t faked_version = 0;
+
+static bool
+nocl_load_pfn(const char *name, void **pfn, int version)
+{
+   if (version > faked_version) {
+      *pfn = NULL;
+      return false;
+   }
+
+   CUdriverProcAddressQueryResult proc_result;
+   CUresult res = nocl_dispatch.cuGetProcAddress_v2(name, pfn, version, CU_GET_PROC_ADDRESS_LEGACY_STREAM, &proc_result);
+   bool success = res == CUDA_SUCCESS && proc_result == CU_GET_PROC_ADDRESS_SUCCESS;
+   /* Probably always NULL on errors but let's be sure */
+   if (!success)
+      *pfn = NULL;
+   return success;
+}
+
+int
+pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev)
+{
+   if (!libcuda) {
+      UNUSED void *func = NULL;
+      faked_version = debug_get_num_option("NOCL_API_TARGET", INT32_MAX);
+      libcuda = util_dl_open("libcuda.so");
+      if (!libcuda)
+         return 0;
+
+      /* Needs 12.0 */
+      nocl_dispatch.cuGetProcAddress_v2 = (void*)util_dl_get_proc_address(libcuda, "cuGetProcAddress_v2");
+      if (!nocl_dispatch.cuGetProcAddress_v2) {
+         return 0;
+      }
+
+      bool success = true;
+      success &= nocl_load_pfn("cuCtxCreate",               (void**)&nocl_dispatch.cuCtxCreate_v2,             3020);
+      success &= nocl_load_pfn("cuCtxDestroy",              (void**)&nocl_dispatch.cuCtxDestroy_v2,            4000);
+      success &= nocl_load_pfn("cuCtxSetCurrent",           (void**)&nocl_dispatch.cuCtxSetCurrent,            4000);
+      success &= nocl_load_pfn("cuDeviceGet",               (void**)&nocl_dispatch.cuDeviceGet,                2000);
+      success &= nocl_load_pfn("cuDeviceGetAttribute",      (void**)&nocl_dispatch.cuDeviceGetAttribute,       2000);
+      success &= nocl_load_pfn("cuDeviceGetCount",          (void**)&nocl_dispatch.cuDeviceGetCount,           2000);
+      success &= nocl_load_pfn("cuDeviceGetName",           (void**)&nocl_dispatch.cuDeviceGetName,            2000);
+      success &= nocl_load_pfn("cuDeviceTotalMem",          (void**)&nocl_dispatch.cuDeviceTotalMem_v2,        3020);
+      success &= nocl_load_pfn("cuDriverGetVersion",        (void**)&nocl_dispatch.cuDriverGetVersion,         2020);
+      success &= nocl_load_pfn("cuEventCreate",             (void**)&nocl_dispatch.cuEventCreate,              2000);
+      success &= nocl_load_pfn("cuEventDestroy",            (void**)&nocl_dispatch.cuEventDestroy_v2,          4000);
+      success &= nocl_load_pfn("cuEventElapsedTime",        (void**)&nocl_dispatch.cuEventElapsedTime_v1,      2000);
+      success &= nocl_load_pfn("cuEventRecord",             (void**)&nocl_dispatch.cuEventRecord,              2000);
+      success &= nocl_load_pfn("cuEventSynchronize",        (void**)&nocl_dispatch.cuEventSynchronize,         2000);
+      success &= nocl_load_pfn("cuFuncGetAttribute",        (void**)&nocl_dispatch.cuFuncGetAttribute,         2020);
+      success &= nocl_load_pfn("cuGetErrorString",          (void**)&nocl_dispatch.cuGetErrorString,           6000);
+      success &= nocl_load_pfn("cuInit",                    (void**)&nocl_dispatch.cuInit,                     2000);
+      success &= nocl_load_pfn("cuLaunchKernel",            (void**)&nocl_dispatch.cuLaunchKernel,             4000);
+      success &= nocl_load_pfn("cuLinkAddData",             (void**)&nocl_dispatch.cuLinkAddData_v2,           6050);
+      success &= nocl_load_pfn("cuLinkComplete",            (void**)&nocl_dispatch.cuLinkComplete,             5050);
+      success &= nocl_load_pfn("cuLinkCreate",              (void**)&nocl_dispatch.cuLinkCreate_v2,            6050);
+      success &= nocl_load_pfn("cuLinkDestroy",             (void**)&nocl_dispatch.cuLinkDestroy,              5050);
+      success &= nocl_load_pfn("cuMemAlloc",                (void**)&nocl_dispatch.cuMemAlloc_v2,              3020);
+      success &= nocl_load_pfn("cuMemAllocHost",            (void**)&nocl_dispatch.cuMemAllocHost_v2,          3020);
+      success &= nocl_load_pfn("cuMemcpy",                  (void**)&nocl_dispatch.cuMemcpy,                   4000);
+      success &= nocl_load_pfn("cuMemcpyAsync",             (void**)&nocl_dispatch.cuMemcpyAsync,              4000);
+      success &= nocl_load_pfn("cuMemcpyDtoHAsync",         (void**)&nocl_dispatch.cuMemcpyDtoHAsync_v2,       3020);
+      success &= nocl_load_pfn("cuMemcpyHtoD",              (void**)&nocl_dispatch.cuMemcpyHtoD_v2,            3020);
+      success &= nocl_load_pfn("cuMemFree",                 (void**)&nocl_dispatch.cuMemFree_v2,               3020);
+      success &= nocl_load_pfn("cuMemFreeHost",             (void**)&nocl_dispatch.cuMemFreeHost,              2000);
+      success &= nocl_load_pfn("cuMemsetD8Async",           (void**)&nocl_dispatch.cuMemsetD8Async,            3020);
+      success &= nocl_load_pfn("cuMemsetD16Async",          (void**)&nocl_dispatch.cuMemsetD16Async,           3020);
+      success &= nocl_load_pfn("cuMemsetD32Async",          (void**)&nocl_dispatch.cuMemsetD32Async,           3020);
+      success &= nocl_load_pfn("cuModuleGetFunction",       (void**)&nocl_dispatch.cuModuleGetFunction,        2000);
+      success &= nocl_load_pfn("cuModuleLoadDataEx",        (void**)&nocl_dispatch.cuModuleLoadDataEx,         2010);
+      success &= nocl_load_pfn("cuModuleUnload",            (void**)&nocl_dispatch.cuModuleUnload,             2000);
+      success &= nocl_load_pfn("cuStreamDestroy",           (void**)&nocl_dispatch.cuStreamDestroy_v2,         4000);
+      success &= nocl_load_pfn("cuStreamSynchronize",       (void**)&nocl_dispatch.cuStreamSynchronize,        2000);
+      if (!success)
+         return 0;
+
+      /* Only documented difference appears to be that v2 won't return errors on previous async
+       * commands, so just overwrite the previously fetched pointer */
+      nocl_load_pfn("cuEventElapsedTime",                   &func,                                            12080);
+      if (func)
+         nocl_dispatch.cuEventElapsedTime_v1 = func;
+
+      nocl_dispatch.all_required_available = true;
+#if CUDA_VERSION >= 11040
+      success &= nocl_load_pfn("cuDeviceGetUuid",           (void**)&nocl_dispatch.cuDeviceGetUuid_v2,        11040);
+#endif
+#if CUDA_VERSION >= 12090
+      nocl_load_pfn("cuLogsRegisterCallback",               (void**)&nocl_dispatch.cuLogsRegisterCallback,    12090);
+      nocl_load_pfn("cuLogsUnregisterCallback",             (void**)&nocl_dispatch.cuLogsUnregisterCallback,  12090);
+#endif
+   }
+
+   if (!nocl_dispatch.all_required_available)
+      return 0;
+
+   int driver_version;
+   CUresult res = nocl_dispatch.cuInit(0);
+   cuda_err(res);
+
+   res = nocl_dispatch.cuDriverGetVersion(&driver_version);
+   /* Our highest version requirement across all required APIs */
+   if (driver_version < 8000)
+      return 0;
+
+   if (!ndev) {
+      res = nocl_dispatch.cuDeviceGetCount(&ndev);
+      cuda_err(res);
+      return ndev;
+   }
+
+   if (!devs)
+      return 0;
+
+   int cnt;
+   res = nocl_dispatch.cuDeviceGetCount(&cnt);
+   cuda_err(res);
+   for (int i = 0; i < ndev; i++) {
+      if (i >= cnt)
+         return i;
+
+      CUdevice cuda_dev;
+      res = nocl_dispatch.cuDeviceGet(&cuda_dev, i);
+      if (res)
+        return i;
+
+      int pci_device_id;
+#if CUDA_VERSION >= 12080
+      res = nocl_dispatch.cuDeviceGetAttribute(&pci_device_id, CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID, cuda_dev);
+      cuda_err(res);
+#else
+      /* TODO: we might need to get the device id from somewhere, but so far
+       * nothing needs it */
+      pci_device_id = 0x10de;
+#endif
+
+      struct cuda_pipe_loader_device *dev = CALLOC_STRUCT(cuda_pipe_loader_device);
+      *dev = (struct cuda_pipe_loader_device) {
+         .base = {
+            .driver_name = "nocl",
+            .ops = &cuda_loader_ops,
+            /* TODO: tegra */
+            .type = PIPE_LOADER_DEVICE_PCI,
+            .u.pci.chip_id = ((unsigned int)pci_device_id) >> 16,
+            .u.pci.vendor_id = pci_device_id & 0xffff,
+         },
+         .dev = cuda_dev,
+         .cuda_version = driver_version,
+      };
+
+      devs[i] = &dev->base;
+   }
+
+   return ndev;
+}
--- a/src/gallium/winsys/nocl/nocl_cuda_public.h
+++ b/src/gallium/winsys/nocl/nocl_cuda_public.h
@ -0,0 +1,73 @@
+/*
+ * Copyright © 2025 Karol Herbst
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __NOCL_DRM_PUBLIC_H__
+#define __NOCL_DRM_PUBLIC_H__
+
+#include <cudaTypedefs.h>
+
+#include "pipe-loader/pipe_loader.h"
+
+struct cuda_pipe_loader_device {
+   struct pipe_loader_device base;
+   CUdevice dev;
+   int cuda_version;
+};
+
+extern struct cuda_symbol_table {
+   PFN_cuCtxCreate_v3020 cuCtxCreate_v2;
+   PFN_cuCtxDestroy_v4000 cuCtxDestroy_v2;
+   PFN_cuCtxSetCurrent_v4000 cuCtxSetCurrent;
+   PFN_cuDeviceGet_v2000 cuDeviceGet;
+   PFN_cuDeviceGetAttribute_v2000 cuDeviceGetAttribute;
+   PFN_cuDeviceGetCount_v2000 cuDeviceGetCount;
+   PFN_cuDeviceGetName_v2000 cuDeviceGetName;
+   PFN_cuDeviceTotalMem_v3020 cuDeviceTotalMem_v2;
+   PFN_cuDriverGetVersion_v2020 cuDriverGetVersion;
+   PFN_cuEventCreate_v2000 cuEventCreate;
+   PFN_cuEventDestroy_v4000 cuEventDestroy_v2;
+   PFN_cuEventElapsedTime_v2000 cuEventElapsedTime_v1;
+   PFN_cuEventRecord_v2000 cuEventRecord;
+   PFN_cuEventSynchronize_v2000 cuEventSynchronize;
+   PFN_cuFuncGetAttribute_v2020 cuFuncGetAttribute;
+   PFN_cuGetErrorString_v6000 cuGetErrorString;
+   PFN_cuGetProcAddress_v12000 cuGetProcAddress_v2;
+   PFN_cuInit_v2000 cuInit;
+   PFN_cuLaunchKernel_v4000 cuLaunchKernel;
+   PFN_cuLinkAddData_v6050 cuLinkAddData_v2;
+   PFN_cuLinkComplete_v5050 cuLinkComplete;
+   PFN_cuLinkCreate_v6050 cuLinkCreate_v2;
+   PFN_cuLinkDestroy_v5050 cuLinkDestroy;
+   PFN_cuMemAlloc_v3020 cuMemAlloc_v2;
+   PFN_cuMemAllocHost_v3020 cuMemAllocHost_v2;
+   PFN_cuMemcpy_v4000 cuMemcpy;
+   PFN_cuMemcpyAsync_v4000 cuMemcpyAsync;
+   PFN_cuMemcpyDtoHAsync_v3020 cuMemcpyDtoHAsync_v2;
+   PFN_cuMemcpyHtoD_v3020 cuMemcpyHtoD_v2;
+   PFN_cuMemFree_v3020 cuMemFree_v2;
+   PFN_cuMemFreeHost_v2000 cuMemFreeHost;
+   PFN_cuMemsetD8Async_v3020 cuMemsetD8Async;
+   PFN_cuMemsetD16Async_v3020 cuMemsetD16Async;
+   PFN_cuMemsetD32Async_v3020 cuMemsetD32Async;
+   PFN_cuModuleGetFunction_v2000 cuModuleGetFunction;
+   PFN_cuModuleLoadDataEx_v2010 cuModuleLoadDataEx;
+   PFN_cuModuleUnload_v2000 cuModuleUnload;
+   PFN_cuStreamDestroy_v4000 cuStreamDestroy_v2;
+   PFN_cuStreamSynchronize_v2000 cuStreamSynchronize;
+#if CUDA_VERSION >= 11040
+   PFN_cuDeviceGetUuid_v11040 cuDeviceGetUuid_v2;
+#endif
+#if CUDA_VERSION >= 12090
+   PFN_cuLogsRegisterCallback_v12090 cuLogsRegisterCallback;
+   PFN_cuLogsUnregisterCallback_v12090 cuLogsUnregisterCallback;
+#endif
+   bool all_required_available;
+} nocl_dispatch;
+
+typedef int CUdevice;
+struct pipe_screen *nocl_create_screen(struct cuda_pipe_loader_device *dev);
+
+#endif /* __NOCL_DRM_PUBLIC_H__ */