Merge branch 'rusticl/cuda/what_the_fuck' into 'main'

add a new gallium compute only driver for nvidia hardware, named nocl

See merge request mesa/mesa!37831
This commit is contained in:
Karol Herbst 2025-12-20 00:49:03 +00:00
commit 74e785ff41
30 changed files with 3916 additions and 28 deletions

13
docs/drivers/nocl.rst Normal file
View file

@ -0,0 +1,13 @@
NoCL
====
Overview
--------
The NoCL driver is a Gallium driver that targets the CUDA driver API and
therefore can be used to layer OpenCL on top of the NVIDIA proprietary driver.
The minimum supported CUDA API version is 6.5.
It translates NIR to PTX and makes use of the CUDA internal JIT compiler to
generate device code.

View file

@ -2161,6 +2161,29 @@ Freedreno driver environment variables
Other Gallium drivers have their own environment variables. These may
change frequently so the source code should be consulted for details.
NoCL driver environment variables
---------------------------------
.. envvar:: NOCL_DEBUG
Debug flags for the NoCL driver.
.. envvar:: NOCL_API_TARGET
Assumes a lower CUDA Driver version is used than what's loaded at runtime.
Useful to ensure that code runs on older Driver versions. For development
only.
.. envvar:: NOCL_PTX_VERSION
Overrides the PTX target version with the specified value. Can be used to
ensure code compiles successfully on various CUDA versions. For development
only. Specified without the dot.
.. envvar:: NOCL_SM_TARGET
Overrides the PTX target SM level with the specified value. Can be used to
ensure code compiles successfully on various GPUs. For development only.
Vulkan loader environment variables
-----------------------------------

View file

@ -820,7 +820,7 @@ Rusticl Optional OpenCL 2.x Features:
Device and host timer synchronization DONE (freedreno, iris, llvmpipe, radeonsi, zink)
OpenCL C 2.0 in progress
- Memory Consistency Model (atomics) not started
- Sub-groups DONE (iris, llvmpipe, radeonsi, asahi)
- Sub-groups DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
- Work-group Collective Functions not started
- Generic Address Space in progress
cl_khr_il_program DONE
@ -840,7 +840,7 @@ Rusticl extensions:
cl_khr_d3d10_sharing not started
cl_khr_d3d11_sharing not started
cl_khr_device_enqueue_local_arg_types not started
cl_khr_device_uuid DONE (freedreno, iris, llvmpipe, radeonsi, zink)
cl_khr_device_uuid DONE (freedreno, iris, llvmpipe, nocl, radeonsi, zink)
cl_khr_dx9_media_sharing not started
cl_khr_egl_event not started
cl_khr_egl_image not started
@ -855,7 +855,7 @@ Rusticl extensions:
cl_khr_external_semaphore_opaque_fd not started
cl_khr_external_semaphore_sync_fd DONE (radeonsi, zink)
cl_khr_external_semaphore_win32 not started
cl_khr_fp16 DONE (asahi, freedreno, llvmpipe, panfrost, radeonsi, zink)
cl_khr_fp16 DONE (asahi, freedreno, llvmpipe, nocl, panfrost, radeonsi, zink)
cl_khr_gl_depth_images not started
cl_khr_gl_event not started
cl_khr_gl_msaa_sharing not started
@ -865,10 +865,10 @@ Rusticl extensions:
cl_khr_int64_base_atomics not started
cl_khr_int64_extended_atomics not started
cl_khr_integer_dot_product DONE
cl_khr_kernel_clock DONE (freedreno, iris, llvmpipe, nvc0, panfrost, radeonsi, zink, needs llvm-19)
cl_khr_kernel_clock DONE (freedreno, iris, llvmpipe, nocl, nvc0, panfrost, radeonsi, zink, needs llvm-19)
cl_khr_mipmap_image not started
cl_khr_mipmap_image_writes not started
cl_khr_pci_bus_info DONE (iris, nvc0, radeonsi, zink)
cl_khr_pci_bus_info DONE (iris, nocl, nvc0, radeonsi, zink)
cl_khr_priority_hints DONE (asahi, freedreno, iris, panfrost, radeonsi)
cl_khr_semaphore DONE (radeonsi, zink)
cl_khr_spirv_extended_debug_info not started
@ -876,21 +876,21 @@ Rusticl extensions:
cl_khr_spirv_no_integer_wrap_decoration DONE
cl_khr_spirv_queries DONE
cl_khr_srgb_image_writes not started
cl_khr_subgroup_ballot not started
cl_khr_subgroup_clustered_reduce not started
cl_khr_subgroup_extended_types not started
cl_khr_subgroup_ballot DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
cl_khr_subgroup_clustered_reduce DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
cl_khr_subgroup_extended_types DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
cl_khr_subgroup_named_barrier not started
cl_khr_subgroup_non_uniform_arithmetic not started
cl_khr_subgroup_non_uniform_vote not started
cl_khr_subgroup_rotate not started
cl_khr_subgroup_shuffle DONE (iris, llvmpipe, radeonsi, asahi)
cl_khr_subgroup_shuffle_relative DONE (iris, llvmpipe, radeonsi, asahi)
cl_khr_subgroup_non_uniform_arithmetic DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
cl_khr_subgroup_non_uniform_vote DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
cl_khr_subgroup_rotate DONE (iris, llvmpipe, zink)
cl_khr_subgroup_shuffle DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
cl_khr_subgroup_shuffle_relative DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
cl_khr_subgroups in progress
cl_khr_suggested_local_work_size DONE
cl_khr_terminate_context not started
cl_khr_throttle_hints not started
cl_khr_work_group_uniform_arithmetic not started
cl_ext_buffer_device_address DONE (iris, llvmpipe, radeonsi, zink)
cl_ext_buffer_device_address DONE (iris, llvmpipe, nocl, radeonsi, zink)
cl_ext_cxx_for_opencl not started
cl_ext_device_fission not started
cl_ext_float_atomics not started

View file

@ -86,6 +86,7 @@ Linux, FreeBSD, and other operating systems.
drivers/kosmickrisp
drivers/lima
drivers/llvmpipe
drivers/nocl
drivers/nvk
drivers/panfrost
drivers/powervr

View file

@ -221,6 +221,7 @@ with_gallium_d3d12 = gallium_drivers.contains('d3d12')
with_gallium_asahi = gallium_drivers.contains('asahi')
with_gallium_rocket = gallium_drivers.contains('rocket')
with_gallium_ethosu = gallium_drivers.contains('ethosu')
with_gallium_nocl = gallium_drivers.contains('nocl')
foreach gallium_driver : gallium_drivers
pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
endforeach
@ -778,7 +779,7 @@ if with_gallium_rusticl
endif
with_virtgpu_kumquat = get_option('virtgpu_kumquat') and with_gfxstream_vk
if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat
if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat or with_gallium_nocl
# rust.bindgen() does not pass `--rust-target` to bindgen until 1.7.0.
if meson.version().version_compare('< 1.7.0')
error('Mesa Rust support requires Meson 1.7.0 or newer')
@ -1641,6 +1642,16 @@ elif with_shader_cache
error('Shader Cache requires compression')
endif
if with_gallium_nocl
dep_libcuda = dependency(
'cuda',
# By default meson searches for the cudart, which we don't want to use. We want to use the
# CUDA driver library.
modules : ['cuda'],
version : '>= 6.5',
)
endif
if host_machine.system() == 'windows'
# For MSVC and MinGW we aren't using pthreads, and dependency('threads') will add linkage
# to pthread for MinGW, so leave the dependency null_dep for Windows. For Windows linking to

View file

@ -87,7 +87,7 @@ option(
choices : [
'all', 'auto',
'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris',
'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
'lima', 'llvmpipe', 'nocl', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
],
description : 'List of gallium drivers to build. If this is set to auto ' +

View file

@ -72,9 +72,15 @@ struct clc_optional_features {
* progress
*/
bool subgroups_ifp;
bool subgroups_ballot;
bool subgroups_clustered;
bool subgroups_extended_types;
bool subgroups_named_barrier;
bool subgroups_non_uniform_arithmetic;
bool subgroups_non_uniform_vote;
bool subgroups_rotate;
bool subgroups_shuffle;
bool subgroups_shuffle_relative;
bool subgroups_ballot;
};
struct clc_compile_args {

View file

@ -1024,15 +1024,33 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
}
if (args->features.subgroups) {
c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_subgroups");
if (args->features.subgroups_ballot) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1");
}
if (args->features.subgroups_clustered) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_clustered_reduce=1");
}
if (args->features.subgroups_extended_types) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_extended_types=1");
}
if (args->features.subgroups_named_barrier) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_named_barrier=1");
}
if (args->features.subgroups_non_uniform_arithmetic) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_arithmetic=1");
}
if (args->features.subgroups_non_uniform_vote) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_vote=1");
}
if (args->features.subgroups_rotate) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_rotate=1");
}
if (args->features.subgroups_shuffle) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle=1");
}
if (args->features.subgroups_shuffle_relative) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle_relative=1");
}
if (args->features.subgroups_ballot) {
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1");
}
}
if (args->features.subgroups_ifp) {
assert(args->features.subgroups);

View file

@ -29,6 +29,7 @@
#include <assert.h>
#include <stdio.h>
#include "compiler/builtin_types.h"
#include "shader_enums.h"
#include "c11/threads.h"
#include "util/blob.h"

View file

@ -73,6 +73,6 @@ if with_gallium
endif
subdir('isaspec')
if with_nouveau_vk
if with_nouveau_vk or with_gallium_nocl
subdir('rust')
endif

View file

@ -5745,6 +5745,8 @@ typedef struct nir_lower_compute_system_values_options {
bool lower_local_invocation_index : 1;
bool lower_cs_local_id_to_index : 1;
bool lower_workgroup_id_to_index : 1;
bool lower_num_subgroups : 1;
bool lower_subgroup_id : 1;
bool global_id_is_32bit : 1;
/* At shader execution time, check if WorkGroupId should be 1D
* and compute it quickly. Fall back to slow computation if not.

View file

@ -794,6 +794,30 @@ lower_compute_system_value_instr(nir_builder *b,
case nir_intrinsic_load_shader_index:
return nir_imm_int(b, b->shader->info.cs.shader_index);
case nir_intrinsic_load_num_subgroups: {
if (!options || !options->lower_num_subgroups)
return NULL;
nir_def *group_size = nir_load_workgroup_size(b);
nir_def *threads = nir_imul(b, nir_channel(b, group_size, 0),
nir_channel(b, group_size, 1));
threads = nir_imul(b, threads, nir_channel(b, group_size, 2));
/* DIV_ROUND_UP(A, B) = ((A + B - 1) / B) */
nir_def *subgroup_size = nir_load_subgroup_size(b);
nir_def *subgroup_size_m1 = nir_iadd_imm(b, subgroup_size, -1);
nir_def *numerator = nir_iadd(b, threads, subgroup_size_m1);
return nir_udiv(b, numerator, subgroup_size);
}
case nir_intrinsic_load_subgroup_id: {
if (!options || !options->lower_subgroup_id)
return NULL;
nir_def *tid = nir_load_local_invocation_index(b);
return nir_udiv(b, tid, nir_load_subgroup_size(b));
}
default:
return NULL;
}

View file

@ -44,6 +44,9 @@
static int (*backends[])(struct pipe_loader_device **, int) = {
#ifdef HAVE_LIBDRM
&pipe_loader_drm_probe,
#endif
#ifdef HAVE_NOCL
&pipe_loader_cuda_probe,
#endif
&pipe_loader_sw_probe
};

View file

@ -215,6 +215,11 @@ pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev,
int
pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev);
#ifdef HAVE_NOCL
int
pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev);
#endif
/**
* Get a list of known DRM accel devices.
*

View file

@ -0,0 +1,12 @@
language = "C"
includes = ["nir.h"]
autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */"
include_guard = "NIR_TO_PTX_H"
usize_is_size_t = true
style = "tag"
[export]
include = ["NirToPtxOutput"]
prefix = ""
renaming_overrides_prefixing = true

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,44 @@
# Copyright © 2025 Karol Herbst
# SPDX-License-Identifier: MIT
prog_cbindgen = find_program(
'cbindgen',
required : true,
native : true,
version : '>= 0.25'
)
libnir_to_ptx_h = custom_target(
'libnir_to_ptx_h',
input : [files('cbindgen.toml'), 'lib.rs'],
output : ['nir_to_ptx.h'],
command : [
prog_cbindgen, '-q', '--config', '@INPUT0@', '--lang', 'c',
'--output', '@OUTPUT0@', '--depfile', '@DEPFILE@',
'--', '@INPUT1@',
],
depfile : 'nir_to_ptx.h.d',
)
files_libnir_to_ptx = files(
'lib.rs',
)
libnir_to_ptx = static_library(
'nir_to_ptx',
[files_libnir_to_ptx],
gnu_symbol_visibility : 'hidden',
rust_abi : 'c',
dependencies : [
idep_compiler_rs,
],
)
idep_nir_to_ptx = declare_dependency(
sources : [
libnir_to_ptx_h
],
link_with : [
libnir_to_ptx,
]
)

View file

@ -0,0 +1,54 @@
# Copyright © 2025 Karol Herbst
# SPDX-License-Identifier: MIT
subdir('compiler')
files_libnocl = files(
'nocl_nir.c',
'nocl_pipe.c',
'nocl_private.h',
)
nocl_nir_algebraic_c = custom_target(
'nocl_nir_algebraic.c',
input : 'nocl_nir_algebraic.py',
output : 'nocl_nir_algebraic.c',
command : [
prog_python, '@INPUT@',
'-p', dir_compiler_nir,
'--out', '@OUTPUT@',
],
depend_files : nir_algebraic_depends,
)
libnocl = static_library(
'nocl',
[
files_libnocl,
nocl_nir_algebraic_c,
],
include_directories : [
inc_gallium,
inc_gallium_aux,
inc_gallium_winsys,
inc_util,
],
gnu_symbol_visibility : 'hidden',
dependencies : [
dep_libcuda,
idep_nir,
idep_nir_to_ptx,
],
)
driver_nocl = declare_dependency(
compile_args : '-DGALLIUM_NOCL',
dependencies : [
dep_libcuda,
idep_libnoclwinsys,
],
link_with : [
libnocl,
libnir_to_ptx,
],
)

View file

@ -0,0 +1,824 @@
/*
* Copyright © 2022 Collabora, Ltd.
* Copyright © 2025 Karol Herbst
*
* SPDX-License-Identifier: MIT
*/
#include "nocl_private.h"
#include "nir.h"
#include "nir_builder.h"
static void
push_block(nir_builder *b, nir_block *block)
{
assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
block->cf_node.parent = &b->impl->cf_node;
exec_list_push_tail(&b->impl->body, &block->cf_node.node);
b->cursor = nir_after_block(block);
}
enum scope_type {
SCOPE_TYPE_SHADER,
SCOPE_TYPE_IF_MERGE,
SCOPE_TYPE_LOOP_BREAK,
SCOPE_TYPE_LOOP_CONT,
};
struct scope {
enum scope_type type;
struct scope *parent;
uint32_t depth;
nir_block *merge;
nir_def *bar;
uint32_t escapes;
};
static struct scope
push_scope(nir_builder *b,
enum scope_type scope_type,
struct scope *parent,
nir_block *merge_block)
{
struct scope scope = {
.type = scope_type,
.parent = parent,
.depth = parent->depth + 1,
.merge = merge_block,
};
return scope;
}
static void
pop_scope(nir_builder *b, nir_def *esc_reg, struct scope scope)
{
if (scope.bar == NULL)
return;
if (scope.escapes > 0) {
/* Find the nearest scope with a sync. */
nir_block *parent_merge = b->impl->end_block;
for (struct scope *p = scope.parent; p != NULL; p = p->parent) {
if (p->bar != NULL) {
parent_merge = p->merge;
break;
}
}
/* No escape is ~0, halt is 0, and we choose outer scope indices such
* that outer scopes always have lower indices than inner scopes.
*/
nir_def *esc = nir_ult_imm(b, nir_load_reg(b, esc_reg), scope.depth);
/* We have to put the escape in its own block to avoid critical edges.
* If we just did goto_if, we would end up with multiple successors,
* including a jump to the parent's merge block which has multiple
* predecessors.
*/
nir_block *esc_block = nir_block_create(b->shader);
nir_block *next_block = nir_block_create(b->shader);
nir_goto_if(b, esc_block, esc, next_block);
push_block(b, esc_block);
nir_goto(b, parent_merge);
push_block(b, next_block);
}
}
static enum scope_type
jump_target_scope_type(nir_jump_type jump_type)
{
switch (jump_type) {
case nir_jump_break: return SCOPE_TYPE_LOOP_BREAK;
case nir_jump_continue: return SCOPE_TYPE_LOOP_CONT;
default:
UNREACHABLE("Unknown jump type");
}
}
static void
break_scopes(nir_builder *b, nir_def *esc_reg,
struct scope *current_scope,
nir_jump_type jump_type)
{
nir_block *first_sync = NULL;
uint32_t target_depth = UINT32_MAX;
enum scope_type target_scope_type = jump_target_scope_type(jump_type);
for (struct scope *scope = current_scope; scope; scope = scope->parent) {
if (first_sync == NULL && scope->bar != NULL)
first_sync = scope->merge;
if (scope->type == target_scope_type) {
if (first_sync == NULL) {
first_sync = scope->merge;
} else {
/* In order for our cascade to work, we need to have the invariant
* that anything which escapes any scope with a warp sync needs to
* target a scope with a warp sync.
*/
assert(scope->bar != NULL);
}
target_depth = scope->depth;
break;
} else {
scope->escapes++;
}
}
assert(target_depth < UINT32_MAX);
nir_store_reg(b, nir_imm_int(b, target_depth), esc_reg);
nir_goto(b, first_sync);
}
static void
normal_exit(nir_builder *b, nir_def *esc_reg, nir_block *merge_block)
{
assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
nir_block *block = nir_cursor_current_block(b->cursor);
if (!nir_block_ends_in_jump(block)) {
nir_store_reg(b, nir_imm_int(b, ~0), esc_reg);
nir_goto(b, merge_block);
}
}
/* This is a heuristic for what instructions are allowed before we sync.
* Annoyingly, we've gotten rid of phis so it's not as simple as "is it a
* phi?".
*/
static bool
instr_is_allowed_before_sync(nir_instr *instr)
{
switch (instr->type) {
case nir_instr_type_alu: {
nir_alu_instr *alu = nir_instr_as_alu(instr);
/* We could probably allow more ALU as long as it doesn't contain
* derivatives but let's be conservative and only allow mov for now.
*/
return alu->op == nir_op_mov;
}
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
return intrin->intrinsic == nir_intrinsic_load_reg ||
intrin->intrinsic == nir_intrinsic_store_reg;
}
default:
return false;
}
}
/** Returns true if our successor will sync for us
*
* This is a bit of a heuristic
*/
static bool
parent_scope_will_sync(nir_cf_node *node, struct scope *parent_scope)
{
/* First search forward to see if there's anything non-trivial after this
* node within the parent scope.
*/
nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node));
nir_foreach_instr(instr, block) {
if (!instr_is_allowed_before_sync(instr))
return false;
}
/* There's another loop or if following and we didn't find a sync */
if (nir_cf_node_next(&block->cf_node))
return false;
/* See if the parent scope will sync for us. */
if (parent_scope->bar != NULL)
return true;
switch (parent_scope->type) {
case SCOPE_TYPE_SHADER:
return true;
case SCOPE_TYPE_IF_MERGE:
return parent_scope_will_sync(block->cf_node.parent,
parent_scope->parent);
case SCOPE_TYPE_LOOP_CONT:
/* In this case, the loop doesn't have a sync of its own so we're
* expected to be uniform before we hit the continue.
*/
return false;
case SCOPE_TYPE_LOOP_BREAK:
UNREACHABLE("Loops must have a continue scope");
default:
UNREACHABLE("Unknown scope type");
}
}
static bool
block_is_merge(const nir_block *block)
{
/* If it's unreachable, there is no merge */
if (block->imm_dom == NULL)
return false;
unsigned num_preds = 0;
set_foreach(&block->predecessors, entry) {
const nir_block *pred = entry->key;
/* We don't care about unreachable blocks */
if (pred->imm_dom == NULL)
continue;
num_preds++;
}
return num_preds > 1;
}
static void
lower_cf_list(nir_builder *b, nir_def *esc_reg, struct scope *parent_scope,
struct exec_list *cf_list)
{
foreach_list_typed_safe(nir_cf_node, node, node, cf_list) {
switch (node->type) {
case nir_cf_node_block: {
nir_block *block = nir_cf_node_as_block(node);
if (exec_list_is_empty(&block->instr_list))
break;
nir_cursor start = nir_before_block(block);
nir_cursor end = nir_after_block(block);
nir_jump_instr *jump = NULL;
nir_instr *last_instr = nir_block_last_instr(block);
if (last_instr->type == nir_instr_type_jump) {
jump = nir_instr_as_jump(last_instr);
end = nir_before_instr(&jump->instr);
}
nir_cf_list instrs;
nir_cf_extract(&instrs, start, end);
b->cursor = nir_cf_reinsert(&instrs, b->cursor);
if (jump != NULL) {
if (jump->type == nir_jump_halt) {
/* Halt instructions map to OpExit on NVIDIA hardware and
* exited lanes never block a bsync.
*/
nir_instr_remove(&jump->instr);
nir_builder_instr_insert(b, &jump->instr);
} else {
/* Everything else needs a break cascade */
break_scopes(b, esc_reg, parent_scope, jump->type);
}
}
break;
}
case nir_cf_node_if: {
nir_if *nif = nir_cf_node_as_if(node);
nir_def *cond = nif->condition.ssa;
nir_instr_clear_src(NULL, &nif->condition);
nir_block *then_block = nir_block_create(b->shader);
nir_block *else_block = nir_block_create(b->shader);
nir_block *merge_block = nir_block_create(b->shader);
struct scope scope = push_scope(b, SCOPE_TYPE_IF_MERGE,
parent_scope, merge_block);
nir_goto_if(b, then_block, cond, else_block);
push_block(b, then_block);
lower_cf_list(b, esc_reg, &scope, &nif->then_list);
normal_exit(b, esc_reg, merge_block);
push_block(b, else_block);
lower_cf_list(b, esc_reg, &scope, &nif->else_list);
normal_exit(b, esc_reg, merge_block);
push_block(b, merge_block);
pop_scope(b, esc_reg, scope);
break;
}
case nir_cf_node_loop: {
nir_loop *loop = nir_cf_node_as_loop(node);
nir_block *head_block = nir_block_create(b->shader);
nir_block *break_block = nir_block_create(b->shader);
nir_block *cont_block = nir_block_create(b->shader);
/* TODO: We can potentially avoid the break sync for loops when the
* parent scope syncs for us. However, we still need to handle the
* continue clause cascading to the break. If there is a
* nir_jump_halt involved, then we have a real cascade where it needs
* to then jump to the next scope. Getting all these cases right
* while avoiding an extra sync for the loop break is tricky at best.
*/
struct scope break_scope = push_scope(b, SCOPE_TYPE_LOOP_BREAK,
parent_scope,
break_block);
nir_goto(b, head_block);
push_block(b, head_block);
struct scope cont_scope = push_scope(b, SCOPE_TYPE_LOOP_CONT,
&break_scope,
cont_block);
lower_cf_list(b, esc_reg, &cont_scope, &loop->body);
normal_exit(b, esc_reg, cont_block);
push_block(b, cont_block);
pop_scope(b, esc_reg, cont_scope);
lower_cf_list(b, esc_reg, &break_scope, &loop->continue_list);
nir_goto(b, head_block);
push_block(b, break_block);
pop_scope(b, esc_reg, break_scope);
break;
}
default:
UNREACHABLE("Unknown CF node type");
}
}
}
static bool
lower_cf_func(nir_function *func)
{
if (func->impl == NULL)
return false;
if (exec_list_is_singular(&func->impl->body)) {
return nir_no_progress(func->impl);
}
nir_function_impl *old_impl = func->impl;
/* We use this in block_is_merge() */
nir_metadata_require(old_impl, nir_metadata_dominance | nir_metadata_divergence);
/* First, we temporarily get rid of SSA. This will make all our block
* motion way easier. Ask the pass to place reg writes directly in the
* immediate predecessors of the phis instead of trying to be clever.
* This will ensure that we never get a write to a uniform register from
* non-uniform control flow and makes our divergence reconstruction for
* phis more reliable.
*/
nir_foreach_block(block, old_impl)
nir_lower_phis_to_regs_block(block, true);
/* We create a whole new nir_function_impl and copy the contents over */
func->impl = NULL;
nir_function_impl *new_impl = nir_function_impl_create(func);
new_impl->structured = false;
/* We copy defs from the old impl */
new_impl->ssa_alloc = old_impl->ssa_alloc;
nir_builder b = nir_builder_at(nir_before_impl(new_impl));
nir_def *esc_reg = nir_decl_reg(&b, 1, 32, 0);
/* Having a function scope makes everything easier */
struct scope scope = {
.type = SCOPE_TYPE_SHADER,
.merge = new_impl->end_block,
};
lower_cf_list(&b, esc_reg, &scope, &old_impl->body);
normal_exit(&b, esc_reg, new_impl->end_block);
/* Now sort by reverse PDFS and restore SSA
*
* Note: Since we created a new nir_function_impl, there is no metadata,
* dirty or otherwise, so we have no need to call nir_progress().
*/
nir_sort_unstructured_blocks(new_impl);
return true;
}
/* Copied from nak_nir_lower_cf.c but without divergency or barriers */
bool
nocl_nir_lower_cf(struct nir_shader *nir)
{
bool progress = false;
nir_foreach_function(func, nir) {
if (lower_cf_func(func))
progress = true;
}
return progress;
}
nir_shader_compiler_options *
nocl_get_nir_options(struct nocl_screen *nocl)
{
nir_shader_compiler_options *nir_options = ralloc(nocl, nir_shader_compiler_options);
*nir_options = (nir_shader_compiler_options) {
// TODO:
// .has_bit_test = nocl->ptx >= 60,
// .has_bitfield_select = nocl->ptx >= 43,
// .has_f2i32_rtne = true,
// .has_uclz = true,
// .has_dot_2x16 = true,
// .has_fneo_fcmpu = true,
// .has_fused_comp_and_csel = true,
// .has_pack_32_4x8 = true,
// .has_shfr32 = true,
.fuse_ffma32 = true,
.has_fsub = true,
.has_imul24 = true,
.has_imad32 = true,
.has_isub = true,
.has_rotate32 = nocl->sm >= 32,
.has_sdot_4x8 = nocl->sm >= 61,
.has_sudot_4x8 = nocl->sm >= 61,
.has_udot_4x8 = nocl->sm >= 61,
.has_umul24 = true,
.has_umad24 = true,
.lower_hadd = true,
.lower_find_lsb = true,
.lower_flrp16 = true,
.lower_flrp32 = true,
.lower_flrp64 = true,
.lower_fisnormal = true,
.lower_fsat = true,
.lower_fsign = true,
.lower_int64_options =
/* _technically PTX supports it, but it fails to compile with vector sources...
* The hardware doesn't have an idiv anyway. */
nir_lower_divmod64 |
nir_lower_iadd_sat64 |
nir_lower_imul64 |
nir_lower_scan_reduce_bitwise64 |
nir_lower_scan_reduce_iadd64 |
nir_lower_subgroup_shuffle64 |
(nocl->sm < 70 ? nir_lower_vote_ieq64 : 0),
.lower_insert_byte = true,
.lower_insert_word = true,
.lower_isign = true,
.lower_ldexp = true,
.lower_uadd_sat = true,
.lower_usub_sat = true,
.max_unroll_iterations = 32,
.support_16bit_alu = true,
};
return nir_options;
}
static uint8_t
alu_width_cb(const nir_instr *instr, const void *)
{
return 1;
}
static unsigned
lower_bit_size_callback(const nir_instr *instr, void *data)
{
const struct nocl_screen *nocl = data;
switch (instr->type) {
case nir_instr_type_alu: {
nir_alu_instr *alu = nir_instr_as_alu(instr);
unsigned dest_bit_size = alu->def.bit_size;
unsigned src0_bit_size = alu->src[0].src.ssa->bit_size;
switch (alu->op) {
case nir_op_extract_i8:
case nir_op_extract_i16:
case nir_op_extract_u8:
case nir_op_extract_u16:
case nir_op_bitfield_reverse:
if (dest_bit_size < 32)
return 32;
return 0;
case nir_op_bcsel:
if (dest_bit_size < 16)
return 16;
return 0;
case nir_op_bit_count:
case nir_op_bitfield_insert:
case nir_op_fdiv:
case nir_op_frsq:
case nir_op_fsqrt:
case nir_op_iadd_sat:
case nir_op_ibitfield_extract:
case nir_op_isub_sat:
case nir_op_ubitfield_extract:
case nir_op_ufind_msb:
if (src0_bit_size < 32)
return 32;
return 0;
case nir_op_iabs:
case nir_op_iadd:
case nir_op_iand:
case nir_op_idiv:
case nir_op_ieq:
case nir_op_ige:
case nir_op_ilt:
case nir_op_imad:
case nir_op_imax:
case nir_op_imin:
case nir_op_imul:
case nir_op_imul_high:
case nir_op_ine:
case nir_op_ineg:
case nir_op_inot:
case nir_op_ior:
case nir_op_irem:
case nir_op_ishl:
case nir_op_ishr:
case nir_op_isub:
case nir_op_ixor:
case nir_op_udiv:
case nir_op_uge:
case nir_op_ult:
case nir_op_umax:
case nir_op_umin:
case nir_op_umod:
case nir_op_umul_high:
case nir_op_ushr:
if (src0_bit_size == 8)
return 16;
return 0;
case nir_op_fmax:
case nir_op_fmin:
if (src0_bit_size == 16 && nocl->sm < 80)
return 32;
return 0;
case nir_op_feq:
case nir_op_fequ:
case nir_op_fge:
case nir_op_fgeu:
case nir_op_flt:
case nir_op_fltu:
case nir_op_fneo:
case nir_op_fneu:
if (src0_bit_size == 16 && nocl->sm < 53)
return 32;
return 0;
default:
return 0;
}
}
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_read_invocation:
case nir_intrinsic_reduce:
case nir_intrinsic_shuffle:
case nir_intrinsic_shuffle_down:
case nir_intrinsic_shuffle_up:
case nir_intrinsic_shuffle_xor:
if (intrin->def.bit_size < 32)
return 32;
return 0;
default:
return 0;
}
}
default:
return 0;
}
}
static bool
lower_subgroups_filter(const nir_intrinsic_instr *intrin, const void *data)
{
const struct nocl_screen *nocl = data;
if (intrin->def.num_components > 1)
return true;
switch (intrin->intrinsic) {
case nir_intrinsic_ballot_bit_count_reduce:
case nir_intrinsic_ballot_bit_count_exclusive:
case nir_intrinsic_ballot_bit_count_inclusive:
case nir_intrinsic_ballot_bitfield_extract:
case nir_intrinsic_ballot_find_lsb:
case nir_intrinsic_ballot_find_msb:
case nir_intrinsic_exclusive_scan:
case nir_intrinsic_first_invocation:
case nir_intrinsic_inclusive_scan:
case nir_intrinsic_inverse_ballot:
case nir_intrinsic_load_subgroup_size:
case nir_intrinsic_quad_vote_all:
case nir_intrinsic_quad_vote_any:
case nir_intrinsic_read_first_invocation:
case nir_intrinsic_rotate:
case nir_intrinsic_vote_feq:
/* TODO: exists on SM90+ */
case nir_intrinsic_elect:
return true;
case nir_intrinsic_vote_ieq:
return nocl->sm < 70;
case nir_intrinsic_reduce: {
if (nocl->sm < 80)
return true;
unsigned cluster_size = nir_intrinsic_cluster_size(intrin);
if (cluster_size != 0 && cluster_size != 32)
return true;
switch (nir_intrinsic_reduction_op(intrin)) {
case nir_op_fadd:
case nir_op_fmax:
case nir_op_fmin:
case nir_op_fmul:
case nir_op_imul:
return true;
case nir_op_imax:
case nir_op_imin:
case nir_op_umax:
case nir_op_umin:
return intrin->def.bit_size == 64;
default:
/* nir_lower_int64 will handle some of it for us */
return false;
}
}
case nir_intrinsic_read_invocation:
case nir_intrinsic_shuffle:
case nir_intrinsic_shuffle_down:
case nir_intrinsic_shuffle_up:
case nir_intrinsic_shuffle_xor:
return intrin->def.bit_size > 32;
default:
return false;
}
}
static nir_mem_access_size_align
lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin,
uint8_t bytes,
uint8_t bit_size,
uint32_t align,
uint32_t align_offset,
bool offset_is_const,
enum gl_access_qualifier,
const void *cb_data)
{
align = nir_combined_align(align, align_offset);
bytes = MIN2(MIN2(bytes, align), 16);
bit_size = MIN2(bit_size, bytes * 8);
unsigned num_components = MIN2(bytes / (bit_size / 8), 4);
if (num_components == 3)
num_components = 2;
return (nir_mem_access_size_align) {
.bit_size = bit_size,
.num_components = num_components,
.align = align,
.shift = nir_mem_access_shift_method_scalar,
};
}
static bool
lower_alu_vec_srcs(nir_builder *b, nir_alu_instr *alu, void *_data)
{
const nir_op_info *info = &nir_op_infos[alu->op];
bool changed = false;
b->cursor = nir_before_instr(&alu->instr);
for (int i = 0; i < info->num_inputs; i++) {
if (info->input_sizes[i])
continue;
/* We lower everything that is bigger than vec4 _and_ 128 bits */
nir_def *src_ssa = alu->src[i].src.ssa;
if (src_ssa->bit_size * src_ssa->num_components <= 128 && src_ssa->num_components <= 4)
continue;
changed = true;
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
for (int c = 0; c < alu->def.num_components; c++) {
unsigned swizzle = alu->src[i].swizzle[c];
alu->src[i].swizzle[c] = c;
nir_const_value *const_val = nir_src_as_const_value(alu->src[i].src);
if (const_val) {
comps[c] = nir_build_imm(b, 1, alu->src[i].src.ssa->bit_size, &const_val[swizzle]);
} else {
comps[c] = nir_swizzle(b, alu->src[i].src.ssa, &swizzle, 1);
}
}
nir_def *src = nir_vec(b, comps, alu->def.num_components);
nir_src_rewrite(&alu->src[i].src, src);
}
return changed;
}
/* copied from nir_lower_alu_vec8_16_srcs */
static bool
nir_lower_alu_vec_srcs(nir_shader *shader)
{
return nir_shader_alu_pass(shader, lower_alu_vec_srcs,
nir_metadata_control_flow,
NULL);
}
void
nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir)
{
struct nocl_screen *nocl = nocl_screen(pscreen);
nir->info.max_subgroup_size = nocl->base.caps.shader_subgroup_size;
nir->info.min_subgroup_size = nocl->base.caps.shader_subgroup_size;
nir->info.api_subgroup_size = nocl->base.caps.shader_subgroup_size;
struct nir_lower_compute_system_values_options sysval_options = {
.lower_local_invocation_index = true,
.lower_num_subgroups = true,
.lower_subgroup_id = true,
};
struct nir_lower_mem_access_bit_sizes_options mem_access_bit_sizes_options = {
.callback = lower_mem_access_bit_sizes_cb,
.cb_data = nocl,
.modes = nir_var_all,
};
struct nir_lower_subgroups_options subgroup_options = {
.ballot_bit_size = 32,
.ballot_components = 1,
.subgroup_size = 32,
.lower_elect = true,
.lower_first_invocation_to_ballot = true,
.lower_inverse_ballot = true,
.lower_quad_vote = true,
.lower_read_first_invocation = true,
.lower_reduce = true,
.lower_rotate_to_shuffle = true,
.lower_shuffle_to_32bit = true,
.lower_to_scalar = true,
.lower_vote_feq = true,
.lower_vote_ieq = nocl->sm < 70,
.filter = lower_subgroups_filter,
.filter_data = nocl,
};
NIR_PASS(_, nir, nir_lower_compute_system_values, &sysval_options);
/* TODO: keeping 8 bit vectors would help a bit... */
NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
NIR_PASS(_, nir, nir_lower_subgroups, &subgroup_options);
bool progress;
do {
progress = false;
NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
NIR_PASS(progress, nir, nir_lower_mem_access_bit_sizes, &mem_access_bit_sizes_options);
NIR_PASS(progress, nir, nir_lower_alu_width, alu_width_cb, nocl);
NIR_PASS(progress, nir, nir_lower_alu_vec_srcs);
NIR_PASS(progress, nir, nir_lower_flrp, 16 | 32 | 64, false);
NIR_PASS(progress, nir, nir_opt_algebraic);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_cse);
} while (progress);
do {
progress = false;
NIR_PASS(progress, nir, nir_lower_bit_size, lower_bit_size_callback, nocl);
NIR_PASS(progress, nir, nir_opt_algebraic_late);
NIR_PASS(progress, nir, nocl_nir_opt_algebraic_late, nocl);
if (progress) {
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, nir_copy_prop);
NIR_PASS(_, nir, nir_opt_dce);
NIR_PASS(_, nir, nir_opt_cse);
}
} while (progress);
/* Run only once */
NIR_PASS(progress, nir, nocl_nir_lower_algebraic_late, nocl);
if (progress) {
NIR_PASS(_, nir, nir_opt_constant_folding);
}
/* After nir_opt_constant_folding */
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
NIR_PASS(_, nir, nir_opt_dce);
}

View file

@ -0,0 +1,65 @@
# Copyright © 2025 Karol Herbst
# SPDX-License-Identifier: MIT
import argparse
import sys
a = 'a'
b = 'b'
c = 'c'
s = 's'
late_algebraic = [
(('ior@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
(('iadd@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
(('ixor@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
(('ixor@32', ('iand', a, ('ixor', b, c)), c), ('bitfield_select', a, b, c)),
(('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c)),
(('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c)),
(('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b)),
(('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c)),
(('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c)),
(('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c)),
]
late_algebraic_lowering = [
(('b2i8', a), ('u2u8', ('b2i16', a))),
]
for bit_size in [16, 32, 64]:
late_algebraic_lowering += [
((f'ishl@{bit_size}', a, b), ('ishl', a, ('iand', b, bit_size - 1))),
((f'ishr@{bit_size}', a, b), ('ishr', a, ('iand', b, bit_size - 1))),
((f'ushr@{bit_size}', a, b), ('ushr', a, ('iand', b, bit_size - 1))),
]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--out', required=True, help='Output file.')
parser.add_argument('-p', '--import-path', required=True)
args = parser.parse_args()
sys.path.insert(0, args.import_path)
import nir_algebraic # pylint: disable=import-error
try:
with open(args.out, 'w', encoding='utf-8') as f:
f.write('#include "nocl_private.h"')
f.write(nir_algebraic.AlgebraicPass(
"nocl_nir_opt_algebraic_late",
late_algebraic,
[
("const struct nocl_screen *", "nocl"),
]).render())
f.write(nir_algebraic.AlgebraicPass(
"nocl_nir_lower_algebraic_late",
late_algebraic_lowering,
[
("const struct nocl_screen *", "nocl"),
]).render())
except Exception:
sys.exit(1)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,147 @@
/*
* Copyright © 2025 Karol Herbst
* SPDX-License-Identifier: MIT
*/
#include "pipe/p_context.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "util/macros.h"
#include "util/ralloc.h"
#include "util/u_memory.h"
#include "util/u_inlines.h"
#include "nocl/nocl_cuda_public.h"
static const uint32_t NOCL_COMPILER_LOG_SIZE = 0x1000;
enum nocl_dbg {
NOCL_DBG_32 = BITFIELD_BIT(0),
NOCL_DBG_JIT_NO_OPTS = BITFIELD_BIT(1),
NOCL_DBG_NIR = BITFIELD_BIT(2),
NOCL_DBG_PTX = BITFIELD_BIT(3),
NOCL_DBG_VERBOSE = BITFIELD_BIT(4),
};
#define NOCL_CALL(func) nocl_error(nocl_dispatch.func)
static inline bool
nocl_error(CUresult err)
{
if (err == CUDA_SUCCESS)
return false;
const char *err_str;
if (nocl_dispatch.cuGetErrorString(err, &err_str)) {
printf("NOCL-ERROR: UNKNOWN\n");
} else {
printf("NOCL-ERROR: %s\n", err_str);
}
return true;
}
struct nocl_screen {
struct pipe_screen base;
#if CUDA_VERSION >= 12090
CUlogsCallbackHandle verbose_handle;
#endif
CUdevice cu_dev;
CUcontext cu_ctx;
struct disk_cache *disk_cache;
uint32_t debug;
uint16_t ptx;
uint16_t sm;
char name[128];
};
struct nocl_buffer {
struct pipe_resource base;
union {
CUdeviceptr dptr;
void *ptr;
};
};
struct nocl_program {
CUmodule cu_mod;
CUfunction cu_func;
uint32_t shared_size;
};
struct nocl_context {
struct pipe_context base;
void *kernel_input;
uint32_t kernel_input_size;
uint32_t kernel_input_capacity;
CUstream cu_stream;
CUevent cu_timestamp_start;
uint64_t timestamp_start;
struct nocl_program *prog;
};
struct pipe_fence_handle {
struct pipe_reference ref;
CUevent cu_event;
};
struct nocl_transfer {
struct pipe_transfer base;
void *ptr;
};
struct pipe_query {
CUevent cu_event_base;
CUevent cu_event;
};
static inline struct nocl_screen*
nocl_screen(struct pipe_screen *pscreen)
{
return (struct nocl_screen *)pscreen;
}
static inline struct nocl_screen*
nocl_screen_and_make_current(struct pipe_screen *pscreen)
{
struct nocl_screen *nocl = nocl_screen(pscreen);
NOCL_CALL(cuCtxSetCurrent(nocl->cu_ctx));
return nocl;
}
static inline struct nocl_context*
nocl_context(struct pipe_context *pctx)
{
return (struct nocl_context *)pctx;
}
static inline struct nocl_context*
nocl_context_and_make_current(struct pipe_context *pctx)
{
nocl_screen_and_make_current(pctx->screen);
return nocl_context(pctx);
}
static inline struct nocl_program*
nocl_program(void *prog)
{
return (struct nocl_program *)prog;
}
static inline struct nocl_buffer*
nocl_buffer(struct pipe_resource *pres)
{
return (struct nocl_buffer *)pres;
}
static inline struct nocl_transfer*
nocl_transfer(struct pipe_transfer *pxfer)
{
return (struct nocl_transfer *)pxfer;
}
void nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir);
bool nocl_nir_lower_cf(struct nir_shader *nir);
bool nocl_nir_opt_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl);
bool nocl_nir_lower_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl);
struct nir_shader_compiler_options *nocl_get_nir_options(struct nocl_screen *nocl);

View file

@ -740,17 +740,50 @@ impl DeviceBase {
}
if self.subgroups_supported() {
add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative);
add_cap(SpvCapability::SpvCapabilityGroups);
add_cap(SpvCapability::SpvCapabilitySubgroupDispatch);
// requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
//add_ext(1, 0, 0, "cl_khr_subgroups");
add_ext(1, 0, 0, "cl_khr_subgroup_extended_types");
add_feat(1, 0, 0, "__opencl_c_subgroups");
// we have lowering in `nir_lower_subgroups`, drivers can just use that
add_ext(1, 0, 0, "cl_khr_subgroup_shuffle");
add_ext(1, 0, 0, "cl_khr_subgroup_shuffle_relative");
if self.subgroup_ballot_supported() {
add_cap(SpvCapability::SpvCapabilityGroupNonUniformBallot);
add_ext(1, 0, 0, "cl_khr_subgroup_ballot");
}
if self.subgroup_clustered_supported() {
add_cap(SpvCapability::SpvCapabilityGroupNonUniformClustered);
add_ext(1, 0, 0, "cl_khr_subgroup_clustered_reduce");
}
if self.subgroup_non_uniform_arithmetic_supported() {
add_cap(SpvCapability::SpvCapabilityGroupNonUniformArithmetic);
add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_arithmetic");
}
if self.subgroup_non_uniform_vote_supported() {
add_cap(SpvCapability::SpvCapabilityGroupNonUniform);
add_cap(SpvCapability::SpvCapabilityGroupNonUniformVote);
add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_vote");
}
if self.subgroup_rotate_supported() {
add_cap(SpvCapability::SpvCapabilityGroupNonUniformRotateKHR);
add_ext(1, 0, 0, "cl_khr_subgroup_rotate");
add_spirv(c"SPV_KHR_subgroup_rotate");
}
if self.subgroup_shuffle_supported() {
add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
add_ext(1, 0, 0, "cl_khr_subgroup_shuffle");
}
if self.subgroup_shuffle_relative_supported() {
add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative);
add_ext(1, 0, 0, "cl_khr_subgroup_shuffle_relative");
}
if self.intel_subgroups_supported() {
// add_cap(SpvCapability::SpvCapabilitySubgroupBufferBlockIOINTEL);
// add_cap(SpvCapability::SpvCapabilitySubgroupImageBlockIOINTEL);
@ -1173,6 +1206,58 @@ impl DeviceBase {
// supported, doing it without shareable shaders isn't practical
self.max_subgroups() > 0
&& (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders()))
&& self.screen().caps().shader_subgroup_supported_features
& PIPE_SHADER_SUBGROUP_FEATURE_BASIC
!= 0
}
pub fn subgroup_ballot_supported(&self) -> bool {
self.subgroups_supported()
&& self.screen().caps().shader_subgroup_supported_features
& PIPE_SHADER_SUBGROUP_FEATURE_BALLOT
!= 0
}
pub fn subgroup_clustered_supported(&self) -> bool {
self.subgroups_supported()
&& self.screen().caps().shader_subgroup_supported_features
& PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED
!= 0
}
pub fn subgroup_non_uniform_arithmetic_supported(&self) -> bool {
self.subgroups_supported()
&& self.screen().caps().shader_subgroup_supported_features
& PIPE_SHADER_SUBGROUP_FEATURE_ARITHMETIC
!= 0
}
pub fn subgroup_non_uniform_vote_supported(&self) -> bool {
self.subgroups_supported()
&& self.screen().caps().shader_subgroup_supported_features
& PIPE_SHADER_SUBGROUP_FEATURE_VOTE
!= 0
}
pub fn subgroup_rotate_supported(&self) -> bool {
let mask =
PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED;
self.subgroups_supported()
&& self.screen().caps().shader_subgroup_supported_features & mask == mask
}
pub fn subgroup_shuffle_supported(&self) -> bool {
self.subgroups_supported()
&& self.screen().caps().shader_subgroup_supported_features
& PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE
!= 0
}
pub fn subgroup_shuffle_relative_supported(&self) -> bool {
self.subgroups_supported()
&& self.screen().caps().shader_subgroup_supported_features
& PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE_RELATIVE
!= 0
}
pub fn system_svm_supported(&self) -> bool {
@ -1255,8 +1340,14 @@ impl DeviceBase {
intel_subgroups: self.intel_subgroups_supported(),
kernel_clock: self.kernel_clock_supported(),
subgroups: subgroups_supported,
subgroups_shuffle: subgroups_supported,
subgroups_shuffle_relative: subgroups_supported,
subgroups_ballot: self.subgroup_ballot_supported(),
subgroups_clustered: self.subgroup_clustered_supported(),
subgroups_extended_types: subgroups_supported,
subgroups_non_uniform_arithmetic: self.subgroup_non_uniform_arithmetic_supported(),
subgroups_non_uniform_vote: self.subgroup_non_uniform_vote_supported(),
subgroups_rotate: self.subgroup_rotate_supported(),
subgroups_shuffle: self.subgroup_shuffle_supported(),
subgroups_shuffle_relative: self.subgroup_shuffle_relative_supported(),
..Default::default()
}
}

View file

@ -99,6 +99,7 @@ fn get_enabled_devs() -> HashMap<String, u32> {
let driver_str = match driver_str[0] {
"llvmpipe" | "lp" => "swrast",
"cuda" => "nocl",
"freedreno" => "msm",
a => a,
};

View file

@ -698,6 +698,14 @@ enum pipe_conservative_raster_mode
#define PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED (1 << 6)
#define PIPE_SHADER_SUBGROUP_FEATURE_QUAD (1 << 7)
#define PIPE_SHADER_SUBGROUP_NUM_FEATURES 8
/* VK_SUBGROUP_FEATURE_ROTATE_BIT */
#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE (1 << 9)
/* VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT */
#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED (1 << 10)
#define PIPE_SHADER_SUBGROUP_FEATURE_MASK \
(BITFIELD_MASK(PIPE_SHADER_SUBGROUP_NUM_FEATURES) | \
PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | \
PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED)
enum pipe_point_size_lower_mode {
PIPE_POINT_SIZE_LOWER_ALWAYS,

View file

@ -206,6 +206,12 @@ if with_gallium_d3d12
else
driver_d3d12 = declare_dependency()
endif
if with_gallium_nocl
subdir('winsys/nocl')
subdir('drivers/nocl')
else
driver_nocl = declare_dependency()
endif
if with_gallium_rusticl
subdir('frontends/rusticl')
subdir('targets/rusticl')

View file

@ -34,6 +34,7 @@ librusticl = shared_library(
],
dependencies : [
driver_asahi,
driver_nocl,
driver_freedreno,
driver_iris,
driver_nouveau,

View file

@ -0,0 +1,25 @@
# Copyright © 2025 Karol Herbst
# SPDX-License-Identifier: MIT
libnocl_cuda = static_library(
'nocl_cuda',
[
'nocl_cuda.c',
'nocl_cuda_public.h',
],
include_directories : [
inc_util,
inc_gallium,
inc_gallium_aux,
],
dependencies : [
dep_libcuda,
],
)
idep_libnoclwinsys = declare_dependency(
include_directories : [
include_directories('.'),
],
link_with : [libnocl_cuda],
)

View file

@ -0,0 +1,212 @@
/*
* Copyright © 2025 Karol Herbst
*
* SPDX-License-Identifier: MIT
*/
#include "pipe-loader/pipe_loader.h"
#include "pipe-loader/pipe_loader_priv.h"
#include "target-helpers/inline_debug_helper.h"
#include "util/u_dl.h"
#include "util/u_memory.h"
#include "cuda.h"
#include "nocl_cuda_public.h"
#define cuda_err(err) if (err) { \
const char *err_str; \
nocl_dispatch.cuGetErrorString(err, &err_str); \
printf("CUDA returned error: %s\n", err_str); \
return 0; \
}
static struct driOptionDescription nocl_dri_conf = {};
static struct util_dl_library *libcuda = NULL;
struct cuda_symbol_table nocl_dispatch = {};
static struct pipe_screen *
nocl_loader_create_screen(struct pipe_loader_device *dev,
const struct pipe_screen_config *config,
bool sw_vk)
{
struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)dev;
struct pipe_screen *pscreen = nocl_create_screen(cuda_dev);
return pscreen ? debug_screen_wrap(pscreen) : NULL;
}
static const struct driOptionDescription *
nocl_get_driconf(struct pipe_loader_device *dev, unsigned *count)
{
*count = 0;
return &nocl_dri_conf;
}
static void
nocl_release(struct pipe_loader_device **dev)
{
struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)*dev;
FREE(cuda_dev);
}
static struct pipe_loader_ops cuda_loader_ops = {
.create_screen = nocl_loader_create_screen,
.release = nocl_release,
.get_driconf = nocl_get_driconf,
};
static int32_t faked_version = 0;
static bool
nocl_load_pfn(const char *name, void **pfn, int version)
{
if (version > faked_version) {
*pfn = NULL;
return false;
}
CUdriverProcAddressQueryResult proc_result;
CUresult res = nocl_dispatch.cuGetProcAddress_v2(name, pfn, version, CU_GET_PROC_ADDRESS_LEGACY_STREAM, &proc_result);
bool success = res == CUDA_SUCCESS && proc_result == CU_GET_PROC_ADDRESS_SUCCESS;
/* Probably always NULL on errors but let's be sure */
if (!success)
*pfn = NULL;
return success;
}
int
pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev)
{
if (!libcuda) {
UNUSED void *func = NULL;
faked_version = debug_get_num_option("NOCL_API_TARGET", INT32_MAX);
libcuda = util_dl_open("libcuda.so");
if (!libcuda)
return 0;
/* Needs 12.0 */
nocl_dispatch.cuGetProcAddress_v2 = (void*)util_dl_get_proc_address(libcuda, "cuGetProcAddress_v2");
if (!nocl_dispatch.cuGetProcAddress_v2) {
return 0;
}
bool success = true;
success &= nocl_load_pfn("cuCtxCreate", (void**)&nocl_dispatch.cuCtxCreate_v2, 3020);
success &= nocl_load_pfn("cuCtxDestroy", (void**)&nocl_dispatch.cuCtxDestroy_v2, 4000);
success &= nocl_load_pfn("cuCtxSetCurrent", (void**)&nocl_dispatch.cuCtxSetCurrent, 4000);
success &= nocl_load_pfn("cuDeviceGet", (void**)&nocl_dispatch.cuDeviceGet, 2000);
success &= nocl_load_pfn("cuDeviceGetAttribute", (void**)&nocl_dispatch.cuDeviceGetAttribute, 2000);
success &= nocl_load_pfn("cuDeviceGetCount", (void**)&nocl_dispatch.cuDeviceGetCount, 2000);
success &= nocl_load_pfn("cuDeviceGetName", (void**)&nocl_dispatch.cuDeviceGetName, 2000);
success &= nocl_load_pfn("cuDeviceTotalMem", (void**)&nocl_dispatch.cuDeviceTotalMem_v2, 3020);
success &= nocl_load_pfn("cuDriverGetVersion", (void**)&nocl_dispatch.cuDriverGetVersion, 2020);
success &= nocl_load_pfn("cuEventCreate", (void**)&nocl_dispatch.cuEventCreate, 2000);
success &= nocl_load_pfn("cuEventDestroy", (void**)&nocl_dispatch.cuEventDestroy_v2, 4000);
success &= nocl_load_pfn("cuEventElapsedTime", (void**)&nocl_dispatch.cuEventElapsedTime_v1, 2000);
success &= nocl_load_pfn("cuEventRecord", (void**)&nocl_dispatch.cuEventRecord, 2000);
success &= nocl_load_pfn("cuEventSynchronize", (void**)&nocl_dispatch.cuEventSynchronize, 2000);
success &= nocl_load_pfn("cuFuncGetAttribute", (void**)&nocl_dispatch.cuFuncGetAttribute, 2020);
success &= nocl_load_pfn("cuGetErrorString", (void**)&nocl_dispatch.cuGetErrorString, 6000);
success &= nocl_load_pfn("cuInit", (void**)&nocl_dispatch.cuInit, 2000);
success &= nocl_load_pfn("cuLaunchKernel", (void**)&nocl_dispatch.cuLaunchKernel, 4000);
success &= nocl_load_pfn("cuLinkAddData", (void**)&nocl_dispatch.cuLinkAddData_v2, 6050);
success &= nocl_load_pfn("cuLinkComplete", (void**)&nocl_dispatch.cuLinkComplete, 5050);
success &= nocl_load_pfn("cuLinkCreate", (void**)&nocl_dispatch.cuLinkCreate_v2, 6050);
success &= nocl_load_pfn("cuLinkDestroy", (void**)&nocl_dispatch.cuLinkDestroy, 5050);
success &= nocl_load_pfn("cuMemAlloc", (void**)&nocl_dispatch.cuMemAlloc_v2, 3020);
success &= nocl_load_pfn("cuMemAllocHost", (void**)&nocl_dispatch.cuMemAllocHost_v2, 3020);
success &= nocl_load_pfn("cuMemcpy", (void**)&nocl_dispatch.cuMemcpy, 4000);
success &= nocl_load_pfn("cuMemcpyAsync", (void**)&nocl_dispatch.cuMemcpyAsync, 4000);
success &= nocl_load_pfn("cuMemcpyDtoHAsync", (void**)&nocl_dispatch.cuMemcpyDtoHAsync_v2, 3020);
success &= nocl_load_pfn("cuMemcpyHtoD", (void**)&nocl_dispatch.cuMemcpyHtoD_v2, 3020);
success &= nocl_load_pfn("cuMemFree", (void**)&nocl_dispatch.cuMemFree_v2, 3020);
success &= nocl_load_pfn("cuMemFreeHost", (void**)&nocl_dispatch.cuMemFreeHost, 2000);
success &= nocl_load_pfn("cuMemsetD8Async", (void**)&nocl_dispatch.cuMemsetD8Async, 3020);
success &= nocl_load_pfn("cuMemsetD16Async", (void**)&nocl_dispatch.cuMemsetD16Async, 3020);
success &= nocl_load_pfn("cuMemsetD32Async", (void**)&nocl_dispatch.cuMemsetD32Async, 3020);
success &= nocl_load_pfn("cuModuleGetFunction", (void**)&nocl_dispatch.cuModuleGetFunction, 2000);
success &= nocl_load_pfn("cuModuleLoadDataEx", (void**)&nocl_dispatch.cuModuleLoadDataEx, 2010);
success &= nocl_load_pfn("cuModuleUnload", (void**)&nocl_dispatch.cuModuleUnload, 2000);
success &= nocl_load_pfn("cuStreamDestroy", (void**)&nocl_dispatch.cuStreamDestroy_v2, 4000);
success &= nocl_load_pfn("cuStreamSynchronize", (void**)&nocl_dispatch.cuStreamSynchronize, 2000);
if (!success)
return 0;
/* Only documented difference appears to be that v2 won't return errors on previous async
* commands, so just overwrite the previously fetched pointer */
nocl_load_pfn("cuEventElapsedTime", &func, 12080);
if (func)
nocl_dispatch.cuEventElapsedTime_v1 = func;
nocl_dispatch.all_required_available = true;
#if CUDA_VERSION >= 11040
success &= nocl_load_pfn("cuDeviceGetUuid", (void**)&nocl_dispatch.cuDeviceGetUuid_v2, 11040);
#endif
#if CUDA_VERSION >= 12090
nocl_load_pfn("cuLogsRegisterCallback", (void**)&nocl_dispatch.cuLogsRegisterCallback, 12090);
nocl_load_pfn("cuLogsUnregisterCallback", (void**)&nocl_dispatch.cuLogsUnregisterCallback, 12090);
#endif
}
if (!nocl_dispatch.all_required_available)
return 0;
int driver_version;
CUresult res = nocl_dispatch.cuInit(0);
cuda_err(res);
res = nocl_dispatch.cuDriverGetVersion(&driver_version);
/* Our highest version requirement across all required APIs */
if (driver_version < 8000)
return 0;
if (!ndev) {
res = nocl_dispatch.cuDeviceGetCount(&ndev);
cuda_err(res);
return ndev;
}
if (!devs)
return 0;
int cnt;
res = nocl_dispatch.cuDeviceGetCount(&cnt);
cuda_err(res);
for (int i = 0; i < ndev; i++) {
if (i >= cnt)
return i;
CUdevice cuda_dev;
res = nocl_dispatch.cuDeviceGet(&cuda_dev, i);
if (res)
return i;
int pci_device_id;
#if CUDA_VERSION >= 12080
res = nocl_dispatch.cuDeviceGetAttribute(&pci_device_id, CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID, cuda_dev);
cuda_err(res);
#else
/* TODO: we might need to get the device id from somewhere, but so far
* nothing needs it */
pci_device_id = 0x10de;
#endif
struct cuda_pipe_loader_device *dev = CALLOC_STRUCT(cuda_pipe_loader_device);
*dev = (struct cuda_pipe_loader_device) {
.base = {
.driver_name = "nocl",
.ops = &cuda_loader_ops,
/* TODO: tegra */
.type = PIPE_LOADER_DEVICE_PCI,
.u.pci.chip_id = ((unsigned int)pci_device_id) >> 16,
.u.pci.vendor_id = pci_device_id & 0xffff,
},
.dev = cuda_dev,
.cuda_version = driver_version,
};
devs[i] = &dev->base;
}
return ndev;
}

View file

@ -0,0 +1,73 @@
/*
* Copyright © 2025 Karol Herbst
*
* SPDX-License-Identifier: MIT
*/
#ifndef __NOCL_DRM_PUBLIC_H__
#define __NOCL_DRM_PUBLIC_H__
#include <cudaTypedefs.h>
#include "pipe-loader/pipe_loader.h"
struct cuda_pipe_loader_device {
struct pipe_loader_device base;
CUdevice dev;
int cuda_version;
};
extern struct cuda_symbol_table {
PFN_cuCtxCreate_v3020 cuCtxCreate_v2;
PFN_cuCtxDestroy_v4000 cuCtxDestroy_v2;
PFN_cuCtxSetCurrent_v4000 cuCtxSetCurrent;
PFN_cuDeviceGet_v2000 cuDeviceGet;
PFN_cuDeviceGetAttribute_v2000 cuDeviceGetAttribute;
PFN_cuDeviceGetCount_v2000 cuDeviceGetCount;
PFN_cuDeviceGetName_v2000 cuDeviceGetName;
PFN_cuDeviceTotalMem_v3020 cuDeviceTotalMem_v2;
PFN_cuDriverGetVersion_v2020 cuDriverGetVersion;
PFN_cuEventCreate_v2000 cuEventCreate;
PFN_cuEventDestroy_v4000 cuEventDestroy_v2;
PFN_cuEventElapsedTime_v2000 cuEventElapsedTime_v1;
PFN_cuEventRecord_v2000 cuEventRecord;
PFN_cuEventSynchronize_v2000 cuEventSynchronize;
PFN_cuFuncGetAttribute_v2020 cuFuncGetAttribute;
PFN_cuGetErrorString_v6000 cuGetErrorString;
PFN_cuGetProcAddress_v12000 cuGetProcAddress_v2;
PFN_cuInit_v2000 cuInit;
PFN_cuLaunchKernel_v4000 cuLaunchKernel;
PFN_cuLinkAddData_v6050 cuLinkAddData_v2;
PFN_cuLinkComplete_v5050 cuLinkComplete;
PFN_cuLinkCreate_v6050 cuLinkCreate_v2;
PFN_cuLinkDestroy_v5050 cuLinkDestroy;
PFN_cuMemAlloc_v3020 cuMemAlloc_v2;
PFN_cuMemAllocHost_v3020 cuMemAllocHost_v2;
PFN_cuMemcpy_v4000 cuMemcpy;
PFN_cuMemcpyAsync_v4000 cuMemcpyAsync;
PFN_cuMemcpyDtoHAsync_v3020 cuMemcpyDtoHAsync_v2;
PFN_cuMemcpyHtoD_v3020 cuMemcpyHtoD_v2;
PFN_cuMemFree_v3020 cuMemFree_v2;
PFN_cuMemFreeHost_v2000 cuMemFreeHost;
PFN_cuMemsetD8Async_v3020 cuMemsetD8Async;
PFN_cuMemsetD16Async_v3020 cuMemsetD16Async;
PFN_cuMemsetD32Async_v3020 cuMemsetD32Async;
PFN_cuModuleGetFunction_v2000 cuModuleGetFunction;
PFN_cuModuleLoadDataEx_v2010 cuModuleLoadDataEx;
PFN_cuModuleUnload_v2000 cuModuleUnload;
PFN_cuStreamDestroy_v4000 cuStreamDestroy_v2;
PFN_cuStreamSynchronize_v2000 cuStreamSynchronize;
#if CUDA_VERSION >= 11040
PFN_cuDeviceGetUuid_v11040 cuDeviceGetUuid_v2;
#endif
#if CUDA_VERSION >= 12090
PFN_cuLogsRegisterCallback_v12090 cuLogsRegisterCallback;
PFN_cuLogsUnregisterCallback_v12090 cuLogsUnregisterCallback;
#endif
bool all_required_available;
} nocl_dispatch;
typedef int CUdevice;
struct pipe_screen *nocl_create_screen(struct cuda_pipe_loader_device *dev);
#endif /* __NOCL_DRM_PUBLIC_H__ */