mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 07:20:10 +01:00
Merge branch 'rusticl/cuda/what_the_fuck' into 'main'
add a new gallium compute only driver for nvidia hardware, named nocl See merge request mesa/mesa!37831
This commit is contained in:
commit
74e785ff41
30 changed files with 3916 additions and 28 deletions
13
docs/drivers/nocl.rst
Normal file
13
docs/drivers/nocl.rst
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
NoCL
|
||||
====
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
The NoCL driver is a Gallium driver that targets the CUDA driver API and
|
||||
therefore can be used to layer OpenCL on top of the NVIDIA proprietary driver.
|
||||
|
||||
The minimum supported CUDA API version is 6.5.
|
||||
|
||||
It translates NIR to PTX and makes use of the CUDA internal JIT compiler to
|
||||
generate device code.
|
||||
|
|
@ -2161,6 +2161,29 @@ Freedreno driver environment variables
|
|||
Other Gallium drivers have their own environment variables. These may
|
||||
change frequently so the source code should be consulted for details.
|
||||
|
||||
NoCL driver environment variables
|
||||
---------------------------------
|
||||
|
||||
.. envvar:: NOCL_DEBUG
|
||||
|
||||
Debug flags for the NoCL driver.
|
||||
|
||||
.. envvar:: NOCL_API_TARGET
|
||||
|
||||
Assumes a lower CUDA Driver version is used than what's loaded at runtime.
|
||||
Useful to ensure that code runs on older Driver versions. For development
|
||||
only.
|
||||
|
||||
.. envvar:: NOCL_PTX_VERSION
|
||||
|
||||
Overrides the PTX target version with the specified value. Can be used to
|
||||
ensure code compiles successfully on various CUDA versions. For development
|
||||
only. Specified without the dot.
|
||||
|
||||
.. envvar:: NOCL_SM_TARGET
|
||||
|
||||
Overrides the PTX target SM level with the specified value. Can be used to
|
||||
ensure code compiles successfully on various GPUs. For development only.
|
||||
|
||||
Vulkan loader environment variables
|
||||
-----------------------------------
|
||||
|
|
|
|||
|
|
@ -820,7 +820,7 @@ Rusticl Optional OpenCL 2.x Features:
|
|||
Device and host timer synchronization DONE (freedreno, iris, llvmpipe, radeonsi, zink)
|
||||
OpenCL C 2.0 in progress
|
||||
- Memory Consistency Model (atomics) not started
|
||||
- Sub-groups DONE (iris, llvmpipe, radeonsi, asahi)
|
||||
- Sub-groups DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
- Work-group Collective Functions not started
|
||||
- Generic Address Space in progress
|
||||
cl_khr_il_program DONE
|
||||
|
|
@ -840,7 +840,7 @@ Rusticl extensions:
|
|||
cl_khr_d3d10_sharing not started
|
||||
cl_khr_d3d11_sharing not started
|
||||
cl_khr_device_enqueue_local_arg_types not started
|
||||
cl_khr_device_uuid DONE (freedreno, iris, llvmpipe, radeonsi, zink)
|
||||
cl_khr_device_uuid DONE (freedreno, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_khr_dx9_media_sharing not started
|
||||
cl_khr_egl_event not started
|
||||
cl_khr_egl_image not started
|
||||
|
|
@ -855,7 +855,7 @@ Rusticl extensions:
|
|||
cl_khr_external_semaphore_opaque_fd not started
|
||||
cl_khr_external_semaphore_sync_fd DONE (radeonsi, zink)
|
||||
cl_khr_external_semaphore_win32 not started
|
||||
cl_khr_fp16 DONE (asahi, freedreno, llvmpipe, panfrost, radeonsi, zink)
|
||||
cl_khr_fp16 DONE (asahi, freedreno, llvmpipe, nocl, panfrost, radeonsi, zink)
|
||||
cl_khr_gl_depth_images not started
|
||||
cl_khr_gl_event not started
|
||||
cl_khr_gl_msaa_sharing not started
|
||||
|
|
@ -865,10 +865,10 @@ Rusticl extensions:
|
|||
cl_khr_int64_base_atomics not started
|
||||
cl_khr_int64_extended_atomics not started
|
||||
cl_khr_integer_dot_product DONE
|
||||
cl_khr_kernel_clock DONE (freedreno, iris, llvmpipe, nvc0, panfrost, radeonsi, zink, needs llvm-19)
|
||||
cl_khr_kernel_clock DONE (freedreno, iris, llvmpipe, nocl, nvc0, panfrost, radeonsi, zink, needs llvm-19)
|
||||
cl_khr_mipmap_image not started
|
||||
cl_khr_mipmap_image_writes not started
|
||||
cl_khr_pci_bus_info DONE (iris, nvc0, radeonsi, zink)
|
||||
cl_khr_pci_bus_info DONE (iris, nocl, nvc0, radeonsi, zink)
|
||||
cl_khr_priority_hints DONE (asahi, freedreno, iris, panfrost, radeonsi)
|
||||
cl_khr_semaphore DONE (radeonsi, zink)
|
||||
cl_khr_spirv_extended_debug_info not started
|
||||
|
|
@ -876,21 +876,21 @@ Rusticl extensions:
|
|||
cl_khr_spirv_no_integer_wrap_decoration DONE
|
||||
cl_khr_spirv_queries DONE
|
||||
cl_khr_srgb_image_writes not started
|
||||
cl_khr_subgroup_ballot not started
|
||||
cl_khr_subgroup_clustered_reduce not started
|
||||
cl_khr_subgroup_extended_types not started
|
||||
cl_khr_subgroup_ballot DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_khr_subgroup_clustered_reduce DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_khr_subgroup_extended_types DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_khr_subgroup_named_barrier not started
|
||||
cl_khr_subgroup_non_uniform_arithmetic not started
|
||||
cl_khr_subgroup_non_uniform_vote not started
|
||||
cl_khr_subgroup_rotate not started
|
||||
cl_khr_subgroup_shuffle DONE (iris, llvmpipe, radeonsi, asahi)
|
||||
cl_khr_subgroup_shuffle_relative DONE (iris, llvmpipe, radeonsi, asahi)
|
||||
cl_khr_subgroup_non_uniform_arithmetic DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_khr_subgroup_non_uniform_vote DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_khr_subgroup_rotate DONE (iris, llvmpipe, zink)
|
||||
cl_khr_subgroup_shuffle DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_khr_subgroup_shuffle_relative DONE (asahi, iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_khr_subgroups in progress
|
||||
cl_khr_suggested_local_work_size DONE
|
||||
cl_khr_terminate_context not started
|
||||
cl_khr_throttle_hints not started
|
||||
cl_khr_work_group_uniform_arithmetic not started
|
||||
cl_ext_buffer_device_address DONE (iris, llvmpipe, radeonsi, zink)
|
||||
cl_ext_buffer_device_address DONE (iris, llvmpipe, nocl, radeonsi, zink)
|
||||
cl_ext_cxx_for_opencl not started
|
||||
cl_ext_device_fission not started
|
||||
cl_ext_float_atomics not started
|
||||
|
|
|
|||
|
|
@ -86,6 +86,7 @@ Linux, FreeBSD, and other operating systems.
|
|||
drivers/kosmickrisp
|
||||
drivers/lima
|
||||
drivers/llvmpipe
|
||||
drivers/nocl
|
||||
drivers/nvk
|
||||
drivers/panfrost
|
||||
drivers/powervr
|
||||
|
|
|
|||
13
meson.build
13
meson.build
|
|
@ -221,6 +221,7 @@ with_gallium_d3d12 = gallium_drivers.contains('d3d12')
|
|||
with_gallium_asahi = gallium_drivers.contains('asahi')
|
||||
with_gallium_rocket = gallium_drivers.contains('rocket')
|
||||
with_gallium_ethosu = gallium_drivers.contains('ethosu')
|
||||
with_gallium_nocl = gallium_drivers.contains('nocl')
|
||||
foreach gallium_driver : gallium_drivers
|
||||
pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
|
||||
endforeach
|
||||
|
|
@ -778,7 +779,7 @@ if with_gallium_rusticl
|
|||
endif
|
||||
|
||||
with_virtgpu_kumquat = get_option('virtgpu_kumquat') and with_gfxstream_vk
|
||||
if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat
|
||||
if with_gallium_rusticl or with_nouveau_vk or with_tools.contains('etnaviv') or with_virtgpu_kumquat or with_gallium_nocl
|
||||
# rust.bindgen() does not pass `--rust-target` to bindgen until 1.7.0.
|
||||
if meson.version().version_compare('< 1.7.0')
|
||||
error('Mesa Rust support requires Meson 1.7.0 or newer')
|
||||
|
|
@ -1641,6 +1642,16 @@ elif with_shader_cache
|
|||
error('Shader Cache requires compression')
|
||||
endif
|
||||
|
||||
if with_gallium_nocl
|
||||
dep_libcuda = dependency(
|
||||
'cuda',
|
||||
# By default meson searches for the cudart, which we don't want to use. We want to use the
|
||||
# CUDA driver library.
|
||||
modules : ['cuda'],
|
||||
version : '>= 6.5',
|
||||
)
|
||||
endif
|
||||
|
||||
if host_machine.system() == 'windows'
|
||||
# For MSVC and MinGW we aren't using pthreads, and dependency('threads') will add linkage
|
||||
# to pthread for MinGW, so leave the dependency null_dep for Windows. For Windows linking to
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ option(
|
|||
choices : [
|
||||
'all', 'auto',
|
||||
'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris',
|
||||
'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
|
||||
'lima', 'llvmpipe', 'nocl', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
|
||||
'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
|
||||
],
|
||||
description : 'List of gallium drivers to build. If this is set to auto ' +
|
||||
|
|
|
|||
|
|
@ -72,9 +72,15 @@ struct clc_optional_features {
|
|||
* progress
|
||||
*/
|
||||
bool subgroups_ifp;
|
||||
bool subgroups_ballot;
|
||||
bool subgroups_clustered;
|
||||
bool subgroups_extended_types;
|
||||
bool subgroups_named_barrier;
|
||||
bool subgroups_non_uniform_arithmetic;
|
||||
bool subgroups_non_uniform_vote;
|
||||
bool subgroups_rotate;
|
||||
bool subgroups_shuffle;
|
||||
bool subgroups_shuffle_relative;
|
||||
bool subgroups_ballot;
|
||||
};
|
||||
|
||||
struct clc_compile_args {
|
||||
|
|
|
|||
|
|
@ -1024,15 +1024,33 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
|
|||
}
|
||||
if (args->features.subgroups) {
|
||||
c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_subgroups");
|
||||
if (args->features.subgroups_ballot) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1");
|
||||
}
|
||||
if (args->features.subgroups_clustered) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_clustered_reduce=1");
|
||||
}
|
||||
if (args->features.subgroups_extended_types) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_extended_types=1");
|
||||
}
|
||||
if (args->features.subgroups_named_barrier) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_named_barrier=1");
|
||||
}
|
||||
if (args->features.subgroups_non_uniform_arithmetic) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_arithmetic=1");
|
||||
}
|
||||
if (args->features.subgroups_non_uniform_vote) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_non_uniform_vote=1");
|
||||
}
|
||||
if (args->features.subgroups_rotate) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_rotate=1");
|
||||
}
|
||||
if (args->features.subgroups_shuffle) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle=1");
|
||||
}
|
||||
if (args->features.subgroups_shuffle_relative) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_shuffle_relative=1");
|
||||
}
|
||||
if (args->features.subgroups_ballot) {
|
||||
c->getPreprocessorOpts().addMacroDef("cl_khr_subgroup_ballot=1");
|
||||
}
|
||||
}
|
||||
if (args->features.subgroups_ifp) {
|
||||
assert(args->features.subgroups);
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@
|
|||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "compiler/builtin_types.h"
|
||||
#include "shader_enums.h"
|
||||
#include "c11/threads.h"
|
||||
#include "util/blob.h"
|
||||
|
|
|
|||
|
|
@ -73,6 +73,6 @@ if with_gallium
|
|||
endif
|
||||
subdir('isaspec')
|
||||
|
||||
if with_nouveau_vk
|
||||
if with_nouveau_vk or with_gallium_nocl
|
||||
subdir('rust')
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -5745,6 +5745,8 @@ typedef struct nir_lower_compute_system_values_options {
|
|||
bool lower_local_invocation_index : 1;
|
||||
bool lower_cs_local_id_to_index : 1;
|
||||
bool lower_workgroup_id_to_index : 1;
|
||||
bool lower_num_subgroups : 1;
|
||||
bool lower_subgroup_id : 1;
|
||||
bool global_id_is_32bit : 1;
|
||||
/* At shader execution time, check if WorkGroupId should be 1D
|
||||
* and compute it quickly. Fall back to slow computation if not.
|
||||
|
|
|
|||
|
|
@ -794,6 +794,30 @@ lower_compute_system_value_instr(nir_builder *b,
|
|||
case nir_intrinsic_load_shader_index:
|
||||
return nir_imm_int(b, b->shader->info.cs.shader_index);
|
||||
|
||||
case nir_intrinsic_load_num_subgroups: {
|
||||
if (!options || !options->lower_num_subgroups)
|
||||
return NULL;
|
||||
|
||||
nir_def *group_size = nir_load_workgroup_size(b);
|
||||
nir_def *threads = nir_imul(b, nir_channel(b, group_size, 0),
|
||||
nir_channel(b, group_size, 1));
|
||||
threads = nir_imul(b, threads, nir_channel(b, group_size, 2));
|
||||
|
||||
/* DIV_ROUND_UP(A, B) = ((A + B - 1) / B) */
|
||||
nir_def *subgroup_size = nir_load_subgroup_size(b);
|
||||
nir_def *subgroup_size_m1 = nir_iadd_imm(b, subgroup_size, -1);
|
||||
nir_def *numerator = nir_iadd(b, threads, subgroup_size_m1);
|
||||
return nir_udiv(b, numerator, subgroup_size);
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_subgroup_id: {
|
||||
if (!options || !options->lower_subgroup_id)
|
||||
return NULL;
|
||||
|
||||
nir_def *tid = nir_load_local_invocation_index(b);
|
||||
return nir_udiv(b, tid, nir_load_subgroup_size(b));
|
||||
}
|
||||
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,6 +44,9 @@
|
|||
static int (*backends[])(struct pipe_loader_device **, int) = {
|
||||
#ifdef HAVE_LIBDRM
|
||||
&pipe_loader_drm_probe,
|
||||
#endif
|
||||
#ifdef HAVE_NOCL
|
||||
&pipe_loader_cuda_probe,
|
||||
#endif
|
||||
&pipe_loader_sw_probe
|
||||
};
|
||||
|
|
|
|||
|
|
@ -215,6 +215,11 @@ pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev,
|
|||
int
|
||||
pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev);
|
||||
|
||||
#ifdef HAVE_NOCL
|
||||
int
|
||||
pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Get a list of known DRM accel devices.
|
||||
*
|
||||
|
|
|
|||
12
src/gallium/drivers/nocl/compiler/cbindgen.toml
Normal file
12
src/gallium/drivers/nocl/compiler/cbindgen.toml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
language = "C"
|
||||
|
||||
includes = ["nir.h"]
|
||||
autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */"
|
||||
include_guard = "NIR_TO_PTX_H"
|
||||
usize_is_size_t = true
|
||||
style = "tag"
|
||||
|
||||
[export]
|
||||
include = ["NirToPtxOutput"]
|
||||
prefix = ""
|
||||
renaming_overrides_prefixing = true
|
||||
1191
src/gallium/drivers/nocl/compiler/lib.rs
Normal file
1191
src/gallium/drivers/nocl/compiler/lib.rs
Normal file
File diff suppressed because it is too large
Load diff
44
src/gallium/drivers/nocl/compiler/meson.build
Normal file
44
src/gallium/drivers/nocl/compiler/meson.build
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# Copyright © 2025 Karol Herbst
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
prog_cbindgen = find_program(
|
||||
'cbindgen',
|
||||
required : true,
|
||||
native : true,
|
||||
version : '>= 0.25'
|
||||
)
|
||||
|
||||
libnir_to_ptx_h = custom_target(
|
||||
'libnir_to_ptx_h',
|
||||
input : [files('cbindgen.toml'), 'lib.rs'],
|
||||
output : ['nir_to_ptx.h'],
|
||||
command : [
|
||||
prog_cbindgen, '-q', '--config', '@INPUT0@', '--lang', 'c',
|
||||
'--output', '@OUTPUT0@', '--depfile', '@DEPFILE@',
|
||||
'--', '@INPUT1@',
|
||||
],
|
||||
depfile : 'nir_to_ptx.h.d',
|
||||
)
|
||||
|
||||
files_libnir_to_ptx = files(
|
||||
'lib.rs',
|
||||
)
|
||||
|
||||
libnir_to_ptx = static_library(
|
||||
'nir_to_ptx',
|
||||
[files_libnir_to_ptx],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
rust_abi : 'c',
|
||||
dependencies : [
|
||||
idep_compiler_rs,
|
||||
],
|
||||
)
|
||||
|
||||
idep_nir_to_ptx = declare_dependency(
|
||||
sources : [
|
||||
libnir_to_ptx_h
|
||||
],
|
||||
link_with : [
|
||||
libnir_to_ptx,
|
||||
]
|
||||
)
|
||||
54
src/gallium/drivers/nocl/meson.build
Normal file
54
src/gallium/drivers/nocl/meson.build
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
# Copyright © 2025 Karol Herbst
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
subdir('compiler')
|
||||
|
||||
files_libnocl = files(
|
||||
'nocl_nir.c',
|
||||
'nocl_pipe.c',
|
||||
'nocl_private.h',
|
||||
)
|
||||
|
||||
nocl_nir_algebraic_c = custom_target(
|
||||
'nocl_nir_algebraic.c',
|
||||
input : 'nocl_nir_algebraic.py',
|
||||
output : 'nocl_nir_algebraic.c',
|
||||
command : [
|
||||
prog_python, '@INPUT@',
|
||||
'-p', dir_compiler_nir,
|
||||
'--out', '@OUTPUT@',
|
||||
],
|
||||
depend_files : nir_algebraic_depends,
|
||||
)
|
||||
|
||||
libnocl = static_library(
|
||||
'nocl',
|
||||
[
|
||||
files_libnocl,
|
||||
nocl_nir_algebraic_c,
|
||||
],
|
||||
include_directories : [
|
||||
inc_gallium,
|
||||
inc_gallium_aux,
|
||||
inc_gallium_winsys,
|
||||
inc_util,
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
dependencies : [
|
||||
dep_libcuda,
|
||||
idep_nir,
|
||||
idep_nir_to_ptx,
|
||||
],
|
||||
)
|
||||
|
||||
driver_nocl = declare_dependency(
|
||||
compile_args : '-DGALLIUM_NOCL',
|
||||
dependencies : [
|
||||
dep_libcuda,
|
||||
idep_libnoclwinsys,
|
||||
],
|
||||
link_with : [
|
||||
libnocl,
|
||||
libnir_to_ptx,
|
||||
],
|
||||
)
|
||||
824
src/gallium/drivers/nocl/nocl_nir.c
Normal file
824
src/gallium/drivers/nocl/nocl_nir.c
Normal file
|
|
@ -0,0 +1,824 @@
|
|||
/*
|
||||
* Copyright © 2022 Collabora, Ltd.
|
||||
* Copyright © 2025 Karol Herbst
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "nocl_private.h"
|
||||
|
||||
#include "nir.h"
|
||||
#include "nir_builder.h"
|
||||
|
||||
static void
|
||||
push_block(nir_builder *b, nir_block *block)
|
||||
{
|
||||
assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
|
||||
block->cf_node.parent = &b->impl->cf_node;
|
||||
exec_list_push_tail(&b->impl->body, &block->cf_node.node);
|
||||
b->cursor = nir_after_block(block);
|
||||
}
|
||||
|
||||
enum scope_type {
|
||||
SCOPE_TYPE_SHADER,
|
||||
SCOPE_TYPE_IF_MERGE,
|
||||
SCOPE_TYPE_LOOP_BREAK,
|
||||
SCOPE_TYPE_LOOP_CONT,
|
||||
};
|
||||
|
||||
struct scope {
|
||||
enum scope_type type;
|
||||
|
||||
struct scope *parent;
|
||||
uint32_t depth;
|
||||
|
||||
nir_block *merge;
|
||||
nir_def *bar;
|
||||
|
||||
uint32_t escapes;
|
||||
};
|
||||
|
||||
static struct scope
|
||||
push_scope(nir_builder *b,
|
||||
enum scope_type scope_type,
|
||||
struct scope *parent,
|
||||
nir_block *merge_block)
|
||||
{
|
||||
struct scope scope = {
|
||||
.type = scope_type,
|
||||
.parent = parent,
|
||||
.depth = parent->depth + 1,
|
||||
.merge = merge_block,
|
||||
};
|
||||
|
||||
return scope;
|
||||
}
|
||||
|
||||
static void
|
||||
pop_scope(nir_builder *b, nir_def *esc_reg, struct scope scope)
|
||||
{
|
||||
if (scope.bar == NULL)
|
||||
return;
|
||||
|
||||
if (scope.escapes > 0) {
|
||||
/* Find the nearest scope with a sync. */
|
||||
nir_block *parent_merge = b->impl->end_block;
|
||||
for (struct scope *p = scope.parent; p != NULL; p = p->parent) {
|
||||
if (p->bar != NULL) {
|
||||
parent_merge = p->merge;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* No escape is ~0, halt is 0, and we choose outer scope indices such
|
||||
* that outer scopes always have lower indices than inner scopes.
|
||||
*/
|
||||
nir_def *esc = nir_ult_imm(b, nir_load_reg(b, esc_reg), scope.depth);
|
||||
|
||||
/* We have to put the escape in its own block to avoid critical edges.
|
||||
* If we just did goto_if, we would end up with multiple successors,
|
||||
* including a jump to the parent's merge block which has multiple
|
||||
* predecessors.
|
||||
*/
|
||||
nir_block *esc_block = nir_block_create(b->shader);
|
||||
nir_block *next_block = nir_block_create(b->shader);
|
||||
nir_goto_if(b, esc_block, esc, next_block);
|
||||
push_block(b, esc_block);
|
||||
nir_goto(b, parent_merge);
|
||||
push_block(b, next_block);
|
||||
}
|
||||
}
|
||||
|
||||
static enum scope_type
|
||||
jump_target_scope_type(nir_jump_type jump_type)
|
||||
{
|
||||
switch (jump_type) {
|
||||
case nir_jump_break: return SCOPE_TYPE_LOOP_BREAK;
|
||||
case nir_jump_continue: return SCOPE_TYPE_LOOP_CONT;
|
||||
default:
|
||||
UNREACHABLE("Unknown jump type");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
break_scopes(nir_builder *b, nir_def *esc_reg,
|
||||
struct scope *current_scope,
|
||||
nir_jump_type jump_type)
|
||||
{
|
||||
nir_block *first_sync = NULL;
|
||||
uint32_t target_depth = UINT32_MAX;
|
||||
enum scope_type target_scope_type = jump_target_scope_type(jump_type);
|
||||
for (struct scope *scope = current_scope; scope; scope = scope->parent) {
|
||||
if (first_sync == NULL && scope->bar != NULL)
|
||||
first_sync = scope->merge;
|
||||
|
||||
if (scope->type == target_scope_type) {
|
||||
if (first_sync == NULL) {
|
||||
first_sync = scope->merge;
|
||||
} else {
|
||||
/* In order for our cascade to work, we need to have the invariant
|
||||
* that anything which escapes any scope with a warp sync needs to
|
||||
* target a scope with a warp sync.
|
||||
*/
|
||||
assert(scope->bar != NULL);
|
||||
}
|
||||
target_depth = scope->depth;
|
||||
break;
|
||||
} else {
|
||||
scope->escapes++;
|
||||
}
|
||||
}
|
||||
assert(target_depth < UINT32_MAX);
|
||||
|
||||
nir_store_reg(b, nir_imm_int(b, target_depth), esc_reg);
|
||||
nir_goto(b, first_sync);
|
||||
}
|
||||
|
||||
static void
|
||||
normal_exit(nir_builder *b, nir_def *esc_reg, nir_block *merge_block)
|
||||
{
|
||||
assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
|
||||
nir_block *block = nir_cursor_current_block(b->cursor);
|
||||
|
||||
if (!nir_block_ends_in_jump(block)) {
|
||||
nir_store_reg(b, nir_imm_int(b, ~0), esc_reg);
|
||||
nir_goto(b, merge_block);
|
||||
}
|
||||
}
|
||||
|
||||
/* This is a heuristic for what instructions are allowed before we sync.
|
||||
* Annoyingly, we've gotten rid of phis so it's not as simple as "is it a
|
||||
* phi?".
|
||||
*/
|
||||
static bool
|
||||
instr_is_allowed_before_sync(nir_instr *instr)
|
||||
{
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_alu: {
|
||||
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
||||
/* We could probably allow more ALU as long as it doesn't contain
|
||||
* derivatives but let's be conservative and only allow mov for now.
|
||||
*/
|
||||
return alu->op == nir_op_mov;
|
||||
}
|
||||
|
||||
case nir_instr_type_intrinsic: {
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
return intrin->intrinsic == nir_intrinsic_load_reg ||
|
||||
intrin->intrinsic == nir_intrinsic_store_reg;
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if our successor will sync for us
|
||||
*
|
||||
* This is a bit of a heuristic
|
||||
*/
|
||||
static bool
|
||||
parent_scope_will_sync(nir_cf_node *node, struct scope *parent_scope)
|
||||
{
|
||||
/* First search forward to see if there's anything non-trivial after this
|
||||
* node within the parent scope.
|
||||
*/
|
||||
nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node));
|
||||
nir_foreach_instr(instr, block) {
|
||||
if (!instr_is_allowed_before_sync(instr))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* There's another loop or if following and we didn't find a sync */
|
||||
if (nir_cf_node_next(&block->cf_node))
|
||||
return false;
|
||||
|
||||
/* See if the parent scope will sync for us. */
|
||||
if (parent_scope->bar != NULL)
|
||||
return true;
|
||||
|
||||
switch (parent_scope->type) {
|
||||
case SCOPE_TYPE_SHADER:
|
||||
return true;
|
||||
|
||||
case SCOPE_TYPE_IF_MERGE:
|
||||
return parent_scope_will_sync(block->cf_node.parent,
|
||||
parent_scope->parent);
|
||||
|
||||
case SCOPE_TYPE_LOOP_CONT:
|
||||
/* In this case, the loop doesn't have a sync of its own so we're
|
||||
* expected to be uniform before we hit the continue.
|
||||
*/
|
||||
return false;
|
||||
|
||||
case SCOPE_TYPE_LOOP_BREAK:
|
||||
UNREACHABLE("Loops must have a continue scope");
|
||||
|
||||
default:
|
||||
UNREACHABLE("Unknown scope type");
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
block_is_merge(const nir_block *block)
|
||||
{
|
||||
/* If it's unreachable, there is no merge */
|
||||
if (block->imm_dom == NULL)
|
||||
return false;
|
||||
|
||||
unsigned num_preds = 0;
|
||||
set_foreach(&block->predecessors, entry) {
|
||||
const nir_block *pred = entry->key;
|
||||
|
||||
/* We don't care about unreachable blocks */
|
||||
if (pred->imm_dom == NULL)
|
||||
continue;
|
||||
|
||||
num_preds++;
|
||||
}
|
||||
|
||||
return num_preds > 1;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_cf_list(nir_builder *b, nir_def *esc_reg, struct scope *parent_scope,
|
||||
struct exec_list *cf_list)
|
||||
{
|
||||
foreach_list_typed_safe(nir_cf_node, node, node, cf_list) {
|
||||
switch (node->type) {
|
||||
case nir_cf_node_block: {
|
||||
nir_block *block = nir_cf_node_as_block(node);
|
||||
if (exec_list_is_empty(&block->instr_list))
|
||||
break;
|
||||
|
||||
nir_cursor start = nir_before_block(block);
|
||||
nir_cursor end = nir_after_block(block);
|
||||
|
||||
nir_jump_instr *jump = NULL;
|
||||
nir_instr *last_instr = nir_block_last_instr(block);
|
||||
if (last_instr->type == nir_instr_type_jump) {
|
||||
jump = nir_instr_as_jump(last_instr);
|
||||
end = nir_before_instr(&jump->instr);
|
||||
}
|
||||
|
||||
nir_cf_list instrs;
|
||||
nir_cf_extract(&instrs, start, end);
|
||||
b->cursor = nir_cf_reinsert(&instrs, b->cursor);
|
||||
|
||||
if (jump != NULL) {
|
||||
if (jump->type == nir_jump_halt) {
|
||||
/* Halt instructions map to OpExit on NVIDIA hardware and
|
||||
* exited lanes never block a bsync.
|
||||
*/
|
||||
nir_instr_remove(&jump->instr);
|
||||
nir_builder_instr_insert(b, &jump->instr);
|
||||
} else {
|
||||
/* Everything else needs a break cascade */
|
||||
break_scopes(b, esc_reg, parent_scope, jump->type);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_cf_node_if: {
|
||||
nir_if *nif = nir_cf_node_as_if(node);
|
||||
|
||||
nir_def *cond = nif->condition.ssa;
|
||||
nir_instr_clear_src(NULL, &nif->condition);
|
||||
|
||||
nir_block *then_block = nir_block_create(b->shader);
|
||||
nir_block *else_block = nir_block_create(b->shader);
|
||||
nir_block *merge_block = nir_block_create(b->shader);
|
||||
|
||||
struct scope scope = push_scope(b, SCOPE_TYPE_IF_MERGE,
|
||||
parent_scope, merge_block);
|
||||
|
||||
nir_goto_if(b, then_block, cond, else_block);
|
||||
|
||||
push_block(b, then_block);
|
||||
lower_cf_list(b, esc_reg, &scope, &nif->then_list);
|
||||
normal_exit(b, esc_reg, merge_block);
|
||||
|
||||
push_block(b, else_block);
|
||||
lower_cf_list(b, esc_reg, &scope, &nif->else_list);
|
||||
normal_exit(b, esc_reg, merge_block);
|
||||
|
||||
push_block(b, merge_block);
|
||||
pop_scope(b, esc_reg, scope);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_cf_node_loop: {
|
||||
nir_loop *loop = nir_cf_node_as_loop(node);
|
||||
|
||||
nir_block *head_block = nir_block_create(b->shader);
|
||||
nir_block *break_block = nir_block_create(b->shader);
|
||||
nir_block *cont_block = nir_block_create(b->shader);
|
||||
|
||||
/* TODO: We can potentially avoid the break sync for loops when the
|
||||
* parent scope syncs for us. However, we still need to handle the
|
||||
* continue clause cascading to the break. If there is a
|
||||
* nir_jump_halt involved, then we have a real cascade where it needs
|
||||
* to then jump to the next scope. Getting all these cases right
|
||||
* while avoiding an extra sync for the loop break is tricky at best.
|
||||
*/
|
||||
struct scope break_scope = push_scope(b, SCOPE_TYPE_LOOP_BREAK,
|
||||
parent_scope,
|
||||
break_block);
|
||||
|
||||
nir_goto(b, head_block);
|
||||
push_block(b, head_block);
|
||||
|
||||
struct scope cont_scope = push_scope(b, SCOPE_TYPE_LOOP_CONT,
|
||||
&break_scope,
|
||||
cont_block);
|
||||
|
||||
lower_cf_list(b, esc_reg, &cont_scope, &loop->body);
|
||||
normal_exit(b, esc_reg, cont_block);
|
||||
|
||||
push_block(b, cont_block);
|
||||
|
||||
pop_scope(b, esc_reg, cont_scope);
|
||||
|
||||
lower_cf_list(b, esc_reg, &break_scope, &loop->continue_list);
|
||||
|
||||
nir_goto(b, head_block);
|
||||
push_block(b, break_block);
|
||||
|
||||
pop_scope(b, esc_reg, break_scope);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
UNREACHABLE("Unknown CF node type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_cf_func(nir_function *func)
|
||||
{
|
||||
if (func->impl == NULL)
|
||||
return false;
|
||||
|
||||
if (exec_list_is_singular(&func->impl->body)) {
|
||||
return nir_no_progress(func->impl);
|
||||
}
|
||||
|
||||
nir_function_impl *old_impl = func->impl;
|
||||
|
||||
/* We use this in block_is_merge() */
|
||||
nir_metadata_require(old_impl, nir_metadata_dominance | nir_metadata_divergence);
|
||||
|
||||
/* First, we temporarily get rid of SSA. This will make all our block
|
||||
* motion way easier. Ask the pass to place reg writes directly in the
|
||||
* immediate predecessors of the phis instead of trying to be clever.
|
||||
* This will ensure that we never get a write to a uniform register from
|
||||
* non-uniform control flow and makes our divergence reconstruction for
|
||||
* phis more reliable.
|
||||
*/
|
||||
nir_foreach_block(block, old_impl)
|
||||
nir_lower_phis_to_regs_block(block, true);
|
||||
|
||||
/* We create a whole new nir_function_impl and copy the contents over */
|
||||
func->impl = NULL;
|
||||
nir_function_impl *new_impl = nir_function_impl_create(func);
|
||||
new_impl->structured = false;
|
||||
|
||||
/* We copy defs from the old impl */
|
||||
new_impl->ssa_alloc = old_impl->ssa_alloc;
|
||||
|
||||
nir_builder b = nir_builder_at(nir_before_impl(new_impl));
|
||||
nir_def *esc_reg = nir_decl_reg(&b, 1, 32, 0);
|
||||
|
||||
/* Having a function scope makes everything easier */
|
||||
struct scope scope = {
|
||||
.type = SCOPE_TYPE_SHADER,
|
||||
.merge = new_impl->end_block,
|
||||
};
|
||||
lower_cf_list(&b, esc_reg, &scope, &old_impl->body);
|
||||
normal_exit(&b, esc_reg, new_impl->end_block);
|
||||
|
||||
/* Now sort by reverse PDFS and restore SSA
|
||||
*
|
||||
* Note: Since we created a new nir_function_impl, there is no metadata,
|
||||
* dirty or otherwise, so we have no need to call nir_progress().
|
||||
*/
|
||||
nir_sort_unstructured_blocks(new_impl);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Copied from nak_nir_lower_cf.c but without divergency or barriers */
|
||||
bool
|
||||
nocl_nir_lower_cf(struct nir_shader *nir)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_function(func, nir) {
|
||||
if (lower_cf_func(func))
|
||||
progress = true;
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
nir_shader_compiler_options *
|
||||
nocl_get_nir_options(struct nocl_screen *nocl)
|
||||
{
|
||||
nir_shader_compiler_options *nir_options = ralloc(nocl, nir_shader_compiler_options);
|
||||
*nir_options = (nir_shader_compiler_options) {
|
||||
// TODO:
|
||||
// .has_bit_test = nocl->ptx >= 60,
|
||||
// .has_bitfield_select = nocl->ptx >= 43,
|
||||
// .has_f2i32_rtne = true,
|
||||
// .has_uclz = true,
|
||||
// .has_dot_2x16 = true,
|
||||
// .has_fneo_fcmpu = true,
|
||||
// .has_fused_comp_and_csel = true,
|
||||
// .has_pack_32_4x8 = true,
|
||||
// .has_shfr32 = true,
|
||||
|
||||
.fuse_ffma32 = true,
|
||||
.has_fsub = true,
|
||||
.has_imul24 = true,
|
||||
.has_imad32 = true,
|
||||
.has_isub = true,
|
||||
.has_rotate32 = nocl->sm >= 32,
|
||||
.has_sdot_4x8 = nocl->sm >= 61,
|
||||
.has_sudot_4x8 = nocl->sm >= 61,
|
||||
.has_udot_4x8 = nocl->sm >= 61,
|
||||
.has_umul24 = true,
|
||||
.has_umad24 = true,
|
||||
.lower_hadd = true,
|
||||
.lower_find_lsb = true,
|
||||
.lower_flrp16 = true,
|
||||
.lower_flrp32 = true,
|
||||
.lower_flrp64 = true,
|
||||
.lower_fisnormal = true,
|
||||
.lower_fsat = true,
|
||||
.lower_fsign = true,
|
||||
.lower_int64_options =
|
||||
/* _technically PTX supports it, but it fails to compile with vector sources...
|
||||
* The hardware doesn't have an idiv anyway. */
|
||||
nir_lower_divmod64 |
|
||||
nir_lower_iadd_sat64 |
|
||||
nir_lower_imul64 |
|
||||
nir_lower_scan_reduce_bitwise64 |
|
||||
nir_lower_scan_reduce_iadd64 |
|
||||
nir_lower_subgroup_shuffle64 |
|
||||
(nocl->sm < 70 ? nir_lower_vote_ieq64 : 0),
|
||||
.lower_insert_byte = true,
|
||||
.lower_insert_word = true,
|
||||
.lower_isign = true,
|
||||
.lower_ldexp = true,
|
||||
.lower_uadd_sat = true,
|
||||
.lower_usub_sat = true,
|
||||
.max_unroll_iterations = 32,
|
||||
.support_16bit_alu = true,
|
||||
};
|
||||
|
||||
return nir_options;
|
||||
}
|
||||
|
||||
static uint8_t
|
||||
alu_width_cb(const nir_instr *instr, const void *)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
lower_bit_size_callback(const nir_instr *instr, void *data)
|
||||
{
|
||||
const struct nocl_screen *nocl = data;
|
||||
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_alu: {
|
||||
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
||||
unsigned dest_bit_size = alu->def.bit_size;
|
||||
unsigned src0_bit_size = alu->src[0].src.ssa->bit_size;
|
||||
switch (alu->op) {
|
||||
case nir_op_extract_i8:
|
||||
case nir_op_extract_i16:
|
||||
case nir_op_extract_u8:
|
||||
case nir_op_extract_u16:
|
||||
case nir_op_bitfield_reverse:
|
||||
if (dest_bit_size < 32)
|
||||
return 32;
|
||||
return 0;
|
||||
case nir_op_bcsel:
|
||||
if (dest_bit_size < 16)
|
||||
return 16;
|
||||
return 0;
|
||||
|
||||
case nir_op_bit_count:
|
||||
case nir_op_bitfield_insert:
|
||||
case nir_op_fdiv:
|
||||
case nir_op_frsq:
|
||||
case nir_op_fsqrt:
|
||||
case nir_op_iadd_sat:
|
||||
case nir_op_ibitfield_extract:
|
||||
case nir_op_isub_sat:
|
||||
case nir_op_ubitfield_extract:
|
||||
case nir_op_ufind_msb:
|
||||
if (src0_bit_size < 32)
|
||||
return 32;
|
||||
return 0;
|
||||
|
||||
case nir_op_iabs:
|
||||
case nir_op_iadd:
|
||||
case nir_op_iand:
|
||||
case nir_op_idiv:
|
||||
case nir_op_ieq:
|
||||
case nir_op_ige:
|
||||
case nir_op_ilt:
|
||||
case nir_op_imad:
|
||||
case nir_op_imax:
|
||||
case nir_op_imin:
|
||||
case nir_op_imul:
|
||||
case nir_op_imul_high:
|
||||
case nir_op_ine:
|
||||
case nir_op_ineg:
|
||||
case nir_op_inot:
|
||||
case nir_op_ior:
|
||||
case nir_op_irem:
|
||||
case nir_op_ishl:
|
||||
case nir_op_ishr:
|
||||
case nir_op_isub:
|
||||
case nir_op_ixor:
|
||||
case nir_op_udiv:
|
||||
case nir_op_uge:
|
||||
case nir_op_ult:
|
||||
case nir_op_umax:
|
||||
case nir_op_umin:
|
||||
case nir_op_umod:
|
||||
case nir_op_umul_high:
|
||||
case nir_op_ushr:
|
||||
if (src0_bit_size == 8)
|
||||
return 16;
|
||||
return 0;
|
||||
|
||||
case nir_op_fmax:
|
||||
case nir_op_fmin:
|
||||
if (src0_bit_size == 16 && nocl->sm < 80)
|
||||
return 32;
|
||||
return 0;
|
||||
|
||||
case nir_op_feq:
|
||||
case nir_op_fequ:
|
||||
case nir_op_fge:
|
||||
case nir_op_fgeu:
|
||||
case nir_op_flt:
|
||||
case nir_op_fltu:
|
||||
case nir_op_fneo:
|
||||
case nir_op_fneu:
|
||||
if (src0_bit_size == 16 && nocl->sm < 53)
|
||||
return 32;
|
||||
return 0;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
case nir_instr_type_intrinsic: {
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_read_invocation:
|
||||
case nir_intrinsic_reduce:
|
||||
case nir_intrinsic_shuffle:
|
||||
case nir_intrinsic_shuffle_down:
|
||||
case nir_intrinsic_shuffle_up:
|
||||
case nir_intrinsic_shuffle_xor:
|
||||
if (intrin->def.bit_size < 32)
|
||||
return 32;
|
||||
return 0;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_subgroups_filter(const nir_intrinsic_instr *intrin, const void *data)
|
||||
{
|
||||
const struct nocl_screen *nocl = data;
|
||||
if (intrin->def.num_components > 1)
|
||||
return true;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_ballot_bit_count_reduce:
|
||||
case nir_intrinsic_ballot_bit_count_exclusive:
|
||||
case nir_intrinsic_ballot_bit_count_inclusive:
|
||||
case nir_intrinsic_ballot_bitfield_extract:
|
||||
case nir_intrinsic_ballot_find_lsb:
|
||||
case nir_intrinsic_ballot_find_msb:
|
||||
case nir_intrinsic_exclusive_scan:
|
||||
case nir_intrinsic_first_invocation:
|
||||
case nir_intrinsic_inclusive_scan:
|
||||
case nir_intrinsic_inverse_ballot:
|
||||
case nir_intrinsic_load_subgroup_size:
|
||||
case nir_intrinsic_quad_vote_all:
|
||||
case nir_intrinsic_quad_vote_any:
|
||||
case nir_intrinsic_read_first_invocation:
|
||||
case nir_intrinsic_rotate:
|
||||
case nir_intrinsic_vote_feq:
|
||||
/* TODO: exists on SM90+ */
|
||||
case nir_intrinsic_elect:
|
||||
return true;
|
||||
case nir_intrinsic_vote_ieq:
|
||||
return nocl->sm < 70;
|
||||
case nir_intrinsic_reduce: {
|
||||
if (nocl->sm < 80)
|
||||
return true;
|
||||
unsigned cluster_size = nir_intrinsic_cluster_size(intrin);
|
||||
if (cluster_size != 0 && cluster_size != 32)
|
||||
return true;
|
||||
|
||||
switch (nir_intrinsic_reduction_op(intrin)) {
|
||||
case nir_op_fadd:
|
||||
case nir_op_fmax:
|
||||
case nir_op_fmin:
|
||||
case nir_op_fmul:
|
||||
case nir_op_imul:
|
||||
return true;
|
||||
case nir_op_imax:
|
||||
case nir_op_imin:
|
||||
case nir_op_umax:
|
||||
case nir_op_umin:
|
||||
return intrin->def.bit_size == 64;
|
||||
default:
|
||||
/* nir_lower_int64 will handle some of it for us */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case nir_intrinsic_read_invocation:
|
||||
case nir_intrinsic_shuffle:
|
||||
case nir_intrinsic_shuffle_down:
|
||||
case nir_intrinsic_shuffle_up:
|
||||
case nir_intrinsic_shuffle_xor:
|
||||
return intrin->def.bit_size > 32;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static nir_mem_access_size_align
|
||||
lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin,
|
||||
uint8_t bytes,
|
||||
uint8_t bit_size,
|
||||
uint32_t align,
|
||||
uint32_t align_offset,
|
||||
bool offset_is_const,
|
||||
enum gl_access_qualifier,
|
||||
const void *cb_data)
|
||||
{
|
||||
align = nir_combined_align(align, align_offset);
|
||||
bytes = MIN2(MIN2(bytes, align), 16);
|
||||
bit_size = MIN2(bit_size, bytes * 8);
|
||||
|
||||
unsigned num_components = MIN2(bytes / (bit_size / 8), 4);
|
||||
if (num_components == 3)
|
||||
num_components = 2;
|
||||
|
||||
return (nir_mem_access_size_align) {
|
||||
.bit_size = bit_size,
|
||||
.num_components = num_components,
|
||||
.align = align,
|
||||
.shift = nir_mem_access_shift_method_scalar,
|
||||
};
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_alu_vec_srcs(nir_builder *b, nir_alu_instr *alu, void *_data)
|
||||
{
|
||||
const nir_op_info *info = &nir_op_infos[alu->op];
|
||||
|
||||
bool changed = false;
|
||||
b->cursor = nir_before_instr(&alu->instr);
|
||||
for (int i = 0; i < info->num_inputs; i++) {
|
||||
if (info->input_sizes[i])
|
||||
continue;
|
||||
|
||||
/* We lower everything that is bigger than vec4 _and_ 128 bits */
|
||||
nir_def *src_ssa = alu->src[i].src.ssa;
|
||||
if (src_ssa->bit_size * src_ssa->num_components <= 128 && src_ssa->num_components <= 4)
|
||||
continue;
|
||||
|
||||
changed = true;
|
||||
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
|
||||
for (int c = 0; c < alu->def.num_components; c++) {
|
||||
unsigned swizzle = alu->src[i].swizzle[c];
|
||||
alu->src[i].swizzle[c] = c;
|
||||
|
||||
nir_const_value *const_val = nir_src_as_const_value(alu->src[i].src);
|
||||
if (const_val) {
|
||||
comps[c] = nir_build_imm(b, 1, alu->src[i].src.ssa->bit_size, &const_val[swizzle]);
|
||||
} else {
|
||||
comps[c] = nir_swizzle(b, alu->src[i].src.ssa, &swizzle, 1);
|
||||
}
|
||||
}
|
||||
nir_def *src = nir_vec(b, comps, alu->def.num_components);
|
||||
nir_src_rewrite(&alu->src[i].src, src);
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
/* copied from nir_lower_alu_vec8_16_srcs */
|
||||
static bool
|
||||
nir_lower_alu_vec_srcs(nir_shader *shader)
|
||||
{
|
||||
return nir_shader_alu_pass(shader, lower_alu_vec_srcs,
|
||||
nir_metadata_control_flow,
|
||||
NULL);
|
||||
}
|
||||
|
||||
void
|
||||
nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir)
|
||||
{
|
||||
struct nocl_screen *nocl = nocl_screen(pscreen);
|
||||
nir->info.max_subgroup_size = nocl->base.caps.shader_subgroup_size;
|
||||
nir->info.min_subgroup_size = nocl->base.caps.shader_subgroup_size;
|
||||
nir->info.api_subgroup_size = nocl->base.caps.shader_subgroup_size;
|
||||
|
||||
struct nir_lower_compute_system_values_options sysval_options = {
|
||||
.lower_local_invocation_index = true,
|
||||
.lower_num_subgroups = true,
|
||||
.lower_subgroup_id = true,
|
||||
};
|
||||
|
||||
struct nir_lower_mem_access_bit_sizes_options mem_access_bit_sizes_options = {
|
||||
.callback = lower_mem_access_bit_sizes_cb,
|
||||
.cb_data = nocl,
|
||||
.modes = nir_var_all,
|
||||
};
|
||||
|
||||
struct nir_lower_subgroups_options subgroup_options = {
|
||||
.ballot_bit_size = 32,
|
||||
.ballot_components = 1,
|
||||
.subgroup_size = 32,
|
||||
.lower_elect = true,
|
||||
.lower_first_invocation_to_ballot = true,
|
||||
.lower_inverse_ballot = true,
|
||||
.lower_quad_vote = true,
|
||||
.lower_read_first_invocation = true,
|
||||
.lower_reduce = true,
|
||||
.lower_rotate_to_shuffle = true,
|
||||
.lower_shuffle_to_32bit = true,
|
||||
.lower_to_scalar = true,
|
||||
.lower_vote_feq = true,
|
||||
.lower_vote_ieq = nocl->sm < 70,
|
||||
.filter = lower_subgroups_filter,
|
||||
.filter_data = nocl,
|
||||
};
|
||||
|
||||
NIR_PASS(_, nir, nir_lower_compute_system_values, &sysval_options);
|
||||
|
||||
/* TODO: keeping 8 bit vectors would help a bit... */
|
||||
NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
|
||||
NIR_PASS(_, nir, nir_lower_subgroups, &subgroup_options);
|
||||
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
|
||||
NIR_PASS(progress, nir, nir_lower_mem_access_bit_sizes, &mem_access_bit_sizes_options);
|
||||
NIR_PASS(progress, nir, nir_lower_alu_width, alu_width_cb, nocl);
|
||||
NIR_PASS(progress, nir, nir_lower_alu_vec_srcs);
|
||||
NIR_PASS(progress, nir, nir_lower_flrp, 16 | 32 | 64, false);
|
||||
NIR_PASS(progress, nir, nir_opt_algebraic);
|
||||
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, nir, nir_copy_prop);
|
||||
NIR_PASS(progress, nir, nir_opt_dce);
|
||||
NIR_PASS(progress, nir, nir_opt_cse);
|
||||
} while (progress);
|
||||
|
||||
do {
|
||||
progress = false;
|
||||
NIR_PASS(progress, nir, nir_lower_bit_size, lower_bit_size_callback, nocl);
|
||||
NIR_PASS(progress, nir, nir_opt_algebraic_late);
|
||||
NIR_PASS(progress, nir, nocl_nir_opt_algebraic_late, nocl);
|
||||
if (progress) {
|
||||
NIR_PASS(_, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(_, nir, nir_copy_prop);
|
||||
NIR_PASS(_, nir, nir_opt_dce);
|
||||
NIR_PASS(_, nir, nir_opt_cse);
|
||||
}
|
||||
} while (progress);
|
||||
|
||||
/* Run only once */
|
||||
NIR_PASS(progress, nir, nocl_nir_lower_algebraic_late, nocl);
|
||||
if (progress) {
|
||||
NIR_PASS(_, nir, nir_opt_constant_folding);
|
||||
}
|
||||
|
||||
/* After nir_opt_constant_folding */
|
||||
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
|
||||
NIR_PASS(_, nir, nir_opt_dce);
|
||||
}
|
||||
65
src/gallium/drivers/nocl/nocl_nir_algebraic.py
Normal file
65
src/gallium/drivers/nocl/nocl_nir_algebraic.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Copyright © 2025 Karol Herbst
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
a = 'a'
|
||||
b = 'b'
|
||||
c = 'c'
|
||||
s = 's'
|
||||
|
||||
late_algebraic = [
|
||||
(('ior@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
|
||||
(('iadd@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
|
||||
(('ixor@32', ('iand', a, b), ('iand', ('inot', a), c)), ('bitfield_select', a, b, c)),
|
||||
(('ixor@32', ('iand', a, ('ixor', b, c)), c), ('bitfield_select', a, b, c)),
|
||||
|
||||
(('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c)),
|
||||
(('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c)),
|
||||
(('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b)),
|
||||
(('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c)),
|
||||
(('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c)),
|
||||
(('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c)),
|
||||
]
|
||||
|
||||
late_algebraic_lowering = [
|
||||
(('b2i8', a), ('u2u8', ('b2i16', a))),
|
||||
]
|
||||
|
||||
for bit_size in [16, 32, 64]:
|
||||
late_algebraic_lowering += [
|
||||
((f'ishl@{bit_size}', a, b), ('ishl', a, ('iand', b, bit_size - 1))),
|
||||
((f'ishr@{bit_size}', a, b), ('ishr', a, ('iand', b, bit_size - 1))),
|
||||
((f'ushr@{bit_size}', a, b), ('ushr', a, ('iand', b, bit_size - 1))),
|
||||
]
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--out', required=True, help='Output file.')
|
||||
parser.add_argument('-p', '--import-path', required=True)
|
||||
args = parser.parse_args()
|
||||
sys.path.insert(0, args.import_path)
|
||||
|
||||
import nir_algebraic # pylint: disable=import-error
|
||||
|
||||
try:
|
||||
with open(args.out, 'w', encoding='utf-8') as f:
|
||||
f.write('#include "nocl_private.h"')
|
||||
f.write(nir_algebraic.AlgebraicPass(
|
||||
"nocl_nir_opt_algebraic_late",
|
||||
late_algebraic,
|
||||
[
|
||||
("const struct nocl_screen *", "nocl"),
|
||||
]).render())
|
||||
f.write(nir_algebraic.AlgebraicPass(
|
||||
"nocl_nir_lower_algebraic_late",
|
||||
late_algebraic_lowering,
|
||||
[
|
||||
("const struct nocl_screen *", "nocl"),
|
||||
]).render())
|
||||
except Exception:
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
1027
src/gallium/drivers/nocl/nocl_pipe.c
Normal file
1027
src/gallium/drivers/nocl/nocl_pipe.c
Normal file
File diff suppressed because it is too large
Load diff
147
src/gallium/drivers/nocl/nocl_private.h
Normal file
147
src/gallium/drivers/nocl/nocl_private.h
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
/*
|
||||
* Copyright © 2025 Karol Herbst
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "pipe/p_context.h"
|
||||
#include "pipe/p_screen.h"
|
||||
#include "pipe/p_state.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/ralloc.h"
|
||||
#include "util/u_memory.h"
|
||||
#include "util/u_inlines.h"
|
||||
|
||||
#include "nocl/nocl_cuda_public.h"
|
||||
|
||||
static const uint32_t NOCL_COMPILER_LOG_SIZE = 0x1000;
|
||||
|
||||
enum nocl_dbg {
|
||||
NOCL_DBG_32 = BITFIELD_BIT(0),
|
||||
NOCL_DBG_JIT_NO_OPTS = BITFIELD_BIT(1),
|
||||
NOCL_DBG_NIR = BITFIELD_BIT(2),
|
||||
NOCL_DBG_PTX = BITFIELD_BIT(3),
|
||||
NOCL_DBG_VERBOSE = BITFIELD_BIT(4),
|
||||
};
|
||||
|
||||
#define NOCL_CALL(func) nocl_error(nocl_dispatch.func)
|
||||
|
||||
static inline bool
|
||||
nocl_error(CUresult err)
|
||||
{
|
||||
if (err == CUDA_SUCCESS)
|
||||
return false;
|
||||
|
||||
const char *err_str;
|
||||
if (nocl_dispatch.cuGetErrorString(err, &err_str)) {
|
||||
printf("NOCL-ERROR: UNKNOWN\n");
|
||||
} else {
|
||||
printf("NOCL-ERROR: %s\n", err_str);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
struct nocl_screen {
|
||||
struct pipe_screen base;
|
||||
#if CUDA_VERSION >= 12090
|
||||
CUlogsCallbackHandle verbose_handle;
|
||||
#endif
|
||||
CUdevice cu_dev;
|
||||
CUcontext cu_ctx;
|
||||
struct disk_cache *disk_cache;
|
||||
uint32_t debug;
|
||||
uint16_t ptx;
|
||||
uint16_t sm;
|
||||
char name[128];
|
||||
};
|
||||
|
||||
struct nocl_buffer {
|
||||
struct pipe_resource base;
|
||||
union {
|
||||
CUdeviceptr dptr;
|
||||
void *ptr;
|
||||
};
|
||||
};
|
||||
|
||||
struct nocl_program {
|
||||
CUmodule cu_mod;
|
||||
CUfunction cu_func;
|
||||
uint32_t shared_size;
|
||||
};
|
||||
|
||||
struct nocl_context {
|
||||
struct pipe_context base;
|
||||
void *kernel_input;
|
||||
uint32_t kernel_input_size;
|
||||
uint32_t kernel_input_capacity;
|
||||
CUstream cu_stream;
|
||||
CUevent cu_timestamp_start;
|
||||
uint64_t timestamp_start;
|
||||
struct nocl_program *prog;
|
||||
};
|
||||
|
||||
struct pipe_fence_handle {
|
||||
struct pipe_reference ref;
|
||||
CUevent cu_event;
|
||||
};
|
||||
|
||||
struct nocl_transfer {
|
||||
struct pipe_transfer base;
|
||||
void *ptr;
|
||||
};
|
||||
|
||||
struct pipe_query {
|
||||
CUevent cu_event_base;
|
||||
CUevent cu_event;
|
||||
};
|
||||
|
||||
static inline struct nocl_screen*
|
||||
nocl_screen(struct pipe_screen *pscreen)
|
||||
{
|
||||
return (struct nocl_screen *)pscreen;
|
||||
}
|
||||
|
||||
static inline struct nocl_screen*
|
||||
nocl_screen_and_make_current(struct pipe_screen *pscreen)
|
||||
{
|
||||
struct nocl_screen *nocl = nocl_screen(pscreen);
|
||||
NOCL_CALL(cuCtxSetCurrent(nocl->cu_ctx));
|
||||
return nocl;
|
||||
}
|
||||
|
||||
static inline struct nocl_context*
|
||||
nocl_context(struct pipe_context *pctx)
|
||||
{
|
||||
return (struct nocl_context *)pctx;
|
||||
}
|
||||
|
||||
static inline struct nocl_context*
|
||||
nocl_context_and_make_current(struct pipe_context *pctx)
|
||||
{
|
||||
nocl_screen_and_make_current(pctx->screen);
|
||||
return nocl_context(pctx);
|
||||
}
|
||||
|
||||
static inline struct nocl_program*
|
||||
nocl_program(void *prog)
|
||||
{
|
||||
return (struct nocl_program *)prog;
|
||||
}
|
||||
|
||||
static inline struct nocl_buffer*
|
||||
nocl_buffer(struct pipe_resource *pres)
|
||||
{
|
||||
return (struct nocl_buffer *)pres;
|
||||
}
|
||||
|
||||
static inline struct nocl_transfer*
|
||||
nocl_transfer(struct pipe_transfer *pxfer)
|
||||
{
|
||||
return (struct nocl_transfer *)pxfer;
|
||||
}
|
||||
|
||||
void nocl_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir);
|
||||
bool nocl_nir_lower_cf(struct nir_shader *nir);
|
||||
bool nocl_nir_opt_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl);
|
||||
bool nocl_nir_lower_algebraic_late(struct nir_shader *nir, const struct nocl_screen *nocl);
|
||||
struct nir_shader_compiler_options *nocl_get_nir_options(struct nocl_screen *nocl);
|
||||
|
|
@ -740,17 +740,50 @@ impl DeviceBase {
|
|||
}
|
||||
|
||||
if self.subgroups_supported() {
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative);
|
||||
add_cap(SpvCapability::SpvCapabilityGroups);
|
||||
add_cap(SpvCapability::SpvCapabilitySubgroupDispatch);
|
||||
// requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
|
||||
//add_ext(1, 0, 0, "cl_khr_subgroups");
|
||||
add_ext(1, 0, 0, "cl_khr_subgroup_extended_types");
|
||||
add_feat(1, 0, 0, "__opencl_c_subgroups");
|
||||
|
||||
// we have lowering in `nir_lower_subgroups`, drivers can just use that
|
||||
if self.subgroup_ballot_supported() {
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformBallot);
|
||||
add_ext(1, 0, 0, "cl_khr_subgroup_ballot");
|
||||
}
|
||||
|
||||
if self.subgroup_clustered_supported() {
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformClustered);
|
||||
add_ext(1, 0, 0, "cl_khr_subgroup_clustered_reduce");
|
||||
}
|
||||
|
||||
if self.subgroup_non_uniform_arithmetic_supported() {
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformArithmetic);
|
||||
add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_arithmetic");
|
||||
}
|
||||
|
||||
if self.subgroup_non_uniform_vote_supported() {
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniform);
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformVote);
|
||||
add_ext(1, 0, 0, "cl_khr_subgroup_non_uniform_vote");
|
||||
}
|
||||
|
||||
if self.subgroup_rotate_supported() {
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformRotateKHR);
|
||||
add_ext(1, 0, 0, "cl_khr_subgroup_rotate");
|
||||
add_spirv(c"SPV_KHR_subgroup_rotate");
|
||||
}
|
||||
|
||||
if self.subgroup_shuffle_supported() {
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffle);
|
||||
add_ext(1, 0, 0, "cl_khr_subgroup_shuffle");
|
||||
}
|
||||
|
||||
if self.subgroup_shuffle_relative_supported() {
|
||||
add_cap(SpvCapability::SpvCapabilityGroupNonUniformShuffleRelative);
|
||||
add_ext(1, 0, 0, "cl_khr_subgroup_shuffle_relative");
|
||||
}
|
||||
|
||||
if self.intel_subgroups_supported() {
|
||||
// add_cap(SpvCapability::SpvCapabilitySubgroupBufferBlockIOINTEL);
|
||||
// add_cap(SpvCapability::SpvCapabilitySubgroupImageBlockIOINTEL);
|
||||
|
|
@ -1173,6 +1206,58 @@ impl DeviceBase {
|
|||
// supported, doing it without shareable shaders isn't practical
|
||||
self.max_subgroups() > 0
|
||||
&& (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders()))
|
||||
&& self.screen().caps().shader_subgroup_supported_features
|
||||
& PIPE_SHADER_SUBGROUP_FEATURE_BASIC
|
||||
!= 0
|
||||
}
|
||||
|
||||
pub fn subgroup_ballot_supported(&self) -> bool {
|
||||
self.subgroups_supported()
|
||||
&& self.screen().caps().shader_subgroup_supported_features
|
||||
& PIPE_SHADER_SUBGROUP_FEATURE_BALLOT
|
||||
!= 0
|
||||
}
|
||||
|
||||
pub fn subgroup_clustered_supported(&self) -> bool {
|
||||
self.subgroups_supported()
|
||||
&& self.screen().caps().shader_subgroup_supported_features
|
||||
& PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED
|
||||
!= 0
|
||||
}
|
||||
|
||||
pub fn subgroup_non_uniform_arithmetic_supported(&self) -> bool {
|
||||
self.subgroups_supported()
|
||||
&& self.screen().caps().shader_subgroup_supported_features
|
||||
& PIPE_SHADER_SUBGROUP_FEATURE_ARITHMETIC
|
||||
!= 0
|
||||
}
|
||||
|
||||
pub fn subgroup_non_uniform_vote_supported(&self) -> bool {
|
||||
self.subgroups_supported()
|
||||
&& self.screen().caps().shader_subgroup_supported_features
|
||||
& PIPE_SHADER_SUBGROUP_FEATURE_VOTE
|
||||
!= 0
|
||||
}
|
||||
|
||||
pub fn subgroup_rotate_supported(&self) -> bool {
|
||||
let mask =
|
||||
PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED;
|
||||
self.subgroups_supported()
|
||||
&& self.screen().caps().shader_subgroup_supported_features & mask == mask
|
||||
}
|
||||
|
||||
pub fn subgroup_shuffle_supported(&self) -> bool {
|
||||
self.subgroups_supported()
|
||||
&& self.screen().caps().shader_subgroup_supported_features
|
||||
& PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE
|
||||
!= 0
|
||||
}
|
||||
|
||||
pub fn subgroup_shuffle_relative_supported(&self) -> bool {
|
||||
self.subgroups_supported()
|
||||
&& self.screen().caps().shader_subgroup_supported_features
|
||||
& PIPE_SHADER_SUBGROUP_FEATURE_SHUFFLE_RELATIVE
|
||||
!= 0
|
||||
}
|
||||
|
||||
pub fn system_svm_supported(&self) -> bool {
|
||||
|
|
@ -1255,8 +1340,14 @@ impl DeviceBase {
|
|||
intel_subgroups: self.intel_subgroups_supported(),
|
||||
kernel_clock: self.kernel_clock_supported(),
|
||||
subgroups: subgroups_supported,
|
||||
subgroups_shuffle: subgroups_supported,
|
||||
subgroups_shuffle_relative: subgroups_supported,
|
||||
subgroups_ballot: self.subgroup_ballot_supported(),
|
||||
subgroups_clustered: self.subgroup_clustered_supported(),
|
||||
subgroups_extended_types: subgroups_supported,
|
||||
subgroups_non_uniform_arithmetic: self.subgroup_non_uniform_arithmetic_supported(),
|
||||
subgroups_non_uniform_vote: self.subgroup_non_uniform_vote_supported(),
|
||||
subgroups_rotate: self.subgroup_rotate_supported(),
|
||||
subgroups_shuffle: self.subgroup_shuffle_supported(),
|
||||
subgroups_shuffle_relative: self.subgroup_shuffle_relative_supported(),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -99,6 +99,7 @@ fn get_enabled_devs() -> HashMap<String, u32> {
|
|||
|
||||
let driver_str = match driver_str[0] {
|
||||
"llvmpipe" | "lp" => "swrast",
|
||||
"cuda" => "nocl",
|
||||
"freedreno" => "msm",
|
||||
a => a,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -698,6 +698,14 @@ enum pipe_conservative_raster_mode
|
|||
#define PIPE_SHADER_SUBGROUP_FEATURE_CLUSTERED (1 << 6)
|
||||
#define PIPE_SHADER_SUBGROUP_FEATURE_QUAD (1 << 7)
|
||||
#define PIPE_SHADER_SUBGROUP_NUM_FEATURES 8
|
||||
/* VK_SUBGROUP_FEATURE_ROTATE_BIT */
|
||||
#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE (1 << 9)
|
||||
/* VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT */
|
||||
#define PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED (1 << 10)
|
||||
#define PIPE_SHADER_SUBGROUP_FEATURE_MASK \
|
||||
(BITFIELD_MASK(PIPE_SHADER_SUBGROUP_NUM_FEATURES) | \
|
||||
PIPE_SHADER_SUBGROUP_FEATURE_ROTATE | \
|
||||
PIPE_SHADER_SUBGROUP_FEATURE_ROTATE_CLUSTERED)
|
||||
|
||||
enum pipe_point_size_lower_mode {
|
||||
PIPE_POINT_SIZE_LOWER_ALWAYS,
|
||||
|
|
|
|||
|
|
@ -206,6 +206,12 @@ if with_gallium_d3d12
|
|||
else
|
||||
driver_d3d12 = declare_dependency()
|
||||
endif
|
||||
if with_gallium_nocl
|
||||
subdir('winsys/nocl')
|
||||
subdir('drivers/nocl')
|
||||
else
|
||||
driver_nocl = declare_dependency()
|
||||
endif
|
||||
if with_gallium_rusticl
|
||||
subdir('frontends/rusticl')
|
||||
subdir('targets/rusticl')
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ librusticl = shared_library(
|
|||
],
|
||||
dependencies : [
|
||||
driver_asahi,
|
||||
driver_nocl,
|
||||
driver_freedreno,
|
||||
driver_iris,
|
||||
driver_nouveau,
|
||||
|
|
|
|||
25
src/gallium/winsys/nocl/meson.build
Normal file
25
src/gallium/winsys/nocl/meson.build
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Copyright © 2025 Karol Herbst
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
libnocl_cuda = static_library(
|
||||
'nocl_cuda',
|
||||
[
|
||||
'nocl_cuda.c',
|
||||
'nocl_cuda_public.h',
|
||||
],
|
||||
include_directories : [
|
||||
inc_util,
|
||||
inc_gallium,
|
||||
inc_gallium_aux,
|
||||
],
|
||||
dependencies : [
|
||||
dep_libcuda,
|
||||
],
|
||||
)
|
||||
|
||||
idep_libnoclwinsys = declare_dependency(
|
||||
include_directories : [
|
||||
include_directories('.'),
|
||||
],
|
||||
link_with : [libnocl_cuda],
|
||||
)
|
||||
212
src/gallium/winsys/nocl/nocl_cuda.c
Normal file
212
src/gallium/winsys/nocl/nocl_cuda.c
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
/*
|
||||
* Copyright © 2025 Karol Herbst
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "pipe-loader/pipe_loader.h"
|
||||
#include "pipe-loader/pipe_loader_priv.h"
|
||||
#include "target-helpers/inline_debug_helper.h"
|
||||
#include "util/u_dl.h"
|
||||
#include "util/u_memory.h"
|
||||
|
||||
#include "cuda.h"
|
||||
#include "nocl_cuda_public.h"
|
||||
|
||||
|
||||
#define cuda_err(err) if (err) { \
|
||||
const char *err_str; \
|
||||
nocl_dispatch.cuGetErrorString(err, &err_str); \
|
||||
printf("CUDA returned error: %s\n", err_str); \
|
||||
return 0; \
|
||||
}
|
||||
|
||||
static struct driOptionDescription nocl_dri_conf = {};
|
||||
static struct util_dl_library *libcuda = NULL;
|
||||
struct cuda_symbol_table nocl_dispatch = {};
|
||||
|
||||
static struct pipe_screen *
|
||||
nocl_loader_create_screen(struct pipe_loader_device *dev,
|
||||
const struct pipe_screen_config *config,
|
||||
bool sw_vk)
|
||||
{
|
||||
struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)dev;
|
||||
struct pipe_screen *pscreen = nocl_create_screen(cuda_dev);
|
||||
return pscreen ? debug_screen_wrap(pscreen) : NULL;
|
||||
}
|
||||
|
||||
static const struct driOptionDescription *
|
||||
nocl_get_driconf(struct pipe_loader_device *dev, unsigned *count)
|
||||
{
|
||||
*count = 0;
|
||||
return &nocl_dri_conf;
|
||||
}
|
||||
|
||||
static void
|
||||
nocl_release(struct pipe_loader_device **dev)
|
||||
{
|
||||
struct cuda_pipe_loader_device *cuda_dev = (struct cuda_pipe_loader_device *)*dev;
|
||||
FREE(cuda_dev);
|
||||
}
|
||||
|
||||
static struct pipe_loader_ops cuda_loader_ops = {
|
||||
.create_screen = nocl_loader_create_screen,
|
||||
.release = nocl_release,
|
||||
.get_driconf = nocl_get_driconf,
|
||||
};
|
||||
|
||||
static int32_t faked_version = 0;
|
||||
|
||||
static bool
|
||||
nocl_load_pfn(const char *name, void **pfn, int version)
|
||||
{
|
||||
if (version > faked_version) {
|
||||
*pfn = NULL;
|
||||
return false;
|
||||
}
|
||||
|
||||
CUdriverProcAddressQueryResult proc_result;
|
||||
CUresult res = nocl_dispatch.cuGetProcAddress_v2(name, pfn, version, CU_GET_PROC_ADDRESS_LEGACY_STREAM, &proc_result);
|
||||
bool success = res == CUDA_SUCCESS && proc_result == CU_GET_PROC_ADDRESS_SUCCESS;
|
||||
/* Probably always NULL on errors but let's be sure */
|
||||
if (!success)
|
||||
*pfn = NULL;
|
||||
return success;
|
||||
}
|
||||
|
||||
int
|
||||
pipe_loader_cuda_probe(struct pipe_loader_device **devs, int ndev)
|
||||
{
|
||||
if (!libcuda) {
|
||||
UNUSED void *func = NULL;
|
||||
faked_version = debug_get_num_option("NOCL_API_TARGET", INT32_MAX);
|
||||
libcuda = util_dl_open("libcuda.so");
|
||||
if (!libcuda)
|
||||
return 0;
|
||||
|
||||
/* Needs 12.0 */
|
||||
nocl_dispatch.cuGetProcAddress_v2 = (void*)util_dl_get_proc_address(libcuda, "cuGetProcAddress_v2");
|
||||
if (!nocl_dispatch.cuGetProcAddress_v2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
success &= nocl_load_pfn("cuCtxCreate", (void**)&nocl_dispatch.cuCtxCreate_v2, 3020);
|
||||
success &= nocl_load_pfn("cuCtxDestroy", (void**)&nocl_dispatch.cuCtxDestroy_v2, 4000);
|
||||
success &= nocl_load_pfn("cuCtxSetCurrent", (void**)&nocl_dispatch.cuCtxSetCurrent, 4000);
|
||||
success &= nocl_load_pfn("cuDeviceGet", (void**)&nocl_dispatch.cuDeviceGet, 2000);
|
||||
success &= nocl_load_pfn("cuDeviceGetAttribute", (void**)&nocl_dispatch.cuDeviceGetAttribute, 2000);
|
||||
success &= nocl_load_pfn("cuDeviceGetCount", (void**)&nocl_dispatch.cuDeviceGetCount, 2000);
|
||||
success &= nocl_load_pfn("cuDeviceGetName", (void**)&nocl_dispatch.cuDeviceGetName, 2000);
|
||||
success &= nocl_load_pfn("cuDeviceTotalMem", (void**)&nocl_dispatch.cuDeviceTotalMem_v2, 3020);
|
||||
success &= nocl_load_pfn("cuDriverGetVersion", (void**)&nocl_dispatch.cuDriverGetVersion, 2020);
|
||||
success &= nocl_load_pfn("cuEventCreate", (void**)&nocl_dispatch.cuEventCreate, 2000);
|
||||
success &= nocl_load_pfn("cuEventDestroy", (void**)&nocl_dispatch.cuEventDestroy_v2, 4000);
|
||||
success &= nocl_load_pfn("cuEventElapsedTime", (void**)&nocl_dispatch.cuEventElapsedTime_v1, 2000);
|
||||
success &= nocl_load_pfn("cuEventRecord", (void**)&nocl_dispatch.cuEventRecord, 2000);
|
||||
success &= nocl_load_pfn("cuEventSynchronize", (void**)&nocl_dispatch.cuEventSynchronize, 2000);
|
||||
success &= nocl_load_pfn("cuFuncGetAttribute", (void**)&nocl_dispatch.cuFuncGetAttribute, 2020);
|
||||
success &= nocl_load_pfn("cuGetErrorString", (void**)&nocl_dispatch.cuGetErrorString, 6000);
|
||||
success &= nocl_load_pfn("cuInit", (void**)&nocl_dispatch.cuInit, 2000);
|
||||
success &= nocl_load_pfn("cuLaunchKernel", (void**)&nocl_dispatch.cuLaunchKernel, 4000);
|
||||
success &= nocl_load_pfn("cuLinkAddData", (void**)&nocl_dispatch.cuLinkAddData_v2, 6050);
|
||||
success &= nocl_load_pfn("cuLinkComplete", (void**)&nocl_dispatch.cuLinkComplete, 5050);
|
||||
success &= nocl_load_pfn("cuLinkCreate", (void**)&nocl_dispatch.cuLinkCreate_v2, 6050);
|
||||
success &= nocl_load_pfn("cuLinkDestroy", (void**)&nocl_dispatch.cuLinkDestroy, 5050);
|
||||
success &= nocl_load_pfn("cuMemAlloc", (void**)&nocl_dispatch.cuMemAlloc_v2, 3020);
|
||||
success &= nocl_load_pfn("cuMemAllocHost", (void**)&nocl_dispatch.cuMemAllocHost_v2, 3020);
|
||||
success &= nocl_load_pfn("cuMemcpy", (void**)&nocl_dispatch.cuMemcpy, 4000);
|
||||
success &= nocl_load_pfn("cuMemcpyAsync", (void**)&nocl_dispatch.cuMemcpyAsync, 4000);
|
||||
success &= nocl_load_pfn("cuMemcpyDtoHAsync", (void**)&nocl_dispatch.cuMemcpyDtoHAsync_v2, 3020);
|
||||
success &= nocl_load_pfn("cuMemcpyHtoD", (void**)&nocl_dispatch.cuMemcpyHtoD_v2, 3020);
|
||||
success &= nocl_load_pfn("cuMemFree", (void**)&nocl_dispatch.cuMemFree_v2, 3020);
|
||||
success &= nocl_load_pfn("cuMemFreeHost", (void**)&nocl_dispatch.cuMemFreeHost, 2000);
|
||||
success &= nocl_load_pfn("cuMemsetD8Async", (void**)&nocl_dispatch.cuMemsetD8Async, 3020);
|
||||
success &= nocl_load_pfn("cuMemsetD16Async", (void**)&nocl_dispatch.cuMemsetD16Async, 3020);
|
||||
success &= nocl_load_pfn("cuMemsetD32Async", (void**)&nocl_dispatch.cuMemsetD32Async, 3020);
|
||||
success &= nocl_load_pfn("cuModuleGetFunction", (void**)&nocl_dispatch.cuModuleGetFunction, 2000);
|
||||
success &= nocl_load_pfn("cuModuleLoadDataEx", (void**)&nocl_dispatch.cuModuleLoadDataEx, 2010);
|
||||
success &= nocl_load_pfn("cuModuleUnload", (void**)&nocl_dispatch.cuModuleUnload, 2000);
|
||||
success &= nocl_load_pfn("cuStreamDestroy", (void**)&nocl_dispatch.cuStreamDestroy_v2, 4000);
|
||||
success &= nocl_load_pfn("cuStreamSynchronize", (void**)&nocl_dispatch.cuStreamSynchronize, 2000);
|
||||
if (!success)
|
||||
return 0;
|
||||
|
||||
/* Only documented difference appears to be that v2 won't return errors on previous async
|
||||
* commands, so just overwrite the previously fetched pointer */
|
||||
nocl_load_pfn("cuEventElapsedTime", &func, 12080);
|
||||
if (func)
|
||||
nocl_dispatch.cuEventElapsedTime_v1 = func;
|
||||
|
||||
nocl_dispatch.all_required_available = true;
|
||||
#if CUDA_VERSION >= 11040
|
||||
success &= nocl_load_pfn("cuDeviceGetUuid", (void**)&nocl_dispatch.cuDeviceGetUuid_v2, 11040);
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12090
|
||||
nocl_load_pfn("cuLogsRegisterCallback", (void**)&nocl_dispatch.cuLogsRegisterCallback, 12090);
|
||||
nocl_load_pfn("cuLogsUnregisterCallback", (void**)&nocl_dispatch.cuLogsUnregisterCallback, 12090);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!nocl_dispatch.all_required_available)
|
||||
return 0;
|
||||
|
||||
int driver_version;
|
||||
CUresult res = nocl_dispatch.cuInit(0);
|
||||
cuda_err(res);
|
||||
|
||||
res = nocl_dispatch.cuDriverGetVersion(&driver_version);
|
||||
/* Our highest version requirement across all required APIs */
|
||||
if (driver_version < 8000)
|
||||
return 0;
|
||||
|
||||
if (!ndev) {
|
||||
res = nocl_dispatch.cuDeviceGetCount(&ndev);
|
||||
cuda_err(res);
|
||||
return ndev;
|
||||
}
|
||||
|
||||
if (!devs)
|
||||
return 0;
|
||||
|
||||
int cnt;
|
||||
res = nocl_dispatch.cuDeviceGetCount(&cnt);
|
||||
cuda_err(res);
|
||||
for (int i = 0; i < ndev; i++) {
|
||||
if (i >= cnt)
|
||||
return i;
|
||||
|
||||
CUdevice cuda_dev;
|
||||
res = nocl_dispatch.cuDeviceGet(&cuda_dev, i);
|
||||
if (res)
|
||||
return i;
|
||||
|
||||
int pci_device_id;
|
||||
#if CUDA_VERSION >= 12080
|
||||
res = nocl_dispatch.cuDeviceGetAttribute(&pci_device_id, CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID, cuda_dev);
|
||||
cuda_err(res);
|
||||
#else
|
||||
/* TODO: we might need to get the device id from somewhere, but so far
|
||||
* nothing needs it */
|
||||
pci_device_id = 0x10de;
|
||||
#endif
|
||||
|
||||
struct cuda_pipe_loader_device *dev = CALLOC_STRUCT(cuda_pipe_loader_device);
|
||||
*dev = (struct cuda_pipe_loader_device) {
|
||||
.base = {
|
||||
.driver_name = "nocl",
|
||||
.ops = &cuda_loader_ops,
|
||||
/* TODO: tegra */
|
||||
.type = PIPE_LOADER_DEVICE_PCI,
|
||||
.u.pci.chip_id = ((unsigned int)pci_device_id) >> 16,
|
||||
.u.pci.vendor_id = pci_device_id & 0xffff,
|
||||
},
|
||||
.dev = cuda_dev,
|
||||
.cuda_version = driver_version,
|
||||
};
|
||||
|
||||
devs[i] = &dev->base;
|
||||
}
|
||||
|
||||
return ndev;
|
||||
}
|
||||
73
src/gallium/winsys/nocl/nocl_cuda_public.h
Normal file
73
src/gallium/winsys/nocl/nocl_cuda_public.h
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Copyright © 2025 Karol Herbst
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#ifndef __NOCL_DRM_PUBLIC_H__
|
||||
#define __NOCL_DRM_PUBLIC_H__
|
||||
|
||||
#include <cudaTypedefs.h>
|
||||
|
||||
#include "pipe-loader/pipe_loader.h"
|
||||
|
||||
struct cuda_pipe_loader_device {
|
||||
struct pipe_loader_device base;
|
||||
CUdevice dev;
|
||||
int cuda_version;
|
||||
};
|
||||
|
||||
extern struct cuda_symbol_table {
|
||||
PFN_cuCtxCreate_v3020 cuCtxCreate_v2;
|
||||
PFN_cuCtxDestroy_v4000 cuCtxDestroy_v2;
|
||||
PFN_cuCtxSetCurrent_v4000 cuCtxSetCurrent;
|
||||
PFN_cuDeviceGet_v2000 cuDeviceGet;
|
||||
PFN_cuDeviceGetAttribute_v2000 cuDeviceGetAttribute;
|
||||
PFN_cuDeviceGetCount_v2000 cuDeviceGetCount;
|
||||
PFN_cuDeviceGetName_v2000 cuDeviceGetName;
|
||||
PFN_cuDeviceTotalMem_v3020 cuDeviceTotalMem_v2;
|
||||
PFN_cuDriverGetVersion_v2020 cuDriverGetVersion;
|
||||
PFN_cuEventCreate_v2000 cuEventCreate;
|
||||
PFN_cuEventDestroy_v4000 cuEventDestroy_v2;
|
||||
PFN_cuEventElapsedTime_v2000 cuEventElapsedTime_v1;
|
||||
PFN_cuEventRecord_v2000 cuEventRecord;
|
||||
PFN_cuEventSynchronize_v2000 cuEventSynchronize;
|
||||
PFN_cuFuncGetAttribute_v2020 cuFuncGetAttribute;
|
||||
PFN_cuGetErrorString_v6000 cuGetErrorString;
|
||||
PFN_cuGetProcAddress_v12000 cuGetProcAddress_v2;
|
||||
PFN_cuInit_v2000 cuInit;
|
||||
PFN_cuLaunchKernel_v4000 cuLaunchKernel;
|
||||
PFN_cuLinkAddData_v6050 cuLinkAddData_v2;
|
||||
PFN_cuLinkComplete_v5050 cuLinkComplete;
|
||||
PFN_cuLinkCreate_v6050 cuLinkCreate_v2;
|
||||
PFN_cuLinkDestroy_v5050 cuLinkDestroy;
|
||||
PFN_cuMemAlloc_v3020 cuMemAlloc_v2;
|
||||
PFN_cuMemAllocHost_v3020 cuMemAllocHost_v2;
|
||||
PFN_cuMemcpy_v4000 cuMemcpy;
|
||||
PFN_cuMemcpyAsync_v4000 cuMemcpyAsync;
|
||||
PFN_cuMemcpyDtoHAsync_v3020 cuMemcpyDtoHAsync_v2;
|
||||
PFN_cuMemcpyHtoD_v3020 cuMemcpyHtoD_v2;
|
||||
PFN_cuMemFree_v3020 cuMemFree_v2;
|
||||
PFN_cuMemFreeHost_v2000 cuMemFreeHost;
|
||||
PFN_cuMemsetD8Async_v3020 cuMemsetD8Async;
|
||||
PFN_cuMemsetD16Async_v3020 cuMemsetD16Async;
|
||||
PFN_cuMemsetD32Async_v3020 cuMemsetD32Async;
|
||||
PFN_cuModuleGetFunction_v2000 cuModuleGetFunction;
|
||||
PFN_cuModuleLoadDataEx_v2010 cuModuleLoadDataEx;
|
||||
PFN_cuModuleUnload_v2000 cuModuleUnload;
|
||||
PFN_cuStreamDestroy_v4000 cuStreamDestroy_v2;
|
||||
PFN_cuStreamSynchronize_v2000 cuStreamSynchronize;
|
||||
#if CUDA_VERSION >= 11040
|
||||
PFN_cuDeviceGetUuid_v11040 cuDeviceGetUuid_v2;
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12090
|
||||
PFN_cuLogsRegisterCallback_v12090 cuLogsRegisterCallback;
|
||||
PFN_cuLogsUnregisterCallback_v12090 cuLogsUnregisterCallback;
|
||||
#endif
|
||||
bool all_required_available;
|
||||
} nocl_dispatch;
|
||||
|
||||
typedef int CUdevice;
|
||||
struct pipe_screen *nocl_create_screen(struct cuda_pipe_loader_device *dev);
|
||||
|
||||
#endif /* __NOCL_DRM_PUBLIC_H__ */
|
||||
Loading…
Add table
Reference in a new issue