kk: Add clc in a similar fashion to other drivers like HK

OpenCL shaders will now be compiled offline and translated to MSL offline
at build time. Then added to a static table as binary data, to later be
compiled once when devices are initialized. This allows for easier
integration of OpenCL shaders for things such as tessellation since we
now don't have to manually add these shaders to the precompiled device
library in code.

Signed-off-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40436>
This commit is contained in:
Aitor Camacho 2026-03-08 14:16:26 +09:00 committed by Marge Bot
parent 3aea42e656
commit e08da4e928
20 changed files with 557 additions and 323 deletions

View file

@ -0,0 +1,325 @@
/*
* Copyright 2026 LunarG, Inc.
* Copyright 2026 Google LLC
* Copyright 2023 Alyssa Rosenzweig
* Copyright 2020 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "kosmickrisp/compiler/nir_to_msl.h"
#include "compiler/glsl_types.h"
#include "compiler/spirv/nir_spirv.h"
#include "nir.h"
#include "nir_builder.h"
#include "nir_builder_opcodes.h"
#include "nir_intrinsics.h"
#include "nir_precompiled.h"
#include "shader_enums.h"
#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "util/macros.h"
#include <sys/mman.h>
static const struct spirv_to_nir_options spirv_options = {
.environment = NIR_SPIRV_OPENCL,
.shared_addr_format = nir_address_format_62bit_generic,
.global_addr_format = nir_address_format_62bit_generic,
.temp_addr_format = nir_address_format_62bit_generic,
.constant_addr_format = nir_address_format_64bit_global,
.create_library = true,
.printf = true,
};
/* Standard optimization loop */
static void
optimize(nir_shader *nir)
{
bool progress;
do {
progress = false;
NIR_PASS(progress, nir, nir_split_var_copies);
NIR_PASS(progress, nir, nir_split_struct_vars, nir_var_function_temp);
NIR_PASS(progress, nir, nir_lower_var_copies);
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
NIR_PASS(progress, nir, nir_opt_copy_prop);
NIR_PASS(progress, nir, nir_opt_remove_phis);
NIR_PASS(progress, nir, nir_lower_all_phis_to_scalar);
NIR_PASS(progress, nir, nir_opt_dce);
NIR_PASS(progress, nir, nir_opt_dead_cf);
NIR_PASS(progress, nir, nir_opt_cse);
nir_opt_peephole_select_options peephole_select_options = {
.limit = 64,
.expensive_alu_ok = true,
};
NIR_PASS(progress, nir, nir_opt_peephole_select,
&peephole_select_options);
NIR_PASS(progress, nir, nir_opt_phi_precision);
NIR_PASS(progress, nir, nir_opt_algebraic);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_opt_deref);
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
NIR_PASS(progress, nir, nir_opt_undef);
NIR_PASS(progress, nir, nir_lower_undef_to_zero);
NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
NIR_PASS(progress, nir, nir_opt_loop_unroll);
} while (progress);
}
static nir_shader *
compile(void *memctx, const uint32_t *spirv, size_t spirv_size)
{
const nir_shader_compiler_options *nir_options = &kk_nir_options;
assert(spirv_size % 4 == 0);
nir_shader *nir =
spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
"library", &spirv_options, nir_options);
nir_validate_shader(nir, "after spirv_to_nir");
ralloc_steal(memctx, nir);
nir_fixup_is_exported(nir);
NIR_PASS(_, nir, nir_lower_system_values);
NIR_PASS(_, nir, nir_lower_calls_to_builtins);
nir_lower_compute_system_values_options cs = {.global_id_is_32bit = true};
NIR_PASS(_, nir, nir_lower_compute_system_values, &cs);
/* We have to lower away local constant initializers right before we
* inline functions. That way they get properly initialized at the top
* of the function and not at the top of its caller.
*/
NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp);
NIR_PASS(_, nir, nir_lower_returns);
NIR_PASS(_, nir, nir_inline_functions);
nir_remove_non_exported(nir);
NIR_PASS(_, nir, nir_opt_copy_prop);
NIR_PASS(_, nir, nir_opt_deref);
/* We can't deal with constant data, get rid of it */
nir_lower_constant_to_temp(nir);
/* We can go ahead and lower the rest of the constant initializers. We do
* this here so that nir_remove_dead_variables and split_per_member_structs
* below see the corresponding stores.
*/
NIR_PASS(_, nir, nir_lower_variable_initializers, ~0);
/* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
* aligned and so it can just read/write them as vec4s. This results in a
* LOT of vec4->vec3 casts on loads and stores. One solution to this
* problem is to get rid of all vec3 variables.
*/
NIR_PASS(_, nir, nir_lower_vec3_to_vec4,
nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
nir_var_mem_global | nir_var_mem_constant);
/* We assign explicit types early so that the optimizer can take advantage
* of that information and hopefully get rid of some of our memcpys.
*/
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
nir_var_uniform | nir_var_shader_temp | nir_var_function_temp |
nir_var_mem_shared | nir_var_mem_global,
glsl_get_cl_type_size_align);
optimize(nir);
NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
/* Lower again, this time after dead-variables to get more compact variable
* layouts.
*/
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
nir_var_mem_global | nir_var_mem_constant,
glsl_get_cl_type_size_align);
assert(nir->constant_data_size == 0);
NIR_PASS(_, nir, nir_lower_memcpy);
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_constant,
nir_address_format_64bit_global);
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform,
nir_address_format_32bit_offset_as_64bit);
/* Note: we cannot lower explicit I/O here, because we need derefs in tact
* for function calls into the library to work.
*/
NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);
NIR_PASS(_, nir, nir_opt_if, 0);
NIR_PASS(_, nir, nir_opt_idiv_const, 16);
msl_lower_textures(nir);
msl_lower_nir_late(nir);
optimize(nir);
return nir;
}
static void
print_shader(FILE *fp, const char *name, const char *suffix, uint32_t variant,
const char *msl, nir_shader *nir)
{
uint32_t msl_length = strlen(msl) + 1;
uint32_t workgroup_count_size = sizeof(nir->info.workgroup_size);
size_t sz_B = workgroup_count_size + msl_length;
size_t sz_el = DIV_ROUND_UP(sz_B, 4);
uint32_t *mem = calloc(sz_el, 4);
memcpy(mem, nir->info.workgroup_size, workgroup_count_size);
memcpy((uint8_t *)mem + workgroup_count_size, msl, msl_length);
nir_precomp_print_blob(fp, name, suffix, variant, mem, sz_B, true);
free(mem);
}
static nir_def *
load_kernel_input(nir_builder *b, unsigned num_components, unsigned bit_size,
unsigned offset_B)
{
nir_def *root = nir_load_buffer_ptr_kk(b, 1, 64, .binding = 0);
/* We've bound the address of the root descriptor, index in. */
nir_def *addr = nir_iadd(b, root, nir_imm_int64(b, offset_B));
return nir_build_load_global_constant(b, num_components, bit_size, addr,
.align_mul = bit_size,
.access = ACCESS_CAN_SPECULATE);
}
static int
type_size_vec4(const struct glsl_type *type, bool bindless)
{
return glsl_count_attribute_slots(type, false);
}
int
main(int argc, char **argv)
{
if (argc != 4) {
fprintf(stderr, "Usage: %s [input spir-v] [output header] [output C]\n",
argv[0]);
return 1;
}
const char *infile = argv[1];
const char *outh_file = argv[2];
const char *outc_file = argv[3];
void *mem_ctx = ralloc_context(NULL);
int fd = open(infile, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Failed to open %s\n", infile);
ralloc_free(mem_ctx);
return 1;
}
off_t spirv_len = lseek(fd, 0, SEEK_END);
const void *spirv_map = mmap(NULL, spirv_len, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
if (spirv_map == MAP_FAILED) {
fprintf(stderr, "Failed to mmap the file: errno=%d, %s\n", errno,
strerror(errno));
ralloc_free(mem_ctx);
return 1;
}
FILE *fp_h = fopen(outh_file, "w");
FILE *fp_c = fopen(outc_file, "w");
glsl_type_singleton_init_or_ref();
nir_precomp_print_header(fp_c, fp_h, "KosmicKrisp Contributors",
"libkk_shaders.h");
nir_shader *nir = compile(mem_ctx, spirv_map, spirv_len);
/* load_preamble works at 16-bit granularity */
struct nir_precomp_opts opt = {.arg_align_B = 2};
const char *target = "AppleSilicon";
nir_foreach_entrypoint(libfunc, nir) {
libfunc->pass_flags = 0;
unsigned nr_vars = nir_precomp_nr_variants(libfunc);
nir_precomp_print_layout_struct(fp_h, &opt, libfunc);
for (unsigned v = 0; v < nr_vars; ++v) {
nir_shader *s = nir_precompiled_build_variant(
libfunc, MESA_SHADER_COMPUTE, v, &kk_nir_options, &opt,
load_kernel_input);
nir_link_shader_functions(s, nir);
NIR_PASS(_, s, nir_inline_functions);
nir_remove_non_entrypoints(s);
NIR_PASS(_, s, nir_opt_deref);
NIR_PASS(_, s, nir_lower_vars_to_ssa);
NIR_PASS(_, s, nir_remove_dead_derefs);
NIR_PASS(_, s, nir_remove_dead_variables,
nir_var_function_temp | nir_var_shader_temp, NULL);
NIR_PASS(_, s, nir_lower_vars_to_explicit_types,
nir_var_shader_temp | nir_var_function_temp,
glsl_get_cl_type_size_align);
NIR_PASS(_, s, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
glsl_get_cl_type_size_align);
NIR_PASS(_, s, nir_lower_explicit_io, nir_var_mem_shared,
nir_address_format_62bit_generic);
msl_preprocess_nir(s);
msl_optimize_nir(nir);
NIR_PASS(_, s, nir_opt_deref);
NIR_PASS(_, s, nir_lower_vars_to_ssa);
NIR_PASS(_, s, nir_lower_explicit_io,
nir_var_shader_temp | nir_var_function_temp |
nir_var_mem_shared | nir_var_mem_global,
nir_address_format_62bit_generic);
/* nir_lower_explicit_io can create phis we need to get rid of */
NIR_PASS(_, s, nir_convert_from_ssa, true, true);
NIR_PASS(_, s, nir_trivialize_registers);
/* nir_lower_explicit_io will create unpack_64 we need to lower */
NIR_PASS(_, s, nir_opt_algebraic);
nir_shader_gather_info(s, nir_shader_get_entrypoint(s));
struct nir_to_msl_options options = {};
const char *msl = nir_to_msl(s, &options);
print_shader(fp_c, libfunc->name, target, v, msl, s);
ralloc_free((void *)msl);
ralloc_free(s);
}
}
nir_precomp_print_program_enum(fp_h, nir, "libkk");
nir_precomp_print_dispatch_macros(fp_h, &opt, nir);
/* For each target, generate a table mapping programs to binaries */
nir_precomp_print_extern_binary_map(fp_h, "libkk", target);
nir_precomp_print_binary_map(fp_c, nir, "libkk", target, NULL);
glsl_type_singleton_decref();
fclose(fp_c);
fclose(fp_h);
ralloc_free(mem_ctx);
return 0;
}

View file

@ -0,0 +1,23 @@
/*
* Copyright 2026 LunarG, Inc.
* Copyright 2026 Google LLC
* SPDX-License-Identifier: MIT
*/
#ifndef KK_PRECOMPILED_SHADER_H
#define KK_PRECOMPILED_SHADER_H 1
#include "kosmickrisp/bridge/mtl_types.h"
#include <stdint.h>
struct kk_precompiled_info {
uint16_t workgroup_size[3];
};
struct kk_precompiled_shader {
struct kk_precompiled_info info;
mtl_compute_pipeline_state *pipeline;
};
#endif /* KK_PRECOMPILED_SHADER_H_ */

View file

@ -0,0 +1,13 @@
# Copyright 2026 LunarG, Inc.
# Copyright 2026 Google LLC
# SPDX-License-Identifier: MIT
prog_kk_clc = executable(
'kk_clc',
['kk_clc.c'],
link_with : [libmsl_compiler],
include_directories : [inc_include, inc_src],
c_args : [pre_args, no_override_init_args],
link_args : [ld_args_build_id],
dependencies : [idep_vtn, idep_nir, idep_mesautil],
)

View file

@ -1986,11 +1986,13 @@ msl_preprocess_nir(struct nir_shader *nir)
nir_var_function_temp | nir_var_shader_in | nir_var_shader_out);
NIR_PASS(_, nir, nir_lower_alu_to_scalar, kk_scalarize_filter, NULL);
/* If we do 256 here MSL compiler crashes with
* dEQP-VK.graphicsfuzz.stable-binarysearch-tree-nested-if-and-conditional */
NIR_PASS(_, nir, nir_lower_vars_to_scratch, 32,
glsl_get_natural_size_align_bytes, glsl_get_word_size_align_bytes);
NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees,
nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
NIR_PASS(_, nir, nir_lower_vars_to_scratch, 0,
glsl_get_natural_size_align_bytes,
glsl_get_natural_size_align_bytes);
nir_var_function_temp | nir_var_shader_in | nir_var_shader_out,
UINT32_MAX);
nir_lower_compute_system_values_options csv_options = {
.has_base_global_invocation_id = 0,
@ -2043,7 +2045,7 @@ msl_optimize_nir(struct nir_shader *nir)
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
NIR_PASS(_, nir, msl_nir_lower_algebraic_late);
NIR_PASS(_, nir, nir_convert_from_ssa, true, false);
nir_trivialize_registers(nir);
NIR_PASS(_, nir, nir_trivialize_registers);
NIR_PASS(_, nir, nir_opt_copy_prop);
return progress;

View file

@ -72,3 +72,37 @@ bool msl_nir_vs_io_types(nir_shader *nir);
bool msl_nir_fake_guard_for_discards(struct nir_shader *nir);
bool msl_nir_lower_sample_shading(nir_shader *nir);
void msl_lower_nir_late(nir_shader *nir);
static const nir_shader_compiler_options kk_nir_options = {
.lower_fdph = true,
.has_fsub = true,
.has_isub = true,
.lower_extract_word = true,
.lower_extract_byte = true,
.lower_insert_word = true,
.lower_insert_byte = true,
.lower_fmod = true,
.discard_is_demote = true,
.instance_id_includes_base_index = true,
.lower_device_index_to_zero = true,
.lower_pack_64_2x32_split = true,
.lower_unpack_64_2x32_split = true,
.lower_pack_64_2x32 = true,
.lower_pack_half_2x16 = true,
.lower_pack_split = true,
.lower_unpack_half_2x16 = true,
.has_cs_global_id = true,
.lower_fquantize2f16 = true,
.lower_scmp = true,
.lower_ifind_msb = true,
.lower_ufind_msb = true,
.lower_find_lsb = true,
.has_uclz = true,
.lower_mul_2x32_64 = true,
.lower_uadd_carry = true,
.lower_usub_borrow = true,
/* Metal does not support double. */
.lower_doubles_options = (nir_lower_doubles_options)(~0),
.lower_int64_options = nir_lower_ufind_msb64 | nir_lower_subgroup_shuffle64,
.io_options = nir_io_mediump_is_32bit,
};

View file

@ -10,13 +10,13 @@
#include "kk_query.h"
void
KERNEL(1)
libkk_write_u64(global struct libkk_imm_write *write_array)
{
*write_array[cl_group_id.x].address = write_array[cl_group_id.x].value;
}
void
KERNEL(1)
libkk_copy_queries(global uint64_t *availability, global uint64_t *results,
global uint16_t *oq_index, uint64_t dst_addr,
uint64_t dst_stride, uint32_t first_query,

View file

@ -79,7 +79,7 @@ libkk_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
*
* Here we assume the first vertex is provoking, the Vulkan default.
*/
uint offsets[6] = {
const uint offsets[6] = {
0,
first ? 1 : (even ? -2 : 3),
even_or_first ? 2 : 4,
@ -212,9 +212,8 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch)
return (first_group * 32) + off;
}
// TODO_KOSMICKRISP
// KERNEL(1024)
void
KERNEL(1)
libkk_unroll_geometry_and_restart(
constant uint8_t *index_buffer, global uint8_t *out_ptr,
constant uint32_t *in_draw, global uint32_t *out_draw,

View file

@ -0,0 +1,54 @@
# Copyright 2026 LunarG, Inc.
# Copyright 2026 Google LLC
# SPDX-License-Identifier: MIT
libkk_shader_files = files(
'kk_triangle_fan.cl',
'kk_query.cl',
)
libkk_spv = custom_target(
input : libkk_shader_files,
output : 'libkk.spv',
command : [
prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@',
libkk_shader_files, '--',
'-I' + join_paths(meson.project_source_root(), 'include'),
'-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
'-I' + join_paths(meson.current_source_dir(), '.'),
'-I' + join_paths(meson.current_source_dir(), '../../'),
'-I' + join_paths(meson.current_source_dir(), 'shaders'),
cl_args,
],
depfile : 'libkk_shaders.h.d',
)
libkk = custom_target(
input : libkk_spv,
output : ['libkk.cpp', 'libkk.h'],
command : [prog_vtn_bindgen2, libkk_spv, '@OUTPUT0@', '@OUTPUT1@'],
)
idep_libkk = declare_dependency(
sources : [libkk],
include_directories : include_directories('.'),
)
libkk_shaders = custom_target(
input : libkk_spv,
output : ['libkk_shaders.h', 'libkk_shaders.c'],
command : [prog_kk_clc, libkk_spv, '@OUTPUT0@', '@OUTPUT1@']
)
# TODO_KOSMICKRISP We can get rid of this library once we remove the generated entrypoints for the trampoline
libkk_shaders_lib = static_library(
'libkk_shaders',
sources : [libkk_shaders],
dependencies : [idep_mesautil],
)
idep_libkk_shaders_h = declare_dependency(
sources : [libkk_shaders[0]],
include_directories : include_directories('.'),
link_with : libkk_shaders_lib,
)

View file

@ -5,6 +5,8 @@
subdir('bridge')
subdir('compiler')
subdir('util')
subdir('clc')
subdir('libkk')
subdir('vulkan')
executable(

View file

@ -421,6 +421,31 @@ kk_cmd_buffer_flush_push_descriptors(struct kk_cmd_buffer *cmd,
desc->push_dirty = 0;
}
void
kk_dispatch_precomp(struct kk_cmd_buffer *cmd, struct mtl_size grid,
bool pre_gfx, enum libkk_program idx, void *data,
size_t data_size)
{
struct kk_device *dev = kk_cmd_buffer_device(cmd);
struct kk_precompiled_shader *prog = &dev->precompiled_cache.shaders[idx];
mtl_compute_encoder *encoder =
pre_gfx ? kk_encoder_pre_gfx_encoder(cmd) : kk_compute_encoder(cmd);
struct kk_bo *bo = kk_cmd_allocate_buffer(cmd, data_size, 4u);
memcpy(bo->cpu, data, data_size);
mtl_compute_set_buffer(encoder, bo->map, 0, 0);
mtl_compute_set_pipeline_state(encoder, prog->pipeline);
struct mtl_size local_size = {
.x = prog->info.workgroup_size[0],
.y = prog->info.workgroup_size[1],
.z = prog->info.workgroup_size[2],
};
mtl_dispatch_threads(encoder, grid, local_size);
}
void
kk_cmd_write(struct kk_cmd_buffer *cmd, mtl_buffer *buffer, uint64_t addr,
uint64_t value)

View file

@ -254,14 +254,13 @@ uint64_t kk_upload_descriptor_root(struct kk_cmd_buffer *cmd,
void kk_cmd_buffer_flush_push_descriptors(struct kk_cmd_buffer *cmd,
struct kk_descriptor_state *desc);
void kk_dispatch_precomp(struct kk_cmd_buffer *cmd, struct mtl_size grid,
bool pre_gfx, enum libkk_program idx, void *data,
size_t data_size);
#define MESA_DISPATCH_PRECOMP kk_dispatch_precomp
void kk_cmd_write(struct kk_cmd_buffer *cmd, mtl_buffer *buffer, uint64_t addr,
uint64_t value);
void kk_cmd_dispatch_pipeline(struct kk_cmd_buffer *cmd,
mtl_compute_encoder *encoder,
mtl_compute_pipeline_state *pipeline,
const void *push_data, size_t push_size,
uint32_t groupCountX, uint32_t groupCountY,
uint32_t groupCountZ);
#endif

View file

@ -18,44 +18,6 @@
#include "vk_common_entrypoints.h"
void
kk_cmd_dispatch_pipeline(struct kk_cmd_buffer *cmd,
mtl_compute_encoder *encoder,
mtl_compute_pipeline_state *pipeline,
const void *push_data, size_t push_size,
uint32_t groupCountX, uint32_t groupCountY,
uint32_t groupCountZ)
{
struct kk_root_descriptor_table *root = NULL;
struct kk_bo *bo = kk_cmd_allocate_buffer(cmd, sizeof(*root), 8u);
/* kk_cmd_allocate_buffer already sets the error, we can just exit */
if (!bo)
return;
root = bo->cpu;
assert(push_size <= sizeof(root->push));
memcpy(root->push, push_data, push_size);
root->cs.base_group[0] = 1; /* TODO_KOSMICKRISP This is hard-coded because we
know this is the size we create them with */
root->cs.base_group[1] = 1;
root->cs.base_group[2] = 1;
mtl_compute_set_buffer(encoder, bo->map, 0, 0);
mtl_compute_set_pipeline_state(encoder, pipeline);
struct mtl_size grid_size = {
.x = groupCountX,
.y = groupCountY,
.z = groupCountZ,
};
struct mtl_size local_size = {
.x = 1,
.y = 1,
.z = 1,
};
mtl_dispatch_threads(encoder, grid_size, local_size);
}
VKAPI_ATTR void VKAPI_CALL
kk_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX,
uint32_t groupCountY, uint32_t groupCountZ)

View file

@ -15,6 +15,9 @@
#include "kosmickrisp/bridge/mtl_types.h"
#include "kosmickrisp/clc/kk_precompiled_shader.h"
#include "libkk_shaders.h"
#include "util/u_dynarray.h"
#include "vk_device.h"
@ -25,13 +28,6 @@ struct kk_bo;
struct kk_physical_device;
struct vk_pipeline_cache;
enum kk_device_lib_pipeline {
KK_LIB_IMM_WRITE = 0,
KK_LIB_COPY_QUERY,
KK_LIB_TRIANGLE_FAN,
KK_LIB_COUNT,
};
struct kk_residency_set {
simple_mtx_t mutex;
mtl_residency_set *handle;
@ -75,6 +71,10 @@ struct kk_sampler_heap {
struct hash_table *ht;
};
struct kk_precompiled_cache {
struct kk_precompiled_shader shaders[LIBKK_NUM_PROGRAMS];
};
struct kk_device {
struct vk_device vk;
@ -91,7 +91,7 @@ struct kk_device {
* recording as required by Metal. */
struct kk_residency_set residency_set;
mtl_compute_pipeline_state *lib_pipelines[KK_LIB_COUNT];
struct kk_precompiled_cache precompiled_cache;
struct kk_queue queue;
@ -103,14 +103,6 @@ struct kk_device {
VK_DEFINE_HANDLE_CASTS(kk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
static inline mtl_compute_pipeline_state *
kk_device_lib_pipeline(const struct kk_device *dev,
enum kk_device_lib_pipeline pipeline)
{
assert(pipeline < KK_LIB_COUNT);
return dev->lib_pipelines[pipeline];
}
static inline struct kk_physical_device *
kk_device_physical(const struct kk_device *dev)
{

View file

@ -8,184 +8,65 @@
#include "kk_shader.h"
#include "kkcl.h"
#include "kosmickrisp/clc/kk_precompiled_shader.h"
#include "libkk_shaders.h"
#include "kosmickrisp/bridge/mtl_bridge.h"
#include "nir/nir.h"
#include "nir/nir_builder.h"
static nir_def *
load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
{
nir_deref_instr *deref =
nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
return nir_load_deref(b, deref);
}
static nir_shader *
create_imm_write_shader()
{
nir_builder build = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
"kk-meta-imm-write-u64");
nir_builder *b = &build;
struct glsl_struct_field push_fields[] = {
{.type = glsl_uint64_t_type(), .name = "buffer_address", .offset = 0},
};
const struct glsl_type *push_iface_type = glsl_interface_type(
push_fields, ARRAY_SIZE(push_fields), GLSL_INTERFACE_PACKING_STD140,
false /* row_major */, "push");
nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
push_iface_type, "push");
b->shader->info.workgroup_size[0] = 1;
b->shader->info.workgroup_size[1] = 1;
b->shader->info.workgroup_size[2] = 1;
libkk_write_u64(b, load_struct_var(b, push, 0));
return build.shader;
}
static nir_shader *
create_copy_query_shader()
{
nir_builder build = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
"kk-meta-copy-queries");
nir_builder *b = &build;
struct glsl_struct_field push_fields[] = {
{.type = glsl_uint64_t_type(), .name = "availability", .offset = 0},
{.type = glsl_uint64_t_type(), .name = "results", .offset = 8},
{.type = glsl_uint64_t_type(), .name = "indices", .offset = 16},
{.type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 24},
{.type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 32},
{.type = glsl_uint_type(), .name = "first_query", .offset = 40},
{.type = glsl_uint_type(), .name = "flags", .offset = 44},
{.type = glsl_uint16_t_type(), .name = "reports_per_query", .offset = 48},
};
/* TODO_KOSMICKRISP Don't use push constants and directly bind the buffer to
* the binding index. This requires compiler work first to remove the
* hard-coded buffer0 value. Same applies to other creation functions.
*/
const struct glsl_type *push_iface_type = glsl_interface_type(
push_fields, ARRAY_SIZE(push_fields), GLSL_INTERFACE_PACKING_STD140,
false /* row_major */, "push");
nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
push_iface_type, "push");
b->shader->info.workgroup_size[0] = 1;
b->shader->info.workgroup_size[1] = 1;
b->shader->info.workgroup_size[2] = 1;
libkk_copy_queries(b, load_struct_var(b, push, 0),
load_struct_var(b, push, 1), load_struct_var(b, push, 2),
load_struct_var(b, push, 3), load_struct_var(b, push, 4),
load_struct_var(b, push, 5), load_struct_var(b, push, 6),
load_struct_var(b, push, 7));
return build.shader;
}
static nir_shader *
create_triangle_fan_shader()
{
nir_builder build = nir_builder_init_simple_shader(
MESA_SHADER_COMPUTE, NULL, "kk-device-unroll-geomtry-and-restart");
nir_builder *b = &build;
struct glsl_struct_field push_fields[] = {
{.type = glsl_uint64_t_type(), .name = "index_buffer", .offset = 0},
{.type = glsl_uint64_t_type(), .name = "out_ptr", .offset = 8},
{.type = glsl_uint64_t_type(), .name = "indirect_in", .offset = 16},
{.type = glsl_uint64_t_type(), .name = "indirect_out", .offset = 24},
{.type = glsl_uint_type(), .name = "restart_index", .offset = 32},
{.type = glsl_uint_type(), .name = "index_buffer_size_el", .offset = 36},
{.type = glsl_uint_type(), .name = "in_el_size_B,", .offset = 40},
{.type = glsl_uint_type(), .name = "out_el_size_B,", .offset = 44},
{.type = glsl_uint_type(), .name = "flatshade_first", .offset = 48},
{.type = glsl_uint_type(), .name = "mode", .offset = 52},
};
const struct glsl_type *push_iface_type = glsl_interface_type(
push_fields, ARRAY_SIZE(push_fields), GLSL_INTERFACE_PACKING_STD140,
false /* row_major */, "push");
nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
push_iface_type, "push");
b->shader->info.workgroup_size[0] = 1;
b->shader->info.workgroup_size[1] = 1;
b->shader->info.workgroup_size[2] = 1;
libkk_unroll_geometry_and_restart(
b, load_struct_var(b, push, 0), load_struct_var(b, push, 1),
load_struct_var(b, push, 2), load_struct_var(b, push, 3),
load_struct_var(b, push, 4), load_struct_var(b, push, 5),
load_struct_var(b, push, 6), load_struct_var(b, push, 7),
load_struct_var(b, push, 8), load_struct_var(b, push, 9));
return build.shader;
}
static struct {
enum kk_device_lib_pipeline ndx;
nir_shader *(*create_shader_fn)();
} lib_shaders[KK_LIB_COUNT] = {
{KK_LIB_IMM_WRITE, create_imm_write_shader},
{KK_LIB_COPY_QUERY, create_copy_query_shader},
{KK_LIB_TRIANGLE_FAN, create_triangle_fan_shader},
};
static_assert(ARRAY_SIZE(lib_shaders) == KK_LIB_COUNT,
"Device lib shader count and created shader count mismatch");
VkResult
kk_device_init_lib(struct kk_device *dev)
static VkResult
build_precompiled_shaders(struct kk_device *dev)
{
VkResult result = VK_SUCCESS;
uint32_t i = 0u;
for (; i < KK_LIB_COUNT; ++i) {
nir_shader *s = lib_shaders[i].create_shader_fn();
if (!s)
goto fail;
for (; i < LIBKK_NUM_PROGRAMS; ++i) {
const uint32_t *bin = libkk_AppleSilicon[i];
const struct kk_precompiled_info *info = (void *)bin;
const char *msl = (const char *)bin + sizeof(*info);
struct kk_shader *shader = NULL;
result = kk_compile_nir_shader(dev, s, &dev->vk.alloc, &shader);
if (result != VK_SUCCESS)
goto fail;
mtl_library *library = mtl_new_library(dev->mtl_handle, shader->msl_code);
mtl_library *library = mtl_new_library(dev->mtl_handle, msl);
if (library == NULL)
goto fail;
uint32_t local_size_threads = shader->info.cs.local_size.x *
shader->info.cs.local_size.y *
shader->info.cs.local_size.z;
struct kk_precompiled_shader *shader = &dev->precompiled_cache.shaders[i];
/* TODO_KOSMICKRISP Do not hardcode the entrypoint */
mtl_function *function =
mtl_new_function_with_name(library, shader->entrypoint_name);
dev->lib_pipelines[i] = mtl_new_compute_pipeline_state(
mtl_new_function_with_name(library, "main_entrypoint");
uint32_t local_size_threads = info->workgroup_size[0] *
info->workgroup_size[1] *
info->workgroup_size[2];
shader->pipeline = mtl_new_compute_pipeline_state(
dev->mtl_handle, function, local_size_threads);
mtl_release(function);
mtl_release(library);
/* We no longer need the shader. Although it may be useful to keep it
* alive for the info maybe? */
shader->vk.ops->destroy(&dev->vk, &shader->vk, &dev->vk.alloc);
if (!dev->lib_pipelines[i])
if (!shader->pipeline)
goto fail;
memcpy(&shader->info, info, sizeof(*info));
}
return result;
fail:
for (uint32_t j = 0u; j < i; ++j)
mtl_release(dev->lib_pipelines[j]);
mtl_release(dev->precompiled_cache.shaders[j].pipeline);
return vk_error(dev, result);
}
VkResult
kk_device_init_lib(struct kk_device *dev)
{
return build_precompiled_shaders(dev);
}
void
kk_device_finish_lib(struct kk_device *dev)
{
for (uint32_t i = 0; i < KK_LIB_COUNT; ++i)
mtl_release(dev->lib_pipelines[i]);
for (uint32_t i = 0; i < LIBKK_NUM_PROGRAMS; ++i)
mtl_release(dev->precompiled_cache.shaders[i].pipeline);
}

View file

@ -13,7 +13,8 @@
#include "kosmickrisp/bridge/mtl_bridge.h"
#include "kosmickrisp/bridge/vk_to_mtl_map.h"
#include "cl/kk_query.h"
#include "kosmickrisp/libkk/kk_query.h"
#include "libkk_shaders.h"
static void
kk_encoder_start_internal(struct kk_encoder_internal *encoder,
@ -134,8 +135,6 @@ upload_queue_writes(struct kk_cmd_buffer *cmd)
enc->copy_query_pool_result_infos.size == 0u)
return;
struct kk_device *dev = kk_cmd_buffer_device(cmd);
mtl_compute_encoder *compute = kk_compute_encoder(cmd);
uint32_t count = util_dynarray_num_elements(&enc->imm_writes, uint64_t) / 2u;
if (count != 0) {
struct kk_bo *bo = kk_cmd_allocate_buffer(cmd, enc->imm_writes.size, 8u);
@ -143,27 +142,23 @@ upload_queue_writes(struct kk_cmd_buffer *cmd)
if (!bo)
return;
memcpy(bo->cpu, enc->imm_writes.data, enc->imm_writes.size);
struct kk_imm_write_push push_data = {
.buffer_address = bo->gpu,
.count = count,
};
kk_cmd_dispatch_pipeline(cmd, compute,
kk_device_lib_pipeline(dev, KK_LIB_IMM_WRITE),
&push_data, sizeof(push_data), count, 1, 1);
struct mtl_size grid = {count, 1, 1};
libkk_write_u64(cmd, grid, false, bo->gpu);
enc->imm_writes.size = 0u;
}
count = util_dynarray_num_elements(&enc->copy_query_pool_result_infos,
struct kk_copy_query_pool_results_info);
struct libkk_copy_queries_args);
if (count != 0u) {
for (uint32_t i = 0u; i < count; ++i) {
struct kk_copy_query_pool_results_info *push_data =
util_dynarray_element(&enc->copy_query_pool_result_infos,
struct kk_copy_query_pool_results_info, i);
kk_cmd_dispatch_pipeline(
cmd, compute, kk_device_lib_pipeline(dev, KK_LIB_COPY_QUERY),
push_data, sizeof(*push_data), push_data->query_count, 1, 1);
struct mtl_size grid = {push_data->query_count, 1, 1};
const struct libkk_copy_queries_args *data =
(const struct libkk_copy_queries_args *)push_data;
libkk_copy_queries_struct(cmd, grid, false, *data);
}
enc->copy_query_pool_result_infos.size = 0u;
}
@ -330,7 +325,7 @@ kk_blit_encoder(struct kk_cmd_buffer *cmd)
return (mtl_blit_encoder *)encoder->encoder;
}
static mtl_compute_encoder *
mtl_compute_encoder *
kk_encoder_pre_gfx_encoder(struct kk_cmd_buffer *cmd)
{
struct kk_encoder *encoder = cmd->encoder;
@ -352,26 +347,12 @@ kk_encoder_pre_gfx_encoder(struct kk_cmd_buffer *cmd)
return encoder->pre_gfx.encoder;
}
struct kk_triangle_fan_info {
uint64_t index_buffer;
uint64_t out_ptr;
uint64_t in_draw;
uint64_t out_draw;
uint32_t restart_index;
uint32_t index_buffer_size_el;
uint32_t in_el_size_B;
uint32_t out_el_size_B;
uint32_t flatshade_first;
uint32_t mode;
};
static void
kk_encoder_render_triangle_fan_common(struct kk_cmd_buffer *cmd,
struct kk_triangle_fan_info *info,
mtl_buffer *indirect, mtl_buffer *index,
uint32_t index_count,
uint32_t in_el_size_B,
uint32_t out_el_size_B)
kk_encoder_render_triangle_fan_common(
struct kk_cmd_buffer *cmd,
struct libkk_unroll_geometry_and_restart_args *info, mtl_buffer *indirect,
mtl_buffer *index, uint32_t index_count, uint32_t in_el_size_B,
uint32_t out_el_size_B)
{
uint32_t index_buffer_size_B = index_count * out_el_size_B;
uint32_t buffer_size_B =
@ -387,12 +368,9 @@ kk_encoder_render_triangle_fan_common(struct kk_cmd_buffer *cmd,
info->in_el_size_B = in_el_size_B;
info->out_el_size_B = out_el_size_B;
info->flatshade_first = true;
mtl_compute_encoder *encoder = kk_encoder_pre_gfx_encoder(cmd);
struct kk_device *dev = kk_cmd_buffer_device(cmd);
kk_cmd_dispatch_pipeline(cmd, encoder,
kk_device_lib_pipeline(dev, KK_LIB_TRIANGLE_FAN),
info, sizeof(*info), 1u, 1u, 1u);
struct mtl_size grid = {1, 1, 1};
libkk_unroll_geometry_and_restart_struct(cmd, grid, true, *info);
enum mtl_index_type index_type =
index_size_in_bytes_to_mtl_index_type(out_el_size_B);
@ -411,7 +389,7 @@ kk_encoder_render_triangle_fan_indirect(struct kk_cmd_buffer *cmd,
u_decomposed_prims_for_vertices(mode, cmd->state.gfx.vb.max_vertices) *
mesa_vertices_per_prim(mode);
uint32_t el_size_B = decomposed_index_count < UINT16_MAX ? 2u : 4u;
struct kk_triangle_fan_info info = {
struct libkk_unroll_geometry_and_restart_args info = {
.in_draw = mtl_buffer_get_gpu_address(indirect) + offset,
.restart_index = UINT32_MAX, /* No restart */
.mode = mode,
@ -437,7 +415,7 @@ kk_encoder_render_triangle_fan_indexed_indirect(struct kk_cmd_buffer *cmd,
u_decomposed_prims_for_vertices(mode, max_index_count) *
mesa_vertices_per_prim(mode);
struct kk_triangle_fan_info info = {
struct libkk_unroll_geometry_and_restart_args info = {
.index_buffer = mtl_buffer_get_gpu_address(cmd->state.gfx.index.handle) +
cmd->state.gfx.index.offset,
.in_draw = mtl_buffer_get_gpu_address(indirect) + offset,

View file

@ -101,6 +101,8 @@ mtl_compute_encoder *kk_compute_encoder(struct kk_cmd_buffer *cmd);
mtl_blit_encoder *kk_blit_encoder(struct kk_cmd_buffer *cmd);
mtl_compute_encoder *kk_encoder_pre_gfx_encoder(struct kk_cmd_buffer *cmd);
void upload_queue_writes(struct kk_cmd_buffer *cmd);
void kk_encoder_render_triangle_fan_indirect(struct kk_cmd_buffer *cmd,

View file

@ -17,7 +17,6 @@
#include "kk_entrypoints.h"
#include "kk_physical_device.h"
#include "kk_query_table.h"
#include "kkcl.h"
struct kk_query_report {
uint64_t value;

View file

@ -33,41 +33,7 @@ static const nir_shader_compiler_options *
kk_get_nir_options(struct vk_physical_device *vk_pdev, mesa_shader_stage stage,
UNUSED const struct vk_pipeline_robustness_state *rs)
{
static nir_shader_compiler_options options = {
.lower_fdph = true,
.has_fsub = true,
.has_isub = true,
.lower_extract_word = true,
.lower_extract_byte = true,
.lower_insert_word = true,
.lower_insert_byte = true,
.lower_fmod = true,
.discard_is_demote = true,
.instance_id_includes_base_index = true,
.lower_device_index_to_zero = true,
.lower_pack_64_2x32_split = true,
.lower_unpack_64_2x32_split = true,
.lower_pack_64_2x32 = true,
.lower_pack_half_2x16 = true,
.lower_pack_split = true,
.lower_unpack_half_2x16 = true,
.has_cs_global_id = true,
.lower_fquantize2f16 = true,
.lower_scmp = true,
.lower_ifind_msb = true,
.lower_ufind_msb = true,
.lower_find_lsb = true,
.has_uclz = true,
.lower_mul_2x32_64 = true,
.lower_uadd_carry = true,
.lower_usub_borrow = true,
/* Metal does not support double. */
.lower_doubles_options = (nir_lower_doubles_options)(~0),
.lower_int64_options =
nir_lower_ufind_msb64 | nir_lower_subgroup_shuffle64,
.io_options = nir_io_mediump_is_32bit,
};
return &options;
return &kk_nir_options;
}
/* TODO_KOSMICKRISP Once we support robustness2, update these values. */

View file

@ -18,6 +18,7 @@ kk_device_dispatch_table_dependencies_list = [
idep_vulkan_util,
idep_vulkan_wsi,
idep_vulkan_wsi_headers,
idep_libkk_shaders_h,
]
libkk_device_dispatch_table = static_library(
@ -100,11 +101,6 @@ kk_files = files(
'kk_wsi.c',
)
kkcl_files = files(
'cl/kk_query.cl',
'cl/kk_triangle_fan.cl',
)
kk_entrypoints = custom_target(
'kk_entrypoints',
input : [vk_entrypoints_gen, vk_api_xml],
@ -119,25 +115,6 @@ kk_entrypoints = custom_target(
relative_dir = fs.relative_to(meson.global_source_root(), meson.global_build_root())
kkcl_spv = custom_target(
input : kkcl_files,
output : 'kkcl.spv',
command : [
prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', kkcl_files, '--',
'-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
'-I' + join_paths(meson.current_source_dir(), '.'),
'-I' + join_paths(meson.project_source_root(), 'src'),
cl_args,
],
depfile : 'libkk_shaders.h.d',
)
kkcl = custom_target(
input : kkcl_spv,
output : ['kkcl.cpp', 'kkcl.h'],
command : [prog_vtn_bindgen2, '@INPUT@', '@OUTPUT@'],
)
kk_deps = [
idep_nir,
idep_mesautil,
@ -148,7 +125,9 @@ kk_deps = [
kk_device_dispatch_table_dependencies,
idep_kk_device_dispatch_table,
idep_msl_to_nir,
idep_mtl_bridge
idep_mtl_bridge,
idep_libkk,
idep_libkk_shaders_h,
]
libkk = static_library(
@ -156,7 +135,6 @@ libkk = static_library(
[
kk_files,
kk_entrypoints,
kkcl,
sha1_h,
],
include_directories : [