mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 02:10:11 +01:00
nir: add helpers for precompiled shaders
v2: generalize function signatures. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> [v1] Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com> [v1] Acked-by: Mary Guillemard <mary.guillemard@collabora.com> [v2] Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32339>
This commit is contained in:
parent
4c3809e7fc
commit
e3001352ad
2 changed files with 675 additions and 0 deletions
|
|
@ -166,6 +166,9 @@ ForEachMacros:
|
|||
- nir_foreach_block_reverse_safe
|
||||
- nir_foreach_block_safe
|
||||
|
||||
- nir_precomp_foreach_arg
|
||||
- nir_precomp_foreach_variant_param
|
||||
|
||||
# egl
|
||||
- egl_relax
|
||||
|
||||
|
|
|
|||
672
src/compiler/nir/nir_precompiled.h
Normal file
672
src/compiler/nir/nir_precompiled.h
Normal file
|
|
@ -0,0 +1,672 @@
|
|||
/*
|
||||
* Copyright 2024 Valve Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ctype.h>
|
||||
#include "nir.h"
|
||||
#include "nir_builder.h"
|
||||
#include "nir_serialize.h"
|
||||
|
||||
/*
|
||||
* This file contains helpers for precompiling OpenCL kernels with a Mesa driver
|
||||
* and dispatching them from within the driver. It is a grab bag of utility
|
||||
* functions, rather than an all-in-one solution, to give drivers flexibility to
|
||||
* customize the compile pipeline. See asahi_clc for how the pieces fit
|
||||
* together, and see libagx for real world examples of this infrastructure.
|
||||
*
|
||||
* Why OpenCL C?
|
||||
*
|
||||
* 1. Mesa drivers are generally written in C. OpenCL C is close enough to C11
|
||||
* that we can share driver code between host and device. This is the "killer
|
||||
* feature" and enables implementing device-generated commands in a sane way.
|
||||
* Both generated (e.g. GenXML) headers and entire complex driver logic may
|
||||
* be shared for a major maintenance win.
|
||||
*
|
||||
* 2. OpenCL C has significant better ergonomics than GLSL, particularly around
|
||||
* raw pointers. Plainly, GLSL was never designed as a systems language. What
|
||||
* we need for implementing driver features on-device is a systems language,
|
||||
* not a shading language.
|
||||
*
|
||||
* 3. OpenCL is the compute standard, and it is supported in Mesa via rusticl.
|
||||
* Using OpenCL in our drivers is a way of "eating our own dog food". If Mesa
|
||||
* based OpenCL isn't good enough for us, it's not good enough for our users
|
||||
* either.
|
||||
*
|
||||
* 4. OpenCL C has enough affordances for GPUs that it is suitable for GPU use,
|
||||
* unlike pure C11.
|
||||
*
|
||||
* Why precompile?
|
||||
*
|
||||
* 1. Precompiling lets us do build-time reflection on internal shaders to
|
||||
* generate data layouts and dispatch macros automatically. The precompile
|
||||
* pipeline implemented in this file offers significantly better ergonomics
|
||||
* than handrolling kernels at runtime.
|
||||
*
|
||||
* 2. Compiling internal shaders at draw-time can introduce jank. Compiling
|
||||
* internal shaders with application shaders slows down application shader
|
||||
* compile time (and might still introduce jank in a hash-and-cache scheme).
|
||||
* Compiling shaders at device creation time slows down initialization. The
|
||||
* only time we can compile with no performance impact is when building the
|
||||
* driver ahead-of-time.
|
||||
*
|
||||
* 3. Mesa is built (on developer and packager machines) far less often than it
|
||||
* is run (on user machines). Compiling at build-time is simply more
|
||||
* efficient in a global sense.
|
||||
*
|
||||
* 4. Compiling /all/ internal shaders with the Mesa build can turn runtime
|
||||
* assertion fails into build failures, allowing for backend compilers to be
|
||||
* smoke-tested without hardware testing and hence allowing regressions to be
|
||||
* caught sooner.
|
||||
*
|
||||
* At a high level, a library of kernels is compiled to SPIR-V. That SPIR-V is
|
||||
* then translated to NIR and optimized, leaving many entrypoints. Each NIR
|
||||
* entrypoint represents one `kernel` to be precompiled.
|
||||
*
|
||||
* Kernels generally have arguments. Arguments may be either scalars or
|
||||
* pointers. It is not necessary to explicitly define a data layout for the
|
||||
* arguments. You simply declare arguments to the OpenCL side kernel:
|
||||
*
|
||||
* KERNEL(1) void foo(int x, int y) { .. }
|
||||
*
|
||||
* The data layout is automatically derived from the function signature
|
||||
* (nir_precomp_derive_layout). The data layout is exposed to the CPU as
|
||||
* structures (nir_precomp_print_layout_struct).
|
||||
*
|
||||
* struct foo_args {
|
||||
* uint32_t x;
|
||||
* uint32_t y;
|
||||
* } PACKED;
|
||||
*
|
||||
* The data is expected to be mapped to something like Vulkan push constants in
|
||||
* the hardware. The driver defines a callback to load an argument given a byte
|
||||
* offset (e.g. via load_push_constant intrinsics). When building a variant,
|
||||
* nir_precomp_build_variant will load the arguments according to the chosen
|
||||
* layout:
|
||||
*
|
||||
* %0 = load_push_constant 0
|
||||
* %1 = load_push_constant 4
|
||||
* ...
|
||||
*
|
||||
* This ensures that data layouts match between CPU and GPU, without any
|
||||
* boilerplate, while giving drivers control over exactly how arguments are
|
||||
* passed. (This can save an indirection compared to stuffing in a UBO.)
|
||||
*
|
||||
* To dispatch kernels from the driver, the kernel is "called" like a function:
|
||||
*
|
||||
* foo(cmdbuf, grid(4, 4, 1), x, y);
|
||||
*
|
||||
* This resolves to generated dispatch macros
|
||||
* (nir_precomp_print_dispatch_macros), which lay out their arguments according
|
||||
* to the derived layout and then call the driver-specific dispatch. To
|
||||
* implement that mechanism, a driver must implement the following function
|
||||
* signature:
|
||||
*
|
||||
* MESA_DISPATCH_PRECOMP(context, grid, kernel index, argument pointer,
|
||||
* size of arguments)
|
||||
*
|
||||
* The exact types used are determined by the driver. context is something like
|
||||
* a Vulkan command buffer. grid represents the 3D dispatch size. kernel index
|
||||
* is the index of the precompiled kernel (nir_precomp_index). argument pointer
|
||||
* is a host pointer to the sized argument structure, which the driver must
|
||||
* upload and bind (e.g. as push constants).
|
||||
*
|
||||
* Because the types are ambiguous here, the same mechanism works for both
|
||||
* Gallium and Vulkan drivers.
|
||||
*
|
||||
* Although the generated header could be consumed by OpenCL code,
|
||||
* MESA_DISPATCH_PRECOMP is not intended to be implemented on the device side.
|
||||
* Instead, an analogous mechanism can be implemented for device-side enqueue
|
||||
* with automatic data layout handling. Device-side enqueue of precompiled
|
||||
* kernels has various applications, most obviously for implementing
|
||||
* device-generated commands.
|
||||
*
|
||||
* All precompiled kernels for a given target are zero-indexed and referenced in
|
||||
* an array of binaries. These indices are enum values, generated by
|
||||
* nir_precomp_print_program_enum. The array of kernels is generated by
|
||||
* nir_precomp_print_binary_map. There is generally an array for each hardware
|
||||
* target supported by a driver. On device creation, the driver would select the
|
||||
* array of binaries for the probed hardware.
|
||||
*
|
||||
* Sometimes a single binary can be used for multiple targets. In this case, the
|
||||
* driver should compile it only once and remap the binary arrays with the
|
||||
* callback passed to nir_precomp_print_binary_map.
|
||||
*
|
||||
* A single entrypoint may have multiple variants, as a small shader key. To
|
||||
* support this, kernel parameters suffixed with __n will automatically vary
|
||||
* from 0 to n - 1. This mechanism is controlled by
|
||||
* nir_precomp_parse_variant_param. For example:
|
||||
*
|
||||
* KERNEL(1) void bar(uchar *x, int variant__4) {
|
||||
* for (uint i = 0; i <= variant__4; ++i)
|
||||
* x[i]++;
|
||||
* }
|
||||
*
|
||||
* will generate 4 binaries with 1, 2, 3, and 4 additions respectively. This
|
||||
* mechanism (sigil suffixing) is kinda ugly, but I can't figure out a nicer way
|
||||
* to attach metadata to the argument in standard OpenCL.
|
||||
*
|
||||
* Internally, all variants of a given kernel have a flat index. The bijection
|
||||
* between n variant parameters and 1 flat index is given in the
|
||||
* nir_precomp_decode_variant_index comment.
|
||||
*
|
||||
* Kernels must declare their workgroup size with
|
||||
* __attribute__((reqd_work_group_size(...))) for two reasons. First, variable
|
||||
* workgroup sizes have tricky register allocation problems in several backends,
|
||||
* avoided here. Second, it makes more sense to attach the workgroup size to the
|
||||
* kernel than to the caller so this improves ergonomics of the dispatch macros.
|
||||
*/
|
||||
|
||||
#define NIR_PRECOMP_MAX_ARGS (64)
|
||||
|
||||
struct nir_precomp_layout {
|
||||
unsigned size_B;
|
||||
unsigned offset_B[NIR_PRECOMP_MAX_ARGS];
|
||||
bool prepadded[NIR_PRECOMP_MAX_ARGS];
|
||||
};
|
||||
|
||||
static inline unsigned
|
||||
nir_precomp_parse_variant_param(const nir_function *f, unsigned p)
|
||||
{
|
||||
assert(p < f->num_params);
|
||||
|
||||
const char *token = "__";
|
||||
const char *q = strstr(f->params[p].name, token);
|
||||
if (q == NULL)
|
||||
return 0;
|
||||
|
||||
int n = atoi(q + strlen(token));
|
||||
|
||||
/* Ensure the number is something reasonable */
|
||||
assert(n > 1 && n < 32 && "sanity check");
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
nir_precomp_is_variant_param(const nir_function *f, unsigned p)
|
||||
{
|
||||
return nir_precomp_parse_variant_param(f, p) != 0;
|
||||
}
|
||||
|
||||
#define nir_precomp_foreach_arg(f, p) \
|
||||
for (unsigned p = 0; p < f->num_params; ++p) \
|
||||
if (!nir_precomp_is_variant_param(f, p))
|
||||
|
||||
#define nir_precomp_foreach_variant_param(f, p) \
|
||||
for (unsigned p = 0; p < f->num_params; ++p) \
|
||||
if (nir_precomp_is_variant_param(f, p))
|
||||
|
||||
static inline unsigned
|
||||
nir_precomp_nr_variants(const nir_function *f)
|
||||
{
|
||||
unsigned nr = 1;
|
||||
|
||||
nir_precomp_foreach_variant_param(f, p) {
|
||||
nr *= nir_precomp_parse_variant_param(f, p);
|
||||
}
|
||||
|
||||
return nr;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
nir_precomp_has_variants(const nir_function *f)
|
||||
{
|
||||
return nir_precomp_nr_variants(f) > 1;
|
||||
}
|
||||
|
||||
static inline struct nir_precomp_layout
|
||||
nir_precomp_derive_layout(const nir_function *f)
|
||||
{
|
||||
struct nir_precomp_layout l = { 0 };
|
||||
|
||||
nir_precomp_foreach_arg(f, a) {
|
||||
nir_parameter param = f->params[a];
|
||||
assert(a < ARRAY_SIZE(l.offset_B));
|
||||
|
||||
/* Align members naturally */
|
||||
l.offset_B[a] = ALIGN_POT(l.size_B, param.bit_size / 8);
|
||||
l.prepadded[a] = (l.offset_B[a] != l.size_B);
|
||||
l.size_B = l.offset_B[a] + (param.num_components * param.bit_size) / 8;
|
||||
}
|
||||
|
||||
return l;
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
nir_precomp_index(const nir_shader *lib, const nir_function *func)
|
||||
{
|
||||
unsigned i = 0;
|
||||
|
||||
nir_foreach_entrypoint(candidate, lib) {
|
||||
if (candidate == func)
|
||||
return i;
|
||||
|
||||
i += nir_precomp_nr_variants(candidate);
|
||||
}
|
||||
|
||||
unreachable("function must be in library");
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_print_uppercase(FILE *fp, const char *str)
|
||||
{
|
||||
for (unsigned i = 0; i < strlen(str); ++i) {
|
||||
fputc(toupper(str[i]), fp);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_enum_value(FILE *fp, const nir_function *func)
|
||||
{
|
||||
nir_print_uppercase(fp, func->name);
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_enum_variant_value(FILE *fp, const nir_function *func, unsigned v)
|
||||
{
|
||||
nir_precomp_print_enum_value(fp, func);
|
||||
|
||||
if (nir_precomp_has_variants(func)) {
|
||||
fprintf(fp, "_%u", v);
|
||||
} else {
|
||||
assert(v == 0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_variant_params(FILE *fp, nir_function *func, bool with_types)
|
||||
{
|
||||
if (nir_precomp_has_variants(func)) {
|
||||
fprintf(fp, "(");
|
||||
|
||||
bool first = true;
|
||||
nir_precomp_foreach_variant_param(func, p) {
|
||||
fprintf(fp, "%s%s%s", first ? "" : ", ", with_types ? "unsigned " : "",
|
||||
func->params[p].name);
|
||||
first = false;
|
||||
}
|
||||
|
||||
fprintf(fp, ")");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a flattened 1D index, extract the i'th coordinate of the original N-D
|
||||
* vector. The forward map is:
|
||||
*
|
||||
* I = sum(t=1...n) [x_t product(j=1...(t-1)) [k_j]]
|
||||
*
|
||||
* It can be shown that
|
||||
*
|
||||
* I < product_(j=1...n)[k_j]
|
||||
*
|
||||
* x_i = floor(I / product(j=1...(i-1)) [k_j]) mod k_i
|
||||
*
|
||||
* The inequality is by induction on n. The equivalence follows from the
|
||||
* inequality by splitting the sum of I at t=i, showing the smaller terms get
|
||||
* killed by the floor and the higher terms get killed by the modulus leaving
|
||||
* just x_i.
|
||||
*
|
||||
* The forward map is emitted in nir_precomp_print_program_enum. The inverse is
|
||||
* calculated here.
|
||||
*/
|
||||
static inline unsigned
|
||||
nir_precomp_decode_variant_index(const nir_function *func, unsigned I,
|
||||
unsigned i)
|
||||
{
|
||||
unsigned product = 1;
|
||||
|
||||
nir_precomp_foreach_variant_param(func, j) {
|
||||
if (j >= i)
|
||||
break;
|
||||
|
||||
unsigned k_j = nir_precomp_parse_variant_param(func, j);
|
||||
product *= k_j;
|
||||
}
|
||||
|
||||
unsigned k_i = nir_precomp_parse_variant_param(func, i);
|
||||
return (I / product) % k_i;
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_program_enum(FILE *fp, const nir_shader *lib, const char *prefix)
|
||||
{
|
||||
/* Generate an enum indexing all binaries */
|
||||
fprintf(fp, "enum %s_program {\n", prefix);
|
||||
nir_foreach_entrypoint(func, lib) {
|
||||
unsigned index = nir_precomp_index(lib, func);
|
||||
|
||||
for (unsigned v = 0; v < nir_precomp_nr_variants(func); ++v) {
|
||||
fprintf(fp, " ");
|
||||
nir_precomp_print_enum_variant_value(fp, func, v);
|
||||
fprintf(fp, " = %u,\n", index + v);
|
||||
}
|
||||
}
|
||||
fprintf(fp, " ");
|
||||
nir_print_uppercase(fp, prefix);
|
||||
fprintf(fp, "_NUM_PROGRAMS,\n");
|
||||
fprintf(fp, "};\n\n");
|
||||
|
||||
/* Generate indexing variants */
|
||||
nir_foreach_entrypoint(func, lib) {
|
||||
if (nir_precomp_has_variants(func)) {
|
||||
fprintf(fp, "static inline unsigned\n");
|
||||
nir_precomp_print_enum_value(fp, func);
|
||||
nir_precomp_print_variant_params(fp, func, true);
|
||||
fprintf(fp, "\n");
|
||||
fprintf(fp, "{\n");
|
||||
|
||||
nir_precomp_foreach_variant_param(func, p) {
|
||||
/* Assert indices are in bounds. These provides some safety. */
|
||||
fprintf(fp, " assert(%s < %u);\n", func->params[p].name,
|
||||
nir_precomp_parse_variant_param(func, p));
|
||||
}
|
||||
|
||||
/* Flatten an N-D index into a 1D index using the standard mapping.
|
||||
*
|
||||
* We iterate parameters backwards so we can do a single multiply-add
|
||||
* each step for simplicity (similar to Horner's method).
|
||||
*/
|
||||
fprintf(fp, "\n");
|
||||
bool first = true;
|
||||
for (signed p = func->num_params - 1; p >= 0; --p) {
|
||||
if (!nir_precomp_is_variant_param(func, p))
|
||||
continue;
|
||||
|
||||
if (first) {
|
||||
fprintf(fp, " unsigned idx = %s;\n", func->params[p].name);
|
||||
} else {
|
||||
fprintf(fp, " idx = (idx * %u) + %s;\n",
|
||||
nir_precomp_parse_variant_param(func, p),
|
||||
func->params[p].name);
|
||||
}
|
||||
|
||||
first = false;
|
||||
}
|
||||
|
||||
/* Post-condition: flattened index is in bounds. */
|
||||
fprintf(fp, "\n");
|
||||
fprintf(fp, " assert(idx < %u);\n", nir_precomp_nr_variants(func));
|
||||
|
||||
fprintf(fp, " return ");
|
||||
nir_precomp_print_enum_variant_value(fp, func, 0);
|
||||
fprintf(fp, " + idx;\n");
|
||||
fprintf(fp, "}\n\n");
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_layout_struct(FILE *fp, const nir_function *func)
|
||||
{
|
||||
struct nir_precomp_layout layout = nir_precomp_derive_layout(func);
|
||||
|
||||
/* Generate a C struct matching the data layout we chose. This is how
|
||||
* the CPU will pack arguments.
|
||||
*/
|
||||
unsigned offset_B = 0;
|
||||
|
||||
fprintf(fp, "struct %s_args {\n", func->name);
|
||||
nir_precomp_foreach_arg(func, a) {
|
||||
nir_parameter param = func->params[a];
|
||||
assert(param.name != NULL && "kernel args must be named");
|
||||
|
||||
assert(layout.offset_B[a] >= offset_B);
|
||||
unsigned pad = layout.offset_B[a] - offset_B;
|
||||
assert((pad > 0) == layout.prepadded[a]);
|
||||
|
||||
if (pad > 0) {
|
||||
fprintf(fp, " uint8_t _pad%u[%u];\n", a, pad);
|
||||
offset_B += pad;
|
||||
}
|
||||
|
||||
/* After padding, the layout will match. */
|
||||
assert(layout.offset_B[a] == offset_B);
|
||||
|
||||
fprintf(fp, " uint%u_t %s", param.bit_size, param.name);
|
||||
if (param.num_components > 1) {
|
||||
fprintf(fp, "[%u]", param.num_components);
|
||||
}
|
||||
fprintf(fp, ";\n");
|
||||
|
||||
offset_B += param.num_components * (param.bit_size / 8);
|
||||
}
|
||||
fprintf(fp, "} PACKED;\n\n");
|
||||
|
||||
/* Assert that the layout on the CPU matches the layout on the GPU. Because
|
||||
* of the asserts above, these are mostly just sanity checking the compiler.
|
||||
* But better err on the side of defensive because alignment bugs are REALLY
|
||||
* painful to track down and we don't pay by the static assert.
|
||||
*/
|
||||
nir_precomp_foreach_arg(func, a) {
|
||||
nir_parameter param = func->params[a];
|
||||
|
||||
fprintf(fp, "static_assert(offsetof(struct %s_args, %s) == %u, \"\");\n",
|
||||
func->name, param.name, layout.offset_B[a]);
|
||||
}
|
||||
fprintf(fp, "static_assert(sizeof(struct %s_args) == %u, \"\");\n",
|
||||
func->name, layout.size_B);
|
||||
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_dispatch_macros(FILE *fp, const nir_shader *nir)
|
||||
{
|
||||
nir_foreach_entrypoint(func, nir) {
|
||||
struct nir_precomp_layout layout = nir_precomp_derive_layout(func);
|
||||
|
||||
for (unsigned i = 0; i < 2; ++i) {
|
||||
bool is_struct = i == 0;
|
||||
|
||||
fprintf(fp, "#define %s%s(_context, _grid%s", func->name,
|
||||
is_struct ? "_struct" : "", is_struct ? ", _data" : "");
|
||||
|
||||
/* Add the arguments, including variant parameters. For struct macros,
|
||||
* we include only the variant parameters; the kernel arguments are
|
||||
* taken from the struct.
|
||||
*/
|
||||
for (unsigned p = 0; p < func->num_params; ++p) {
|
||||
if (!is_struct || nir_precomp_is_variant_param(func, p))
|
||||
fprintf(fp, ", %s", func->params[p].name);
|
||||
}
|
||||
|
||||
fprintf(fp, ") do { \\\n");
|
||||
|
||||
fprintf(fp, " struct %s_args _args = ", func->name);
|
||||
|
||||
if (is_struct) {
|
||||
fprintf(fp, "_data");
|
||||
} else {
|
||||
fprintf(fp, "{");
|
||||
|
||||
nir_precomp_foreach_arg(func, a) {
|
||||
/* We need to zero out the padding between members. We cannot use
|
||||
* a designated initializer without prefixing the macro
|
||||
* arguments, which would add noise to the macro signature
|
||||
* reported in IDEs (which should ideally match the actual
|
||||
* signature as close as possible).
|
||||
*/
|
||||
if (layout.prepadded[a]) {
|
||||
assert(a > 0 && "first argument is never prepadded");
|
||||
fprintf(fp, ", {0}");
|
||||
}
|
||||
|
||||
fprintf(fp, "%s%s", a == 0 ? "" : ", ", func->params[a].name);
|
||||
}
|
||||
|
||||
fprintf(fp, "}");
|
||||
}
|
||||
|
||||
fprintf(fp, ";\\\n");
|
||||
|
||||
/* Dispatch via MESA_DISPATCH_PRECOMP, which the driver must #define
|
||||
* suitably before #include-ing this file.
|
||||
*/
|
||||
fprintf(fp, " MESA_DISPATCH_PRECOMP(_context, _grid, ");
|
||||
nir_precomp_print_enum_value(fp, func);
|
||||
nir_precomp_print_variant_params(fp, func, false);
|
||||
fprintf(fp, ", &_args, sizeof(_args)); \\\n");
|
||||
fprintf(fp, "} while(0);\n\n");
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_extern_binary_map(FILE *fp,
|
||||
const char *prefix, const char *target)
|
||||
{
|
||||
fprintf(fp, "extern const uint32_t *%s_%s[", prefix, target);
|
||||
nir_print_uppercase(fp, prefix);
|
||||
fprintf(fp, "_NUM_PROGRAMS];\n");
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_binary_map(FILE *fp, const nir_shader *nir,
|
||||
const char *prefix, const char *target,
|
||||
const char *(*map)(nir_function *func,
|
||||
unsigned variant,
|
||||
const char *target))
|
||||
{
|
||||
fprintf(fp, "const uint32_t *%s_%s[", prefix, target);
|
||||
nir_print_uppercase(fp, prefix);
|
||||
fprintf(fp, "_NUM_PROGRAMS] = {\n");
|
||||
|
||||
nir_foreach_entrypoint(func, nir) {
|
||||
for (unsigned v = 0; v < nir_precomp_nr_variants(func); ++v) {
|
||||
fprintf(fp, " [");
|
||||
nir_precomp_print_enum_variant_value(fp, func, v);
|
||||
fprintf(fp, "] = %s_%u_%s,\n", func->name, v,
|
||||
map ? map(func, v, target) : target);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(fp, "};\n\n");
|
||||
}
|
||||
|
||||
static inline nir_shader *
|
||||
nir_precompiled_build_variant(const nir_function *libfunc, unsigned variant,
|
||||
const nir_shader_compiler_options *opts,
|
||||
nir_def *(*load_arg)(nir_builder *b,
|
||||
unsigned num_components,
|
||||
unsigned bit_size,
|
||||
unsigned offset_B))
|
||||
{
|
||||
struct nir_precomp_layout layout = nir_precomp_derive_layout(libfunc);
|
||||
bool has_variants = nir_precomp_has_variants(libfunc);
|
||||
|
||||
nir_builder b;
|
||||
if (has_variants) {
|
||||
b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, opts,
|
||||
"%s variant %u", libfunc->name,
|
||||
variant);
|
||||
} else {
|
||||
b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, opts, "%s",
|
||||
libfunc->name);
|
||||
}
|
||||
|
||||
assert(libfunc->workgroup_size[0] != 0 && "must set workgroup size");
|
||||
|
||||
b.shader->info.workgroup_size[0] = libfunc->workgroup_size[0];
|
||||
b.shader->info.workgroup_size[1] = libfunc->workgroup_size[1];
|
||||
b.shader->info.workgroup_size[2] = libfunc->workgroup_size[2];
|
||||
|
||||
nir_function *func = nir_function_clone(b.shader, libfunc);
|
||||
func->is_entrypoint = false;
|
||||
|
||||
nir_def *args[NIR_PRECOMP_MAX_ARGS] = { NULL };
|
||||
|
||||
/* Some parameters are variant indices and others are kernel arguments */
|
||||
for (unsigned a = 0; a < libfunc->num_params; ++a) {
|
||||
nir_parameter p = func->params[a];
|
||||
|
||||
if (nir_precomp_is_variant_param(libfunc, a)) {
|
||||
unsigned idx = nir_precomp_decode_variant_index(libfunc, variant, a);
|
||||
args[a] = nir_imm_intN_t(&b, idx, p.bit_size);
|
||||
} else {
|
||||
args[a] = load_arg(&b, p.num_components, p.bit_size, layout.offset_B[a]);
|
||||
}
|
||||
}
|
||||
|
||||
nir_build_call(&b, func, func->num_params, args);
|
||||
return b.shader;
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_blob(FILE *fp, const char *arr_name, const char *suffix,
|
||||
uint32_t variant, const uint32_t *data, size_t len)
|
||||
{
|
||||
fprintf(fp, "const uint32_t %s_%u_%s[%zu] = {", arr_name, variant, suffix,
|
||||
DIV_ROUND_UP(len, 4));
|
||||
for (unsigned i = 0; i < (len / 4); i++) {
|
||||
if (i % 4 == 0)
|
||||
fprintf(fp, "\n ");
|
||||
|
||||
fprintf(fp, " 0x%08" PRIx32 ",", data[i]);
|
||||
}
|
||||
|
||||
if (len % 4) {
|
||||
const uint8_t *data_u8 = (const uint8_t *)data;
|
||||
uint32_t last = 0;
|
||||
unsigned last_offs = ROUND_DOWN_TO(len, 4);
|
||||
for (unsigned i = 0; i < len % 4; ++i) {
|
||||
last |= (uint32_t)data_u8[last_offs + i] << (i * 8);
|
||||
}
|
||||
|
||||
fprintf(fp, " 0x%08" PRIx32 ",", last);
|
||||
}
|
||||
|
||||
fprintf(fp, "\n};\n");
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_nir(FILE *fp_c, FILE *fp_h, const nir_shader *nir,
|
||||
const char *name, const char *suffix)
|
||||
{
|
||||
struct blob blob;
|
||||
blob_init(&blob);
|
||||
nir_serialize(&blob, nir, true /* strip */);
|
||||
|
||||
nir_precomp_print_blob(fp_c, name, suffix, 0, (const uint32_t *)blob.data,
|
||||
blob.size);
|
||||
|
||||
fprintf(fp_h, "extern const uint32_t %s_0_%s[%zu];\n", name, suffix,
|
||||
DIV_ROUND_UP(blob.size, 4));
|
||||
|
||||
blob_finish(&blob);
|
||||
}
|
||||
|
||||
static inline void
|
||||
nir_precomp_print_header(FILE *fp_c, FILE *fp_h, const char *copyright,
|
||||
const char *h_name)
|
||||
{
|
||||
for (unsigned i = 0; i < 2; ++i) {
|
||||
FILE *fp = i ? fp_c : fp_h;
|
||||
fprintf(fp, "/*\n");
|
||||
fprintf(fp, " * Copyright %s\n", copyright);
|
||||
fprintf(fp, " * SPDX-License-Identifier: MIT\n");
|
||||
fprintf(fp, " *\n");
|
||||
fprintf(fp, " * Autogenerated file, do not edit\n");
|
||||
fprintf(fp, " */\n\n");
|
||||
|
||||
/* uint32_t types are used throughout */
|
||||
fprintf(fp, "#include <stdint.h>\n\n");
|
||||
}
|
||||
|
||||
/* The generated C code depends on the header we will generate */
|
||||
fprintf(fp_c, "#include \"%s\"\n", h_name);
|
||||
|
||||
/* Include guard the header. This relies on a grown up compiler. If you're
|
||||
* doing precompiled, you have one.
|
||||
*/
|
||||
fprintf(fp_h, "#pragma once\n");
|
||||
|
||||
/* The generated header uses unprefixed static_assert which needs an #include
|
||||
* seemingly.
|
||||
*/
|
||||
fprintf(fp_h, "#include \"util/macros.h\"\n\n");
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue