mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-21 06:48:09 +02:00
libclc seems to have piles of bugs where it relies on precise floating point behaviours to meet CL precision requirements but doesn't actually disable fast math in its own spir-v. I am tired of playing this whack-a-mole game. Let's just assume that the math in CLC is right and should not be optimized in unsafe ways, and force the exact bit across libclc. This works around a large class of libclc bugs that keep cropping up from innocuous NIR changes. This does not force the exact bit for application shaders using libclc, just for the calculations inside of libclc itself. This seems like the right tradeoff all considered, anything "fast" bypasses libclc anyway. Fixes generated_tests/cl/builtin/math/builtin-float-pow-1.0.generated.cl on drivers using nir_opt_reassociate, and probably other stuff. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Jesse Natalie <jenatali@microsoft.com> Reviewed-by: Karol Herbst <kherbst@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36527>
441 lines
13 KiB
C
441 lines
13 KiB
C
/*
|
|
* Copyright © 2020 Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "nir.h"
|
|
#include "nir_builder.h"
|
|
#include "nir_clc_helpers.h"
|
|
#include "nir_serialize.h"
|
|
#include "nir_spirv.h"
|
|
#include "util/mesa-sha1.h"
|
|
|
|
#ifdef DYNAMIC_LIBCLC_PATH
|
|
#include <fcntl.h>
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/mman.h>
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
#ifdef HAVE_STATIC_LIBCLC_ZSTD
|
|
#include <zstd.h>
|
|
#endif
|
|
|
|
#ifdef HAVE_STATIC_LIBCLC_SPIRV
|
|
#include "spirv-mesa3d-.spv.h"
|
|
#endif
|
|
|
|
#ifdef HAVE_STATIC_LIBCLC_SPIRV64
|
|
#include "spirv64-mesa3d-.spv.h"
|
|
#endif
|
|
|
|
struct clc_file {
|
|
unsigned bit_size;
|
|
const char *static_data;
|
|
size_t static_data_size;
|
|
const char *sys_path;
|
|
};
|
|
|
|
static const struct clc_file libclc_files[] = {
|
|
{
|
|
.bit_size = 32,
|
|
#ifdef HAVE_STATIC_LIBCLC_SPIRV
|
|
.static_data = libclc_spirv_mesa3d_spv,
|
|
.static_data_size = sizeof(libclc_spirv_mesa3d_spv),
|
|
#endif
|
|
#ifdef DYNAMIC_LIBCLC_PATH
|
|
.sys_path = DYNAMIC_LIBCLC_PATH "spirv-mesa3d-.spv",
|
|
#endif
|
|
},
|
|
{
|
|
.bit_size = 64,
|
|
#ifdef HAVE_STATIC_LIBCLC_SPIRV64
|
|
.static_data = libclc_spirv64_mesa3d_spv,
|
|
.static_data_size = sizeof(libclc_spirv64_mesa3d_spv),
|
|
#endif
|
|
#ifdef DYNAMIC_LIBCLC_PATH
|
|
.sys_path = DYNAMIC_LIBCLC_PATH "spirv64-mesa3d-.spv",
|
|
#endif
|
|
},
|
|
};
|
|
|
|
static const struct clc_file *
|
|
get_libclc_file(unsigned ptr_bit_size)
|
|
{
|
|
assert(ptr_bit_size == 32 || ptr_bit_size == 64);
|
|
return &libclc_files[ptr_bit_size / 64];
|
|
}
|
|
|
|
struct clc_data {
|
|
const struct clc_file *file;
|
|
|
|
unsigned char cache_key[20];
|
|
|
|
int fd;
|
|
const void *data;
|
|
size_t size;
|
|
};
|
|
|
|
static bool
|
|
open_clc_data(struct clc_data *clc, unsigned ptr_bit_size)
|
|
{
|
|
memset(clc, 0, sizeof(*clc));
|
|
clc->file = get_libclc_file(ptr_bit_size);
|
|
clc->fd = -1;
|
|
|
|
if (clc->file->static_data) {
|
|
snprintf((char *)clc->cache_key, sizeof(clc->cache_key),
|
|
"libclc-spirv%d", ptr_bit_size);
|
|
return true;
|
|
}
|
|
|
|
#ifdef DYNAMIC_LIBCLC_PATH
|
|
if (clc->file->sys_path != NULL) {
|
|
int fd = open(clc->file->sys_path, O_RDONLY);
|
|
if (fd < 0)
|
|
return false;
|
|
|
|
struct stat stat;
|
|
int ret = fstat(fd, &stat);
|
|
if (ret < 0) {
|
|
fprintf(stderr, "fstat failed on %s: %m\n", clc->file->sys_path);
|
|
close(fd);
|
|
return false;
|
|
}
|
|
|
|
struct mesa_sha1 ctx;
|
|
_mesa_sha1_init(&ctx);
|
|
_mesa_sha1_update(&ctx, clc->file->sys_path, strlen(clc->file->sys_path));
|
|
#if defined(__APPLE__) || defined(__MACOSX)
|
|
_mesa_sha1_update(&ctx, &stat.st_mtime, sizeof(stat.st_mtime));
|
|
#else
|
|
_mesa_sha1_update(&ctx, &stat.st_mtim, sizeof(stat.st_mtim));
|
|
#endif
|
|
_mesa_sha1_final(&ctx, clc->cache_key);
|
|
|
|
clc->fd = fd;
|
|
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
return false;
|
|
}
|
|
|
|
#define SPIRV_WORD_SIZE 4
|
|
|
|
static bool
|
|
map_clc_data(struct clc_data *clc)
|
|
{
|
|
if (clc->file->static_data) {
|
|
#ifdef HAVE_STATIC_LIBCLC_ZSTD
|
|
unsigned long long cmp_size =
|
|
ZSTD_getFrameContentSize(clc->file->static_data,
|
|
clc->file->static_data_size);
|
|
if (cmp_size == ZSTD_CONTENTSIZE_UNKNOWN ||
|
|
cmp_size == ZSTD_CONTENTSIZE_ERROR) {
|
|
fprintf(stderr, "Could not determine the decompressed size of the "
|
|
"libclc SPIR-V\n");
|
|
return false;
|
|
}
|
|
|
|
size_t frame_size =
|
|
ZSTD_findFrameCompressedSize(clc->file->static_data,
|
|
clc->file->static_data_size);
|
|
if (ZSTD_isError(frame_size)) {
|
|
fprintf(stderr, "Could not determine the size of the first ZSTD frame "
|
|
"when decompressing libclc SPIR-V: %s\n",
|
|
ZSTD_getErrorName(frame_size));
|
|
return false;
|
|
}
|
|
|
|
void *dest = malloc(cmp_size + 1);
|
|
size_t size = ZSTD_decompress(dest, cmp_size, clc->file->static_data,
|
|
frame_size);
|
|
if (ZSTD_isError(size)) {
|
|
free(dest);
|
|
fprintf(stderr, "Error decompressing libclc SPIR-V: %s\n",
|
|
ZSTD_getErrorName(size));
|
|
return false;
|
|
}
|
|
|
|
clc->data = dest;
|
|
clc->size = size;
|
|
#else
|
|
clc->data = clc->file->static_data;
|
|
clc->size = clc->file->static_data_size;
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
#ifdef DYNAMIC_LIBCLC_PATH
|
|
if (clc->file->sys_path != NULL) {
|
|
off_t len = lseek(clc->fd, 0, SEEK_END);
|
|
if (len % SPIRV_WORD_SIZE != 0) {
|
|
fprintf(stderr, "File length isn't a multiple of the word size\n");
|
|
return false;
|
|
}
|
|
clc->size = len;
|
|
|
|
clc->data = mmap(NULL, len, PROT_READ, MAP_PRIVATE, clc->fd, 0);
|
|
if (clc->data == MAP_FAILED) {
|
|
fprintf(stderr, "Failed to mmap libclc SPIR-V: %m\n");
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
close_clc_data(struct clc_data *clc)
|
|
{
|
|
if (clc->file->static_data) {
|
|
#ifdef HAVE_STATIC_LIBCLC_ZSTD
|
|
free((void *)clc->data);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
#ifdef DYNAMIC_LIBCLC_PATH
|
|
if (clc->file->sys_path != NULL) {
|
|
if (clc->data)
|
|
munmap((void *)clc->data, clc->size);
|
|
close(clc->fd);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/** Returns true if libclc is found
|
|
*
|
|
* If libclc is compiled in statically, this always returns true. If we
|
|
* depend on a dynamic libclc, this opens and tries to stat the file.
|
|
*/
|
|
bool
|
|
nir_can_find_libclc(unsigned ptr_bit_size)
|
|
{
|
|
struct clc_data clc;
|
|
if (open_clc_data(&clc, ptr_bit_size)) {
|
|
close_clc_data(&clc);
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/** Adds generic pointer variants of libclc functions
|
|
*
|
|
* Libclc currently doesn't contain generic variants for a bunch of functions
|
|
* like `frexp` but the OpenCL spec with generic pointers requires them. We
|
|
* really should fix libclc but, in the mean time, we can easily duplicate
|
|
* every function that works on global memory and make it also work on generic
|
|
* memory.
|
|
*/
|
|
static bool
|
|
libclc_add_generic_variants(nir_shader *shader)
|
|
{
|
|
bool progress = false;
|
|
|
|
nir_foreach_function(func, shader) {
|
|
/* These don't need generic variants */
|
|
if (strstr(func->name, "async_work_group_strided_copy"))
|
|
continue;
|
|
|
|
char *U3AS1 = strstr(func->name, "U3AS1");
|
|
if (U3AS1 == NULL)
|
|
continue;
|
|
|
|
ptrdiff_t offset_1 = U3AS1 - func->name + 4;
|
|
assert(offset_1 < strlen(func->name) && func->name[offset_1] == '1');
|
|
|
|
char *generic_name = ralloc_strdup(shader, func->name);
|
|
assert(generic_name[offset_1] == '1');
|
|
generic_name[offset_1] = '4';
|
|
|
|
if (nir_shader_get_function_for_name(shader, generic_name))
|
|
continue;
|
|
|
|
nir_function *gfunc = nir_function_create(shader, generic_name);
|
|
gfunc->num_params = func->num_params;
|
|
gfunc->params = ralloc_array(shader, nir_parameter, gfunc->num_params);
|
|
for (unsigned i = 0; i < gfunc->num_params; i++)
|
|
gfunc->params[i] = func->params[i];
|
|
|
|
nir_function_set_impl(gfunc, nir_function_impl_clone(shader, func->impl));
|
|
|
|
/* Rewrite any global pointers to generic */
|
|
nir_foreach_block(block, gfunc->impl) {
|
|
nir_foreach_instr(instr, block) {
|
|
if (instr->type != nir_instr_type_deref)
|
|
continue;
|
|
|
|
nir_deref_instr *deref = nir_instr_as_deref(instr);
|
|
if (!nir_deref_mode_may_be(deref, nir_var_mem_global))
|
|
continue;
|
|
|
|
assert(deref->type != nir_deref_type_var);
|
|
assert(nir_deref_mode_is(deref, nir_var_mem_global));
|
|
|
|
deref->modes = nir_var_mem_generic;
|
|
}
|
|
}
|
|
|
|
progress = true;
|
|
nir_progress(true, func->impl, nir_metadata_none);
|
|
}
|
|
|
|
if (progress) {
|
|
nir_foreach_function_impl(impl, shader) {
|
|
if (impl->valid_metadata & nir_metadata_not_properly_reset) {
|
|
/* Preserve all metadata for functions that we didn't modify. */
|
|
nir_no_progress(impl);
|
|
}
|
|
}
|
|
}
|
|
|
|
return progress;
|
|
}
|
|
|
|
static bool
|
|
mark_exact(nir_builder *b, nir_alu_instr *alu, UNUSED void *_)
|
|
{
|
|
alu->exact = true;
|
|
return true;
|
|
}
|
|
|
|
nir_shader *
|
|
nir_load_libclc_shader(unsigned ptr_bit_size,
|
|
struct disk_cache *disk_cache,
|
|
const struct spirv_to_nir_options *spirv_options,
|
|
const nir_shader_compiler_options *nir_options,
|
|
bool optimize)
|
|
{
|
|
assert(ptr_bit_size ==
|
|
nir_address_format_bit_size(spirv_options->global_addr_format));
|
|
|
|
struct clc_data clc;
|
|
if (!open_clc_data(&clc, ptr_bit_size))
|
|
return NULL;
|
|
|
|
#ifdef ENABLE_SHADER_CACHE
|
|
cache_key cache_key;
|
|
if (disk_cache) {
|
|
disk_cache_compute_key(disk_cache, clc.cache_key,
|
|
sizeof(clc.cache_key), cache_key);
|
|
|
|
size_t buffer_size;
|
|
uint8_t *buffer = disk_cache_get(disk_cache, cache_key, &buffer_size);
|
|
if (buffer) {
|
|
struct blob_reader blob;
|
|
blob_reader_init(&blob, buffer, buffer_size);
|
|
nir_shader *nir = nir_deserialize(NULL, nir_options, &blob);
|
|
free(buffer);
|
|
close_clc_data(&clc);
|
|
return nir;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (!map_clc_data(&clc)) {
|
|
close_clc_data(&clc);
|
|
return NULL;
|
|
}
|
|
|
|
struct spirv_to_nir_options spirv_lib_options = *spirv_options;
|
|
spirv_lib_options.create_library = true;
|
|
|
|
assert(clc.size % SPIRV_WORD_SIZE == 0);
|
|
nir_shader *nir = spirv_to_nir(clc.data, clc.size / SPIRV_WORD_SIZE,
|
|
NULL, 0, MESA_SHADER_KERNEL, NULL,
|
|
&spirv_lib_options, nir_options);
|
|
nir_validate_shader(nir, "after nir_load_clc_shader");
|
|
|
|
/* nir_inline_libclc will assume that the functions in this shader are
|
|
* already ready to lower. This means we need to inline any function_temp
|
|
* initializers and lower any early returns.
|
|
*/
|
|
nir->info.internal = true;
|
|
NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp);
|
|
NIR_PASS(_, nir, nir_lower_returns);
|
|
|
|
NIR_PASS(_, nir, libclc_add_generic_variants);
|
|
|
|
/* libclc relies on precise floating point behaviour to meet CL precision
|
|
* requirements, but the SPIR-V does not disable contractions etc. Forcing
|
|
* the exact bit across libclc effectively compiles libclc without fast-math,
|
|
* which works around a large class of (current and future) libclc bugs.
|
|
*
|
|
* Kernels using CL are unaffected, this only affects the high-precision
|
|
* floating point routines inside libclc. Fast variants bypass libclc anyway.
|
|
*/
|
|
NIR_PASS(_, nir, nir_shader_alu_pass, mark_exact, nir_metadata_all, NULL);
|
|
|
|
/* Run some optimization passes. Those used here should be considered safe
|
|
* for all use cases and drivers.
|
|
*/
|
|
if (optimize) {
|
|
NIR_PASS(_, nir, nir_split_var_copies);
|
|
|
|
bool progress;
|
|
do {
|
|
progress = false;
|
|
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
|
|
NIR_PASS(progress, nir, nir_lower_var_copies);
|
|
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
|
|
NIR_PASS(progress, nir, nir_copy_prop);
|
|
NIR_PASS(progress, nir, nir_opt_remove_phis);
|
|
NIR_PASS(progress, nir, nir_opt_dce);
|
|
NIR_PASS(progress, nir, nir_opt_if, false);
|
|
NIR_PASS(progress, nir, nir_opt_dead_cf);
|
|
NIR_PASS(progress, nir, nir_opt_cse);
|
|
/* drivers run this pass, so don't be too aggressive. More aggressive
|
|
* values only increase effectiveness by <5%
|
|
*/
|
|
nir_opt_peephole_select_options peephole_select_options = {
|
|
.limit = 0,
|
|
};
|
|
NIR_PASS(progress, nir, nir_opt_peephole_select, &peephole_select_options);
|
|
NIR_PASS(progress, nir, nir_opt_algebraic);
|
|
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
|
NIR_PASS(progress, nir, nir_opt_undef);
|
|
NIR_PASS(progress, nir, nir_opt_deref);
|
|
} while(progress);
|
|
|
|
nir_sweep(nir);
|
|
}
|
|
|
|
#ifdef ENABLE_SHADER_CACHE
|
|
if (disk_cache) {
|
|
struct blob blob;
|
|
blob_init(&blob);
|
|
nir_serialize(&blob, nir, false);
|
|
disk_cache_put(disk_cache, cache_key, blob.data, blob.size, NULL);
|
|
blob_finish(&blob);
|
|
}
|
|
#endif
|
|
|
|
close_clc_data(&clc);
|
|
return nir;
|
|
}
|