From b7447a94c832dfba5886d1c86c65299ee9147f95 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Fri, 17 Jan 2025 10:50:17 -0500
Subject: [PATCH] vtn: add vtn_bindgen2 tool

This is a rewrite of vtn_bindgen. For now the two tools live in parallel, to
give Intel time to migrate off v1.

For a refresher, the classic vtn_bindgen reads a SPIR-V and generates a .h
containing nir_builder stubs for each exported function. The stub inserts an
unimplemented nir_function with the proper signature into the shader, and adds a
"call" to that function. The driver is responsible for linking with the library
later, which is annoying.

vtn_bindgen2 instead generates a .c/.h pair. The header are just prototypes with
identical signatures to what we have now. The .c implementations, however, are
very different. Instead of generating unimplemented nir_function, the
implementations contain the actual code (as serialized NIR, deserialized
on-the-fly). There is no linking step, nor a library nir_shader that the driver
has to keep around.

The programming model here is that this is "just" nir_builder ... just a
massively more competent way of using nir_builder.

Additionally, the whole SPIR-V -> optimized lowered serialized NIR step is now
all common code. There's no longer anything target-specific, and it's
disentangled from the nir_precomp infrastructure.

That means drivers can use CL with zero integration, except a few meson.build
rules. This gives a very gentle on-ramp to CL for drivers. (Note: that applies
only for library-style CL. For precompiled kernel-style CL, that still requires
significant driver integration. I do have plans there, though. Also,
printf/abort support requires a minimal amount of driver code.)

Furthermore, this unblocks the use of CL library functions in common code. That
makes this an important step towards common code geom/tess or maybe saner
raytracing.

For drivers already using classic vtn_bindgen, porting to vtn_bindgen2 should
just be deleting all your linking/deserializing code. The .cl's are unchanged,
as are the function prototypes exposed.

Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33099>
---
 src/compiler/spirv/meson.build    |  13 +
 src/compiler/spirv/vtn_bindgen2.c | 457 ++++++++++++++++++++++++++++++
 2 files changed, 470 insertions(+)
 create mode 100644 src/compiler/spirv/vtn_bindgen2.c

diff --git a/src/compiler/spirv/meson.build b/src/compiler/spirv/meson.build
index 179e1ad5636..525650508a9 100644
--- a/src/compiler/spirv/meson.build
+++ b/src/compiler/spirv/meson.build
@@ -91,6 +91,19 @@ if get_option('mesa-clc') != 'system' and (with_driver_using_cl or \
      native : not meson.can_run_host_binaries(),
      install : get_option('install-mesa-clc'),
    )
+
+   prog_vtn_bindgen2 = executable(
+     'vtn_bindgen2',
+     ['vtn_bindgen2.c'],
+     include_directories : [inc_include, inc_src],
+     c_args : [c_msvc_compat_args, no_override_init_args],
+     dependencies : [idep_vtn, idep_mesautil, idep_nir],
+     # If we can run host binaries directly, just build vtn_bindgen for the host.
+     # Most commonly this happens when doing a cross compile from an x86_64 build
+     # machine to an x86 host
+     native : not meson.can_run_host_binaries(),
+     install : get_option('install-mesa-clc'),
+   )
 endif
 
 if with_tests
diff --git a/src/compiler/spirv/vtn_bindgen2.c b/src/compiler/spirv/vtn_bindgen2.c
new file mode 100644
index 00000000000..746ae14e003
--- /dev/null
+++ b/src/compiler/spirv/vtn_bindgen2.c
@@ -0,0 +1,457 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/spirv/nir_spirv.h"
+#include "util/u_printf.h"
+#include "glsl_types.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_precompiled.h"
+#include "nir_serialize.h"
+
+static const struct spirv_to_nir_options spirv_options = {
+   .environment = NIR_SPIRV_OPENCL,
+   .shared_addr_format = nir_address_format_62bit_generic,
+   .global_addr_format = nir_address_format_62bit_generic,
+   .temp_addr_format = nir_address_format_62bit_generic,
+   .constant_addr_format = nir_address_format_64bit_global,
+   .create_library = true,
+   .printf = true,
+};
+
+struct nir_shader_compiler_options generic_opts = {
+   /* TODO: Do we want to set has_*? Will drivers be able to lower
+    * appropriately?
+    */
+   .fuse_ffma16 = true,
+   .fuse_ffma32 = true,
+   .fuse_ffma64 = true,
+
+   .max_unroll_iterations = 32,
+   .max_unroll_iterations_fp64 = 32,
+};
+
+static bool
+rewrite_return(nir_builder *b, nir_intrinsic_instr *intr, void *return_deref)
+{
+   if (intr->intrinsic != nir_intrinsic_load_param)
+      return false;
+
+   unsigned idx = nir_intrinsic_param_idx(intr);
+   if (idx == 0)
+      nir_def_replace(&intr->def, return_deref);
+   else
+      nir_intrinsic_set_param_idx(intr, idx - 1);
+
+   return true;
+}
+
+static void
+lower_to_bindgen_return(nir_shader *nir)
+{
+   nir_foreach_function(libfunc, nir) {
+      bool returns = libfunc->params[0].is_return;
+      libfunc->pass_flags = returns;
+      if (!returns)
+         continue;
+
+      nir_variable *ret = nir_local_variable_create(
+         libfunc->impl, libfunc->params[0].type, "return");
+
+      nir_builder b = nir_builder_at(nir_before_impl(libfunc->impl));
+      nir_deref_instr *deref = nir_build_deref_var(&b, ret);
+
+      nir_function_intrinsics_pass(libfunc->impl, rewrite_return,
+                                   nir_metadata_control_flow, &deref->def);
+
+      b.cursor = nir_after_impl(libfunc->impl);
+      nir_bindgen_return(&b, nir_load_var(&b, ret));
+
+      /* Remove the first parameter (the return deref), leaving only the true
+       * parameters.
+       */
+      libfunc->num_params--;
+      memcpy(libfunc->params, libfunc->params + 1,
+             sizeof(libfunc->params[0]) * libfunc->num_params);
+   }
+}
+
+/* Standard optimization loop */
+static void
+optimize(nir_shader *nir)
+{
+   bool progress;
+   do {
+      progress = false;
+
+      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_remove_phis);
+      NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      NIR_PASS(progress, nir, nir_opt_dead_cf);
+      NIR_PASS(progress, nir, nir_opt_cse);
+      NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
+      NIR_PASS(progress, nir, nir_opt_phi_precision);
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+      NIR_PASS(progress, nir, nir_opt_deref);
+      NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, nir, nir_opt_undef);
+
+      NIR_PASS(progress, nir, nir_opt_loop_unroll);
+      NIR_PASS(progress, nir, nir_opt_loop);
+   } while (progress);
+
+   NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
+}
+
+static nir_shader *
+compile(void *memctx, const uint32_t *spirv, size_t spirv_size)
+{
+   const nir_shader_compiler_options *nir_options = &generic_opts;
+
+   assert(spirv_size % 4 == 0);
+   nir_shader *nir =
+      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
+                   "library", &spirv_options, nir_options);
+   nir_validate_shader(nir, "after spirv_to_nir");
+   ralloc_steal(memctx, nir);
+
+   nir_fixup_is_exported(nir);
+
+   /* At the moment, entrypoints will be compiled to binaries by a different
+    * tool, remove them as we are only interested in library functions for
+    * bindgen.
+    *
+    * A future version of vtn_bindgen will handle the entrypoints too.
+    */
+   nir_remove_entrypoints(nir);
+
+   NIR_PASS(_, nir, nir_lower_system_values);
+   NIR_PASS(_, nir, nir_lower_calls_to_builtins);
+
+   nir_lower_compute_system_values_options cs = {.global_id_is_32bit = true};
+   NIR_PASS(_, nir, nir_lower_compute_system_values, &cs);
+
+   NIR_PASS(_, nir, nir_lower_printf,
+            &(const struct nir_lower_printf_options){
+               .hash_format_strings = true,
+            });
+
+   /* We have to lower away local constant initializers right before we
+    * inline functions.  That way they get properly initialized at the top
+    * of the function and not at the top of its caller.
+    */
+   NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS(_, nir, nir_lower_returns);
+   NIR_PASS(_, nir, nir_inline_functions);
+   nir_remove_non_exported(nir);
+   NIR_PASS(_, nir, nir_copy_prop);
+   NIR_PASS(_, nir, nir_opt_deref);
+
+   /* We can't deal with constant data, get rid of it */
+   nir_lower_constant_to_temp(nir);
+
+   /* We can go ahead and lower the rest of the constant initializers.  We do
+    * this here so that nir_remove_dead_variables and split_per_member_structs
+    * below see the corresponding stores.
+    */
+   NIR_PASS(_, nir, nir_lower_variable_initializers, ~0);
+
+   /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
+    * aligned and so it can just read/write them as vec4s.  This results in a
+    * LOT of vec4->vec3 casts on loads and stores.  One solution to this
+    * problem is to get rid of all vec3 variables.
+    */
+   NIR_PASS(_, nir, nir_lower_vec3_to_vec4,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant);
+
+   /* Bit more lowering... this doesn't seem to be load-bearing though.. */
+   NIR_PASS(_, nir, nir_split_var_copies);
+   NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
+   NIR_PASS(_, nir, nir_lower_var_copies);
+
+   /* We assign explicit types early so that the optimizer can take advantage
+    * of that information and hopefully get rid of some of our memcpys.
+    */
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_uniform | nir_var_shader_temp | nir_var_function_temp |
+               nir_var_mem_shared | nir_var_mem_global,
+            glsl_get_cl_type_size_align);
+
+   NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
+
+   /* Lower again, this time after dead-variables to get more compact variable
+    * layouts.
+    */
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant,
+            glsl_get_cl_type_size_align);
+   assert(nir->constant_data_size == 0);
+
+   NIR_PASS(_, nir, nir_lower_memcpy);
+
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_constant,
+            nir_address_format_64bit_global);
+
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform,
+            nir_address_format_32bit_offset_as_64bit);
+
+   lower_to_bindgen_return(nir);
+
+   NIR_PASS(_, nir, nir_opt_deref);
+   NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);
+   NIR_PASS(_, nir, nir_opt_if, 0);
+
+   optimize(nir);
+
+   /* Now lower returns so we can get rid of derefs */
+   NIR_PASS(_, nir, nir_lower_vars_to_ssa);
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant,
+            glsl_get_cl_type_size_align);
+
+   NIR_PASS(_, nir, nir_lower_explicit_io,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global,
+            nir_address_format_62bit_generic);
+
+   /* Try to optimize scratch access, since LLVM loooves its scratch. If this
+    * makes progress, we need to lower the results.
+    */
+   bool scratch_lowered = false;
+   NIR_PASS(scratch_lowered, nir, nir_lower_scratch_to_var);
+   if (scratch_lowered) {
+      NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
+   }
+
+   /* Prune derefs/variables late, since scratch lowering leaves dead
+    * derefs/variables and there's no point rerunning these passes.
+    */
+   NIR_PASS(_, nir, nir_remove_dead_derefs);
+   NIR_PASS(_, nir, nir_remove_dead_variables,
+            nir_var_function_temp | nir_var_shader_temp, NULL);
+
+   /* Do a last round of clean up after the extra lowering */
+   NIR_PASS(_, nir, nir_copy_prop);
+   NIR_PASS(_, nir, nir_opt_constant_folding);
+   NIR_PASS(_, nir, nir_opt_algebraic);
+   NIR_PASS(_, nir, nir_opt_cse);
+   NIR_PASS(_, nir, nir_opt_dce);
+
+   /* Re-index SSA defs at the very end to make the NIR more legible. This
+    * doesn't matter for correctness, but it's polite.
+    */
+   nir_foreach_function_impl(it, nir) {
+      nir_index_ssa_defs(it);
+   }
+
+   return nir;
+}
+
+static void
+print_signature(FILE *fp, nir_function *f)
+{
+   bool returns = f->pass_flags;
+   fprintf(fp, "%s\n", returns ? "nir_def *" : "void");
+   fprintf(fp, "%s(nir_builder *b", f->name);
+
+   for (unsigned i = 0; i < f->num_params; ++i) {
+      fprintf(fp, ", nir_def *%s", f->params[i].name);
+   }
+
+   fprintf(fp, ")");
+}
+
+int
+main(int argc, char **argv)
+{
+   if (argc != 4) {
+      fprintf(stderr, "Usage: %s [input spir-v] [output .c] [output .h]\n",
+              argv[0]);
+      return 1;
+   }
+
+   const char *infile = argv[1];
+   const char *outcfile = argv[2];
+   const char *outhfile = argv[3];
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   FILE *fin = fopen(infile, "rb");
+   if (!fin) {
+      fprintf(stderr, "Failed to open %s\n", infile);
+      return 1;
+   }
+
+   fseek(fin, 0L, SEEK_END);
+   size_t len = ftell(fin);
+   rewind(fin);
+
+   uint32_t *map = malloc(ALIGN_POT(len, 4));
+   if (!map) {
+      fprintf(stderr, "Failed to allocate");
+      fclose(fin);
+      return 1;
+   }
+
+   fread(map, 1, len, fin);
+   fclose(fin);
+
+   FILE *fp_c = fopen(outcfile, "w");
+   if (!fp_c) {
+      fprintf(stderr, "Failed to open %s\n", outcfile);
+      free(map);
+      return 1;
+   }
+
+   FILE *fp_h = fopen(outhfile, "w");
+   if (!fp_h) {
+      fprintf(stderr, "Failed to open %s\n", outhfile);
+      free(map);
+      fclose(fp_c);
+      return 1;
+   }
+
+   glsl_type_singleton_init_or_ref();
+
+   for (unsigned i = 0; i < 2; ++i) {
+      FILE *fp = i ? fp_c : fp_h;
+
+      fprintf(fp, "/*\n");
+      fprintf(fp, " * Copyright Mesa3D Contributors\n");
+      fprintf(fp, " * SPDX-License-Identifier: MIT\n");
+      fprintf(fp, " *\n");
+      fprintf(fp, " * Autogenerated file, do not edit\n");
+      fprintf(fp, " */\n\n");
+
+      if (fp == fp_h) {
+         fprintf(fp, "#pragma once\n\n");
+      }
+
+      fprintf(fp, "#include \"compiler/nir/nir.h\"\n");
+      fprintf(fp, "#include \"compiler/nir/nir_builder.h\"\n\n");
+
+      fprintf(fp, "#ifdef __cplusplus\n");
+      fprintf(fp, "extern \"C\" {\n");
+      fprintf(fp, "#endif\n");
+   }
+
+   nir_shader *nir = compile(mem_ctx, map, len);
+
+   nir_foreach_function(libfunc, nir) {
+      bool returns = libfunc->pass_flags;
+
+      /* Declare the function in the generated header */
+      print_signature(fp_h, libfunc);
+      fprintf(fp_h, ";\n\n");
+
+      /* We don't know where the header will end up on the file system, so we
+       * manually declare the signatures.
+       */
+      print_signature(fp_c, libfunc);
+      fprintf(fp_c, ";\n\n");
+
+      print_signature(fp_c, libfunc);
+      fprintf(fp_c, "\n{\n");
+
+      struct blob blob;
+      blob_init(&blob);
+      nir_serialize_function(&blob, libfunc);
+      fprintf(fp_c, "   /*\n");
+      nir_print_function_body(libfunc->impl, fp_c);
+      fprintf(fp_c, "   */\n");
+      fprintf(fp_c, "   ");
+      nir_precomp_print_blob(fp_c, "impl", "nir", 0,
+                             (const uint32_t *)blob.data, blob.size, true);
+      blob_finish(&blob);
+
+      if (libfunc->num_params > 0) {
+         fprintf(fp_c, "   nir_def *args[%u] = { ", libfunc->num_params);
+         for (unsigned a = 0; a < libfunc->num_params; ++a) {
+            fprintf(fp_c, "%s%s", a ? ", " : "", libfunc->params[a].name);
+         }
+         fprintf(fp_c, " };\n");
+      }
+
+      fprintf(fp_c, "   ");
+      if (returns)
+         fprintf(fp_c, "return ");
+
+      fprintf(fp_c,
+              "nir_call_serialized(b, impl_0_nir, sizeof(impl_0_nir), %s);",
+              libfunc->num_params > 0 ? "args" : "NULL");
+
+      fprintf(fp_c, "\n}\n\n");
+   }
+
+   for (unsigned i = 0; i < 2; ++i) {
+      FILE *fp = i ? fp_c : fp_h;
+
+      fprintf(fp, "#ifdef __cplusplus\n");
+      fprintf(fp, "} /* extern C */\n");
+      fprintf(fp, "#endif\n");
+   }
+
+   fprintf(fp_c, "struct vtn_bindgen_dummy {\n");
+   fprintf(fp_c, "   vtn_bindgen_dummy() {\n");
+   fprintf(fp_c, "      /* Format strings:\n");
+   fprintf(fp_c, "       *\n");
+   for (unsigned i = 0; i < nir->printf_info_count; ++i) {
+      u_printf_info *info = &nir->printf_info[i];
+      const char *str = info->strings;
+      fprintf(fp_c, "       * ");
+
+      for (unsigned j = 0; j < strlen(str); ++j) {
+         char c = str[j];
+         if (c == '\n')
+            fprintf(fp_c, "\\n");
+         else if (c == '/' && j && str[j - 1] == '*')
+            fprintf(fp_c, "\\/");
+         else
+            fprintf(fp_c, "%c", c);
+      }
+
+      fprintf(fp_c, "\n");
+   }
+   fprintf(fp_c, "       */\n");
+
+   /* Stuff printf info into Mesa's singleton */
+   struct blob blob;
+   blob_init(&blob);
+   u_printf_serialize_info(&blob, nir->printf_info, nir->printf_info_count);
+   nir_precomp_print_blob(fp_c, "printf", "blob", 0,
+                          (const uint32_t *)blob.data, blob.size, true);
+   blob_finish(&blob);
+
+   fprintf(fp_c, "      u_printf_singleton_init_or_ref();\n");
+   fprintf(
+      fp_c,
+      "      u_printf_singleton_add_serialized((const void*)printf_0_blob, sizeof(printf_0_blob));\n");
+
+   fprintf(fp_c, "   }\n");
+   fprintf(fp_c, "\n");
+   fprintf(fp_c, "   ~vtn_bindgen_dummy() {\n");
+   fprintf(fp_c, "      u_printf_singleton_decref();\n");
+   fprintf(fp_c, "   }\n");
+   fprintf(fp_c, "};\n");
+   fprintf(fp_c, "\n");
+   fprintf(fp_c, "static vtn_bindgen_dummy vtn_bindgen_dummy_instance;\n");
+
+   glsl_type_singleton_decref();
+   fclose(fp_c);
+   fclose(fp_h);
+   free(map);
+   ralloc_free(mem_ctx);
+   return 0;
+}