intel: remove GRL/intel-clc

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35227>
2025-12-20 22:30:12 +01:00 · 2025-05-29 12:35:40 +03:00 · 2025-05-29 12:35:40 +03:00 · f0e18c475b
commit f0e18c475b
parent 44bff7eb05
93 changed files with 10 additions and 40555 deletions
--- a/meson.build
+++ b/meson.build
@ -307,29 +307,12 @@ with_any_broadcom = [
 with_intel_vk_rt = get_option('intel-rt') \
   .disable_auto_if(not with_intel_vk) \
   .disable_if(get_option('intel-bvh-grl') and \
               host_machine.cpu_family() != 'x86_64', \
               error_message : 'Intel Ray Tracing is only supported on x86_64') \
  .allowed()
 with_intel_bvh_grl = get_option('intel-bvh-grl')
 if get_option('intel-clc') != 'system' and \
   get_option('precomp-compiler') != 'system' and \
   with_intel_bvh_grl
  # Require intel-clc with Anv & Iris (for internal shaders)
  with_intel_clc = get_option('intel-clc') == 'enabled' or \
                   get_option('precomp-compiler') == 'enabled' or \
                   with_intel_bvh_grl
 else
  with_intel_clc = false
 endif
 with_any_intel = [
  with_gallium_crocus,
  with_gallium_i915,
  with_gallium_iris,
  with_intel_clc,
  with_intel_hasvk,
  with_intel_tools,
  with_intel_vk,
--- a/meson.options
+++ b/meson.options
@ -693,13 +693,6 @@ option(
  description : 'Build the intel-clc compiler or use a system version.'
 )
 option(
  'intel-bvh-grl',
  type : 'boolean',
  value : false,
  description : 'Build the BVH structure using GRL.'
 )
 option(
  'install-intel-clc',
  type : 'boolean',
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@ -87,7 +87,6 @@ with_nir_headers_only = (
    with_gallium_rusticl,
    with_microsoft_clc,
    with_spirv_to_dxil,
    with_intel_clc,
    with_clc,
    with_drivers_clc,
    get_option('intel-elk'),
--- a/src/intel/compiler/intel_clc.c
+++ b/src/intel/compiler/intel_clc.c
@ -1,632 +0,0 @@
 /*
 * Copyright © 2021 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include "brw_compiler.h"
 #include "brw_kernel.h"
 #include "brw_nir.h"
 #include "elk/elk_nir.h"
 #include "compiler/brw_disasm.h"
 #include "compiler/clc/clc.h"
 #include "compiler/glsl_types.h"
 #include "compiler/nir/nir_serialize.h"
 #include "compiler/spirv/spirv_info.h"
 #include "dev/intel_debug.h"
 #include "util/build_id.h"
 #include "util/disk_cache.h"
 #include "util/macros.h"
 #include "util/mesa-sha1.h"
 #include "util/u_dynarray.h"
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <string.h>
 #include <sys/mman.h>
 /* Shader functions */
 #define SPIR_V_MAGIC_NUMBER 0x07230203
 static struct disk_cache *
 get_disk_cache(struct brw_compiler *compiler)
 {
 #ifdef ENABLE_SHADER_CACHE
   char renderer[14];
   ASSERTED int len = snprintf(renderer, sizeof(renderer), "brw_clc_%04x",
                               compiler->devinfo->pci_device_id);
   assert(len == sizeof(renderer) - 2);
   const struct build_id_note *note =
      build_id_find_nhdr_for_addr(get_disk_cache);
   if (note == NULL) {
      fprintf(stderr, "Failed to find build-id\n");
      abort();
   }
   unsigned build_id_len = build_id_length(note);
   if (build_id_len < 20) {
      fprintf(stderr, "build-id too short.  It needs to be a SHA\n");
      abort();
   }
   struct mesa_sha1 sha1_ctx;
   uint8_t sha1[20];
   _mesa_sha1_init(&sha1_ctx);
   _mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len);
   _mesa_sha1_final(&sha1_ctx, sha1);
   char timestamp[41];
   _mesa_sha1_format(timestamp, sha1);
   const uint64_t driver_flags = brw_get_compiler_config_value(compiler);
   return disk_cache_create(renderer, timestamp, driver_flags);
 #endif
   return NULL;
 }
 static void
 compiler_log(void *data, unsigned *id, const char *fmt, ...)
 {
   va_list args;
   va_start(args, fmt);
   if (INTEL_DEBUG(DEBUG_CS))
      vfprintf(stderr, fmt, args);
   va_end(args);
 }
 static void
 msg_callback(void *priv, const char *msg)
 {
   (void)priv;
   fprintf(stderr, "%s", msg);
 }
 static void
 print_u32_data(FILE *fp, const char *prefix, const char *arr_name,
               const uint32_t *data, size_t len)
 {
   assert(len % 4 == 0);
   fprintf(fp, "static const uint32_t %s_%s[] = {", prefix, arr_name);
   for (unsigned i = 0; i < (len / 4); i++) {
      if (i % 4 == 0)
         fprintf(fp,"\n   ");
      fprintf(fp, " 0x%08" PRIx32 ",", data[i]);
   }
   fprintf(fp, "\n};\n");
 }
 static void
 print_u8_data(FILE *fp, const char *prefix, const char *arr_name,
               const uint8_t *data, size_t len)
 {
   fprintf(fp, "static const uint8_t %s_%s[] = {", prefix, arr_name);
   for (unsigned i = 0; i < len; i++) {
      if (i % 16 == 0)
         fprintf(fp,"\n   ");
      fprintf(fp, " 0x%02" PRIx8 ",", data[i]);
   }
   fprintf(fp, "\n};\n");
 }
 static const char *
 reloc_type_str(enum brw_shader_reloc_type type)
 {
   switch (type) {
 #define CASE(e) case e: return #e;
   CASE(BRW_SHADER_RELOC_TYPE_U32)
   CASE(BRW_SHADER_RELOC_TYPE_MOV_IMM)
 #undef CASE
   default:
      unreachable("Unknown relocation type");
   }
 }
 static void
 print_cs_prog_data_fields(FILE *fp, const char *prefix, const char *pad,
                          const struct brw_cs_prog_data *cs_prog_data)
 {
 #define PROG_DATA_FIELD(fmt, field) \
   fprintf(fp, "%s." #field " = " fmt ",\n", pad, cs_prog_data->field)
 #define PROG_DATA_BOOL_FIELD(field) \
   fprintf(fp, "%s." #field " = %s,\n", pad, \
           cs_prog_data->field ? "true" : "false")
   PROG_DATA_FIELD("%u", base.nr_params);
   assert(cs_prog_data->base.stage == MESA_SHADER_COMPUTE);
   fprintf(fp, "%s.base.stage = MESA_SHADER_COMPUTE,\n", pad);
   assert(cs_prog_data->base.zero_push_reg == 0);
   assert(cs_prog_data->base.push_reg_mask_param == 0);
   PROG_DATA_FIELD("%u", base.curb_read_length);
   PROG_DATA_FIELD("%u", base.total_scratch);
   PROG_DATA_FIELD("%u", base.total_shared);
   PROG_DATA_FIELD("%u", base.program_size);
   PROG_DATA_FIELD("%u", base.const_data_size);
   PROG_DATA_FIELD("%u", base.const_data_offset);
   PROG_DATA_FIELD("%u", base.num_relocs);
   fprintf(fp, "%s.base.relocs = %s_relocs,\n", pad, prefix);
   PROG_DATA_FIELD("%u", base.grf_used);
   PROG_DATA_FIELD("%u", base.printf_info_count);
   fprintf(fp, "%s.base.printf_info = (u_printf_info *)%s_printfs,\n", pad, prefix);
   assert(!cs_prog_data->base.has_ubo_pull);
   assert(cs_prog_data->base.dispatch_grf_start_reg == 0);
   assert(!cs_prog_data->base.use_alt_mode);
   assert(cs_prog_data->base.param == 0);
   PROG_DATA_BOOL_FIELD(base.uses_atomic_load_store);
   fprintf(fp, "%s.local_size = { %u, %u, %u },\n", pad,
           cs_prog_data->local_size[0],
           cs_prog_data->local_size[1],
           cs_prog_data->local_size[2]);
   fprintf(fp, "%s.prog_offset = { %u, %u, %u },\n", pad,
           cs_prog_data->prog_offset[0],
           cs_prog_data->prog_offset[1],
           cs_prog_data->prog_offset[2]);
   PROG_DATA_FIELD("%u", prog_mask);
   PROG_DATA_FIELD("%u", prog_spilled);
   PROG_DATA_BOOL_FIELD(uses_barrier);
   PROG_DATA_BOOL_FIELD(uses_num_work_groups);
   assert(!cs_prog_data->uses_inline_data);
   assert(!cs_prog_data->uses_btd_stack_ids);
   PROG_DATA_FIELD("%u", push.per_thread.dwords);
   PROG_DATA_FIELD("%u", push.per_thread.regs);
   PROG_DATA_FIELD("%u", push.per_thread.size);
   PROG_DATA_FIELD("%u", push.cross_thread.dwords);
   PROG_DATA_FIELD("%u", push.cross_thread.regs);
   PROG_DATA_FIELD("%u", push.cross_thread.size);
 #undef PROG_DATA_FIELD
 #undef PROG_DATA_BOOL_FIELD
 }
 static void
 print_kernel(FILE *fp, const char *prefix,
             const struct brw_kernel *kernel,
             const struct brw_isa_info *isa)
 {
   struct mesa_sha1 sha1_ctx;
   _mesa_sha1_init(&sha1_ctx);
 #define SHA1_UPDATE_VALUE(val) \
   _mesa_sha1_update(&sha1_ctx, &val, sizeof(val))
   fprintf(fp, "#include \"intel/compiler/brw_kernel.h\"\n");
   fprintf(fp, "\n");
   fprintf(fp, "static const struct brw_shader_reloc %s_relocs[] = {\n",
           prefix);
   for (unsigned i = 0; i < kernel->prog_data.base.num_relocs; i++) {
      const struct brw_shader_reloc *reloc = &kernel->prog_data.base.relocs[i];
      fprintf(fp, "   { %"PRIu32", %s, %"PRIu32", %"PRIu32" },\n",
              reloc->id, reloc_type_str(reloc->type),
              reloc->offset, reloc->delta);
   }
   fprintf(fp, "};\n");
   _mesa_sha1_update(&sha1_ctx, kernel->prog_data.base.relocs,
                     kernel->prog_data.base.num_relocs *
                     sizeof(kernel->prog_data.base.relocs[0]));
   fprintf(fp, "static const u_printf_info %s_printfs[] = {\n",
           prefix);
   for (unsigned i = 0; i < kernel->prog_data.base.printf_info_count; i++) {
      const u_printf_info *printf_info = &kernel->prog_data.base.printf_info[i];
      fprintf(fp, "   {\n");
      fprintf(fp, "      .num_args = %"PRIu32",\n", printf_info->num_args);
      fprintf(fp, "      .arg_sizes = (unsigned []) {\n");
      for (unsigned a = 0; a < printf_info->num_args; a++)
         fprintf(fp, "         %"PRIu32",\n", printf_info->arg_sizes[a]);
      fprintf(fp, "      },\n");
      fprintf(fp, "      .string_size = %"PRIu32",\n", printf_info->string_size);
      fprintf(fp, "      .strings = (char []) {");
      for (unsigned c = 0; c < printf_info->string_size; c++) {
         if (c % 8 == 0 )
            fprintf(fp, "\n         ");
         fprintf(fp, "0x%02hhx, ", printf_info->strings[c]);
      }
      fprintf(fp, "\n      },\n");
      fprintf(fp, "   },\n");
   }
   fprintf(fp, "};\n");
   /* Get rid of the pointers before we hash */
   struct brw_cs_prog_data cs_prog_data = kernel->prog_data;
   cs_prog_data.base.relocs = NULL;
   assert(cs_prog_data.base.param == NULL);
   _mesa_sha1_update(&sha1_ctx, &cs_prog_data, sizeof(cs_prog_data));
   SHA1_UPDATE_VALUE(kernel->args_size);
   SHA1_UPDATE_VALUE(kernel->arg_count);
   _mesa_sha1_update(&sha1_ctx, kernel->args,
                     kernel->arg_count * sizeof(kernel->args[0]));
   fprintf(fp, "static const struct brw_kernel_arg_desc %s_args[] = {\n",
           prefix);
   for (unsigned i = 0; i < kernel->arg_count; i++) {
      fprintf(fp, "   { %d, %d },\n",
              kernel->args[i].offset, kernel->args[i].size);
   }
   fprintf(fp, "};\n\n");
   _mesa_sha1_update(&sha1_ctx, kernel->code,
                     kernel->prog_data.base.program_size);
   fprintf(fp, "#if 0  /* BEGIN KERNEL ASSEMBLY */\n");
   fprintf(fp, "\n");
   brw_disassemble_with_errors(isa, kernel->code, 0, NULL, fp);
   fprintf(fp, "\n");
   fprintf(fp, "#endif /* END KERNEL ASSEMBLY */\n");
   print_u32_data(fp, prefix, "code", kernel->code,
                  kernel->prog_data.base.program_size);
   fprintf(fp, "static const struct brw_kernel %s = {\n", prefix);
   fprintf(fp, "   .prog_data = {\n");
   print_cs_prog_data_fields(fp, prefix, "      ", &kernel->prog_data);
   fprintf(fp, "   },\n");
   fprintf(fp, "   .args_size = %d,\n", (int)kernel->args_size);
   fprintf(fp, "   .arg_count = %d,\n", (int)kernel->arg_count);
   fprintf(fp, "   .args = %s_args,\n", prefix);
   fprintf(fp, "   .code = %s_code,\n", prefix);
   fprintf(fp, "};\n");
   unsigned char sha1[20];
   _mesa_sha1_final(&sha1_ctx, sha1);
   char sha1_str[41];
   _mesa_sha1_format(sha1_str, sha1);
   fprintf(fp, "const char *%s_sha1 = \"%s\";\n", prefix, sha1_str);
 }
 static void
 print_usage(char *exec_name, FILE *f)
 {
   fprintf(f,
 "Usage: %s [options] -- [clang args]\n"
 "Options:\n"
 "  -h  --help              Print this help.\n"
 "  -e, --entrypoint <name> Specify the entry-point name.\n"
 "  -L, --llvm17-wa         Enable LLVM 17 workarounds for opaque pointers"
 "  -p, --platform <name>   Specify the target platform name.\n"
 "      --prefix <prefix>   Prefix for variable names in generated C code.\n"
 "  -o, --out <filename>    Specify the output filename.\n"
 "  -i, --in <filename>     Specify one input filename. Accepted multiple times.\n"
 "  -s, --spv <filename>    Specify the output filename for spirv.\n"
 "  -n, --nir               Specify whether to output serialized NIR instead of ISA.\n"
 "  -g, --gfx-version <ver> Specify the Gfx version used for NIR output.\n"
 "  -t, --text <filename>   Specify the output filename for the parsed text\n"
 "  -v, --verbose           Print more information during compilation.\n"
 "  -M, --llvm-version      Print LLVM version.\n"
   , exec_name);
 }
 #define OPT_PREFIX 1000
 struct intel_clc_params {
   char *entry_point;
   char *platform;
   char *outfile;
   char *spv_outfile;
   char *txt_outfile;
   char *prefix;
   unsigned gfx_version;
   bool output_nir;
   bool print_info;
   bool llvm17_wa;
   void *mem_ctx;
   struct intel_device_info devinfo;
 };
 #include "compiler/spirv/nir_spirv.h"
 static int
 output_isa(const struct intel_clc_params *params, struct clc_binary *binary)
 {
   struct brw_kernel kernel = {};
   char *error_str;
   int ret = 0;
   struct brw_isa_info _isa, *isa = &_isa;
   brw_init_isa_info(isa, &params->devinfo);
   struct brw_compiler *compiler = brw_compiler_create(params->mem_ctx,
                                                       &params->devinfo);
   compiler->spilling_rate = 11;
   compiler->shader_debug_log = compiler_log;
   compiler->shader_perf_log = compiler_log;
   struct disk_cache *disk_cache = get_disk_cache(compiler);
   if (!brw_kernel_from_spirv(compiler, disk_cache, &kernel, NULL, params->mem_ctx,
                              binary->data, binary->size,
                              params->entry_point, &error_str)) {
      fprintf(stderr, "Compile failed: %s\n", error_str);
      ret = -1;
      goto exit;
   }
   if (params->print_info) {
      fprintf(stdout, "kernel info:\n");
      fprintf(stdout, "   uses_barrier           : %u\n", kernel.prog_data.uses_barrier);
      fprintf(stdout, "   uses_num_work_groups   : %u\n", kernel.prog_data.uses_num_work_groups);
      fprintf(stdout, "   uses_inline_data       : %u\n", kernel.prog_data.uses_inline_data);
      fprintf(stdout, "   local_size             : %ux%ux%u\n",
              kernel.prog_data.local_size[0],
              kernel.prog_data.local_size[1],
              kernel.prog_data.local_size[2]);
      fprintf(stdout, "   curb_read_length       : %u\n", kernel.prog_data.base.curb_read_length);
      fprintf(stdout, "   total_scratch          : %u\n", kernel.prog_data.base.total_scratch);
      fprintf(stdout, "   total_shared           : %u\n", kernel.prog_data.base.total_shared);
      fprintf(stdout, "   program_size           : %u\n", kernel.prog_data.base.program_size);
      fprintf(stdout, "   const_data_size        : %u\n", kernel.prog_data.base.const_data_size);
      fprintf(stdout, "   uses_atomic_load_store : %u\n", kernel.prog_data.base.uses_atomic_load_store);
      fprintf(stdout, "   dispatch_grf_start_reg : %u\n", kernel.prog_data.base.dispatch_grf_start_reg);
   }
   char *prefix = params->prefix;
   char prefix_tmp[256];
   if (prefix == NULL) {
      bool is_pt_5 = (params->devinfo.verx10 % 10) == 5;
      snprintf(prefix_tmp, sizeof(prefix_tmp), "gfx%d%s_clc_%s",
               params->devinfo.ver, is_pt_5 ? "5" : "", params->entry_point);
      prefix = prefix_tmp;
   }
   if (params->outfile != NULL) {
      FILE *fp = fopen(params->outfile, "w");
      print_kernel(fp, prefix, &kernel, isa);
      fclose(fp);
   } else {
      print_kernel(stdout, prefix, &kernel, isa);
   }
 exit:
   disk_cache_destroy(disk_cache);
   return ret;
 }
 static void
 print_llvm_version(FILE *out)
 {
   fprintf(out, "%s\n", MESA_LLVM_VERSION_STRING);
 }
 int main(int argc, char **argv)
 {
   int exit_code = 0;
   process_intel_debug_variable();
   static struct option long_options[] ={
      {"help",         no_argument,         0, 'h'},
      {"entrypoint",   required_argument,   0, 'e'},
      {"platform",     required_argument,   0, 'p'},
      {"prefix",       required_argument,   0, OPT_PREFIX},
      {"in",           required_argument,   0, 'i'},
      {"out",          required_argument,   0, 'o'},
      {"spv",          required_argument,   0, 's'},
      {"text",         required_argument,   0, 't'},
      {"llvm-version", no_argument,         0, 'M'},
      {"verbose",      no_argument,         0, 'v'},
      {0, 0, 0, 0}
   };
   struct intel_clc_params params = {};
   struct util_dynarray clang_args;
   struct util_dynarray input_files;
   struct clc_binary spirv_obj = {0};
   struct clc_parsed_spirv parsed_spirv_data = {0};
   struct disk_cache *disk_cache = NULL;
   params.mem_ctx = ralloc_context(NULL);
   util_dynarray_init(&clang_args, params.mem_ctx);
   util_dynarray_init(&input_files, params.mem_ctx);
   int ch;
   while ((ch = getopt_long(argc, argv, "he:p:s:t:i:o:Mv", long_options, NULL)) != -1)
   {
      switch (ch)
      {
      case 'h':
         print_usage(argv[0], stdout);
         goto end;
      case 'e':
         params.entry_point = optarg;
         break;
      case 'p':
         params.platform = optarg;
         break;
      case 'o':
         params.outfile = optarg;
         break;
      case 'i':
         util_dynarray_append(&input_files, char *, optarg);
 	 break;
      case 's':
         params.spv_outfile = optarg;
         break;
      case 't':
         params.txt_outfile = optarg;
         break;
      case 'v':
         params.print_info = true;
         break;
      case 'M':
         print_llvm_version(stdout);
         return EXIT_SUCCESS;
      case OPT_PREFIX:
         params.prefix = optarg;
         break;
      default:
         fprintf(stderr, "Unrecognized option \"%s\".\n", optarg);
         print_usage(argv[0], stderr);
         goto fail;
      }
   }
   for (int i = optind; i < argc; i++) {
      util_dynarray_append(&clang_args, char *, argv[i]);
   }
   if (util_dynarray_num_elements(&input_files, char *) == 0) {
      fprintf(stderr, "No input file(s).\n");
      print_usage(argv[0], stderr);
      goto fail;
   }
   struct clc_logger logger = {
      .error = msg_callback,
      .warning = msg_callback,
   };
   size_t total_size = 0;
   char *all_inputs = NULL;
   util_dynarray_foreach(&input_files, char *, infile) {
      int fd = open(*infile, O_RDONLY);
      if (fd < 0) {
         fprintf(stderr, "Failed to open %s\n", *infile);
         goto fail;
      }
      off_t len = lseek(fd, 0, SEEK_END);
      size_t new_size = total_size + len;
      all_inputs = reralloc_size(params.mem_ctx, all_inputs, new_size + 1);
      if (!all_inputs) {
         fprintf(stderr, "Failed to allocate memory\n");
         goto fail;
      }
      lseek(fd, 0, SEEK_SET);
      read(fd, all_inputs + total_size, len);
      close(fd);
      total_size = new_size;
      all_inputs[total_size] = '\0';
   }
   if (params.txt_outfile) {
      FILE *fp = fopen(params.txt_outfile, "w");
      fwrite(all_inputs, total_size, 1, fp);
      fclose(fp);
   }
   const char *allowed_spirv_extensions[] = {
      "SPV_EXT_shader_atomic_float_add",
      "SPV_EXT_shader_atomic_float_min_max",
      "SPV_KHR_float_controls",
      "SPV_INTEL_subgroups",
      NULL,
   };
   struct clc_compile_args clc_args = {
      .source = {
         .name = "intel_clc_files",
         .value = all_inputs,
      },
      .features = {
         .fp16 = true,
         .intel_subgroups = true,
         .subgroups = true,
         .subgroups_ifp = true,
      },
      .args = util_dynarray_begin(&clang_args),
      .num_args = util_dynarray_num_elements(&clang_args, char *),
      .allowed_spirv_extensions = allowed_spirv_extensions,
   };
   if (!clc_compile_c_to_spirv(&clc_args, &logger, &spirv_obj, NULL)) {
      goto fail;
   }
   if (params.spv_outfile) {
      FILE *fp = fopen(params.spv_outfile, "w");
      fwrite(spirv_obj.data, spirv_obj.size, 1, fp);
      fclose(fp);
   }
   glsl_type_singleton_init_or_ref();
   if (params.platform == NULL) {
      fprintf(stderr, "No target platform name specified.\n");
      print_usage(argv[0], stderr);
      goto fail;
   }
   int pci_id = intel_device_name_to_pci_device_id(params.platform);
   if (pci_id < 0) {
      fprintf(stderr, "Invalid target platform name: %s\n", params.platform);
      goto fail;
   }
   if (!intel_get_device_info_for_build(pci_id, &params.devinfo)) {
      fprintf(stderr, "Failed to get device information.\n");
      goto fail;
   }
   if (params.devinfo.verx10 < 125) {
      fprintf(stderr, "Platform currently not supported.\n");
      goto fail;
   }
   if (params.entry_point == NULL) {
      fprintf(stderr, "No entry-point name specified.\n");
      print_usage(argv[0], stderr);
      goto fail;
   }
   if (!clc_parse_spirv(&spirv_obj, &logger, &parsed_spirv_data))
      goto fail;
   const struct clc_kernel_info *kernel_info = NULL;
   for (unsigned i = 0; i < parsed_spirv_data.num_kernels; i++) {
      if (strcmp(parsed_spirv_data.kernels[i].name, params.entry_point) == 0) {
         kernel_info = &parsed_spirv_data.kernels[i];
         break;
      }
   }
   if (kernel_info == NULL) {
      fprintf(stderr, "Kernel entrypoint %s not found\n", params.entry_point);
      goto fail;
   }
   exit_code = output_isa(&params, &spirv_obj);
   glsl_type_singleton_decref();
   goto end;
 fail:
   exit_code = 1;
 end:
   disk_cache_destroy(disk_cache);
   clc_free_parsed_spirv(&parsed_spirv_data);
   clc_free_spirv(&spirv_obj);
   ralloc_free(params.mem_ctx);
   return exit_code;
 }
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@ -169,42 +169,6 @@ idep_intel_compiler_brw = declare_dependency(
  ],
 )
 # For now this tool is only going to be used by Anv
 if with_intel_bvh_grl
  if get_option('intel-clc') == 'system' or get_option('precomp-compiler') == 'system'
    prog_intel_clc = find_program('intel_clc', native : true)
    dep_prog_intel_clc = []
  elif with_intel_clc
    prog_intel_clc = executable(
      'intel_clc',
      [
        'intel_clc.c',
        'brw_kernel.c',
        # Use just the nir_options part of ELK instead of fully linking.
        'elk/elk_nir_options.h',
        'elk/elk_nir_options.c',
        'elk/elk_spirv.c',
      ],
      link_with : [libisl],
      include_directories : [inc_include, inc_src, inc_intel],
      c_args : [pre_args, no_override_init_args],
      cpp_args : ['-Werror=vla'],
      link_args : [ld_args_build_id],
      dependencies : [idep_nir, idep_mesaclc, idep_mesautil, idep_intel_dev,
                      idep_intel_compiler_brw],
      # If we can run host binaries directly, just build intel_clc for the host.
      # Most commonly this happens when doing a cross compile from an x86_64 build
      # machine to an x86 host
      native : not meson.can_run_host_binaries(),
      install : get_option('install-intel-clc') or get_option('install-precomp-compiler'),
    )
    dep_prog_intel_clc = [prog_intel_clc]
  endif
 else
  dep_prog_intel_clc = []
 endif
 if with_tests
  test(
    'intel_compiler_brw_tests',
--- a/src/intel/vulkan/anv_formats.c
+++ b/src/intel/vulkan/anv_formats.c
@ -1016,29 +1016,8 @@ get_buffer_format_features2(const struct intel_device_info *devinfo,
            flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
         if (devinfo->has_ray_tracing) {
 #if ANV_SUPPORT_RT_GRL
            switch (vk_format) {
            case VK_FORMAT_R32G32_SFLOAT:
            case VK_FORMAT_R32G32B32_SFLOAT:
            case VK_FORMAT_R16G16_SFLOAT:
            case VK_FORMAT_R16G16B16A16_SFLOAT:
            case VK_FORMAT_R16G16_SNORM:
            case VK_FORMAT_R16G16B16A16_SNORM:
            case VK_FORMAT_R16G16B16A16_UNORM:
            case VK_FORMAT_R16G16_UNORM:
            case VK_FORMAT_R8G8B8A8_UNORM:
            case VK_FORMAT_R8G8_UNORM:
            case VK_FORMAT_R8G8B8A8_SNORM:
            case VK_FORMAT_R8G8_SNORM:
               flags |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
               break;
            default:
               break;
            }
 #else
            if (vk_acceleration_struct_vtx_format_supported(vk_format))
               flags |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
 #endif
         }
      }
   }
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -2415,7 +2415,7 @@ anv_queue_post_submit(struct anv_queue *queue, VkResult submit_result)
         result = vk_queue_set_lost(&queue->vk, "sync wait failed");
   }
-#if ANV_SUPPORT_RT && !ANV_SUPPORT_RT_GRL
+#if ANV_SUPPORT_RT
   /* The recorded bvh is dumped to files upon command buffer completion */
   if (INTEL_DEBUG_BVH_ANY)
      anv_dump_bvh_to_files(queue->device);
--- a/src/intel/vulkan/genX_acceleration_structure_grl.c
+++ b/src/intel/vulkan/genX_acceleration_structure_grl.c
--- a/src/intel/vulkan/genX_init_state.c
+++ b/src/intel/vulkan/genX_init_state.c
@ -31,10 +31,6 @@
 #include "vk_standard_sample_locations.h"
 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT_GRL
 #include "grl/genX_grl.h"
 #endif
 #include "genX_mi_builder.h"
 #include "vk_util.h"
@ -895,13 +891,8 @@ genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
   assert(pdevice->info.verx10 == GFX_VERx10);
 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
 #if ANV_SUPPORT_RT_GRL
   genX(grl_load_rt_uuid)(pdevice->rt_uuid);
   pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
 #else
   STATIC_ASSERT(sizeof(ANV_RT_UUID_MACRO) == VK_UUID_SIZE);
   memcpy(pdevice->rt_uuid, ANV_RT_UUID_MACRO, VK_UUID_SIZE);
 #endif
 #endif
   pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@ -2040,12 +2040,7 @@ void genX(CmdCopyQueryPoolResults)(
 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
 #if ANV_SUPPORT_RT_GRL
 #include "grl/include/GRLRTASCommon.h"
 #include "grl/grl_metakernel_postbuild_info.h"
 #else
 #include "bvh/anv_bvh.h"
 #endif
 void
 genX(CmdWriteAccelerationStructuresPropertiesKHR)(
@ -2064,66 +2059,19 @@ genX(CmdWriteAccelerationStructuresPropertiesKHR)(
   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 #if !ANV_SUPPORT_RT_GRL
   anv_add_pending_pipe_bits(cmd_buffer,
                             ANV_PIPE_END_OF_PIPE_SYNC_BIT |
                             ANV_PIPE_DATA_CACHE_FLUSH_BIT,
                             "read BVH data using CS");
 #endif
   if (append_query_clear_flush(
          cmd_buffer, pool,
-          "CmdWriteAccelerationStructuresPropertiesKHR flush query clears") ||
+          "CmdWriteAccelerationStructuresPropertiesKHR flush query clears"))
       !ANV_SUPPORT_RT_GRL)
      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
   struct mi_builder b;
   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 #if ANV_SUPPORT_RT_GRL
   for (uint32_t i = 0; i < accelerationStructureCount; i++) {
      ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
      struct anv_address query_addr =
         anv_address_add(anv_query_address(pool, firstQuery + i), 8);
      switch (queryType) {
      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
         genX(grl_postbuild_info_compacted_size)(cmd_buffer,
                                                 vk_acceleration_structure_get_va(accel),
                                                 anv_address_physical(query_addr));
         break;
      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
         genX(grl_postbuild_info_current_size)(cmd_buffer,
                                               vk_acceleration_structure_get_va(accel),
                                               anv_address_physical(query_addr));
         break;
      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
         genX(grl_postbuild_info_serialized_size)(cmd_buffer,
                                                  vk_acceleration_structure_get_va(accel),
                                                  anv_address_physical(query_addr));
         break;
      default:
         unreachable("unhandled query type");
      }
   }
   /* TODO: Figure out why MTL needs ANV_PIPE_DATA_CACHE_FLUSH_BIT in order
    * to not lose the availability bit.
    */
   anv_add_pending_pipe_bits(cmd_buffer,
                             ANV_PIPE_END_OF_PIPE_SYNC_BIT |
                             ANV_PIPE_DATA_CACHE_FLUSH_BIT,
                             "after write acceleration struct props");
   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
   for (uint32_t i = 0; i < accelerationStructureCount; i++)
      emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true);
 #else
   for (uint32_t i = 0; i < accelerationStructureCount; i++) {
      ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
      struct anv_address query_addr =
@ -2163,6 +2111,5 @@ genX(CmdWriteAccelerationStructuresPropertiesKHR)(
      mi_builder_set_write_check(&b1, (i == (accelerationStructureCount - 1)));
      emit_query_mi_availability(&b1, anv_query_address(pool, firstQuery + i), true);
   }
 #endif /* ANV_SUPPORT_RT_GRL */
 }
 #endif /* GFX_VERx10 >= 125 && ANV_SUPPORT_RT */
--- a/src/intel/vulkan/grl/.gitignore
+++ b/src/intel/vulkan/grl/.gitignore
@ -1 +0,0 @@
 parsetab.py
--- a/src/intel/vulkan/grl/genX_grl.h
+++ b/src/intel/vulkan/grl/genX_grl.h
@ -1,54 +0,0 @@
 /*
 * Copyright © 2021 Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #ifndef ANV_GRL_H
 #define ANV_GRL_H
 #include "grl/grl_cl_kernel.h"
 #include "genxml/gen_macros.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct anv_cmd_buffer;
 struct anv_kernel_arg;
 void
 genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
                   enum grl_cl_kernel kernel,
                   const uint32_t *global_size,
                   uint32_t arg_count,
                   const struct anv_kernel_arg *args);
 void
 genX(grl_load_rt_uuid)(uint8_t *out_uuid);
 uint32_t
 genX(grl_max_scratch_size)(void);
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* ANV_GRL_H */
--- a/src/intel/vulkan/grl/genX_grl_dispatch.c
+++ b/src/intel/vulkan/grl/genX_grl_dispatch.c
@ -1,113 +0,0 @@
 /*
 * Copyright © 2021 Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include "anv_private.h"
 #include "genX_grl.h"
 static struct anv_shader_bin *
 get_shader_bin(struct anv_device *device,
               enum grl_cl_kernel kernel)
 {
   const char *key = genX(grl_get_cl_kernel_sha1)(kernel);
   int key_len = strlen(key);
   bool cache_hit = false;
   struct anv_shader_bin *bin =
      anv_device_search_for_kernel(device, device->internal_cache,
                                   key, key_len, &cache_hit);
   if (bin != NULL)
      return bin;
   uint32_t dummy_param[32];
   struct brw_kernel kernel_data;
   genX(grl_get_cl_kernel)(&kernel_data, kernel);
   assert(kernel_data.prog_data.base.nr_params <= ARRAY_SIZE(dummy_param));
   kernel_data.prog_data.base.param = dummy_param;
   struct anv_push_descriptor_info empty_push_desc_info = {};
   struct anv_pipeline_bind_map bind_map = {
      .kernel_args_size = kernel_data.args_size,
      .kernel_arg_count = kernel_data.arg_count,
      .kernel_args = (struct brw_kernel_arg_desc *)kernel_data.args,
   };
   struct anv_shader_upload_params upload_params = {
      .stage               = MESA_SHADER_KERNEL,
      .key_data            = key,
      .key_size            = key_len,
      .kernel_data         = kernel_data.code,
      .kernel_size         = kernel_data.prog_data.base.program_size,
      .prog_data           = &kernel_data.prog_data.base,
      .prog_data_size      = sizeof(kernel_data.prog_data),
      .bind_map            = &bind_map,
      .push_desc_info      = &empty_push_desc_info,
   };
   bin = anv_device_upload_kernel(device, device->internal_cache,
                                  &upload_params);
   /* The cache already has a reference and it's not going anywhere so there
    * is no need to hold a second reference.
    */
   anv_shader_bin_unref(device, bin);
   return bin;
 }
 void
 genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
                   enum grl_cl_kernel kernel,
                   const uint32_t *global_size,
                   uint32_t arg_count,
                   const struct anv_kernel_arg *args)
 {
   struct anv_device *device = cmd_buffer->device;
   const struct intel_l3_weights w =
      intel_get_default_l3_weights(device->info, true, true);
   struct anv_kernel ak = {
      .bin = get_shader_bin(device, kernel),
      .l3_config = intel_get_l3_config(device->info, w),
   };
   genX(cmd_buffer_dispatch_kernel)(cmd_buffer, &ak, global_size,
                                    arg_count, args);
 }
 uint32_t
 genX(grl_max_scratch_size)(void)
 {
   uint32_t scratch_size = 0;
   for (uint32_t i = 0; i < GRL_CL_KERNEL_MAX; i++) {
      struct brw_kernel kernel_data;
      genX(grl_get_cl_kernel)(&kernel_data, i);
      scratch_size = MAX2(kernel_data.prog_data.base.total_scratch,
                          scratch_size);
   }
   return scratch_size;
 }
--- a/src/intel/vulkan/grl/genX_grl_uuid.cpp
+++ b/src/intel/vulkan/grl/genX_grl_uuid.cpp
@ -1,40 +0,0 @@
 /*
 * Copyright © 2021 Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include <assert.h>
 #include <string.h>
 #include "genX_grl.h"
 #include "include/GRLGen12.h"
 #include "vulkan/vulkan_core.h"
 extern "C" void
 genX(grl_load_rt_uuid)(uint8_t *out_uuid);
 extern "C" void
 genX(grl_load_rt_uuid)(uint8_t *out_uuid)
 {
   assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE);
   memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE);
 }
--- a/src/intel/vulkan/grl/gpu/AABB.h
+++ b/src/intel/vulkan/grl/gpu/AABB.h
@ -1,450 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "shared.h"
 #include "intrinsics.h"
 #ifndef __OPENCL_VERSION__
 #include "stdio.h"
 #endif
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
 /* ====== QUAD ENCODING config ====== */
 #define QUAD_GEOMID_BITS 27 // dxr limit is 2^24 geos... we have headroom
 #define QUAD_PRIMID_DIFF_BITS (32 - QUAD_GEOMID_BITS)
 #define QUAD_GEOMID_MASK      ((1<<QUAD_GEOMID_BITS)-1)
 #define QUAD_PRIMID_BITS 29 // dxr limit is 2^29 prims total within one blas
 #define QUAD_PRIMID_MASK  ((1<<QUAD_PRIMID_BITS)-1)
 #define INSTANCE_ID_BITS 24
 #define INSTANCE_ID_MASK ((1<<INSTANCE_ID_BITS)-1)
 // JDB TODO:  Make this a separate, dedicated structure..  Aliasing a float4 AABB as a primref is needlessly obfuscated
 typedef struct AABB PrimRef;
 GRL_INLINE void AABB_init(struct AABB *aabb)
 {
    aabb->lower = (float4)(INFINITY, INFINITY, INFINITY, 0);
    aabb->upper = -(float4)(INFINITY, INFINITY, INFINITY, 0);
 }
 GRL_INLINE uint PRIMREF_geomID( PrimRef* aabb)
 {
    const uint v = as_uint(aabb->lower.w);
    return v & QUAD_GEOMID_MASK;
 }
 GRL_INLINE uint PRIMREF_primID0( PrimRef* aabb)
 {
    return as_uint( aabb->upper.w ) & QUAD_PRIMID_MASK;
 }
 GRL_INLINE uint PRIMREF_primID1( PrimRef* aabb)
 {
    const uint v = as_uint(aabb->lower.w);
    const uint primID0 = as_uint(aabb->upper.w) & QUAD_PRIMID_MASK;
    const uint deltaID = v >> QUAD_GEOMID_BITS;
    const uint primID1 = primID0 + deltaID;
    return primID1;
 }
 GRL_INLINE uint PRIMREF_geomFlags( PrimRef* aabb )
 {
    const uint v = as_uint( aabb->upper.w );
    return (v >> QUAD_PRIMID_BITS) ;
 }
 GRL_INLINE uint PRIMREF_instanceIndex( PrimRef* aabb )
 {
    return as_uint(aabb->lower.w) & INSTANCE_ID_MASK;
 }
 GRL_INLINE uchar PRIMREF_instanceMask( PrimRef* aabb )
 {
    return as_uint(aabb->lower.w) >> INSTANCE_ID_BITS;
 }
 GRL_INLINE void PRIMREF_setProceduralMetaData( PrimRef* primref, uint geomID, uint primID, uint geomFlags )
 {
    /* encode geomID, primID */
    uint flags = (geomFlags << QUAD_PRIMID_BITS);
    primref->lower.w = as_float( geomID );
    primref->upper.w = as_float( primID | flags );
 }
 GRL_INLINE void PRIMREF_setQuadMetaData( PrimRef* primref, uint primID0, uint primID1, uint geomID, uint geomFlags )
 {
    const uint primID_diff = primID1 - primID0;
    uint flags = geomFlags << QUAD_PRIMID_BITS;
    primref->lower.w = as_float( geomID | (primID_diff << QUAD_GEOMID_BITS) );
    primref->upper.w = as_float( (primID0 | flags) );
 }
 GRL_INLINE void PRIMREF_setAABB( PrimRef* primref, float3 lower, float3 upper )
 {
    primref->lower.xyz = lower.xyz;
    primref->upper.xyz = upper.xyz;
 }
 GRL_INLINE PrimRef PRIMREF_set_instance( float3 lower, float3 upper, uint instanceIndex, uint instanceMask, uint rootOffset, bool is_procedural )
 {
    PrimRef new_ref;
    new_ref.lower.xyz = lower;
    new_ref.lower.w = as_float(instanceIndex | (instanceMask << 24));
    new_ref.upper.xyz = upper;
    new_ref.upper.w = as_float(rootOffset + (is_procedural? 0x80000000 : 0));    
    return new_ref;
 }
 GRL_INLINE bool PRIMREF_isProceduralInstance( PrimRef* primref )
 {
    return (as_uint(primref->upper.w) & 0x80000000) != 0;
 }
 GRL_INLINE uint PRIMREF_instanceRootNodeOffset(PrimRef* primref)
 {
    return (as_uint(primref->upper.w) & 0x7fffffff);
 }
 GRL_INLINE float3 PRIMREF_lower( PrimRef* primref )
 {
    return primref->lower.xyz;
 }
 GRL_INLINE float3 PRIMREF_upper( PrimRef* primref )
 {
    return primref->upper.xyz;
 }
 GRL_INLINE void AABB_extend(struct AABB *aabb, struct AABB *v)
 {
    aabb->lower = min(aabb->lower, v->lower);
    aabb->upper = max(aabb->upper, v->upper);
 }
 GRL_INLINE void AABB_extend_point(struct AABB *aabb, const float4 p)
 {
    aabb->lower = min(aabb->lower, p);
    aabb->upper = max(aabb->upper, p);
 }
 GRL_INLINE void AABB_extendlu(struct AABB *aabb, const float4 lower, const float4 upper)
 {
    aabb->lower = min(aabb->lower, lower);
    aabb->upper = max(aabb->upper, upper);
 }
 GRL_INLINE struct AABB AABB_enlarge(struct AABB *aabb, const float v)
 {
    struct AABB box;
    box.lower = aabb->lower - (float4)v;
    box.upper = aabb->upper + (float4)v;
    return box;
 }
 GRL_INLINE void AABB_intersect(struct AABB *aabb, struct AABB *v)
 {
    aabb->lower = max(aabb->lower, v->lower);
    aabb->upper = min(aabb->upper, v->upper);
 }
 GRL_INLINE float4 AABB_size(struct AABB *aabb)
 {
    return aabb->upper - aabb->lower;
 }
 GRL_INLINE float4 AABB_centroid2(struct AABB *aabb)
 {
    return aabb->lower + aabb->upper;
 }
 GRL_INLINE float AABB_halfArea(struct AABB *aabb)
 {
    const float4 d = AABB_size(aabb);
    return halfarea(d.xyz);
 }
 GRL_INLINE float AABB_intersecion_size(struct AABB* aabb, struct AABB* v)
 {
    struct AABB temp = *aabb;
    AABB_intersect(&temp, v);
    float4 len = AABB_size(&temp);
    float ret = 0.0f;
    if (len.x >= 0.0f && len.y >= 0.0f && len.z >= 0.0f) {
        float3 v = { len.x, len.y, len.z };
        ret = halfarea(v);
    }
    return ret;
 }
 GRL_INLINE bool AABB_subset(struct AABB* small, struct AABB* big)
 {
    const int4 b0 = small->lower >= big->lower;
    const int4 b1 = small->upper <= big->upper;
    const int4 b = b0 & b1;
    return b.x & b.y & b.z;
 }
 GRL_INLINE struct AABB AABBfromAABB3f(const struct AABB3f box)
 {
    struct AABB box4d = {
        {box.lower[0], box.lower[1], box.lower[2], 0.0f},
        {box.upper[0], box.upper[1], box.upper[2], 0.0f}
    };
    return box4d;
 }
 GRL_INLINE struct AABB3f AABB3fFromAABB(const struct AABB box)
 {
    struct AABB3f box3d = {
        {box.lower[0], box.lower[1], box.lower[2]},
        {box.upper[0], box.upper[1], box.upper[2]}
    };
    return box3d;
 }
 GRL_INLINE bool AABB_verify(struct AABB* aabb)
 {
    bool error = false;
    if (aabb->lower.x > aabb->upper.x)
        error = true;
    if (aabb->lower.y > aabb->upper.y)
        error = true;
    if (aabb->lower.z > aabb->upper.z)
        error = true;
    if (!isfinite(aabb->lower.x))
        error = true;
    if (!isfinite(aabb->lower.y))
        error = true;
    if (!isfinite(aabb->lower.z))
        error = true;
    if (!isfinite(aabb->upper.x))
        error = true;
    if (!isfinite(aabb->upper.y))
        error = true;
    if (!isfinite(aabb->upper.z))
        error = true;
    return error;
 }
 GRL_INLINE void AABB_print(struct AABB* aabb)
 {
    printf("AABB {\n  area = %f\n  lower = %f\n  upper = %f\n  geomID = %i  primID0 = %i  primID1 = %i\n  aabb->lower.w = %x  aabb->upper.w = %x }\n",
        AABB_halfArea(aabb),
        aabb->lower.xyz,
        aabb->upper.xyz,
        PRIMREF_geomID(aabb),
        PRIMREF_primID0(aabb),
        PRIMREF_primID1(aabb),
        as_uint(aabb->lower.w),
        as_uint(aabb->upper.w));
 }
 #ifdef __OPENCL_VERSION__
 GRL_INLINE PrimRef PrimRef_sub_group_shuffle(PrimRef* primRef, const uint slotID)
 {
    PrimRef shuffledPrimref;
    shuffledPrimref.lower.x = intel_sub_group_shuffle(primRef->lower.x, slotID);
    shuffledPrimref.lower.y = intel_sub_group_shuffle(primRef->lower.y, slotID);
    shuffledPrimref.lower.z = intel_sub_group_shuffle(primRef->lower.z, slotID);
    shuffledPrimref.lower.w = intel_sub_group_shuffle(primRef->lower.w, slotID);
    shuffledPrimref.upper.x = intel_sub_group_shuffle(primRef->upper.x, slotID);
    shuffledPrimref.upper.y = intel_sub_group_shuffle(primRef->upper.y, slotID);
    shuffledPrimref.upper.z = intel_sub_group_shuffle(primRef->upper.z, slotID);
    shuffledPrimref.upper.w = intel_sub_group_shuffle(primRef->upper.w, slotID);
    return shuffledPrimref;
 }
 GRL_INLINE struct AABB AABB_sub_group_broadcast(struct AABB *aabb, const uint slotID)
 {
    struct AABB bounds;
    bounds.lower.x = sub_group_broadcast(aabb->lower.x, slotID);
    bounds.lower.y = sub_group_broadcast(aabb->lower.y, slotID);
    bounds.lower.z = sub_group_broadcast(aabb->lower.z, slotID);
    bounds.lower.w = 0;
    bounds.upper.x = sub_group_broadcast(aabb->upper.x, slotID);
    bounds.upper.y = sub_group_broadcast(aabb->upper.y, slotID);
    bounds.upper.z = sub_group_broadcast(aabb->upper.z, slotID);
    bounds.upper.w = 0;
    return bounds;
 }
 GRL_INLINE struct AABB AABB_sub_group_shuffle(struct AABB* aabb, const uint slotID)
 {
    struct AABB bounds;
    bounds.lower.x = intel_sub_group_shuffle(aabb->lower.x, slotID);
    bounds.lower.y = intel_sub_group_shuffle(aabb->lower.y, slotID);
    bounds.lower.z = intel_sub_group_shuffle(aabb->lower.z, slotID);
    bounds.lower.w = 0;
    bounds.upper.x = intel_sub_group_shuffle(aabb->upper.x, slotID);
    bounds.upper.y = intel_sub_group_shuffle(aabb->upper.y, slotID);
    bounds.upper.z = intel_sub_group_shuffle(aabb->upper.z, slotID);
    bounds.upper.w = 0;
    return bounds;
 }
 GRL_INLINE uint AABB_sub_group_shuffle_coordPerLane(struct AABB* aabb, const uint slotID)
 {
    float coordData[8] = {
        sub_group_broadcast(aabb->lower.x, slotID),
        sub_group_broadcast(aabb->lower.y, slotID),
        sub_group_broadcast(aabb->lower.z, slotID),
        sub_group_broadcast(aabb->lower.w, slotID),
        sub_group_broadcast(aabb->upper.x, slotID),
        sub_group_broadcast(aabb->upper.y, slotID),
        sub_group_broadcast(aabb->upper.z, slotID),
        sub_group_broadcast(aabb->upper.w, slotID) };
    uint coordDataFiltered;
    const uint lane = get_sub_group_local_id();
    if (lane < 8) coordDataFiltered = as_uint(coordData[lane]);
    return coordDataFiltered;
 }
 GRL_INLINE struct AABB AABB_sub_group_reduce(struct AABB *aabb)
 {
    struct AABB bounds;
    bounds.lower.x = sub_group_reduce_min(aabb->lower.x);
    bounds.lower.y = sub_group_reduce_min(aabb->lower.y);
    bounds.lower.z = sub_group_reduce_min(aabb->lower.z);
    bounds.lower.w = 0;
    bounds.upper.x = sub_group_reduce_max(aabb->upper.x);
    bounds.upper.y = sub_group_reduce_max(aabb->upper.y);
    bounds.upper.z = sub_group_reduce_max(aabb->upper.z);
    bounds.upper.w = 0;
    return bounds;
 }
 GRL_INLINE struct AABB AABB_sub_group_reduce_N6( struct AABB* aabb )
 {
    float3 l = aabb->lower.xyz;
    float3 u = aabb->upper.xyz;
    l = min( l, intel_sub_group_shuffle_down( l, l, 4 ) );
    l = min( l, intel_sub_group_shuffle_down( l, l, 2 ) );
    l = min( l, intel_sub_group_shuffle_down( l, l, 1 ) );
    u = max( u, intel_sub_group_shuffle_down( u, u, 4 ) );
    u = max( u, intel_sub_group_shuffle_down( u, u, 2 ) );
    u = max( u, intel_sub_group_shuffle_down( u, u, 1 ) );
    struct AABB bounds;
    bounds.lower.x = l.x;
    bounds.lower.y = l.y;
    bounds.lower.z = l.z;
    bounds.lower.w = 0;
    bounds.upper.x = u.x;
    bounds.upper.y = u.y;
    bounds.upper.z = u.z;
    bounds.upper.w = 0;
    return bounds;
 }
 GRL_INLINE struct AABB AABB_work_group_reduce(struct AABB *aabb)
 {
    struct AABB bounds;
    bounds.lower.x = work_group_reduce_min(aabb->lower.x);
    bounds.lower.y = work_group_reduce_min(aabb->lower.y);
    bounds.lower.z = work_group_reduce_min(aabb->lower.z);
    bounds.upper.x = work_group_reduce_max(aabb->upper.x);
    bounds.upper.y = work_group_reduce_max(aabb->upper.y);
    bounds.upper.z = work_group_reduce_max(aabb->upper.z);
    return bounds;
 }
 GRL_INLINE struct AABB AABB_sub_group_scan_exclusive_min_max(struct AABB *aabb)
 {
    struct AABB bounds;
    bounds.lower.x = sub_group_scan_exclusive_min(aabb->lower.x);
    bounds.lower.y = sub_group_scan_exclusive_min(aabb->lower.y);
    bounds.lower.z = sub_group_scan_exclusive_min(aabb->lower.z);
    bounds.lower.w = 0;
    bounds.upper.x = sub_group_scan_exclusive_max(aabb->upper.x);
    bounds.upper.y = sub_group_scan_exclusive_max(aabb->upper.y);
    bounds.upper.z = sub_group_scan_exclusive_max(aabb->upper.z);
    bounds.upper.w = 0;
    return bounds;
 }
 GRL_INLINE struct AABB AABB_sub_group_scan_inclusive_min_max(struct AABB *aabb)
 {
    struct AABB bounds;
    bounds.lower.x = sub_group_scan_inclusive_min(aabb->lower.x);
    bounds.lower.y = sub_group_scan_inclusive_min(aabb->lower.y);
    bounds.lower.z = sub_group_scan_inclusive_min(aabb->lower.z);
    bounds.lower.w = 0;
    bounds.upper.x = sub_group_scan_inclusive_max(aabb->upper.x);
    bounds.upper.y = sub_group_scan_inclusive_max(aabb->upper.y);
    bounds.upper.z = sub_group_scan_inclusive_max(aabb->upper.z);
    bounds.upper.w = 0;
    return bounds;
 }
 GRL_INLINE void AABB_global_atomic_merge(global struct AABB *global_aabb, struct AABB *aabb)
 {
    atomic_min((volatile __global float *)&global_aabb->lower + 0, aabb->lower.x);
    atomic_min((volatile __global float *)&global_aabb->lower + 1, aabb->lower.y);
    atomic_min((volatile __global float *)&global_aabb->lower + 2, aabb->lower.z);
    atomic_max((volatile __global float *)&global_aabb->upper + 0, aabb->upper.x);
    atomic_max((volatile __global float *)&global_aabb->upper + 1, aabb->upper.y);
    atomic_max((volatile __global float *)&global_aabb->upper + 2, aabb->upper.z);
 }
 GRL_INLINE void AABB_global_atomic_merge_lu(global struct AABB* global_aabb, float3 lower, float3 upper )
 {
    atomic_min((volatile __global float*) & global_aabb->lower + 0, lower.x);
    atomic_min((volatile __global float*) & global_aabb->lower + 1, lower.y);
    atomic_min((volatile __global float*) & global_aabb->lower + 2, lower.z);
    atomic_max((volatile __global float*) & global_aabb->upper + 0, upper.x);
    atomic_max((volatile __global float*) & global_aabb->upper + 1, upper.y);
    atomic_max((volatile __global float*) & global_aabb->upper + 2, upper.z);
 }
 GRL_INLINE void AABB_global_atomic_merge_sub_group_lu(uniform global struct AABB* aabb, float3 lower, float3 upper)
 {
    uint lane = get_sub_group_local_id();
    float l[3];
    l[0] = sub_group_reduce_min(lower.x);
    l[1] = sub_group_reduce_min(lower.y);
    l[2] = sub_group_reduce_min(lower.z);
    float u[3];
    u[0] = sub_group_reduce_max(upper.x);
    u[1] = sub_group_reduce_max(upper.y);
    u[2] = sub_group_reduce_max(upper.z);
    if (lane < 3)
    {
        atomic_min((global float*)&aabb->lower + lane, l[lane]);
        atomic_max((global float*)&aabb->upper + lane, u[lane]);
    }
 }
 GRL_INLINE void AABB_local_atomic_merge(local struct AABB *aabb, const float4 lower, const float4 upper)
 {
    if (lower.x < aabb->lower.x)
        atomic_min((local float *)&aabb->lower + 0, lower.x);
    if (lower.y < aabb->lower.y)
        atomic_min((local float *)&aabb->lower + 1, lower.y);
    if (lower.z < aabb->lower.z)
        atomic_min((local float *)&aabb->lower + 2, lower.z);
    if (upper.x > aabb->upper.x)
        atomic_max((local float *)&aabb->upper + 0, upper.x);
    if (upper.y > aabb->upper.y)
        atomic_max((local float *)&aabb->upper + 1, upper.y);
    if (upper.z > aabb->upper.z)
        atomic_max((local float *)&aabb->upper + 2, upper.z);
 }
 #endif
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/gpu/api_interface.h
+++ b/src/intel/vulkan/grl/gpu/api_interface.h
@ -1,840 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "GRLStructs.h"
 #include "shared.h"
 #include "libs/lsc_intrinsics.h"
 typedef struct Geo GRL_RAYTRACING_GEOMETRY_DESC;
 typedef struct GRL_RAYTRACING_AABB
 {
    float MinX;
    float MinY;
    float MinZ;
    float MaxX;
    float MaxY;
    float MaxZ;
 } GRL_RAYTRACING_AABB;
 GRL_INLINE void GLR_set_raytracing_aabb(GRL_RAYTRACING_AABB* dest, struct AABB* source)
 {
    dest->MinX = source->lower.x;
    dest->MinY = source->lower.y;
    dest->MinZ = source->lower.z;
    dest->MaxX = source->upper.x;
    dest->MaxY = source->upper.y;
    dest->MaxZ = source->upper.z;
 }
 GRL_INLINE uint3 GRL_load_triangle(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint triID)
 {
    global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
    uint index_format = geomDesc->Desc.Triangles.IndexFormat;
    if (index_format == INDEX_FORMAT_R32_UINT)
    {
        const uint* data = (const uint*)(indices + triID * 3 * 4);
        return (uint3)(data[0], data[1], data[2]);
    }
    else if (index_format == INDEX_FORMAT_NONE)
    {
        return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
    }
    else
    {
        const ushort* data = (const ushort*)(indices + triID * 3 * 2);
        return (uint3)(data[0], data[1], data[2]);
    }
 }
 GRL_INLINE uint3 GRL_load_indices_from_buffer(global char* indices, const uint index_format, const uint triID)
 {
    if (index_format == INDEX_FORMAT_R32_UINT)
    {
        return load_uint3_L1C_L3C((global uint3*)(indices + triID * 3 * 4), 0);
    }
    else if (index_format == INDEX_FORMAT_NONE)
    {
        return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
    }
    else
    {
        const ushort* data = (const ushort*)(indices + triID * 3 * 2);
        return (uint3)(data[0], data[1], data[2]);
    }
 }
 // Load all 3 indices from one triangle, and a single index from another
 GRL_INLINE uint4 GRL_load_quad_indices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint triID, uint triID_1, ushort fourth_vert)
 {
    global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
    uint index_format = geomDesc->Desc.Triangles.IndexFormat;
    if (index_format == INDEX_FORMAT_R32_UINT)
    {
        const uint* data0 = (const uint*)(indices + triID * 3 * 4);
        const uint* data1 = (const uint*)(indices + triID_1 * 3 * 4);
        return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
    }
    else if (index_format == INDEX_FORMAT_NONE)
    {
        return (uint4)(triID * 3, triID * 3 + 1, triID * 3 + 2, triID_1 * 3 + fourth_vert);
    }
    else
    {
        const ushort* data0 = (const ushort*)(indices + triID * 3 * 2);
        const ushort* data1 = (const ushort*)(indices + triID_1 * 3 * 2);
        return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
    }
 }
 GRL_INLINE void GRL_set_Type(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, GeometryType type)
 {
    geomDesc->Type = type;
 }
 GRL_INLINE GeometryType GRL_get_Type(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Type;
 }
 GRL_INLINE void GRL_set_Flags(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint8_t flags)
 {
    geomDesc->Flags = flags;
 }
 GRL_INLINE uint8_t GRL_get_Flags(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Flags;
 }
 GRL_INLINE void GRL_set_triangles_Transform(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t transform)
 {
    geomDesc->Desc.Triangles.pTransformBuffer = transform;
 }
 GRL_INLINE gpuva_t GRL_get_triangles_Transform(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Triangles.pTransformBuffer;
 }
 GRL_INLINE void GRL_set_triangles_IndexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, IndexFormat format)
 {
    geomDesc->Desc.Triangles.IndexFormat = format;
 }
 GRL_INLINE IndexFormat GRL_get_triangles_IndexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Triangles.IndexFormat;
 }
 GRL_INLINE void GRL_set_triangles_VertexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, VertexFormat format)
 {
    geomDesc->Desc.Triangles.VertexFormat = format;
 }
 GRL_INLINE VertexFormat GRL_get_triangles_VertexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Triangles.VertexFormat;
 }
 GRL_INLINE void GRL_set_triangles_IndexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
 {
    geomDesc->Desc.Triangles.IndexCount = count;
 }
 GRL_INLINE dword GRL_get_triangles_IndexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Triangles.IndexCount;
 }
 GRL_INLINE void GRL_set_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
 {
    geomDesc->Desc.Triangles.VertexCount = count;
 }
 GRL_INLINE dword GRL_get_triangles_VertexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Triangles.VertexCount;
 }
 GRL_INLINE void GRL_set_triangles_IndexBuffer(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t buffer)
 {
    geomDesc->Desc.Triangles.pIndexBuffer = buffer;
 }
 GRL_INLINE gpuva_t GRL_get_triangles_IndexBuffer(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Triangles.pIndexBuffer;
 }
 GRL_INLINE void GRL_set_triangles_VertexBuffer_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
 {
    geomDesc->Desc.Triangles.pVertexBuffer = address;
 }
 GRL_INLINE gpuva_t GRL_get_triangles_VertexBuffer_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Triangles.pVertexBuffer;
 }
 GRL_INLINE void GRL_set_triangles_VertexBuffer_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, unsigned long stride)
 {
    geomDesc->Desc.Triangles.VertexBufferByteStride = stride;
 }
 GRL_INLINE unsigned long GRL_get_triangles_VertexBuffer_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Triangles.VertexBufferByteStride;
 }
 GRL_INLINE unsigned long GRL_get_triangles_IndexFormatSizeInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return (unsigned long)(geomDesc->Desc.Triangles.IndexFormat);
 }
 GRL_INLINE void GRL_set_procedurals_AABBCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
 {
    geomDesc->Desc.Procedural.AABBCount = count;
 }
 GRL_INLINE dword GRL_get_procedurals_AABBCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Procedural.AABBCount;
 }
 GRL_INLINE void GRL_set_procedurals_AABBs_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
 {
    geomDesc->Desc.Procedural.pAABBs_GPUVA = address;
 }
 GRL_INLINE gpuva_t GRL_get_procedurals_AABBs_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Procedural.pAABBs_GPUVA;
 }
 GRL_INLINE void GRL_set_procedurals_AABBs_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, qword stride)
 {
    geomDesc->Desc.Procedural.AABBByteStride = stride;
 }
 GRL_INLINE qword GRL_get_procedurals_AABBs_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
 {
    return geomDesc->Desc.Procedural.AABBByteStride;
 }
 GRL_INLINE uint GRL_is_procedural(GRL_RAYTRACING_GEOMETRY_DESC* desc)
 {
    return desc->Type == (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
 }
 GRL_INLINE uint GRL_is_triangle(GRL_RAYTRACING_GEOMETRY_DESC* desc)
 {
    return desc->Type != (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
 }
 GRL_INLINE unsigned int GRL_get_ShaderIndex_Mask(GRL_RAYTRACING_GEOMETRY_DESC* desc)
 {
    return 0x00FFFFFF;
 }
 GRL_INLINE dword GRL_atomic_add_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* desc, dword value)
 {
    return atomic_add((global uint*) & desc->Desc.Triangles.VertexCount, value);
 }
 GRL_INLINE unsigned int GRL_get_primitive_count(GRL_RAYTRACING_GEOMETRY_DESC* desc)
 {
    if (GRL_is_triangle(desc))
    {
        if (desc->Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
        {
            return desc->Desc.Triangles.VertexCount / 3;
        }
        else
        {
            return desc->Desc.Triangles.IndexCount / 3;
        }
    }
    else
    {
        return desc->Desc.Procedural.AABBCount;
    }
 }
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable // to leaf half values
 GRL_INLINE float snorm_to_float(short v)
 {
    return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 32767.0f))); // FIXME: do we have intrinsic for this?
 }
 GRL_INLINE float snorm8_to_float(signed char v)
 {
    return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 127.0f))); // FIXME: do we have intrinsic for this?
 }
 GRL_INLINE float unorm_to_float(unsigned short v)
 {
    return min(1.0f, max(0.0f, ((float)v) * (1.0f / 65535.0f))); // FIXME: do we have intrinsic for this?
 }
 //only lower 10 bits of v are used
 GRL_INLINE float unorm10_to_float(unsigned v)
 {
    const unsigned short mask = (unsigned short)((1u << 10u) - 1u);
    const unsigned short v10 = (unsigned short)v & mask;
    return min(1.0f, max(0.0f, ((float)v10) * (1.0f / 1023.0f))); // FIXME: do we have intrinsic for this?
 }
 GRL_INLINE float unorm8_to_float(unsigned char v)
 {
    return min(1.0f, max(0.0f, ((float)v) * (1.0f / 255.0f))); // FIXME: do we have intrinsic for this?
 }
 GRL_INLINE float4 GRL_load_vertex(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint vtxID)
 {
    float4 v = (float4)(0, 0, 0, 0);
    global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
    uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
    uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
    {
        const float* data = (const float*)(vertices + vtxID * vertex_stride);
        v = (float4)(data[0], data[1], data[2], 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
    {
        const float* data = (const float*)(vertices + vtxID * vertex_stride);
        v = (float4)(data[0], data[1], 0.0f, 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
    {
        const half* data = (const half*)(vertices + vtxID * vertex_stride);
        v = (float4)(data[0], data[1], data[2], 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
    {
        const half* data = (const half*)(vertices + vtxID * vertex_stride);
        v = (float4)(data[0], data[1], 0.0f, 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
    {
        const short* data = (const short*)(vertices + vtxID * vertex_stride);
        v = (float4)(snorm_to_float(data[0]),
            snorm_to_float(data[1]),
            snorm_to_float(data[2]),
            0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
    {
        const short* data = (const short*)(vertices + vtxID * vertex_stride);
        v = (float4)(snorm_to_float(data[0]),
            snorm_to_float(data[1]),
            0.0f,
            0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
    {
        const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
        v = (float4)(unorm_to_float(data[0]),
            unorm_to_float(data[1]),
            unorm_to_float(data[2]),
            0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
    {
        const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
        v = (float4)(unorm_to_float(data[0]),
            unorm_to_float(data[1]),
            0.0f,
            0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
    {
        const unsigned data = *(const unsigned*)(vertices + vtxID * vertex_stride);
        v = (float4)(unorm10_to_float(data),
            unorm10_to_float((data >> 10)),
            unorm10_to_float((data >> 20)),
            0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
    {
        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
        v = (float4)(unorm8_to_float(data[0]),
            unorm8_to_float(data[1]),
            unorm8_to_float(data[2]),
            0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
    {
        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
        v = (float4)(unorm8_to_float(data[0]),
            unorm8_to_float(data[1]),
            0.0f,
            0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
    {
        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
        v = (float4)(snorm8_to_float(data[0]),
            snorm8_to_float(data[1]),
            snorm8_to_float(data[2]),
            0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
    {
        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
        v = (float4)(snorm8_to_float(data[0]),
            snorm8_to_float(data[1]),
            0.0f,
            0.0f);
    }
    /* perform vertex transformation */
    if (geomDesc->Desc.Triangles.pTransformBuffer)
    {
        global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
        const float x = xfm[0] * v.x + xfm[1] * v.y + xfm[2] * v.z + xfm[3];
        const float y = xfm[4] * v.x + xfm[5] * v.y + xfm[6] * v.z + xfm[7];
        const float z = xfm[8] * v.x + xfm[9] * v.y + xfm[10] * v.z + xfm[11];
        v = (float4)(x, y, z, 0.0f);
    }
    return v;
 }
 GRL_INLINE void GRL_load_triangle_vertices(global char* vertices, const uint vertex_format, const uint vertex_stride, global float* transform_buffer, const uint vtx0ID, const uint vtx1ID, const uint vtx2ID, float4* out)
 {
    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
    {
        const float3 data0 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx0ID * vertex_stride), 0));
        const float3 data1 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx1ID * vertex_stride), 0));
        const float3 data2 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx2ID * vertex_stride), 0));
        out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
        out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
        out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
    {
        const float* data0 = (const float*)(vertices + vtx0ID * vertex_stride);
        const float* data1 = (const float*)(vertices + vtx1ID * vertex_stride);
        const float* data2 = (const float*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
        out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
        out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
    {
        const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
        const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
        const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
        out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
        out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
    {
        const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
        const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
        const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
        out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
        out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
    {
        const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
        const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
        const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]), 0.0f);
        out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]), 0.0f);
        out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]), 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
    {
        const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
        const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
        const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f, 0.0f);
        out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f, 0.0f);
        out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f, 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
    {
        const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
        const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
        const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]), 0.0f);
        out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]), 0.0f);
        out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]), 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
    {
        const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
        const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
        const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f, 0.0f);
        out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f, 0.0f);
        out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f, 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
    {
        const unsigned data0 = *(const unsigned*)(vertices + vtx0ID * vertex_stride);
        const unsigned data1 = *(const unsigned*)(vertices + vtx1ID * vertex_stride);
        const unsigned data2 = *(const unsigned*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(unorm10_to_float(data0), unorm10_to_float(data0 >> 10), unorm10_to_float(data0 >> 20), 0.0f);
        out[1] = (float4)(unorm10_to_float(data1), unorm10_to_float(data1 >> 10), unorm10_to_float(data1 >> 20), 0.0f);
        out[2] = (float4)(unorm10_to_float(data2), unorm10_to_float(data2 >> 10), unorm10_to_float(data2 >> 20), 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
    {
        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]), 0.0f);
        out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]), 0.0f);
        out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]), 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
    {
        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f, 0.0f);
        out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f, 0.0f);
        out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f, 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
    {
        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]), 0.0f);
        out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]), 0.0f);
        out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]), 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
    {
        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
        out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f, 0.0f);
        out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f, 0.0f);
        out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f, 0.0f);
    }
    /* perform vertex transformation */
    if (transform_buffer)
    {
        global float* xfm = (global float*)transform_buffer;
        for (uint i = 0; i < 3; ++i)
        {
            const float x = xfm[0] * out[i].x + xfm[1] * out[i].y + xfm[2] * out[i].z + xfm[3];
            const float y = xfm[4] * out[i].x + xfm[5] * out[i].y + xfm[6] * out[i].z + xfm[7];
            const float z = xfm[8] * out[i].x + xfm[9] * out[i].y + xfm[10] * out[i].z + xfm[11];
            out[i] = (float4)(x, y, z, 0.0f);
        }
    }
 }
 GRL_INLINE void GRL_load_quad_vertices_no_stride(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    float3* out0, float3* out1, float3* out2, float3* out3,
    const uint4 vtxID, const uint vertex_format, global char* vertices)
 {
    float3 v0, v1, v2, v3;
    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
    {
        const float* data0 = (const float*)(vertices + vtxID.x);
        const float* data1 = (const float*)(vertices + vtxID.y);
        const float* data2 = (const float*)(vertices + vtxID.z);
        const float* data3 = (const float*)(vertices + vtxID.w);
        v0 = (float3)(data0[0], data0[1], data0[2]);
        v1 = (float3)(data1[0], data1[1], data1[2]);
        v2 = (float3)(data2[0], data2[1], data2[2]);
        v3 = (float3)(data3[0], data3[1], data3[2]);
    }
    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
    {
        const float* data0 = (const float*)(vertices + vtxID.x);
        const float* data1 = (const float*)(vertices + vtxID.y);
        const float* data2 = (const float*)(vertices + vtxID.z);
        const float* data3 = (const float*)(vertices + vtxID.w);
        v0 = (float3)(data0[0], data0[1], 0.0f);
        v1 = (float3)(data1[0], data1[1], 0.0f);
        v2 = (float3)(data2[0], data2[1], 0.0f);
        v3 = (float3)(data3[0], data3[1], 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
    {
        const half* data0 = (const half*)(vertices + vtxID.x);
        const half* data1 = (const half*)(vertices + vtxID.y);
        const half* data2 = (const half*)(vertices + vtxID.z);
        const half* data3 = (const half*)(vertices + vtxID.w);
        v0 = (float3)(data0[0], data0[1], data0[2]);
        v1 = (float3)(data1[0], data1[1], data1[2]);
        v2 = (float3)(data2[0], data2[1], data2[2]);
        v3 = (float3)(data3[0], data3[1], data3[2]);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
    {
        const half* data0 = (const half*)(vertices + vtxID.x);
        const half* data1 = (const half*)(vertices + vtxID.y);
        const half* data2 = (const half*)(vertices + vtxID.z);
        const half* data3 = (const half*)(vertices + vtxID.w);
        v0 = (float3)(data0[0], data0[1], 0.0f);
        v1 = (float3)(data1[0], data1[1], 0.0f);
        v2 = (float3)(data2[0], data2[1], 0.0f);
        v3 = (float3)(data3[0], data3[1], 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
    {
        const short* data0 = (const short*)(vertices + vtxID.x);
        const short* data1 = (const short*)(vertices + vtxID.y);
        const short* data2 = (const short*)(vertices + vtxID.z);
        const short* data3 = (const short*)(vertices + vtxID.w);
        v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]));
        v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]));
        v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]));
        v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), snorm_to_float(data3[2]));
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
    {
        const short* data0 = (const short*)(vertices + vtxID.x);
        const short* data1 = (const short*)(vertices + vtxID.y);
        const short* data2 = (const short*)(vertices + vtxID.z);
        const short* data3 = (const short*)(vertices + vtxID.w);
        v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f);
        v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f);
        v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f);
        v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
    {
        const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
        const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
        const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
        const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
        v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]));
        v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]));
        v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]));
        v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), unorm_to_float(data3[2]));
    }
    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
    {
        const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
        const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
        const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
        const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
        v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f);
        v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f);
        v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f);
        v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
    {
        const unsigned data0 = *(const unsigned*)(vertices + vtxID.x);
        const unsigned data1 = *(const unsigned*)(vertices + vtxID.y);
        const unsigned data2 = *(const unsigned*)(vertices + vtxID.z);
        const unsigned data3 = *(const unsigned*)(vertices + vtxID.w);
        v0 = (float3)(unorm10_to_float(data0), unorm10_to_float((data0 >> 10)), unorm10_to_float((data0 >> 20)));
        v1 = (float3)(unorm10_to_float(data1), unorm10_to_float((data1 >> 10)), unorm10_to_float((data1 >> 20)));
        v2 = (float3)(unorm10_to_float(data2), unorm10_to_float((data2 >> 10)), unorm10_to_float((data2 >> 20)));
        v3 = (float3)(unorm10_to_float(data3), unorm10_to_float((data3 >> 10)), unorm10_to_float((data3 >> 20)));
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
    {
        const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
        const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
        const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
        const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
        v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]));
        v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]));
        v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]));
        v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), unorm8_to_float(data3[2]));
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
    {
        const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
        const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
        const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
        const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
        v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f);
        v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f);
        v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f);
        v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), 0.0f);
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
    {
        const signed char* data0 = (const signed char*)(vertices + vtxID.x);
        const signed char* data1 = (const signed char*)(vertices + vtxID.y);
        const signed char* data2 = (const signed char*)(vertices + vtxID.z);
        const signed char* data3 = (const signed char*)(vertices + vtxID.w);
        v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]));
        v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]));
        v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]));
        v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), snorm8_to_float(data3[2]));
    }
    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
    {
        const signed char* data0 = (const signed char*)(vertices + vtxID.x);
        const signed char* data1 = (const signed char*)(vertices + vtxID.y);
        const signed char* data2 = (const signed char*)(vertices + vtxID.z);
        const signed char* data3 = (const signed char*)(vertices + vtxID.w);
        v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f);
        v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f);
        v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f);
        v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), 0.0f);
    }
    else
    {
        v0 = (float3)(0.0f, 0.0f, 0.0f);
        v1 = (float3)(0.0f, 0.0f, 0.0f);
        v2 = (float3)(0.0f, 0.0f, 0.0f);
        v3 = (float3)(0.0f, 0.0f, 0.0f);
    }
    /* perform vertex transformation */
    if (geomDesc->Desc.Triangles.pTransformBuffer)
    {
        global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
        v0.xyz = (float3)(
            xfm[0] * v0.x + xfm[1] * v0.y + xfm[2] * v0.z + xfm[3],
            xfm[4] * v0.x + xfm[5] * v0.y + xfm[6] * v0.z + xfm[7],
            xfm[8] * v0.x + xfm[9] * v0.y + xfm[10] * v0.z + xfm[11]
            );
        v1.xyz = (float3)(
            xfm[0] * v1.x + xfm[1] * v1.y + xfm[2] * v1.z + xfm[3],
            xfm[4] * v1.x + xfm[5] * v1.y + xfm[6] * v1.z + xfm[7],
            xfm[8] * v1.x + xfm[9] * v1.y + xfm[10] * v1.z + xfm[11]
            );
        v2.xyz = (float3)(
            xfm[0] * v2.x + xfm[1] * v2.y + xfm[2] * v2.z + xfm[3],
            xfm[4] * v2.x + xfm[5] * v2.y + xfm[6] * v2.z + xfm[7],
            xfm[8] * v2.x + xfm[9] * v2.y + xfm[10] * v2.z + xfm[11]
            );
        v3.xyz = (float3)(
            xfm[0] * v3.x + xfm[1] * v3.y + xfm[2] * v3.z + xfm[3],
            xfm[4] * v3.x + xfm[5] * v3.y + xfm[6] * v3.z + xfm[7],
            xfm[8] * v3.x + xfm[9] * v3.y + xfm[10] * v3.z + xfm[11]
            );
    }
    *out0 = v0;
    *out1 = v1;
    *out2 = v2;
    *out3 = v3;
 }
 GRL_INLINE void GRL_load_quad_vertices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    float3* out0, float3* out1, float3* out2, float3* out3,
    uint4 vtxID)
 {
    global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
    uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
    uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
    vtxID *= vertex_stride;
    GRL_load_quad_vertices_no_stride(geomDesc, out0, out1, out2, out3,
        vtxID, vertex_format, vertices);
 }
 GRL_INLINE GRL_RAYTRACING_AABB GRL_load_aabb(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint primID)
 {
    global char* aabb0 = (global char*)geomDesc->Desc.Procedural.pAABBs_GPUVA;
    global char* aabb = aabb0 + (primID * geomDesc->Desc.Procedural.AABBByteStride);
    return *(global GRL_RAYTRACING_AABB*)aabb;
 }
 // same as for d3d12
 typedef struct GRL_RAYTRACING_INSTANCE_DESC
 {
    float Transform[12];
    //     unsigned int InstanceID : 24;
    //     unsigned int InstanceMask : 8;
    uint32_t DW0;
    //     unsigned int InstanceContributionToHitGroupIndex : 24;
    //     unsigned int Flags : 8;
    uint32_t DW1;
    global char* AccelerationStructure;
 } GRL_RAYTRACING_INSTANCE_DESC;
 GRL_INLINE float GRL_get_transform(const GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column)
 {
    return d->Transform[row * 4 + column];
 }
 GRL_INLINE uint32_t GRL_get_instanceID(const GRL_RAYTRACING_INSTANCE_DESC* d)
 {
    return d->DW0 & ((1 << 24) - 1);
 }
 GRL_INLINE uint32_t GRL_get_InstanceMask(const GRL_RAYTRACING_INSTANCE_DESC* d)
 {
    return d->DW0 >> 24;
 }
 GRL_INLINE uint32_t GRL_get_InstanceContributionToHitGroupIndex(const GRL_RAYTRACING_INSTANCE_DESC* d)
 {
    return d->DW1 & ((1 << 24) - 1);
 }
 GRL_INLINE uint32_t GRL_get_InstanceFlags(const GRL_RAYTRACING_INSTANCE_DESC* d)
 {
    return d->DW1 >> 24;
 }
 GRL_INLINE gpuva_t GRL_get_AccelerationStructure(const GRL_RAYTRACING_INSTANCE_DESC* d)
 {
    return (gpuva_t)d->AccelerationStructure;
 }
 GRL_INLINE void GRL_set_transform(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column, float value)
 {
    d->Transform[row * 4 + column] = value;
 }
 GRL_INLINE void GRL_set_instanceID(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t id)
 {
    d->DW0 &= 255 << 24;
    d->DW0 |= id & ((1 << 24) - 1);
 }
 GRL_INLINE void GRL_set_InstanceMask(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t mask)
 {
    d->DW0 &= ((1 << 24) - 1);
    d->DW0 |= mask << 24;
 }
 GRL_INLINE void GRL_set_InstanceContributionToHitGroupIndex(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t contribution)
 {
    d->DW1 &= 255 << 24;
    d->DW1 |= contribution & ((1 << 24) - 1);
 }
 GRL_INLINE void GRL_set_InstanceFlags(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t flags)
 {
    d->DW1 &= ((1 << 24) - 1);
    d->DW1 |= flags << 24;
 }
 GRL_INLINE void GRL_set_AccelerationStructure(GRL_RAYTRACING_INSTANCE_DESC* d, gpuva_t address)
 {
    d->AccelerationStructure = (global char*)address;
 }
--- a/src/intel/vulkan/grl/gpu/atomic_update.cl
+++ b/src/intel/vulkan/grl/gpu/atomic_update.cl
--- a/src/intel/vulkan/grl/gpu/atomic_update.grl
+++ b/src/intel/vulkan/grl/gpu/atomic_update.grl
@ -1,198 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module atomic_update;
 kernel_module atomic_update ("atomic_update.cl")
 {    
    links lsc_intrinsics;
    kernel init_refit_scratch   < kernelFunction = "init_refit_scratch"  >;
    kernel traverse_aabbs_quad  < kernelFunction = "traverse_aabbs_quad" >;
    kernel write_inner_nodes    < kernelFunction = "write_inner_nodes"   >;
    kernel build_fatleaf_table  < kernelFunction = "build_fatleaf_table" >;
    kernel build_innernode_table < kernelFunction = "build_innernode_table" >;
    kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >;
    kernel build_fatleaf_table_new_update  < kernelFunction = "build_fatleaf_table_new_update" >;
    kernel fixup_quad_table  < kernelFunction = "fixup_quad_table" >;
    kernel traverse_aabbs_new_update  < kernelFunction = "traverse_aabbs_new_update" >;
    kernel traverse_aabbs_new_update_single_geo  < kernelFunction = "traverse_aabbs_new_update_single_geo" >;
 }
 import struct MKBuilderState "structs.grl";
 // this metakernel only initializes registers for use in a batching loop by "init_refit_scratch"
 metakernel init_refit_scratch_metakernel_registers()
 {
    REG0.hi = 0;
    REG1 = 3;
    REG2 = 63;
    REG3 = 4;
    REG4 = 2;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
 }
 metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes )
 {
    REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
    define C_3  REG1;
    define C_63 REG2;
    define C_4  REG3;
    define C_2  REG4;
    REG0 = REG0 - C_3; // nodedataCurr - fixed offset
    REG0 = REG0 + C_63; // + 63
    REG0 = REG0 >> C_4; // >> 4
    REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64
    DISPATCHDIM_X = REG0.lo;
    dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 )
        args(bvh_base,scratch);
 }
 metakernel build_node_tables( qword bvh_base )
 {
    REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
    REG1 = 2;
    REG2 = 63;
    REG3 = 4;
    REG4 = 3;  // fixed offset... TODO: DON'T HARDCODE!!
    REG0 = REG0 - REG4; // nodedataCurr - fixed offset
    REG0 = REG0 + REG2; // + 63
    REG0 = REG0 >> REG3; // >> 4
    REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
    DISPATCHDIM_X = REG0.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 )
        args(bvh_base);
    dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
        args(bvh_base);
 }
 metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base )
 {
    REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
    REG1 = 2;
    REG2 = 63;
    REG3 = 4;
    REG4 = 3;  // fixed offset... TODO: DON'T HARDCODE!!
    REG0 = REG0 - REG4; // nodedataCurr - fixed offset
    REG0 = REG0 + REG2; // + 63
    REG0 = REG0 >> REG3; // >> 4
    REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
    DISPATCHDIM_X = REG0.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 )
        args(state.build_globals, bvh_base);
    dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
        args(bvh_base);
 }
 metakernel fixup_quad_table( qword bvh_base )
 {
    dispatch  fixup_quad_table(2,1,1)
        args(bvh_base);
 }
 // this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes"
 metakernel init_traverse_aabbs_quad_and_write_inner_nodes()
 {
    REG0.hi = 0;
    REG1 = 1;
    REG2 = 31;
    REG3 = 4;
    REG4 = 2;
    REG5 = 7;
    REG6 = 255;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
 }
 metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes )
 {
    REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode!
    define C_1  REG1;
    define C_31 REG2;
    define C_4  REG3;
    REG0 = REG0 + C_31; // + 31
    REG0 = REG0 >> C_4; // >> 4
    REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32
    DISPATCHDIM_X = REG0.lo;
    dispatch_indirect traverse_aabbs_quad//( (max_inner_nodes+32)/32, 1, 1 )
        args(bvh_base,scratch,geos);
 }
 metakernel write_inner_nodes( qword bvh_base, qword scratch )//, dword max_inner_nodes )
 {
    REG0.lo = load_dword( bvh_base + 68 ); // TODO: DOn't hardcode!
    define C_1 REG1;
    define C_2 REG4;
    define C_7 REG5;
    REG0 = REG0 + C_7;  // + 7
    REG0 = REG0 >> C_2; // >> 2 
    REG0 = REG0 >> C_1; // >> 1 ==>  >> 3  (/8)
    DISPATCHDIM_X = REG0.lo;
    dispatch_indirect  write_inner_nodes//( (max_inner_nodes+7)/8, 1, 1 )
        args(bvh_base,scratch);
 }
 metakernel update_single_group_quads( qword bvh_base, qword geos, qword aabbs  )
 {
    dispatch  update_single_group_quads(1,1,1) //( (max_inner_nodes+1)/2, 1, 1 )
        args(bvh_base,geos,aabbs);
 }
 metakernel traverse_aabbs_new_update( qword bvh_base, qword geos, qword scratch )
 {
    REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
    define C_255 REG6;
    define C_4   REG3;
    REG0 = REG0 + C_255; // + 255
    REG0 = REG0 >> C_4; // >> 4
    REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
    DISPATCHDIM_X = REG0.lo;
    dispatch_indirect traverse_aabbs_new_update//( (max_inner_nodes+255)/256, 1, 1 )
        args(bvh_base, geos, scratch);
 }
 metakernel traverse_aabbs_new_update_single_geo( qword bvh_base, qword vertices, qword geos, qword scratch, dword vertex_format )
 {
    REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
    define C_255 REG6;
    define C_4   REG3;
    REG0 = REG0 + C_255; // + 255
    REG0 = REG0 >> C_4; // >> 4
    REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
    DISPATCHDIM_X = REG0.lo;
    dispatch_indirect traverse_aabbs_new_update_single_geo//( (max_inner_nodes+255)/256, 1, 1 )
        args(bvh_base, vertices, geos, scratch, vertex_format);
 }
--- a/src/intel/vulkan/grl/gpu/binned_sah_shared.h
+++ b/src/intel/vulkan/grl/gpu/binned_sah_shared.h
@ -1,265 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 //
 //   This file contains structure definitions shared by GRL OCL kernels and host code
 //
 #include "GRLGen12.h"
 #pragma once
 #define BFS_NUM_BINS        16
 #define BFS_NUM_VCONTEXTS   256
 #define BFS_MAX_DEPTH 32
 #define TRIVIAL_BUILD_THRESHOLD   6
 #define SINGLE_WG_BUILD_THRESHOLD 256
 #define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
 typedef uchar vcontext_id_t;
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
 GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
 struct BFS_Split
 {
    float sah;
    int dim;
    int pos;
 };
 struct BFS_BinInfo
 {
    float min_max[18 * BFS_NUM_BINS]; //  layout: bins[axis][num_bins][6]  
                                      //          The 6 are lower(xyz) and -upper(xyz)
                                      // bins use negated-max so that we can use vectorized mins instead of min/max pairs
    uint counts[3 * BFS_NUM_BINS];
 };
 enum_uint8(SAHBuildFlags)
 {
    SAH_FLAG_NEED_BACKPOINTERS = 1,        // identifies a mixed internal node where each child can have a different type
    SAH_FLAG_NEED_MASKS        = 2
 };
 struct SAHBuildGlobals
 {
    qword   p_primref_index_buffers;
    qword   p_primrefs_buffer;
    qword   p_bvh2;
    qword   p_globals;     // TODO: deprecate this
    qword   p_bvh_base;
    gpuva_t p_qnode_root_buffer;
    dword flags; // bit 1 is 'alloc_backpointers'.  bit 2 is 'need_masks'
    dword num_primrefs;
    dword leaf_size;
    dword leaf_type;
    dword root_buffer_num_produced;
    dword root_buffer_num_produced_hi;
    dword root_buffer_num_consumed;
    dword root_buffer_num_consumed_hi;
    dword root_buffer_num_to_consume;
    dword root_buffer_num_to_consume_hi;
 };
 struct SAHBuildBuffersInfo
 {
    gpuva_t p_globals;
    gpuva_t p_primref_index_buffers;
    gpuva_t p_primrefs_buffer;
    gpuva_t p_bvh2;
    gpuva_t p_bvh_base;
    gpuva_t p_qnode_root_buffer;
    dword   sah_globals_flags;
    dword   _pad;
    gpuva_t _pad2;
 };
 typedef union LRBounds
 {    
    struct
    {
        struct AABB3f left_centroid_bounds;
        struct AABB3f left_geom_bounds;
        struct AABB3f right_centroid_bounds;
        struct AABB3f right_geom_bounds;
    } boxes;
    struct
    {
        float Array[24];
    } scalars;
 } LRBounds;
 struct VContext
 {
    uint dispatch_primref_begin;    // range of primrefs for this task
    uint dispatch_primref_end;
    uint bvh2_root;                 // BVH2 root node for this task
    uint tree_depth;                // depth of this node in the tree
    uint num_left;          // primref counts
    uint num_right;
    uint lr_mask;      // lower 8b : left mask.  upper 8b : right mask
    uint batch_index;
    // pass1 global working state and output
    struct BFS_Split split;
    struct BFS_BinInfo global_bin_info;
    // pass2 global working state and output
    LRBounds lr_bounds;
 };
 struct BFSDispatchRecord
 {
    ushort batch_index;
    ushort context_id;
 };
 struct BFSDispatchQueue
 {
    uint num_dispatches;
    uint wg_count[BFS_NUM_VCONTEXTS];
    struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
 };
 struct BFS1SpillStackEntry
 {
    uint primref_begin;
    uint primref_end;
    uint bvh2_root;
    ushort tree_depth;
    ushort batch_index;
 };
 struct BFS1SpillStack
 {
    uint size;
    struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
 };
 struct QNodeGlobalRootBufferEntry
 {
    uint bvh2_node;
    uint qnode;
    uint build_idx;
    uint _pad;
 };
 struct QNodeGlobalRootBuffer
 {
    uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
    struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
 };
 struct DFSDispatchRecord
 {
    uint primref_base;
    uint bvh2_base;
    uint batch_index;
    ushort num_primrefs;
    ushort tree_depth;
 };
 struct DFSDispatchQueue
 {
    struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
 };
 #define VCONTEXT_STATE_EXECUTING   0
 #define VCONTEXT_STATE_UNALLOCATED 1
 union SchedulerUnion
 {
    struct VContextScheduler
    {
        /////////////////////////////////////////////////////////////
        //  State data used for communication with command streamer
        //   NOTE: This part must match definition in 'new_sah_builder.grl'
        /////////////////////////////////////////////////////////////
        dword num_bfs_wgs;
        dword num_dfs_wgs;
        dword scheduler_postsync;
        dword _pad1;
        dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).  
        dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
        dword batched_build_wg_count;  // number of wgs to dispatch for initial BFS pass
        dword batched_build_loop_mask; // value is 0 if  #builds <= #contexts.  else 1  command streamer uses this as a loop condition
        /////////////////////////////////////////////////////////////
        dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
        dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
        dword vcontext_state[BFS_NUM_VCONTEXTS];
        struct BFSDispatchQueue bfs_queue;
        struct DFSDispatchQueue dfs_queue;
        struct VContext contexts[BFS_NUM_VCONTEXTS];
        struct BFS1SpillStack bfs2_spill_stack;
    } vContextScheduler;
    struct QnodeScheduler
    {
        dword num_qnode_grb_curr_entries;
        dword num_qnode_grb_new_entries;
        dword scheduler_postsync;
        dword _pad1;
        dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).  
        dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
        dword batched_builds_to_process;
        dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
        /////////////////////////////////////////////////////////////
        dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
        dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
        struct QNodeGlobalRootBuffer qnode_global_root_buffer;
    } qnodeScheduler;
 };
 struct BVH2Node
 {
    struct AABB3f box;
    uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
    uint  meta_ss;  
    //ushort meta_s;   // leaf: primref count.  inner: offset from first to second child, in nodes  
    //uchar is_inner; //  1 if inner, 0 if leaf
    //uchar mask;
 };
 struct BVH2
 {
    uint num_nodes;
    uint _pad[7];  // align to 32B
 };
 GRL_NAMESPACE_END(GPUBVHBuilder)
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/gpu/build_leaf.grl
+++ b/src/intel/vulkan/grl/gpu/build_leaf.grl
@ -1,206 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module leaf_builder;
 kernel_module leaf_kernels ("bvh_build_leaf.cl")
 {
    links lsc_intrinsics;
    kernel opencl_kernel_primref_to_quads                   < kernelFunction="primref_to_quads" >;
    kernel opencl_kernel_primref_to_procedurals             < kernelFunction="primref_to_procedurals" >;
    kernel opencl_kernel_create_HW_instance_nodes           < kernelFunction="create_HW_instance_nodes" >;
    kernel opencl_kernel_create_HW_instance_nodes_pointers  < kernelFunction="create_HW_instance_nodes_pointers" >;
 }
 import struct MKBuilderState "structs.grl";
 import struct MKSizeEstimate "structs.grl";
 const Instances_GROUPSIZE = 16;
 metakernel buildLeafDXR_instances(
            MKBuilderState state,
            qword build_primref_index_buffers,
            qword srcInstanceDescrArray,
            dword stride,
            dword offset,
            dword numPrims)
 {
    define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
    dispatch opencl_kernel_create_HW_instance_nodes(num_groups,1,1) args(
        state.build_globals,
        build_primref_index_buffers,
        state.build_primref_buffer,
        state.bvh_buffer,
        srcInstanceDescrArray,
        stride,
        offset);
 }
 metakernel buildLeafDXR_instances_indirect(
            MKBuilderState state,
            qword build_primref_index_buffers,
            qword srcInstanceDescrArray,
            qword indirectBuildRangeInfo,
            dword stride,
            dword offset)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // Instances_GROUPSIZE - 1
    C_4 = 4;          // log_2(Instances_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_create_HW_instance_nodes args(
        state.build_globals,
        build_primref_index_buffers,
        state.build_primref_buffer,
        state.bvh_buffer,
        srcInstanceDescrArray,
        stride,
        offset);
 }
 metakernel buildLeafDXR_instances_pointers(
            MKBuilderState state,
            qword build_primref_index_buffers,
            qword srcInstanceDescrArrayPtr,
            dword stride,
            dword offset,
            dword numPrims)
 {
    define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
    dispatch opencl_kernel_create_HW_instance_nodes_pointers(num_groups,1,1) args(
        state.build_globals,
        build_primref_index_buffers,
        state.build_primref_buffer,
        state.bvh_buffer,
        srcInstanceDescrArrayPtr,
        stride,
        offset);
 }
 metakernel buildLeafDXR_instances_pointers_indirect(
            MKBuilderState state,
            qword build_primref_index_buffers,
            qword srcInstanceDescrArrayPtr,
            qword indirectBuildRangeInfo,
            dword stride,
            dword offset)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // Instances_GROUPSIZE - 1
    C_4 = 4;          // log_2(Instances_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_create_HW_instance_nodes_pointers args(
        state.build_globals,
        build_primref_index_buffers,
        state.build_primref_buffer,
        state.bvh_buffer,
        srcInstanceDescrArrayPtr,
        stride,
        offset);
 }
 metakernel buildLeafDXR_procedurals(
            MKBuilderState state,
            qword build_primref_index_buffers,
            dword stride,
            dword offset,
            qword p_numPrimitives)
 {
    define C_1                  REG0;
    define REG_PRIMS_PER_WG     REG1;
    define REG_PRIMS_PER_WG_SHR REG2;
    C_1 = 1;
    REG_PRIMS_PER_WG = 16;
    REG_PRIMS_PER_WG_SHR = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
    define reg_numPrimitives  REG3;
    define reg_num_wgs        REG4;
    reg_numPrimitives = load_dword(p_numPrimitives);
    reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
    reg_num_wgs = reg_num_wgs - C_1;
    reg_num_wgs = reg_num_wgs >> REG_PRIMS_PER_WG_SHR;
    DISPATCHDIM_X = reg_num_wgs;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_primref_to_procedurals args(
        state.build_globals,
        state.build_primref_buffer,
        build_primref_index_buffers,
        state.bvh_buffer,
        state.geomDesc_buffer,
        stride,
        offset);
 }
 metakernel buildLeafDXR_quads(
            MKBuilderState state,
            qword build_primref_index_buffers,
            dword stride,
            dword offset,
            qword p_numPrimitives,
            dword allow_update)
 {
    define C_1                  REG0;
    define REG_PRIMS_PER_WG     REG1;
    define SHIFT                REG2;
    C_1 = 1;
    REG_PRIMS_PER_WG = 32;
    SHIFT = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
    define reg_numPrimitives  REG3;
    define reg_num_wgs        REG4;
    reg_numPrimitives = load_dword(p_numPrimitives);
    reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
    reg_num_wgs = reg_num_wgs - C_1;
    reg_num_wgs = reg_num_wgs >> SHIFT;
    reg_num_wgs = reg_num_wgs >> C_1;
    DISPATCHDIM_X = reg_num_wgs;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_primref_to_quads args(
        state.build_globals,
        state.build_primref_buffer,
        build_primref_index_buffers,
        state.bvh_buffer,
        state.geomDesc_buffer,
        stride,
        offset,
        allow_update);
 }
--- a/src/intel/vulkan/grl/gpu/build_primref.grl
+++ b/src/intel/vulkan/grl/gpu/build_primref.grl
@ -1,229 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module build_primref;
 kernel_module primref_kernels ("bvh_build_primref.cl")
 {
    links lsc_intrinsics;
    kernel opencl_kernel_primrefs_from_DXR_instances          < kernelFunction="primrefs_from_DXR_instances" >;
    kernel opencl_kernel_primrefs_from_DXR_instances_indirect < kernelFunction="primrefs_from_DXR_instances_indirect" >;
    kernel opencl_kernel_primrefs_from_DXR_instances_pointers < kernelFunction="primrefs_from_DXR_instances_pointers" >;
    kernel opencl_kernel_primrefs_from_DXR_instances_pointers_indirect < kernelFunction="primrefs_from_DXR_instances_pointers_indirect" >;
    kernel opencl_kernel_triangles_to_primrefs            < kernelFunction="triangles_to_primrefs" >;
    kernel opencl_kernel_triangles_to_primrefs_indirect   < kernelFunction="triangles_to_primrefs_indirect" >;
    kernel opencl_kernel_procedurals_to_primrefs          < kernelFunction="procedurals_to_primrefs" >;
    kernel opencl_kernel_procedurals_to_primrefs_indirect < kernelFunction="procedurals_to_primrefs_indirect" >;
 }
 import struct MKBuilderState "structs.grl";
 import struct MKSizeEstimate "structs.grl";
 const PrimirefsFromInstances_GROUPSIZE = 16;
 metakernel buildPrimirefsFromInstances(
            qword instanceDescBuff,
            MKSizeEstimate estimate,
            MKBuilderState build_state,
            dword allowUpdate)
 {
  define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
  dispatch opencl_kernel_primrefs_from_DXR_instances(num_groups,1,1) args(
    build_state.build_globals,
    build_state.bvh_buffer,
    instanceDescBuff,
    estimate.numPrimitives,
    build_state.build_primref_buffer,
    allowUpdate);
 }
 metakernel buildPrimirefsFromInstancesIndirect(
            qword instanceDescBuff,
            qword indirectBuildRangeInfo,
            MKBuilderState build_state,
            dword allowUpdate)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
    C_4 = 4;          // log_2(PrimirefsFromInstances_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_indirect args(
        build_state.build_globals,
        build_state.bvh_buffer,
        instanceDescBuff,
        indirectBuildRangeInfo,
        build_state.build_primref_buffer,
        allowUpdate);
 }
 metakernel buildPrimirefsFromInstancesArrOfPtrs(
            qword instanceDescPtrArrayBuff,
            MKSizeEstimate estimate,
            MKBuilderState build_state,
            dword allowUpdate)
 {
  define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
  dispatch opencl_kernel_primrefs_from_DXR_instances_pointers(num_groups,1,1) args(
    build_state.build_globals,
    build_state.bvh_buffer,
    instanceDescPtrArrayBuff,
    estimate.numPrimitives,
    build_state.build_primref_buffer,
    allowUpdate);
 }
 metakernel buildPrimirefsFromInstancesArrOfPtrsIndirect(
            qword instanceDescPtrArrayBuff,
            qword indirectBuildRangeInfo,
            MKSizeEstimate estimate,
            MKBuilderState build_state,
            dword allowUpdate)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
    C_4 = 4;          // log_2(PrimirefsFromInstances_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_pointers_indirect args(
        build_state.build_globals,
        build_state.bvh_buffer,
        instanceDescPtrArrayBuff,
        build_state.build_primref_buffer,
        indirectBuildRangeInfo,
        allowUpdate);
 }
 metakernel primrefs_from_tris(
            MKBuilderState build_state,
            MKSizeEstimate estimate,
            qword geo_ptr,
            dword geom_id,
            dword geom_flags,
            dword num_prims)
 {
    define num_threads ((num_prims+15)/16);
    dispatch opencl_kernel_triangles_to_primrefs(num_threads,1,1) args(
        build_state.build_globals,
        build_state.bvh_buffer,
        build_state.build_primref_buffer,
        geo_ptr,
        (geom_id & 0x00ffffff) + (geom_flags<<24),
        num_prims);
 }
 metakernel primrefs_from_tris_indirect(
            MKBuilderState build_state,
            MKSizeEstimate estimate,
            qword geo_ptr,
            qword indirectBuildRangeInfo,
            dword geom_id,
            dword geom_flags)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups  = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
    C_4         = 4;  // log_2(PrimirefsFromInstances_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_triangles_to_primrefs_indirect args(
        build_state.build_globals,
        build_state.bvh_buffer,
        build_state.build_primref_buffer,
        geo_ptr,
        indirectBuildRangeInfo,
        (geom_id & 0x00ffffff) + (geom_flags << 24));
 }
 metakernel primrefs_from_proc(
            MKBuilderState build_state,
            MKSizeEstimate estimate,
            qword geo_ptr,
            dword geom_id,
            dword geom_flags,
            dword num_prims)
 {
    define num_threads ((num_prims+15)/16);
    dispatch opencl_kernel_procedurals_to_primrefs(num_threads,1,1) args(
        build_state.build_globals,
        build_state.bvh_buffer,
        build_state.build_primref_buffer,
        geo_ptr,
        (geom_id & 0x00ffffff) + (geom_flags<<24),
        num_prims);
 }
 metakernel primrefs_from_proc_indirect(
            MKBuilderState build_state,
            MKSizeEstimate estimate,
            qword geo_ptr,
            qword indirectBuildRangeInfo,
            dword geom_id,
            dword geom_flags)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups  = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
    C_4         = 4;  // log_2(PrimirefsFromInstances_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_procedurals_to_primrefs_indirect args(
        build_state.build_globals,
        build_state.bvh_buffer,
        build_state.build_primref_buffer,
        geo_ptr,
        indirectBuildRangeInfo,
        (geom_id & 0x00ffffff) + (geom_flags<<24));
 }
--- a/src/intel/vulkan/grl/gpu/build_refit.grl
+++ b/src/intel/vulkan/grl/gpu/build_refit.grl
@ -1,324 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module build_refit;
 kernel_module morton_kernels ("bvh_build_refit.cl")
 {
    links lsc_intrinsics;
    kernel update_instance_leaves    < kernelFunction="update_instance_leaves" >;
    kernel refit_indirect_sg         < kernelFunction="Refit_indirect_sg" >;
    kernel update_instance_leaves_indirect    < kernelFunction="update_instance_leaves_indirect" >;
 }
 const INSTANCE_LEAF_GROUP_SIZE = 16;
 const REFIT_GROUP_SIZE = 8;
 metakernel update_instance_leaves(
    qword bvh,
    qword dxrInstancesArray,
    qword dxrInstancesPtrArray,
    qword instance_leaf_aabbs,
    dword num_instances )
 {
    define num_groups (num_instances + INSTANCE_LEAF_GROUP_SIZE - 1) / INSTANCE_LEAF_GROUP_SIZE;
    dispatch update_instance_leaves(num_groups, 1, 1) args(
        bvh,
        dxrInstancesArray,
        dxrInstancesPtrArray,
        instance_leaf_aabbs);
 }
 metakernel update_instance_leaves_indirect(
    qword bvh,
    qword dxrInstancesArray,
    qword dxrInstancesPtrArray,
    qword instance_leaf_aabbs,
    qword indirectBuildRangeInfo)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // INSTANCE_LEAF_GROUP_SIZE - 1
    C_4 = 4;  // log_2(INSTANCE_LEAF_GROUP_SIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / INSTANCE_LEAF_GROUP_SIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect update_instance_leaves_indirect args(
        bvh,
        dxrInstancesArray,
        dxrInstancesPtrArray,
        instance_leaf_aabbs,
        indirectBuildRangeInfo);
 }
 /*
 metakernel refit(
    qword bvh,
    qword geomDesc,
    qword instance_aabbs,
    dword dispatchSize )
 {
    define num_groups (dispatchSize + REFIT_GROUP_SIZE - 1) / REFIT_GROUP_SIZE;
    dispatch refit(num_groups, 1, 1) args(
        bvh,
        geomDesc,
        instance_aabbs);
 }
 const REFIT_SIMD_SIZE = 8;
 const REFIT_SIMD_SIZE_SHIFT = 3;
 metakernel refit_indirect(
    qword bvh,
    qword bvh_inner_nodes_start_value,
    qword bvh_inner_nodes_end,
    qword geomDesc,
    qword instance_aabbs )
 {
    define cRoundingSIMD REG4;
    define TWO REG3;
    define ONE REG5;
    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
    TWO = 2;
    ONE = 1;
    REG0 = bvh_inner_nodes_start_value;
    REG1 = load_dword(bvh_inner_nodes_end);
    REG1.hi = 0;
    REG2 = REG1 - REG0;
    REG2 = REG2 + cRoundingSIMD;
    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
    DISPATCHDIM_X = REG2.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect refit_indirect args(
        bvh,
        geomDesc,
        instance_aabbs);
 }
 */
 metakernel refit_indirect_sg(
    qword bvh,
    qword bvh_inner_nodes_start_value,
    qword bvh_inner_nodes_end,
    qword geomDesc,
    qword instance_aabbs )
 {
    REG0 = bvh_inner_nodes_start_value;
    REG1.lo = load_dword(bvh_inner_nodes_end);
    REG1.hi = 0;
    REG2 = REG1 - REG0;
    DISPATCHDIM_X = REG2.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect refit_indirect_sg args(
        bvh,
        geomDesc,
        instance_aabbs);
 }
 /*
 ////////////////////////////////////////////////////////////////
 // constructing treelets
 // phase 1: mark nodes that will be roots of bottom treelets
 // also for each node leave a number of startpoints that are under it and max depth of the path from the node
 metakernel find_refit_treelets(
    qword bvh,
    qword treelet_node_data,
    qword scratch_startpoints,
    qword startpointAlloc,
    qword bvh_inner_nodes_start_value,
    qword bvh_inner_nodes_end )
 {
    define cRoundingSIMD REG4;
    define TWO REG3;
    define ONE REG5;
    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
    TWO = 2;
    ONE = 1;
    REG0 = bvh_inner_nodes_start_value;
    REG1.lo = load_dword(bvh_inner_nodes_end);
    REG1.hi = 0;
    REG2 = REG1 - REG0;
    REG2 = REG2 + cRoundingSIMD;
    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
    DISPATCHDIM_X = REG2.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect find_refit_treelets args(
        bvh,
        treelet_node_data,
        scratch_startpoints,
        startpointAlloc);
 }
 ////////////////////////////////////////////////////////////////
 // constructing treelets
 // phase 2 totally parallel, run threads up to assign startpoints to given treelet
 // 
 metakernel assign_refit_startpoints_to_treelets(
    qword bvh,
    qword treelet_node_data,
    qword scratch_startpoints,
    qword bvh_inner_nodes_start_value,
    qword bvh_inner_nodes_end )
 {
    define cRoundingSIMD REG4;
    define TWO REG3;
    define ONE REG5;
    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
    TWO = 2;
    ONE = 1;
    REG0 = bvh_inner_nodes_start_value;
    REG1.lo = load_dword(bvh_inner_nodes_end);
    REG1.hi = 0;
    REG2 = REG1 - REG0;
    REG2 = REG2 + cRoundingSIMD;
    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
    DISPATCHDIM_X = REG2.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect assign_refit_startpoints_to_treelets args(
        bvh,
        treelet_node_data,
        scratch_startpoints);
 }
 ////////////////////////////////////////////////////////////////
 // constructing treelets
 // phase 3 local work: group per treelet, sort the startpoints in treelets ?// by length of the path
 metakernel finalize_treelets_in_groups(
    qword bvh,
    qword scratch_startpoints,
    qword ptrNumTreelets )
 {
    REG0 = load_qword(ptrNumTreelets);
    DISPATCHDIM_X = REG0.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect finalize_treelets_in_groups args(
        bvh,
        scratch_startpoints);
 }
 ////////////////////////////////////////////////////////////////
 // Updating treelets
 // phase 1 update vertex and generate boxes for vertices
 //
 const PER_GROUP_ELEMENTS_ROUNDING = 15;
 const PER_GROUP_ELEMENTS_SHIFT = 4;
 metakernel init_treelets_refit(qword pSquashGroupsCountToReset)
 {
    REG1 = 0;
    store_qword(pSquashGroupsCountToReset, REG1);
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    //REG4 = PER_GROUP_ELEMENTS_SHIFT;
    //REG5.hi = PER_GROUP_ELEMENTS_ROUNDING;
    //REG5.lo = 0;
 }
 metakernel update_quads(
    qword scratch_box,
    qword bvh,
    qword input,
    dword numPrimsDividedBy32,
    qword bigSquashInput)
 {
    //REG0 = load_qword(quads_nodes_begin_end_pair);
    //REG1.hi = REG0.lo; // this holds inner nodes begin
    //REG2 = REG0 - REG1;
    //REG2 = REG2 + REG5;
    //REG2 = REG2 >> REG4;
    //DISPATCHDIM_X = REG2.hi;
    dispatch  refit_quads(numPrimsDividedBy32, 1, 1) args(
        bvh,
        input,
        scratch_box,
        numPrimsDividedBy32,
        bigSquashInput );
 }
 //
 ////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////
 //
 // phase 1 or 2 - update primitives as well as bottom up refit internal nodes
 // in single dispatch (in single group per tree)
 metakernel refit_tree_by_group_including_quads(
    qword squashed_inputs,
    dword numBuilds
 )
 {
    dispatch refit_tree_per_group(numBuilds, 1, 1) args(
        squashed_inputs);
 }
 //
 ////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////
 //
 // phase 2 bottom up refit internal nodes
 //
 metakernel refit_treelet_per_group(
    qword bigSquashInput,
    qword ptrNumTreelets)
 {
    DISPATCHDIM_X = load_dword(ptrNumTreelets);
    dispatch_indirect refit_treelet_per_group args(
        bigSquashInput);
 }
 //
 ////////////////////////////////////////////////////////////////
 #endif
 */
--- a/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl
--- a/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl
--- a/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl
@ -1,357 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "api_interface.h"
 #include "common.h"
 #include "instance.h"
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(32, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel
 primref_to_quads(global struct Globals *globals,
                 global struct AABB *primref,
                 global char *primref_index,
                 global char *bvh_mem,
                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
                 const uint stride,
                 const uint offset,
                 const uint allow_update)
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
    uint quadIndicesStart = bvh->quadIndicesDataStart;
    const uint numPrimitives = globals->numPrimitives;
    uint i = get_group_id( 0 ) * get_local_size( 0 ) + get_local_id(0);
    if (i < numPrimitives)
    {
        global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
        const uint primrefID = *(uint *)(primref_index + i * stride + offset);
        const uint geomID    = PRIMREF_geomID(&primref[primrefID]);
        const uint primID0   = PRIMREF_primID0(&primref[primrefID]);
        const uint primID1   = PRIMREF_primID1(&primref[primrefID]);
        const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
        const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
        const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
        const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
        uint vertex_stride = geomDesc[geomID].Desc.Triangles.VertexBufferByteStride;
        const uint4 indices = q.a;
        const uint mask = 0xff; // FIXME: hardcoded mask
        float3 vtx0, vtx1, vtx2, vtx3;
        GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
        uint j0 = q.lb.x;
        uint j1 = q.lb.y;
        uint j2 = q.lb.z;
        uint shaderIndex = (mask << 24) | geomID;
        uint geomIndex = geomID | (geomFlags << 30);
        uint primIndex0 = primID0;
        const uint delta = primID1 - primID0;
        const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
        uint primIndex1Delta = delta | (j << 16) | (1 << 22);
        uint4 pack0 = (uint4)(shaderIndex, geomIndex, primIndex0, primIndex1Delta);
        float4 pack1 = (float4)(vtx0.x, vtx0.y, vtx0.z, vtx1.x);
        float4 pack2 = (float4)(vtx1.y, vtx1.z, vtx2.x, vtx2.y);
        float4 pack3 = (float4)(vtx2.z, vtx3.x, vtx3.y, vtx3.z);
        global uint4* dst = (global uint4*)&quads[i];
        store_uint4_L1WB_L3WB(dst, 0, pack0);
        store_uint4_L1WB_L3WB(dst, 1, as_uint4(pack1));
        store_uint4_L1WB_L3WB(dst, 2, as_uint4(pack2));
        store_uint4_L1WB_L3WB(dst, 3, as_uint4(pack3));
        if(allow_update)
        {
            global uint4* vertex_indice_ptr = (global uint4*)(((char*)bvh) + (64u * quadIndicesStart + 32 * i));
            uint4 pack_indices = (uint4) ( indices.x , indices.y, indices.z, indices.w );
            store_uint4_L1WB_L3WB( vertex_indice_ptr, 0, pack0 );
            store_uint4_L1WB_L3WB( vertex_indice_ptr, 1, pack_indices * vertex_stride);
        }
        if (i == 0)
            bvh->quadLeafCur += numPrimitives ;
    }
 #if 0
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
    const uint numPrimitives = globals->numPrimitives;
    const uint startID = get_group_id( 0 ) * get_local_size( 0 );
    const uint endID   = min((uint)(startID + get_local_size( 0 )), numPrimitives);
    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
    {
        const uint primrefID = *(uint *)(primref_index + i * stride + offset);
        const uint geomID    = PRIMREF_geomID(&primref[primrefID]);
        const uint primID0   = PRIMREF_primID0(&primref[primrefID]);
        const uint primID1   = PRIMREF_primID1(&primref[primrefID]);
        const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
        const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
        const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
        const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
        const uint4 indices = q.a;
        const uint mask = 0xff; // FIXME: hardcoded mask
        float3 vtx0, vtx1, vtx2, vtx3;
        GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
        setQuad(&quads[i], (float4)(vtx0,0), (float4)(vtx1,0), (float4)(vtx2,0), (float4)(vtx3,0), q.lb.x, q.lb.y, q.lb.z, geomID, primID0, primID1, mask, geomFlags );
    }
    if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
        bvh->quadLeafCur += numPrimitives ;
 #endif
 }
 GRL_INLINE void create_procedural_leaf(global struct Globals *globals,
                            global struct AABB *primref,
                            local uint *primrefids,
                            uint numProcedurals,
                            struct QBVHNodeN *qnode,
                            global char *bvh_mem,
                            global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    if (get_local_id(0) >= 8)
        return;
    global struct BVHBase* bvh_base = (global struct BVHBase*)bvh_mem;
    /* first read geomID of all primitives */
    uint primrefID = -1;
    uint geomID = -1;
    uint geomFlags = 0;
    if (get_local_id(0) < numProcedurals)
    {
        primrefID = primrefids[get_local_id(0)];
        geomID = PRIMREF_geomID(&primref[primrefID]);
        geomFlags = PRIMREF_geomFlags( &primref[primrefID] );
    }
    // cannot sort by geomID as bounds in parent node are then wrong
    //ulong geomID_primrefID = (((ulong)geomID) << 32) | ((ulong)primrefID);
    //geomID_primrefID = sort8_ascending_ulong(geomID_primrefID);
    //geomID = geomID_primrefID >> 32;
    //primrefID = geomID_primrefID;
    /* We have to split at geomID boundaries into multiple leaves. This
   * block calculates the lane where a leaf starts and ends. */
    const uint geomIDprev = intel_sub_group_shuffle_up(0xFFFFFFFFu, geomID, 1u);
    const uint geomIDnext = intel_sub_group_shuffle_down(geomID, 0xFFFFFFFFu, 1u);
    const uint leaf_start = geomIDprev != geomID;
    const uint leaf_end = geomIDnext != geomID;
    const uint leaf_start_next = intel_sub_group_shuffle_down(leaf_start, 0u, 1u);
    /* This computes which leaf a lane processes. E.g. form geomID =
   * [3,3,4,4,4,0] we get leaf_id = [0,0,1,1,1,2] */
    //const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); // FIXME: exclusive?
    /* This computes the n'th primitive a lane processes inside its
    * leaf. For the example above we compute leaf_prim =
    * [0,1,0,1,2,0]. */
    const uint leaf_prim = get_local_id(0) - sub_group_scan_inclusive_max(leaf_start ? get_local_id(0) : 0);
    /* from here on we allocate data and write to memory, thus only
   * lanes that process a primitive should continue. */
    if (get_local_id(0) >= numProcedurals)
        return;
    /* Here we allocate a single memory block for each required
     * ProceduralLeaf node. We do this from a single lane to ensure
     * the allocation is contiguous. */
    uint leaf_base_offset = 0;
    uint n_leafs = sub_group_reduce_add(leaf_start);
    if (get_local_id(0) == 0)
       leaf_base_offset = allocate_procedural_leaves( bvh_base, n_leafs );
    leaf_base_offset = sub_group_broadcast(leaf_base_offset, 0);
    /* Compute the leaf offset for each lane. */
    uint leaf_offset = leaf_base_offset + sub_group_scan_inclusive_add(leaf_start) - 1;
    struct ProceduralLeaf *pleaf = ((global struct ProceduralLeaf *)(bvh_mem)) + leaf_offset;
    /* write the procedural leaf headers */
    if (leaf_end)
    {
        pleaf->leafDesc.shaderIndex_geomMask = 0xFF000000 | (geomID & 0x00FFFFFF); // FIXME: use accessor function.   Future extensions may have shaderIndex != geomID
        pleaf->leafDesc.geomIndex_flags = geomID | (geomFlags<<30); // FIXME:  Use setter function
        pleaf->DW1 = 0xFFFFFFF0 | (leaf_prim + 1); // !!!
    }
    /* write the procedural leaf primIDs */
    pleaf->_primIndex[leaf_prim] = PRIMREF_primID0(&primref[primrefID]);
    /* update leaf node offset inside parent node */
    if (get_local_id(0) == 0)
    {
        QBVH6Node_set_offset(qnode, pleaf);
        QBVH6Node_set_type(qnode, NODE_TYPE_PROCEDURAL);
    }
    /* Let parent node children point to proper procedural leaf block
   * and primitive. */
    qnode->childData[get_local_id(0)] = leaf_start_next | (leaf_prim << 2);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 GRL_ANNOTATE_BIG_REG_REQ
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 primref_to_procedurals(global struct Globals *globals,
                                 global struct AABB *primref,
                                 global char *primref_index,
                                 global char *bvh_mem,
                                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
                                 const uint stride,
                                 const uint offset)
 {
    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
    const uint numPrimitives = globals->numPrimitives;
    uint startID = get_group_id( 0 ) * get_local_size( 0 );
    uint endID   = min((uint)(startID + get_local_size( 0 )), numPrimitives);
    uint offset1 = stride * globals->numPrimitives;
    if (stride == 8)
        offset1 = 4;
    uint prev_start_back_pointer = startID == 0 ? -1 : *(uint *)(primref_index + (startID-1) * stride + offset1);
    /* start at leaf start */
    while (startID < numPrimitives)
    {
        const uint back_pointer = *(uint *)(primref_index + startID * stride + offset1);
        if (back_pointer != prev_start_back_pointer)
            break;
        startID++;
    }
    uint prev_end_back_pointer = *(uint *)(primref_index + (endID-1) * stride + offset1);
    /* end at next leaf start */
    while (endID < numPrimitives)
    {
        const uint back_pointer = *(uint *)(primref_index + endID * stride + offset1);
        if (back_pointer != prev_end_back_pointer)
            break;
        endID++;
    }
    local uint procedurals[16];
    for (uint lid = startID + get_local_id(0); lid < endID + get_local_id(0);)
    {
        /* load leaf start points and back_pointer */
        const uint primrefID = *(uint *)(primref_index + lid * stride + offset);
        uint back_pointer = *(uint *)(primref_index + lid * stride + offset1);
        uint prev_back_pointer = get_local_id(0) == 0 ? -1 : *(uint *)(primref_index + (lid-1) * stride + offset1);
        const uint leaf_start = back_pointer != prev_back_pointer;
        uint leaf_start_back_pointer = sub_group_broadcast(back_pointer, 0);
        /* compute number of primitives inside the leaf starting at lid */
        const uint leaf_id = sub_group_scan_inclusive_add(leaf_start);
        uint numPrimitives = 0;
        if (back_pointer == leaf_start_back_pointer && lid < endID)
            numPrimitives = sub_group_reduce_add(1);
        numPrimitives = sub_group_broadcast(numPrimitives, 0);
        procedurals[get_local_id(0)] = primrefID;
        struct QBVHNodeN *qnode = (struct QBVHNodeN *)bvh_mem + back_pointer;
        create_procedural_leaf(globals, primref, procedurals, numPrimitives, qnode, bvh_mem, geomDesc);
        lid += numPrimitives;
    }
 }
 GRL_INLINE void create_HW_instance_leaf(
    global struct BVHBase* bvh,
    global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
    uint dstLeafId,
    uint instanceIndex,
    uint rootNodeByteOffset,
    uint instanceMask)
 {
    /* convert DXR instance to instance leaf node */
    global struct HwInstanceLeaf* leaves = (__global struct HwInstanceLeaf*)BVHBase_quadLeaves(bvh);
    HwInstanceLeaf_Constructor(&leaves[dstLeafId], instDesc, instanceIndex, rootNodeByteOffset, instanceMask);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel create_HW_instance_nodes(
    global const struct Globals *globals,
    global char *primref_index,
    global struct AABB *primref,
    global struct BVHBase *bvh,
    global struct GRL_RAYTRACING_INSTANCE_DESC *src_instances,
    uint32_t stride,
    uint32_t offset)
 {
    uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
    uint num_prims = globals->numPrimitives;
    if (dstLeafId >= num_prims)
        return;
    if( dstLeafId == 0 )
        bvh->instanceLeafEnd += 2*num_prims;
    /* get instance ID */
    const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
    const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
    const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
    const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
    create_HW_instance_leaf(bvh, &src_instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel create_HW_instance_nodes_pointers(
    global const struct Globals *globals,
    global char *primref_index,
    global struct AABB *primref,
    global struct BVHBase *bvh,
    global void *instances_in,
    uint32_t stride,
    uint32_t offset)
 {
    uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
    uint num_prims = globals->numPrimitives;
    if (dstLeafId >= num_prims)
        return;
    if (dstLeafId == 0)
        bvh->instanceLeafEnd += 2 * num_prims;
    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
    /* get instance ID */
    const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
    const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
    const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
    const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
    create_HW_instance_leaf(bvh, instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
 }
--- a/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl
@ -1,556 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "api_interface.h"
 #include "common.h"
 #define GRID_SIZE 1024
 /*
  This presplit item contains for each primitive a number of splits to
  perform (priority) and the primref index.
 */
 struct PresplitItem
 {
    unsigned int index;
    float priority;
 };
 /*
  This function splits a line v0->v1 at position pos in dimension dim
  and merges the bounds for the left and right line segments into
  lbounds and rbounds.
 */
 GRL_INLINE void splitLine(const uint dim,
                      const float pos,
                      const float4 v0,
                      const float4 v1,
                      struct AABB *lbounds,
                      struct AABB *rbounds)
 {
    const float v0d = v0[dim];
    const float v1d = v1[dim];
    /* this point is on left side */
    if (v0d <= pos)
        AABB_extend_point(lbounds, v0);
    /* this point is on right side */
    if (v0d >= pos)
        AABB_extend_point(rbounds, v0);
    /* the edge crosses the splitting location */
    if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d))
    {
        const float f = (pos - v0d) / (v1d - v0d);
        const float4 c = f * (v1 - v0) + v0;
        AABB_extend_point(lbounds, c);
        AABB_extend_point(rbounds, c);
    }
 }
 /*
  This function splits a clipped triangle v0,v1,v2 with bounds prim at
  position pos in dimension dim and merges the bounds for the left and
  right clipped triangle fragments into lbounds and rbounds.
 */
 GRL_INLINE void splitTriangle(struct AABB *prim,
                          const uint dim,
                          const float pos,
                          const float4 v0,
                          const float4 v1,
                          const float4 v2,
                          struct AABB *lbounds,
                          struct AABB *rbounds)
 {
    /* clip each triangle edge */
    splitLine(dim, pos, v0, v1, lbounds, rbounds);
    splitLine(dim, pos, v1, v2, lbounds, rbounds);
    splitLine(dim, pos, v2, v0, lbounds, rbounds);
    /* the triangle itself was clipped already, thus clip against triangle bounds */
    AABB_intersect(lbounds, prim);
    AABB_intersect(rbounds, prim);
 }
 float calculate_priority(struct AABB *prim, global GRL_RAYTRACING_GEOMETRY_DESC *geom)
 {
    /* calculate projected area of first triangles */
    const uint primID0 = PRIMREF_primID0(prim);
    const uint3 tri0 = GRL_load_triangle(geom, primID0);
    const float4 av0 = GRL_load_vertex(geom, tri0.x);
    const float4 av1 = GRL_load_vertex(geom, tri0.y);
    const float4 av2 = GRL_load_vertex(geom, tri0.z);
    const float area_tri0 = areaProjectedTriangle(av0, av1, av2);
    /* calculate projected area of second triangle */
    const uint primID1 = PRIMREF_primID1(prim);
    const uint3 tri1 = GRL_load_triangle(geom, primID1);
    const float4 bv0 = GRL_load_vertex(geom, tri1.x);
    const float4 bv1 = GRL_load_vertex(geom, tri1.y);
    const float4 bv2 = GRL_load_vertex(geom, tri1.z);
    const float area_tri1 = areaProjectedTriangle(bv0, bv1, bv2);
    /* as priority we use the AABB area */
    const float area_aabb = AABB_halfArea(prim);
    float priority = area_aabb;
    /* prefer triangles with a large potential SAH gain. */
    const float area_tris = area_tri0 + area_tri1;
    const float area_ratio = min(4.0f, area_aabb / max(1E-12f, area_tris));
    priority *= area_ratio;
    /* ignore too small primitives */
    //const float4 size = AABB_size(prim);
    //const float max_size = max(size.x,max(size.y,size.z));
    //if (max_size < 0.5f*max_scene_size/GRID_SIZE)
    //  priority = 0.0f;
    return priority;
 }
 /*
  This kernel calculates for each primitive an estimated splitting priority.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel compute_num_presplits(global struct Globals *globals,
                                                                                                 global struct BVHBase* bvh_base,
                                                                                                 global struct AABB *primref,
                                                                                                 global struct PresplitItem *presplit,
                                                                                                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    //assert(sizeof(PresplitItem) == sizeof_PresplitItem);
    /* calculate the range of primitives each work group should process */
    const uint numPrimitives = globals->numPrimitives;
    const uint startID = (get_group_id(0) + 0) * numPrimitives / get_num_groups(0);
    const uint endID = (get_group_id(0) + 1) * numPrimitives / get_num_groups(0);
    /* get scene bounding box size */
    const float3 scene_size = AABB3f_size(&bvh_base->Meta.bounds);
    const float max_scene_size = max(scene_size.x, max(scene_size.y, scene_size.z));
    /* each work group iterates over its range of primitives */
    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
    {
        const uint geomID = PRIMREF_geomID(&primref[i]);
        /* splitting heuristic for triangles */
        if (GRL_is_triangle(&geomDesc[geomID]))
        {
            presplit[i].index = i;
            presplit[i].priority = calculate_priority(&primref[i], &geomDesc[geomID]);
        }
        /* splitting of procedurals is not supported */
        else if (GRL_is_procedural(&geomDesc[geomID]))
        {
            presplit[i].index = i;
            presplit[i].priority = 0.0f;
        }
        else
        {
            //assert(false);
        }
    }
    if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
        globals->numOriginalPrimitives = globals->numPrimitives;
 }
 /*
  This kernel computes the sum of all priorities.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 priority_sum(global struct Globals *globals,
             global struct PresplitItem *presplit,
             uint numPrimitivesToSplit)
 {
    const uint N = globals->numPrimitives;
    const uint j = get_local_id(0);
    const uint J = get_local_size(0);
    const uint BLOCKSIZE = (N + J - 1) / J;
    const uint start = min((j + 0) * BLOCKSIZE, N);
    const uint end = min((j + 1) * BLOCKSIZE, N);
    float prioritySum = 0;
    for (uint i = start; i < end; i++)
        prioritySum += presplit[i].priority;
    prioritySum = work_group_reduce_add(prioritySum);
    globals->presplitPrioritySum = prioritySum;
 #if 0
  work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
  float scale = 1.0f;
  for (uint i = 0; i < 10; i++)
  {
    //if (j == 0)
    //printf("prioritySum = %f\n",scale*prioritySum);
    uint numSplits = 0;
    for (uint i = start; i < end; i++)
      numSplits += presplit[i].priority / (scale*prioritySum)*numPrimitivesToSplit;
    numSplits = work_group_reduce_add(numSplits);
    if (numSplits > numPrimitivesToSplit)
      break;
    //if (j == 0)
    //  printf("numSplits = %i (%i)\n",numSplits,numPrimitivesToSplit);
    globals->presplitPrioritySum = scale * prioritySum;
    scale -= 0.05f;
  }
 #endif
 }
 GRL_INLINE void heapify_down(struct AABB *array, uint size)
 {
    /* we start at the root */
    uint cur_node_id = 0;
    struct AABB *cur_node = array;
    while (true)
    {
        int larger_node_id = cur_node_id;
        struct AABB *larger_node = cur_node;
        /* check if left child is largest */
        const int left_node_id = 2 * cur_node_id + 1;
        struct AABB *left_node = &array[left_node_id];
        if (left_node_id < size && AABB_halfArea(left_node) > AABB_halfArea(larger_node))
        {
            larger_node_id = left_node_id;
            larger_node = left_node;
        }
        /* check if right child is largest */
        const int right_node_id = 2 * cur_node_id + 2;
        struct AABB *right_node = &array[right_node_id];
        if (right_node_id < size && AABB_halfArea(right_node) > AABB_halfArea(larger_node))
        {
            larger_node_id = right_node_id;
            larger_node = right_node;
        }
        /* if current node is largest heap property is fulfilled and we are done */
        if (larger_node_id == cur_node_id)
            break;
        /* otherwise we swap cur and largest */
        struct AABB tmp = *cur_node;
        *cur_node = *larger_node;
        *larger_node = tmp;
        /* we continue downwards with the largest node */
        cur_node_id = larger_node_id;
        cur_node = larger_node;
    }
 }
 GRL_INLINE void heapify_up(struct AABB *array, uint cur_node_id)
 {
    /* stop if we start at the root */
    if (cur_node_id == 0)
        return;
    struct AABB *cur_node = &array[cur_node_id];
    /* we loop until we reach the root node */
    while (cur_node_id)
    {
        /* get parent node */
        uint parent_node_id = (cur_node_id - 1) / 2;
        struct AABB *parent_node = &array[parent_node_id];
        /* if parent is larger then current we fulfill the heap property and can terminate */
        if (AABB_halfArea(parent_node) > AABB_halfArea(cur_node))
            break;
        /* otherwise we swap cur and parent */
        struct AABB tmp = *cur_node;
        *cur_node = *parent_node;
        *parent_node = tmp;
        /* and continue upwards */
        cur_node_id = parent_node_id;
        cur_node = parent_node;
    }
 }
 /* splits a quad primref */
 GRL_INLINE void splitQuadPrimRef(global GRL_RAYTRACING_GEOMETRY_DESC *geom,
                      struct AABB *cur, uint dim, float fsplit,
                      struct AABB *left, struct AABB *right)
 {
    /* left and right bounds to compute */
    AABB_init(left);
    AABB_init(right);
    /* load first triangle and split it */
    const uint primID0 = PRIMREF_primID0(cur);
    const uint3 tri0 = GRL_load_triangle(geom, primID0);
    const float4 av0 = GRL_load_vertex(geom, tri0.x);
    const float4 av1 = GRL_load_vertex(geom, tri0.y);
    const float4 av2 = GRL_load_vertex(geom, tri0.z);
    splitTriangle(cur, dim, fsplit, av0, av1, av2, left, right);
    /* load second triangle and split it */
    const uint primID1 = PRIMREF_primID1(cur);
    const uint3 tri1 = GRL_load_triangle(geom, primID1);
    const float4 bv0 = GRL_load_vertex(geom, tri1.x);
    const float4 bv1 = GRL_load_vertex(geom, tri1.y);
    const float4 bv2 = GRL_load_vertex(geom, tri1.z);
    splitTriangle(cur, dim, fsplit, bv0, bv1, bv2, left, right);
    /* copy the PrimRef payload into left and right */
    left->lower.w = cur->lower.w;
    left->upper.w = cur->upper.w;
    right->lower.w = cur->lower.w;
    right->upper.w = cur->upper.w;
 }
 /*
  This kernel performs the actual pre-splitting. It selects split
  locations based on an implicit octree over the scene.
 */
 #define USE_HEAP 0
 #define HEAP_SIZE 32u
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 //__attribute__((intel_reqd_sub_group_size(16)))
 void kernel
 perform_presplits(global struct Globals *globals,
                  global struct BVHBase* bvh_base,
                  global struct AABB *primref,
                  global struct PresplitItem *presplit,
                  global char *bvh_mem,
                  global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
                  uint numPrimitivesToSplit)
 {
    /* calculate the range of primitives each work group should process */
    const uint numPrimitives = globals->numPrimitives;
    int pstart = globals->numOriginalPrimitives - numPrimitivesToSplit;
    pstart = max(0, pstart);
    const uint numPrimitivesToProcess = globals->numPrimitives - pstart;
    const uint startID = (get_group_id(0) + 0) * numPrimitivesToProcess / get_num_groups(0);
    const uint endID = (get_group_id(0) + 1) * numPrimitivesToProcess / get_num_groups(0);
    /* calculates the 3D grid */
    float4 grid_base;
    grid_base.xyz = AABB3f_load_lower( &bvh_base->Meta.bounds );
    grid_base.w = 0;
    float4 grid_extend;
    grid_extend.xyz = AABB3f_size(&bvh_base->Meta.bounds);
    grid_extend.w=0;
    grid_extend = max(grid_extend.x, max(grid_extend.y, grid_extend.z));
    const float4 grid_scale = select(GRID_SIZE / grid_extend, 0.0f, grid_extend == 0.0f);
    const float inv_grid_size = 1.0f / GRID_SIZE;
    /* we have to update centroid bounds */
    struct AABB centroidBounds;
    AABB_init(&centroidBounds);
    /* initialize heap */
    struct AABB heap[HEAP_SIZE];
    uint heap_size = 0;
    /* each work group iterates over its range of primitives */
    for (uint j = startID + get_local_id(0); j < endID; j += get_local_size(0))
    {
        /* array is in ascending order */
        //const uint ID = numPrimitives-1-j;
        const uint ID = pstart + j;
        const float prob = presplit[ID].priority;
        const uint i = presplit[ID].index;
        const uint geomID = PRIMREF_geomID(&primref[i]);
        /* do not split primitives with low splitting priority */
        if (prob <= 0.0f)
            continue;
        /* we support splitting only for triangles */
        if (!GRL_is_triangle(&geomDesc[geomID]))
            continue;
        /* compute number of split primitives to produce */
        uint numSplitPrims = prob / globals->presplitPrioritySum * numPrimitivesToSplit;
        numSplitPrims = min(HEAP_SIZE, numSplitPrims);
        /* stop if not splits have to get performed */
        if (numSplitPrims <= 1)
            continue;
        /* add primref to heap */
        heap[0] = primref[i];
        heap_size = 1;
        uint heap_pos = 0;
        /* iterate until all splits are done */
        uint prims = 1;
        uint last_heap_size = heap_size;
        while (prims < numSplitPrims)
        {
            /* map the primitive bounds to the grid */
            const float4 lower = heap[heap_pos].lower;
            const float4 upper = heap[heap_pos].upper;
            const float4 glower = (lower - grid_base) * grid_scale + 0.2f;
            const float4 gupper = (upper - grid_base) * grid_scale - 0.2f;
            uint4 ilower = convert_uint4_rtz(glower);
            uint4 iupper = convert_uint4_rtz(gupper);
            /* this ignores dimensions that are empty */
            if (glower.x >= gupper.x)
                iupper.x = ilower.x;
            if (glower.y >= gupper.y)
                iupper.y = ilower.y;
            if (glower.z >= gupper.z)
                iupper.z = ilower.z;
            /* Now we compute a morton code for the lower and upper grid
       * coordinates. */
            const uint lower_code = bitInterleave3D(ilower);
            const uint upper_code = bitInterleave3D(iupper);
            /* if all bits are equal then we cannot split */
            if (lower_code == upper_code)
            {
 #if !USE_HEAP
                prims++; // !!!!!!!
                heap_pos++;
                if (heap_pos == last_heap_size)
                {
                    heap_pos = 0;
                    last_heap_size = heap_size;
                }
                continue;
 #else
                if (heap_size == 1)
                    break;
                const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
                primref[offset] = heap[heap_pos];
                presplit[offset].index = offset;
                presplit[offset].priority = calculate_priority(&heap[heap_pos], &geomDesc[geomID]);
                heap[0] = heap[--heap_size];
                heapify_down(heap, heap_size);
                continue;
 #endif
            }
            /* We find the bit position of the first differing bit from the
       * top down. This bit indicates a split position inside an
       * implicit octree. */
            const uint diff = 31 - clz(lower_code ^ upper_code);
            /* compute octree level and dimension to perform the split in */
            const uint level = diff / 3;
            const uint dim = diff % 3;
            /* now we compute the grid position of the split */
            const uint isplit = iupper[dim] & ~((1 << level) - 1);
            /* compute world space position of split */
            const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend[dim];
            /* split primref into left and right part */
            struct AABB left, right;
            splitQuadPrimRef(&geomDesc[geomID], &heap[heap_pos], dim, fsplit, &left, &right);
            prims++;
            /* update centroid bounds */
            AABB_extend_point(&centroidBounds, AABB_centroid2(&left));
            AABB_extend_point(&centroidBounds, AABB_centroid2(&right));
 #if !USE_HEAP
            heap[heap_pos] = left;
            heap[heap_size] = right;
            heap_size++;
            heap_pos++;
            if (heap_pos == last_heap_size)
            {
                heap_pos = 0;
                last_heap_size = heap_size;
            }
 #else
            /* insert left element into heap */
            heap[0] = left;
            heapify_down(heap, heap_size);
            /* insert right element into heap */
            heap[heap_size] = right;
            heapify_up(heap, heap_size);
            heap_size++;
 #endif
        }
        /* copy primities to primref array */
        primref[i] = heap[0];
        presplit[ID].index = i;
        presplit[ID].priority = calculate_priority(&heap[0], &geomDesc[geomID]);
        for (uint k = 1; k < heap_size; k++)
        {
            const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
            primref[offset] = heap[k];
            presplit[offset].index = offset;
            presplit[offset].priority = calculate_priority(&heap[k], &geomDesc[geomID]);
        }
    }
    /* merge centroid bounds into global bounds */
    centroidBounds = AABB_sub_group_reduce(&centroidBounds);
    if (get_sub_group_local_id() == 0)
        AABB_global_atomic_merge(&globals->centroidBounds, &centroidBounds);
    work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    /* update number of primitives on finish */
    if (Globals_OnFinish(globals))
    {
        globals->numPrimitives = globals->numPrimitives + globals->numSplittedPrimitives;
        globals->numSplittedPrimitives = 0;
        /* update first build record */ // FIXME: should be done in builder itself
        global struct BuildRecord *record = (global struct BuildRecord *)(bvh_mem + bvh_base->quadLeafStart*64);
        record->end = globals->numPrimitives;
    }
 }
--- a/src/intel/vulkan/grl/gpu/bvh_build_primref.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl
@ -1,674 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "api_interface.h"
 #include "common.h"
 #include "instance.h"
 #include "bvh_build_primref.h"
 //#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
 //int sub_group_non_uniform_any(int predicate);
 #define WINDOW_SIZE 16
 /* Representation of two merged triangles. */
 struct QuadIndices
 {
    uint primID0, primID1;
    uint v0, v1, v2, v3;
 };
 /*
  This function calculates a PrimRef from a merged quad and writes
  this PrimRef to memory.
 */
 GRL_INLINE void create_prim_ref(const uint geomID,
                            const struct QuadIndices quad,
                            global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
                            struct AABB *geometryBounds,
                            struct AABB *centroidBounds,
                            global uint *numPrimitives,
                            global struct AABB *primref)
 {
    /* load quad vertices */
    const float4 vtx0 = GRL_load_vertex(geomDesc, quad.v0); // FIXME: these multiple load_vertex calls should get merged
    const float4 vtx1 = GRL_load_vertex(geomDesc, quad.v1);
    const float4 vtx2 = GRL_load_vertex(geomDesc, quad.v2);
    const float4 vtx3 = GRL_load_vertex(geomDesc, quad.v3);
    /* calculate bounds for quad */
    float4 lower = min(min(vtx0, vtx1), min(vtx2, vtx3));
    float4 upper = max(max(vtx0, vtx1), max(vtx2, vtx3));
    /* extend geometry and centroid bounds */
    const float4 centroid2 = lower + upper;
    AABB_extendlu(geometryBounds, lower, upper);
    AABB_extendlu(centroidBounds, centroid2, centroid2);
    PrimRef ref;
    PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
    PRIMREF_setQuadMetaData( &ref, quad.primID0, quad.primID1, geomID, GRL_get_Flags( geomDesc ) );
    /* store primref to memory */
    const uint offset = atomic_add_global(numPrimitives, 1);
    primref[offset] = ref;
 }
 /*
  This function calculates a PrimRef from a procedural and writes
  this PrimRef to memory.
 */
 GRL_INLINE void create_prim_ref_procedural(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
                                       const uint geomID,
                                       const uint primID,
                                       struct AABB *geometryBounds,
                                       struct AABB *centroidBounds,
                                       global uint *numPrimitives,
                                       global struct AABB *primref)
 {
    /* load aabb from memory */
    struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
    /* extend geometry and centroid bounds */
    float4 lower = (float4)(aabb.MinX, aabb.MinY, aabb.MinZ, 0.0f);
    float4 upper = (float4)(aabb.MaxX, aabb.MaxY, aabb.MaxZ, 0.0f);
    const float4 centroid2 = lower + upper;
    AABB_extendlu(geometryBounds, lower, upper);
    AABB_extendlu(centroidBounds, centroid2, centroid2);
    /* encode geomID, primID */
    uint geomFlags = GRL_get_Flags(&geomDesc[geomID]);
    PrimRef ref;
    PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
    PRIMREF_setProceduralMetaData( &ref, geomID, primID, geomFlags );
    /* store primref to memory */
    const uint offset = atomic_add_global(numPrimitives, 1);
    primref[offset] = ref;
 }
 /*
   This function performs a binary search to calculate the geomID and
   primID of the i'th primitive of the scene. For the search a
   prefix_sum array is used that stores for each location j the sum of
   the number of primitives of all meshes k with k<j.
 */
 struct GeomPrimID
 {
    uint geomID, primID;
 };
 struct GeomPrimID binary_search_geomID_primID(global uint *prefix_sum, const uint prefix_sum_size, const uint i)
 {
    uint l = 0;
    uint r = prefix_sum_size;
    uint k = 0;
    while (r - l > 1)
    {
        const uint m = (l + r) / 2;
        k = prefix_sum[m];
        if (k <= i)
        {
            l = m;
        }
        else if (i < k)
        {
            r = m;
        }
    }
    struct GeomPrimID id;
    id.geomID = l;
    id.primID = i - prefix_sum[l];
    return id;
 }
 /*
  Checks if a vertex contains only finite floating point numbers.
 */
 GRL_INLINE bool isfinite_vertex(float4 vtx)
 {
    return isfinite(vtx.x) && isfinite(vtx.y) && isfinite(vtx.z);
 }
 /*
  Create primrefs from array of instance descriptors.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
 primrefs_from_DXR_instances(global struct Globals *globals,
                            global struct BVHBase* bvh,
                            global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
                            uint numInstances,
                            global struct AABB *primrefs,
                            uint allowUpdate)
 {
    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (instanceIndex < numInstances)
    {
        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
        primrefs_from_instances(
            globals,
            bvh,
            instance,
            instanceIndex,
            primrefs,
            0,
            allowUpdate);
    }
 }
 /*
  Create primrefs from array of instance descriptors.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 void kernel
 primrefs_from_DXR_instances_indirect(global struct Globals *globals,
                            global struct BVHBase* bvh,
                            global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
                            global struct IndirectBuildRangeInfo* indirect_data,
                            global struct AABB *primrefs,
                            uint allowUpdate)
 {
    // TODO: On DG2, we have 8 dwords of 'inline data' which can be pushed
    // directly to the kernel. THe rest of the kernel args are pulled using
    // loads from memory. It may be more efficient to put 'numInstances' and
    // 'allowUpdate' into 'globals'
    const uint instanceIndex =  get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (instanceIndex < indirect_data->primitiveCount)
    {
        instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
            (((global char*)instances) + indirect_data->primitiveOffset);
        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
        primrefs_from_instances(
            globals,
            bvh,
            instance,
            instanceIndex,
            primrefs,
            0,
            allowUpdate);
    }
 }
 /*
  Create primrefs from array of pointers to instance descriptors.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
 primrefs_from_DXR_instances_pointers(global struct Globals *globals,
                                     global struct BVHBase* bvh,
                                     global void *instances_in,
                                     uint numInstances,
                                     global struct AABB *primrefs,
                                     uint allowUpdate)
 {
    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (instanceIndex < numInstances)
    {
        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
        primrefs_from_instances(
            globals,
            bvh,
            instance,
            instanceIndex,
            primrefs,
            0,
            allowUpdate);
    }
 }
 /*
  Create primrefs from array of pointers to instance descriptors.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 void kernel
 primrefs_from_DXR_instances_pointers_indirect(global struct Globals *globals,
                                              global struct BVHBase* bvh,
                                              global void *instances_in,
                                              global struct AABB *primrefs,
                                              global struct IndirectBuildRangeInfo* indirect_data,
                                              uint allowUpdate)
 {
    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (instanceIndex < indirect_data->primitiveCount)
    {
        instances = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)
            (((global char*)instances) + indirect_data->primitiveOffset);
        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
        primrefs_from_instances(
            globals,
            bvh,
            instance,
            instanceIndex,
            primrefs,
            0,
            allowUpdate);
    }
 }
 ///////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////
 bool can_pair( uint3 a, uint3 b )
 {
    bool match0 = any( a.xxx == b.xyz ) ? 1 : 0;
    bool match1 = any( a.yyy == b.xyz ) ? 1 : 0;
    bool match2 = any( a.zzz == b.xyz ) ? 1 : 0;
    return (match0 + match1 + match2) >= 2;
 }
 void reduce_bounds(
    float3 lower,
    float3 upper,
    global struct Globals* globals,
    global struct BVHBase* bvh )
 {
    // reduce centroid bounds... make sure to exclude lanes with invalid AABBs
    float3 cent = lower + upper;
    float3 cent_lower = select( (float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
    float3 cent_upper = select(-(float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
    // reduce geo bounds
    AABB3f_atomic_merge_global_sub_group_lu( &bvh->Meta.bounds, lower, upper );
    AABB_global_atomic_merge_sub_group_lu(&globals->centroidBounds, cent_lower, cent_upper );
 }
 struct TriState
 {
    bool valid;
    uint prim_index;
    uint pairing;
    uint3 indices;
    float3 lower;
    float3 upper;
 };
 #define NOT_PAIRED 0xffffffff
 void load_triangle_data(uniform global char* index_buffer,
                        uniform const uint index_format,
                        uniform global char* vertex_buffer,
                        uniform const uint vertex_format,
                        uniform const uint vertex_stride,
                        uniform global float* transform_buffer,
                        uniform uint total_vert_count,
                        struct TriState* state,
                        float4* v)
 {
        state->indices = GRL_load_indices_from_buffer(index_buffer, index_format, state->prim_index );
        const uint last_vertex = total_vert_count - 1;
        const uint x = min(state->indices.x, last_vertex);
        const uint y = min(state->indices.y, last_vertex);
        const uint z = min(state->indices.z, last_vertex);
        GRL_load_triangle_vertices(vertex_buffer, vertex_format, vertex_stride, transform_buffer, x, y, z, v);
 }
 struct TriState load_triangle( uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
                        uniform uint base,
                        uniform uint num_prims,
                        uniform uint total_vert_count )
 {
    struct TriState state;
    state.pairing        = NOT_PAIRED;
    state.valid          = false;
    state.prim_index     = base + get_sub_group_local_id();
    state.lower = (float3)(INFINITY, INFINITY, INFINITY);
    state.upper = -(float3)(INFINITY, INFINITY, INFINITY);
    if (state.prim_index < num_prims)
    {
        state.valid = true;
        float4 v[3];
        load_triangle_data((global char*)geomDesc->Desc.Triangles.pIndexBuffer,
                        geomDesc->Desc.Triangles.IndexFormat,
                        (global char*)geomDesc->Desc.Triangles.pVertexBuffer,
                        geomDesc->Desc.Triangles.VertexFormat,
                        geomDesc->Desc.Triangles.VertexBufferByteStride,
                        (global float*)geomDesc->Desc.Triangles.pTransformBuffer,
                        total_vert_count,
                        &state,
                        v);
        if (state.indices.x >= total_vert_count || state.indices.y >= total_vert_count || state.indices.z >= total_vert_count ||
            !isfinite_vertex(v[0]) || !isfinite_vertex(v[1]) || !isfinite_vertex(v[2]) ||
            state.indices.x == state.indices.y || state.indices.x == state.indices.z || state.indices.y == state.indices.z)
        {
            state.valid = false;
        }
        else
        {
            state.lower.xyz = min(v[2].xyz, min(v[1].xyz, v[0].xyz));
            state.upper.xyz = max(v[2].xyz, max(v[1].xyz, v[0].xyz));
        }
    }
    return state;
 }
 void broadcast_triangles_local( struct TriState* state  )
 {
    varying uint my_prim    = state->prim_index;
    varying uint my_pairing = state->pairing;
    varying float3 my_lower = state->lower;
    varying float3 my_upper = state->upper;
    varying bool valid      = state->valid;
    varying uint3 indices   = state->indices;
    for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
    {
        // don't broadcast invalid prims
        if ( !sub_group_broadcast( valid, broadcast_lane ) )
            continue;
        uint broadcast_pairing = sub_group_broadcast(my_pairing, broadcast_lane);
        uint broadcast_prim = sub_group_broadcast(my_prim, broadcast_lane);
        if (broadcast_pairing == NOT_PAIRED)
        {
            // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
            bool pairable = false;
            uint3 other_indices = sub_group_broadcast_uint3( indices, broadcast_lane );
            if (broadcast_prim != my_prim && my_pairing == NOT_PAIRED && valid )
            {
                pairable = can_pair( indices, other_indices );
            }
            uint pairable_lane = ctz(intel_sub_group_ballot(pairable));
            if (valid && pairable_lane < get_sub_group_size())
            {
                // pair the broadcast primitive with the first lane that can accept it
                float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
                float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
                if (get_sub_group_local_id() == pairable_lane)
                {
                    my_pairing = broadcast_prim;
                    my_lower.xyz = min(my_lower.xyz, broadcast_lower);
                    my_upper.xyz = max(my_upper.xyz, broadcast_upper);
                }
                // pair the broadcast primitive with the same that was paired to it
                uint pairable_prim = sub_group_broadcast(my_pairing, pairable_lane);
                if (get_sub_group_local_id() == broadcast_lane)
                {
                    my_pairing = pairable_prim;
                }
            }
        }
        else
        {
            //
            // if this lane was already paired with the broadcasting tri
            //   in an earlier loop iteration, then record the pairing in this lane's registers
            float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
            float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
            if (broadcast_pairing == my_prim)
            {
                my_pairing = broadcast_prim;
                my_lower.xyz = min(my_lower.xyz, broadcast_lower);
                my_upper.xyz = max(my_upper.xyz, broadcast_upper);
            }
        }
    }
    state->pairing = my_pairing;
    state->lower = my_lower;
    state->upper = my_upper;
 }
 void broadcast_triangles_nonlocal(struct TriState* state, const struct TriState* other )
 {
    varying uint my_prim = state->prim_index;
    varying uint my_pairing = state->pairing;
    varying float3 my_lower = state->lower;
    varying float3 my_upper = state->upper;
    varying bool valid = state->valid;
    varying uint3 indices = state->indices;
    for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
    {
        // don't broadcast invalid prims
        if (!sub_group_broadcast(other->valid, broadcast_lane))
            continue;
        uint broadcast_pairing = sub_group_broadcast(other->pairing, broadcast_lane);
        uint broadcast_prim = sub_group_broadcast(other->prim_index, broadcast_lane);
        if (broadcast_pairing == NOT_PAIRED)
        {
            // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
            bool pairable = false;
            if ( my_pairing == NOT_PAIRED && valid )
            {
                uint3 other_indices = sub_group_broadcast_uint3(other->indices, broadcast_lane);
                pairable = can_pair(indices, other_indices);
            }
            // pair the broadcast primitive with the first lane that can accept it
            uint pairable_mask = intel_sub_group_ballot(pairable);
            if (valid && (ctz(pairable_mask) == get_sub_group_local_id()))
            {
                my_pairing = broadcast_prim;
                my_lower.xyz = min(my_lower.xyz, sub_group_broadcast_float3(other->lower.xyz, broadcast_lane));
                my_upper.xyz = max(my_upper.xyz, sub_group_broadcast_float3(other->upper.xyz, broadcast_lane));
            }
        }
    }
    state->pairing = my_pairing;
    state->lower = my_lower;
    state->upper = my_upper;
 }
 GRL_INLINE void do_triangles_to_primrefs(
    global struct Globals*               globals,
    global struct BVHBase*               bvh,
    global struct AABB*                  primref,
    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    uint                                 geomID_and_flags,
    const uint                           num_prims)
 {
    uint geomID             = geomID_and_flags & 0x00ffffff;
    uint geom_flags         = geomID_and_flags >> 24;
    uint prim_base          = get_group_id(0) * get_local_size(0);
    uint total_vert_count   = GRL_get_triangles_VertexCount(geomDesc);
    struct TriState tri = load_triangle( geomDesc, prim_base, num_prims, total_vert_count );
    broadcast_triangles_local( &tri );
    // we will produce output if the lane creates a triangle (my_pairing == NOT_PAIRED)
    // or for the lane corresponding to the larger of two triangles
    bool will_write = (tri.pairing > tri.prim_index) && tri.valid;
    uint write_mask = intel_sub_group_ballot(will_write);
    uint write_offs = subgroup_bit_prefix_exclusive( write_mask );
    uint write_count = popcount(write_mask);
    // allocate space in primref buffer
    uint write_base;
    if( get_sub_group_local_id() == 0 )
        write_base = atomic_add_global( &globals->numPrimitives, write_count );
    write_offs += sub_group_broadcast( write_base, 0 );
    uint primID0 = tri.prim_index;
    uint primID1 = (tri.pairing != NOT_PAIRED) ? tri.pairing : tri.prim_index;
    if (will_write)
    {
        PrimRef ref;
        PRIMREF_setAABB(&ref, tri.lower.xyz, tri.upper.xyz);
        PRIMREF_setQuadMetaData(&ref, primID0, primID1, geomID, geom_flags);
        uint8 val = (uint8)(
            as_uint(ref.lower.x), as_uint(ref.lower.y), as_uint(ref.lower.z), as_uint(ref.lower.w),
            as_uint(ref.upper.x), as_uint(ref.upper.y), as_uint(ref.upper.z), as_uint(ref.upper.w));
        store_uint8_L1WB_L3WB((global uint8*)(primref + write_offs), 0, val);
    }
    reduce_bounds( tri.lower, tri.upper, globals, bvh );
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 triangles_to_primrefs(
    global struct Globals* globals,
    global struct BVHBase* bvh,
    global struct AABB* primref,
    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    uint geomID_and_flags,
    uint num_prims
    )
 {
    do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel
 triangles_to_primrefs_indirect(
    global struct Globals*                globals,
    global struct BVHBase*                bvh,
    global struct AABB*                   primref,
    global GRL_RAYTRACING_GEOMETRY_DESC*  geomDesc,
    global struct IndirectBuildRangeInfo* indirect_data,
    uint                                  geomID_and_flags)
 {
    const uint num_prims = indirect_data->primitiveCount;
    do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
 }
 GRL_INLINE void do_procedurals_to_primrefs(
    global struct Globals* globals,
    global struct BVHBase* bvh,
    global struct AABB* primref,
    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    uint geomID_and_flags,
    const uint num_prims)
 {
    uint geomID    = geomID_and_flags & 0x00ffffff;
    uint geomFlags = geomID_and_flags >> 24;
    uint primID   = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
    bool create_primref = false;
    float3 lower =  (float3)(INFINITY, INFINITY, INFINITY);
    float3 upper = -(float3)(INFINITY, INFINITY, INFINITY);
    if (primID < num_prims)
    {
        /* check if procedural is valid */
        struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(geomDesc, primID);
        const bool valid_min = isfinite(aabb.MinX) && isfinite(aabb.MinY) && isfinite(aabb.MinZ);
        const bool valid_max = isfinite(aabb.MaxX) && isfinite(aabb.MaxY) && isfinite(aabb.MaxZ);
        if (valid_min & valid_max)
        {
            /* load aabb from memory */
            float3 l = (float3)(aabb.MinX, aabb.MinY, aabb.MinZ);
            float3 u = (float3)(aabb.MaxX, aabb.MaxY, aabb.MaxZ);
            // convert degenerate boxes to points at the box centroid
            lower = min( l, u );
            upper = max( l, u );
            create_primref = true;
        }
    }
    uint write_mask = intel_sub_group_ballot(create_primref);
    uint write_offs = subgroup_bit_prefix_exclusive(write_mask);
    uint write_count = popcount(write_mask);
    // allocate space in primref buffer
    uint write_base;
    if (get_sub_group_local_id() == 0)
        write_base = atomic_add_global(&globals->numPrimitives, write_count);
    write_offs += sub_group_broadcast(write_base, 0);
    // write the primref
    if (create_primref)
    {
        PrimRef ref;
        PRIMREF_setAABB(&ref, lower.xyz, upper.xyz);
        PRIMREF_setProceduralMetaData(&ref, geomID, primID, geomFlags);
        primref[write_offs] = ref;
    }
    reduce_bounds(lower, upper, globals, bvh);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 procedurals_to_primrefs(
    global struct Globals* globals,
    global struct BVHBase* bvh,
    global struct AABB* primref,
    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    uint geomID_and_flags,
    uint num_prims
    )
 {
    do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel
 procedurals_to_primrefs_indirect(
    global struct Globals* globals,
    global struct BVHBase* bvh,
    global struct AABB* primref,
    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    global const struct IndirectBuildRangeInfo* indirect_data,
    uint geomID_and_flags
    )
 {
    const uint num_prims = indirect_data->primitiveCount;
    do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
 }
--- a/src/intel/vulkan/grl/gpu/bvh_build_primref.h
+++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.h
@ -1,246 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #if 0
 /*
 Create primrefs from array of instance descriptors.
 */
 void store_instance_primref(
    global struct BVHBase* top_bvh,
    global struct Globals* globals,
    global PrimRef* primrefs,
    bool alloc_primref,
    PrimRef new_primref )
 {
    uint allocatePrimref = alloc_primref ? 1 : 0;
    uint index = 0;
    uint numAllocations = sub_group_reduce_add(allocatePrimref);
    if (get_sub_group_local_id() == 0)
    {
        index = atomic_add_global(&globals->numPrimitives, numAllocations);
    }
    index = sub_group_broadcast(index, 0);
    index = index + sub_group_scan_exclusive_add(allocatePrimref);
    if (allocatePrimref)
    {
        primrefs[index] = new_primref;
    }
    struct AABB centroidBounds;
    centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
    struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
    struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(&centroidBounds);
    if (get_sub_group_local_id() == 0)
    {
        AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
        AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
    }
 }
 // Compute transformed blas AABB.  Returns false if instance is degenerate
 bool create_instance_primref(
    PrimRef* ref_out,
    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
    global struct BVHBase* bvh,
    uint instanceMask,
    uint instanceIndex
    )
 {
    struct AABB3f bbox;
    bool alloc_primref = false;
    uint rootNodeOffset = NO_NODE_OFFSET;
    if (bvh != 0)
    {
        alloc_primref = true;
        AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
        const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
        const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
        if (!valid_min || !valid_max || instanceMask == 0)
        {
            // degenerated instance case
            // TODO this should be under  if ( allocate backpointers )
            {
                // we have to allocate the primref because this instance can be updated to non-degenerated
                // take the origin of the instance as a bounding box.
                bbox.lower[0] = instance->Transform[3];
                bbox.lower[1] = instance->Transform[7];
                bbox.lower[2] = instance->Transform[11];
                bbox.upper[0] = instance->Transform[3];
                bbox.upper[1] = instance->Transform[7];
                bbox.upper[2] = instance->Transform[11];
                instanceMask = 0;
            }
        }
        else
        {
            rootNodeOffset = BVH_ROOT_NODE_OFFSET;
            float transformOverhead = 0.0f;
            bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
        }
    }
    *ref_out = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, 0);
    return alloc_primref;
 }
 GRL_INLINE void primrefs_from_instances(
    global struct Globals* globals,
    global struct BVHBase* top_bvh,
    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
    uint instanceIndex,
    global struct AABB* primrefs)
 {
    bool alloc_primref = false;
    PrimRef new_primref;
    AABB_init(&new_primref);
    if (instance)
    {
        uint mask = GRL_get_InstanceMask(instance);
        global struct BVHBase* bvh = (global struct BVHBase*)instance->AccelerationStructure;
        alloc_primref = create_instance_primref(&new_primref, instance, bvh, mask, instanceIndex);
    }
    store_instance_primref(top_bvh, globals, primrefs, alloc_primref, new_primref);
 }
 #endif
 #if 1
 GRL_INLINE void primrefs_from_instances(
    global struct Globals* globals,
    global struct BVHBase* top_bvh,
    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
    uint instanceIndex,
    global struct AABB* primrefs,
    global GRL_RAYTRACING_AABB* procedural_aabb,
    uint allowUpdate
    )
 {
    struct AABB3f bbox;
    uint allocatePrimref = 0;
    uint rootNodeOffset = NO_NODE_OFFSET;
    uint instanceMask = 0;
    bool is_procedural = (procedural_aabb != 0);
    if( instance )
    {
        instanceMask = GRL_get_InstanceMask(instance) ;
        if ( is_procedural )
        {
            // procedural instance primref
            allocatePrimref = 1;
            float3 lower = (float3)(procedural_aabb->MinX, procedural_aabb->MinY, procedural_aabb->MinZ);
            float3 upper = (float3)(procedural_aabb->MaxX, procedural_aabb->MaxY, procedural_aabb->MaxZ);
            if (instanceMask == 0 || any(lower > upper))
            {
                bbox.lower[0] = instance->Transform[3];
                bbox.lower[1] = instance->Transform[7];
                bbox.lower[2] = instance->Transform[11];
                bbox.upper[0] = instance->Transform[3];
                bbox.upper[1] = instance->Transform[7];
                bbox.upper[2] = instance->Transform[11];
                instanceMask = 0;
            }
            else
            {
                bbox = transform_aabb(lower, upper, instance->Transform);
            }
        }
        else
        {
            // HW-instance primref
            global struct BVHBase* bvh = instance ?
                (global struct BVHBase*)instance->AccelerationStructure :
                0;
            if (bvh != 0)
            {
                AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
                const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
                const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
                if (valid_min && valid_max && instanceMask != 0)
                {
                    allocatePrimref = 1;
                    rootNodeOffset = BVH_ROOT_NODE_OFFSET;
                    float transformOverhead = 0.0f;
                    bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
                }
                else if (allowUpdate)
                {
                    // degenerated instance case
                    // we have to allocate the primref because this instance can be updated to non-degenerated
                    // take the origin of the instance as a bounding box.
                    allocatePrimref = 1;
                    bbox.lower[0] = instance->Transform[3];
                    bbox.lower[1] = instance->Transform[7];
                    bbox.lower[2] = instance->Transform[11];
                    bbox.upper[0] = instance->Transform[3];
                    bbox.upper[1] = instance->Transform[7];
                    bbox.upper[2] = instance->Transform[11];
                    instanceMask = 0;
                }
            }
        }
    }
    uint index = 0;
    uint numAllocations = sub_group_reduce_add(allocatePrimref);
    if (get_sub_group_local_id() == 0)
    {
        index = atomic_add_global(&globals->numPrimitives, numAllocations);
    }
    index = sub_group_broadcast(index, 0);
    index = index + sub_group_scan_exclusive_add(allocatePrimref);
    struct AABB new_primref;
    struct AABB centroidBounds;
    if (allocatePrimref)
    {
        new_primref = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, is_procedural);
        primrefs[index] = new_primref;
        centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
    }
    else
    {
        AABB_init(&new_primref);
        AABB_init(&centroidBounds);
    }
    struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
    struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(&centroidBounds);
    if (get_sub_group_local_id() == 0)
    {
        AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
        AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
    }
 }
 #endif
--- a/src/intel/vulkan/grl/gpu/bvh_build_refit.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl
@ -1,491 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "bvh_build_refit.h"
 #include "api_interface.h"
 #include "common.h"
 #if 0 
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__( (reqd_work_group_size( 16, 1, 1 )) )
 void kernel
 update_instance_leaves( global struct BVHBase* bvh,
    uint64_t dxrInstancesArray,
    uint64_t dxrInstancesPtr,
    global struct AABB3f* instance_aabb_scratch
 )
 {
    uint num_leaves = BVHBase_GetNumHWInstanceLeaves( bvh );
    uint id = get_local_id( 0 ) + get_local_size( 0 ) * get_group_id( 0 );
    if ( id >= num_leaves )
        return;
    global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
        (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
    global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
        (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
    global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
    /* iterate over all children of the instance node and get their bounds */
    uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex( &leafs[id] );
    global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
    if ( dxrInstancesArray != NULL )
        instance = &instancesArray[instanceIdx];
    else
        instance = instancesPtrArray[instanceIdx];
    struct AffineSpace3f xfm = AffineSpace3f_load_row_major( instance->Transform );
    global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
    struct AABB3f newSubtreeBounds = instanceBvh->Meta.bounds;
    struct AABB3f bbox = AABB3f_transform( xfm, newSubtreeBounds ); // JDB TODO:  Use faster abs-matrix method
    const bool valid_min = isfinite( bbox.lower[0] ) && isfinite( bbox.lower[1] ) && isfinite( bbox.lower[2] );
    const bool valid_max = isfinite( bbox.upper[0] ) && isfinite( bbox.upper[1] ) && isfinite( bbox.upper[2] );
    uint mask = GRL_get_InstanceMask(instance);
    uint offset = instanceBvh->rootNodeOffset;
    if ( !valid_min || !valid_max )
    {
        bbox.lower[0] = xfm.p.x;
        bbox.lower[1] = xfm.p.y;
        bbox.lower[2] = xfm.p.z;
        bbox.upper[0] = xfm.p.x;
        bbox.upper[1] = xfm.p.y;
        bbox.upper[2] = xfm.p.z;
        offset = NO_NODE_OFFSET;
        mask = 0;
    }
    instance_aabb_scratch[id] = bbox;
    HwInstanceLeaf_Constructor( &leafs[id], instance, instanceIdx, offset, mask ); // TODO: No instance opening for refittable BVH   
 }
 #endif
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel
 update_instance_leaves(global struct BVHBase* bvh,
    uint64_t dxrInstancesArray,
    uint64_t dxrInstancesPtr,
    global struct AABB3f* instance_aabb_scratch
 )
 {
    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
    if (id >= num_leaves)
        return;
    DO_update_instance_leaves(
        bvh,
        dxrInstancesArray,
        dxrInstancesPtr,
        instance_aabb_scratch,
        id,
        0 );
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel
 update_instance_leaves_indirect(global struct BVHBase* bvh,
    uint64_t dxrInstancesArray,
    uint64_t dxrInstancesPtr,
    global struct AABB3f* instance_aabb_scratch,
    global struct IndirectBuildRangeInfo* indirect_data)
 {
    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
    if (id >= num_leaves)
        return;
    DO_update_instance_leaves(
        bvh,
        dxrInstancesArray + indirect_data->primitiveOffset,
        dxrInstancesPtr,
        instance_aabb_scratch,
        id,
        0 );
 }
 #if 0
 /*
  This kernel refit a BVH. The algorithm iterates over all BVH nodes
  to find all leaf nodes, which is where refitting starts. For these
  leaf nodes bounds get recalculated and then propagates up the tree.
  One kernel instance considers a range of inner nodes as startpoints.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(8, 1, 1))) void kernel refit(
    global struct BVHBase *bvh,
    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
    global struct AABB3f* instance_leaf_aabbs )
 {
    /* here we temporarily store the bounds for the children of a node */
    struct AABB childrenAABB[BVH_NODE_N6];
    /* get pointer to inner nodes and back pointers */
    global struct QBVHNodeN *inner_nodes = BVHBase_rootNode(bvh);
    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
    /* construct range of nodes that each work group will process */
    const uint numInnerNodes = BVHBase_numNodes(bvh);
    const uint startID = (get_group_id(0) + 0) * numInnerNodes / get_num_groups(0);
    const uint endID = (get_group_id(0) + 1) * numInnerNodes / get_num_groups(0);
    /* each workgroup iterates over its range of nodes */
    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
    {
        global struct QBVHNodeN* curNode = &inner_nodes[i];
        uint numChildren = refit_bottom(bvh, geosArray,
                                 instance_leaf_aabbs,
                                 curNode,
                                 childrenAABB,
                                 *InnerNode_GetBackPointer(backPointers, i));
        if (numChildren != 0)
        {
            /* update bounds of node */
            QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
            /* refit upper parts of the BVH */
            // TODO: this will not gonna work for mixed nodes
            refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
        }
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(8, 1, 1)))
 void kernel Find_refit_treelets(
    global struct BVHBase* bvh,
    global TreeletNodeData* treelets,
    global uint* scratchStartpoints,
    global uint* startpointAlloc)
 {
    find_refit_treelets(bvh,
                        treelets,
                        scratchStartpoints,
                        startpointAlloc);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1))) 
 void kernel Assign_refit_startpoints_to_treelets(
    global struct BVHBase* bvh,
    global TreeletNodeData* treelets,
    global uint* scratchStartpoints)
 {
    assign_refit_startpoints_to_treelets(bvh, treelets, scratchStartpoints);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(128, 1, 1))) 
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel Finalize_treelets_in_groups(
    global struct BVHBase* bvh,
    global uint* scratchStartpoints )
 {
    local uint depths[FINALIZE_TREELETS_SLM_DEPTHS_SPACE];
    finalize_treelets_in_groups(bvh, scratchStartpoints, depths);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(256, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel Refit_quads_tree_per_group(global SquashedInput* psqinputs)
 {
    uint group_id = get_group_id(0);
    SquashedInput sqinput = psqinputs[group_id];
    global struct BVHBase* bvh = sqinput.pBvh;
    uint numLeaves = BVHBase_GetNumQuads(bvh);
    global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
    global void* input = sqinput.pInput;
    global struct AABB* bbox_scratch = sqinput.bbox_scratch;
    uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
    uint id = get_local_id(0);
    for (uint leaf_id = id; leaf_id < numLeaves; leaf_id += get_local_size(0))
    {
        struct AABB theAABB;
        refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
        theAABB.lower.w = as_float(0xABBADEFFu);
        bbox_scratch[leafsIndexOffset + leaf_id] = theAABB;
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(32, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel Refit_quads(
    global struct BVHBase* bvh,
    global void* input,
    global struct AABB* bbox_scratch,
    uint numGroupsExecuted,
    global SquashedInputGroupDesc* sqinput)
 {
    uint numLeafs = BVHBase_GetNumQuads(bvh);
    if (numLeafs == 0) return;
    global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
    uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
    uint numLeafsPerGr = (numLeafs + (numGroupsExecuted - 1)) / numGroupsExecuted;
    uint id_start = get_group_id(0) * numLeafsPerGr + get_local_id(0);
    uint id_end = min(id_start + numLeafsPerGr, numLeafs);
    for (uint id = id_start; id < id_end; id+= get_local_size(0))
    {
        struct AABB theAABB;
        refit_bottom_child_quad(leafs + id, geosArray, &theAABB);
        theAABB.lower.w = as_float(0xABBADEFFu);
        bbox_scratch[leafsIndexOffset + id] = theAABB;
    }
    if (get_group_id(0) == 0 && get_local_id(0) < 16)
    {
        uint groupnr;
        uint treeletCnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
        if (get_sub_group_local_id() == 0) {
            groupnr = atomic_add_global(&sqinput->totalNumGroups, treeletCnt);
        }
        groupnr = sub_group_broadcast(groupnr, 0);
        for (uint subtree = get_sub_group_local_id(); subtree < treeletCnt; subtree += get_sub_group_size())
        {
            uint gr = groupnr + subtree;
            //printf("tree %llx, treelet %d/%d, grId %d, numStartpoints %d\n",  bvh, subtree,treeletCnt, gr, BVHBase_GetRefitTreeletDescs(bvh)[subtree].numStartpoints);
            sqinput[gr].bvh = (qword)bvh;
            sqinput[gr].scratch = (qword)bbox_scratch;
            sqinput[gr].groupInTree = subtree;
        }
        //if (get_local_id(0)==0 && treeletCnt > 1)
        //{
        //    printf("tree %llx, tip treelet %d/%d = numStartpoints %d depth %d\n", bvh, treeletCnt, treeletCnt, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].numStartpoints, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].maxDepth);
        //}
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(256, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel
 Refit_tree_per_group_quad(
    global SquashedInput* psqinputs)
 {
    uint group_id = get_group_id(0);
    SquashedInput sqinput = psqinputs[group_id];
    global struct BVHBase* bvh = sqinput.pBvh;
    global struct AABB* bbox_scratch = sqinput.bbox_scratch;
    global void* pInput = sqinput.pInput;
    local Treelet_by_single_group_locals loc;
    if (*BVHBase_GetRefitTreeletCntPtr(bvh) == 0)
        return;
 #if REFIT_DEBUG_CHECKS
    uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
    if (bottoms_cnt != 1) {
        if (get_local_id(0) == 0)
        {
            printf("Error: this tree has more than 1 treelets!\n");
        }
        return;
    }
 #endif
    /* get pointer to inner nodes and back pointers */
    uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
    // uniform per group
    uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);    
    uint numLeafs = bvh->quadLeafCur - bvh->quadLeafStart;
    if (numLeafs == 0) { return; }
    uint numLeafsByOneThread = (numLeafs + (get_local_size(0) - 1)) / get_local_size(0);
    update_quads(bvh, pInput, bbox_scratch, get_local_id(0), numLeafsByOneThread);
    mem_fence_workgroup_default(); work_group_barrier(0);
    RefitTreelet trltDsc = *pTrltDsc;
    refit_treelet_by_single_group(
        bbox_scratch,
        &loc,
        bvh,
        trltDsc,
        false,
        true);
    if (trltDsc.maxDepth > 0)
    {
        mem_fence_workgroup_default(); work_group_barrier(0);
        post_refit_encode_qnode_tree_per_group(bbox_scratch,bvh);
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(256, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel
 Refit_treelet_per_group(
    global SquashedInputGroupDesc* sqinput)
 {
    uint group_id = get_group_id(0);
    global struct AABB*    bbox_scratch = (global struct AABB* )sqinput[group_id].scratch;
    global struct BVHBase* bvh          = (global struct BVHBase* )sqinput[group_id].bvh;
    group_id                            = sqinput[group_id].groupInTree;
    /* get pointer to inner nodes and back pointers */
    uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
    uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
    // uniform per group
    uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);
    bool should_we_process_treetip = true;
    local Treelet_by_single_group_locals loc;
    local bool* l_should_we_process_treetip = (local bool*)&loc;
 #if REFIT_VERBOSE_LOG
    if (group_id != 0) return;
 #endif
    if (bottoms_cnt > 1)
    {
 #if REFIT_VERBOSE_LOG
        for (; group_id < bottoms_cnt; group_id++)
        {
            if (get_local_id(0) == 0) { printf("\n ====== treelet %d ====== \n", group_id); }
            work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, memory_scope_device);
 #endif
            bool rootProcThread = refit_treelet_by_single_group(
                bbox_scratch,
                &loc,
                bvh,
                pTrltDsc[group_id],
                true,
                false);
            // we have to make last group that finishes go up and process the treetip
            if (rootProcThread)
            {
                mem_fence_gpu_invalidate();
                uint finished_cnt = atomic_inc_global((global uint*) & bvh->refitTreeletCnt2);
                should_we_process_treetip = finished_cnt + 1 == bottoms_cnt;
                * l_should_we_process_treetip = should_we_process_treetip;
                if (should_we_process_treetip) mem_fence_gpu_invalidate();
            }
 #if REFIT_VERBOSE_LOG
        }
 #endif
        work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
        should_we_process_treetip = *l_should_we_process_treetip;
    }
    if (should_we_process_treetip)
    {
        //this group will process treetip
        if (get_local_id(0) == 0) { bvh->refitTreeletCnt2 = 0; }    
        if (bottoms_cnt == 1) { bottoms_cnt = 0; }
        refit_treelet_by_single_group(
            bbox_scratch,
            &loc,
            bvh,
            pTrltDsc[bottoms_cnt],
            true,
            true);
    }
 }
 /*
  This kernel refit a BVH. The algorithm iterates over all BVH nodes
  to find all leaf nodes, which is where refitting starts. For these
  leaf nodes bounds get recalculated and then propagates up the tree.
  One kernel instance considers exactly one inner_node startpoint. 
  not range of inner nodes.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(8, 1, 1))) void kernel 
 Refit_per_one_startpoint(
    global struct BVHBase* bvh,
    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
    global struct AABB3f* instance_leaf_aabbs )
 {
    /* here we temporarily store the bounds for the children of a node */
    struct AABB childrenAABB[BVH_NODE_N6];
    /* get pointer to inner nodes and back pointers */
    global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
    /* get the inner node that we will consider as a bottom startpoint */
    const uint numInnerNodes = BVHBase_numNodes(bvh);
    const uint innerNodeIdx = (get_group_id(0) + 0) * get_local_size(0) + get_local_id(0);
    if (innerNodeIdx >= numInnerNodes) return;
    global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
    uint numChildren = refit_bottom(
        bvh,
        geosArray,
        instance_leaf_aabbs,
        curNode,
        childrenAABB,
        *InnerNode_GetBackPointer(backPointers, innerNodeIdx));
    if (numChildren != 0)
    {
        /* update bounds of node */
        QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
        /* refit upper parts of the BVH */
        /* TODO: this will not gonna work for mixed nodes */
        refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
    }
 }
 #endif
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
 Refit_indirect_sg(
    global struct BVHBase* bvh,
    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
    global struct AABB3f* instance_leaf_aabbs)
 {    
    DO_Refit_per_one_startpoint_sg(bvh, geosArray, instance_leaf_aabbs, 0);
 }
--- a/src/intel/vulkan/grl/gpu/bvh_build_refit.h
+++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.h
@ -1,546 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "common.h"
 #include "api_interface.h"
 #include "instance.h"
 #include "GRLGen12.h"
 #include "libs/lsc_intrinsics.h"
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel
 DO_update_instance_leaves(global struct BVHBase* bvh,
    uint64_t dxrInstancesArray,
    uint64_t dxrInstancesPtr,
    global struct AABB3f* instance_aabb_scratch,
    uint id ,
    global struct GRL_RAYTRACING_AABB* procedural_box
 )
 {
    global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
        (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
    global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
        (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
    global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
    /* iterate over all children of the instance node and get their bounds */
    uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]);
    global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
    if (dxrInstancesArray != NULL)
        instance = &instancesArray[instanceIdx];
    else
        instance = instancesPtrArray[instanceIdx];
    uint mask = GRL_get_InstanceMask(instance);
    uint offset = NO_NODE_OFFSET;
    struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform);
    struct AABB3f bbox;
    if (procedural_box != 0)
    {
        bbox.lower[0] = procedural_box->MinX;
        bbox.lower[1] = procedural_box->MinY;
        bbox.lower[2] = procedural_box->MinZ;
        bbox.upper[0] = procedural_box->MaxX;
        bbox.upper[1] = procedural_box->MaxY;
        bbox.upper[2] = procedural_box->MaxZ;
    }
    else
    {
        global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
        bbox = instanceBvh->Meta.bounds;
        offset = BVH_ROOT_NODE_OFFSET;
    }
    const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]);
    const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]);
    if (!valid_min || !valid_max )
    {
        bbox.lower[0] = xfm.p.x;
        bbox.lower[1] = xfm.p.y;
        bbox.lower[2] = xfm.p.z;
        bbox.upper[0] = xfm.p.x;
        bbox.upper[1] = xfm.p.y;
        bbox.upper[2] = xfm.p.z;
        offset = NO_NODE_OFFSET;
        mask = 0;
    }
    else
    {
        bbox = AABB3f_transform(xfm, bbox); // JDB TODO:  Use faster abs-matrix method
    }
    instance_aabb_scratch[id] = bbox;
    HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH   
 }
 /*
   This function starts at some BVH node and refits all nodes upwards
   to the root. At some node the algorithm only proceeds upwards if
   all children of the current node have already been processed. This
   is checked as each time a node is reached an atomic counter is
   incremented, which will reach the number of children of the node at
   some time.
 */
 GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed)
                            global struct BVHBase *bvh,           // pointer to BVH
                            struct AABB *childrenAABB,            // temporary data to use
                            uint numChildrenTotal)
 {
    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
    /* compute the index of the start node */
    uint curNodeIndex = qnode_start - nodeData;
    /* the start node got already processed, thus go to its parent node */
    curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6;
    /* end at root node */
    while (curNodeIndex != 0x03FFFFFF)
    {
        /* increment refit counter that counts refitted children of current node */
        const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex));
        /* if all children got refitted, then continue */
        const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
        numChildrenTotal = (parentPointer >> 3) & 0x7;
        if (numChildrenRefitted != numChildrenTotal)
            return;
        /* reset refit counter for next refit */
        *InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8;
        /* get bounds of all children from child nodes directly */
        global struct QBVHNodeN *qnode = nodeData + curNodeIndex;
        global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode);
        for (uint k = 0; k < numChildrenTotal; k++)
            childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k);
        /* update node bounds of all children */
        QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal);
        write_mem_fence(CLK_GLOBAL_MEM_FENCE);
        /* make parent node the current node */
        curNodeIndex = parentPointer >> 6;
    }
    /* update QBVH6 bounds */
    struct AABB bounds;
    AABB_init(&bounds);
    for (uint i = 0; i < numChildrenTotal; i++)
        AABB_extend(&bounds, &childrenAABB[i]);
    setBVHBaseBounds(bvh, &bounds);
 }
 GRL_INLINE void SUBGROUP_refit_bottom_up( 
    uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed)
    uniform global struct BVHBase* bvh,           // pointer to BVH
    varying struct AABB reduce_bounds,            
    uniform uint numChildrenTotal,
    varying ushort lane,
    varying ushort head_lane)
 {
    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    /* compute the index of the start node */
    uniform uint curNodeIndex = qnode_start - nodeData;
    /* the start node got already processed, thus go to its parent node */
    uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6;
    varying struct AABB childrenAABB;
    /* end at root node */
    while ( curNodeIndex != 0x03FFFFFF )
    {
        mem_fence_gpu_invalidate();
        /* increment refit counter that counts refitted children of current node */
        uniform uint parentPointer = 1;
        if (lane == 0)
        {
            // acquire fence ensures that all previous writes complete before the atomic starts
            parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex));
        }
        parentPointer = intel_sub_group_shuffle( parentPointer, head_lane );
        /* if all children got refitted, then continue */
        uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
        numChildrenTotal = (parentPointer >> 3) & 0x7;
        if ( numChildrenRefitted != numChildrenTotal )
            return;
        /* reset refit counter for next refit */
        if (lane == 0)
        {
            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8);
        }
        /* get bounds of all children from child nodes directly */
        global struct QBVHNodeN* qnode = nodeData + curNodeIndex;
        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
        varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0;
        childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
        /* update node bounds of all children */
        reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
        reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane );
        subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane);
        /* update node mask */
        uchar childrenMask = qnode_child[child_idx].instMask;
        qnode->instMask = sub_group_reduce_or_N6(childrenMask);
        /* make parent node the current node */
        curNodeIndex = parentPointer >> 6;
    }
    /* update QBVH6 bounds */
    if( lane == 0 )
        setBVHBaseBounds( bvh, &reduce_bounds );
 }
 GRL_INLINE void quadCopyVertices(
    const struct QuadLeaf* pQuad,
    struct QuadLeaf* newQuad)
 {
    const uint4* s = (const uint4*) & (pQuad->v[0][0]);
    uint4* d = (uint4*) & (newQuad->v[0][0]);
    const uint8* s2 = (const uint8*)(s+1);
    uint8* d2 = (uint8*)(d+1);
    *d = *s;
    *d2 = *s2;
 }
 GRL_INLINE void get_updated_quad(
    global const struct QuadLeaf* pQuad,
    global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs,
    struct QuadLeaf* newQuad)
 {
    struct QuadLeaf tempQuad;
    // fetch non vtx data;
    {
        uint4* tempQuad4U = (uint4*)&tempQuad;
        global const uint4* pQuad4U = (global const uint4*)pQuad;
        *tempQuad4U = *pQuad4U;
    }   
    /* get the geomID and primID0/1 for both quad triangles */
    const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc);
    const uint primID0 = tempQuad.primIndex0;
    const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad);
    ushort fourth_vert = 0;
    if (primID1 != primID0)
    {
        ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad);
        fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
        fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
    }
    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID;
    uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
    // read the indices of the 4 verts we want
    float3 vtx0, vtx1, vtx2, vtx3;
    GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
    QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3);
    *newQuad = tempQuad;
 }
 // This calculates children BBs for innerNode having *all* children leafs.
 // mixed nodes will be updated by passing through bottom-up thread.
 GRL_INLINE uint refit_bottom( global struct BVHBase* bvh, 
                          global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,                       
                          global struct AABB3f* instance_leaf_aabbs,
                          global struct QBVHNodeN* curNode,
                          struct AABB *childrenAABB,
                          uint backPointer)
 {
    uint numChildren = 0;
    /* we start refit at leaf nodes, this case is for quad nodes */
    if (curNode->type == BVH_QUAD_NODE)
    {
        global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
        /* iterate over all quads of the quad node and get their bounds */
        numChildren = (backPointer >> 3) & 0x7;
        for (uint k = 0; k < numChildren; k++)
        {
            struct QuadLeaf Q;
            get_updated_quad(&quads[k], geomDesc, &Q);
            quadCopyVertices(&Q, &quads[k]);
            childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad
        }
    }
    /* we start refit at leaf nodes, this case is for procedural nodes */
    else if (curNode->type == BVH_PROCEDURAL_NODE)
    {
        global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
        /* iterate over all children of the procedural node and get their bounds */
        numChildren = (backPointer >> 3) & 0x7;
        for (uint k = 0; k < numChildren; k++)
        {
            /* extract geomID and primID from leaf */
            const uint startPrim = QBVHNodeN_startPrim(curNode, k);
            const uint geomID = ProceduralLeaf_geomIndex(leaf);
            const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
            /* read bounds from geometry descriptor */
            struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
            childrenAABB[k].lower.x = aabb.MinX;
            childrenAABB[k].lower.y = aabb.MinY;
            childrenAABB[k].lower.z = aabb.MinZ;
            childrenAABB[k].upper.x = aabb.MaxX;
            childrenAABB[k].upper.y = aabb.MaxY;
            childrenAABB[k].upper.z = aabb.MaxZ;
            /* advance leaf pointer to next child */
            leaf += QBVHNodeN_blockIncr(curNode, k);
        }
    }
    /* we start refit at leaf nodes, this case is for instance nodes */
    else if (curNode->type == BVH_INSTANCE_NODE)
    {
        global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
        global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
        /* iterate over all children of the instance node and get their bounds */
        numChildren = (backPointer >> 3) & 0x7;
        for (uint k = 0; k < numChildren; k++)
        {
            uint leafindex = (instancesLeaves + k) - leafBase;
            childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
            childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
        }
    }
    return numChildren;
 }
 // This calculates children BBs for innerNode having *all* children leafs.
 // mixed nodes will be updated by passing through bottom-up thread.
 GRL_INLINE uint SUBGROUP_refit_bottom(
    uniform global struct BVHBase* bvh,
    uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    uniform global struct AABB3f* instance_leaf_aabbs,
    uniform global struct QBVHNodeN* curNode,
    uniform uint backPointer,
    varying struct AABB* childrenAABB,
    varying uchar* childrenMask,
    varying ushort lane,
    global uchar* is_procedural_instance
    )
 {
    uniform uint numChildren = 0;
    bool enable_procedural_instance = (is_procedural_instance != 0);
    /* we start refit at leaf nodes, this case is for quad nodes */
    if (curNode->type == BVH_QUAD_NODE)
    {
        /* iterate over all quads of the quad node and get their bounds */
        numChildren = (backPointer >> 3) & 0x7;
        uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
        struct QuadLeaf Q;
        if (lane < numChildren)
        {
            get_updated_quad(&quads[lane], geomDesc, &Q);
            *childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
            quadCopyVertices(&Q, &quads[lane]);
            *childrenMask = 0xff;
        }
        // FIXME: support leaves with more than one quad
    }
    /* we start refit at leaf nodes, this case is for procedural nodes */
    else if (curNode->type == BVH_PROCEDURAL_NODE)
    {
        uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
        /* iterate over all children of the procedural node and get their bounds */
        numChildren = (backPointer >> 3) & 0x7;
        varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0;
        incr = sub_group_scan_exclusive_add(incr);
        if( lane < numChildren )
        {
            /* extract geomID and primID from leaf */
            varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane );
            varying global struct ProceduralLeaf* my_leaf = leaf + incr;
            const uint geomID = ProceduralLeaf_geomIndex(my_leaf);
            const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim); 
            /* read bounds from geometry descriptor */
            struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
            childrenAABB->lower.x = aabb.MinX;
            childrenAABB->lower.y = aabb.MinY;
            childrenAABB->lower.z = aabb.MinZ;
            childrenAABB->upper.x = aabb.MaxX;
            childrenAABB->upper.y = aabb.MaxY;
            childrenAABB->upper.z = aabb.MaxZ;
            *childrenMask = 0xff;
        }
    }
    /* we start refit at leaf nodes, this case is for instance nodes */
    else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE)
    {
        uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
        uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
        /* iterate over all children of the instance node and get their bounds and masks */
        numChildren = (backPointer >> 3) & 0x7;
        if( lane < numChildren )
        {
            uint leafindex = (instancesLeaves + lane) - leafBase;
            childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]);
            childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]);
            *childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]);
        }
    }
    else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE)
    {
        // Handle procedural-instance leaves
        //   TODO:  Generalize this!   Should re-write the kernel to work with arbitrary mixed-mode leaves
        numChildren = (backPointer >> 3) & 0x7;
        uint childType = BVH_INTERNAL_NODE;
        if ( lane < numChildren )
        {
            childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane );
            if (childType != BVH_INTERNAL_NODE)
            {
                uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode );
                uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
                uint leafindex = (instancesLeaves + lane) - leafBase;
                childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
                childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
                *childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] );
                // see if the child has flipped from procedural to non-procedural and update the child type field as needed
                uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] );
                uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
                if (newChildType != childType)
                {
                    InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType );
                }
            }            
        }
        // don't ascend the tree for a true internal node
        if (sub_group_all(childType == BVH_INTERNAL_NODE))
            numChildren = 0;
    }
    return numChildren;
 }
 #define SG_REFIT_WG_SIZE 8
 void DO_Refit_per_one_startpoint_sg(
    global struct BVHBase* bvh,
    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
    global struct AABB3f* instance_leaf_aabbs,
    global uchar* is_procedural_instance )
 {
    /* get pointer to inner nodes and back pointers */
    global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
    /* get the inner node that we will consider as a bottom startpoint */
    const uint numInnerNodes = BVHBase_numNodes(bvh);
    const uint innerNodeIdx = get_sub_group_global_id();
    varying ushort lane = get_sub_group_local_id();
    if (innerNodeIdx >= numInnerNodes) return;
    varying struct AABB childrenAABB; // one child AABB per lane
    AABB_init(&childrenAABB);
    varying uchar childrenMask = 0; // one child mask per lane
    global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
    uint numChildren = SUBGROUP_refit_bottom(
        bvh,
        geosArray,
        instance_leaf_aabbs,
        curNode,
        backPointer,
        &childrenAABB,
        &childrenMask,
        lane,
        is_procedural_instance
         );
    if (numChildren != 0)
    {
        /* update bounds of node */
        struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB);
        reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0);
        subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane);
        /* update mask of node */
        uchar mask = sub_group_reduce_or_N6(childrenMask);
        curNode->instMask = mask;
        /* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done
           only by the first thread (similar to morton phase1) the machine hangs. */
        mem_fence_gpu_invalidate();
        /* refit upper parts of the BVH */
        /* TODO: this will not gonna work for mixed nodes */
        SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0);
    }
 }
--- a/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl
--- a/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h
+++ b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h
--- a/src/intel/vulkan/grl/gpu/bvh_copy.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_copy.cl
@ -1,763 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "api_interface.h"
 #include "d3d12.h"
 #include "common.h"
 #include "mem_utils.h"
 #include "misc_shared.h"
 #define offsetof(TYPE, ELEMENT) ((size_t)&(((TYPE *)0)->ELEMENT))
 GRL_INLINE
 uint GroupCountForCopySize(uint size)
 {
    return (size >> 8) + 4;
 }
 GRL_INLINE
 uint GroupCountForCopy(BVHBase* base)
 {
    return GroupCountForCopySize(base->Meta.allocationSize);
 }
 GRL_INLINE void copyInstanceDescs(InstanceDesc* instances, D3D12_RAYTRACING_INSTANCE_DESC* descs, uint64_t numInstances)
 {
    for (uint64_t instanceIndex = get_local_id(0); instanceIndex < numInstances; instanceIndex += get_local_size(0))
    {
        for (uint row = 0; row < 3; row++)
        {
            for (uint column = 0; column < 4; column++)
            {
                D3D12_set_transform(&descs[instanceIndex], row, column, InstanceDesc_get_transform(&instances[instanceIndex], row, column));
            }
        }
        D3D12_set_instanceID(&descs[instanceIndex], InstanceDesc_get_instanceID(&instances[instanceIndex]));
        D3D12_set_InstanceMask(&descs[instanceIndex], InstanceDesc_get_InstanceMask(&instances[instanceIndex]));
        D3D12_set_InstanceContributionToHitGroupIndex(&descs[instanceIndex], InstanceDesc_get_InstanceContributionToHitGroupIndex(&instances[instanceIndex]));
        D3D12_set_InstanceFlags(&descs[instanceIndex], InstanceDesc_get_InstanceFlags(&instances[instanceIndex]));
        D3D12_set_AccelerationStructure(&descs[instanceIndex], InstanceDesc_get_AccelerationStructure(&instances[instanceIndex]));
    }
 }
 GRL_INLINE void createGeoDescs(GeoMetaData* geoMetaData, D3D12_RAYTRACING_GEOMETRY_DESC* descs, uint64_t numGeos, const uint64_t dataBufferStart)
 {
    if (get_local_id(0) == 0)
    {
        uint64_t previousGeoDataBufferEnd = dataBufferStart;
        for (uint64_t geoIndex = 0; geoIndex < numGeos; geoIndex += 1)
        {
            D3D12_set_Type(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Type));
            D3D12_set_Flags(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Flags));
            if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
            {
                // Every triangle is stored separately
                uint64_t vertexBufferSize = 9 * sizeof(float) * geoMetaData[geoIndex].PrimitiveCount;
                D3D12_set_triangles_Transform(&descs[geoIndex], 0);
                D3D12_set_triangles_IndexFormat(&descs[geoIndex], INDEX_FORMAT_NONE);
                D3D12_set_triangles_VertexFormat(&descs[geoIndex], VERTEX_FORMAT_R32G32B32_FLOAT);
                D3D12_set_triangles_IndexCount(&descs[geoIndex], 0);
                D3D12_set_triangles_VertexCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount * 3);
                D3D12_set_triangles_IndexBuffer(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
                D3D12_set_triangles_VertexBuffer_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
                D3D12_set_triangles_VertexBuffer_StrideInBytes(&descs[geoIndex], 3 * sizeof(float));
                previousGeoDataBufferEnd += vertexBufferSize;
            }
            else
            {
                D3D12_set_procedurals_AABBCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount);
                D3D12_set_procedurals_AABBs_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
                D3D12_set_procedurals_AABBs_StrideInBytes(&descs[geoIndex], sizeof(D3D12_RAYTRACING_AABB));
                previousGeoDataBufferEnd += sizeof(D3D12_RAYTRACING_AABB) * geoMetaData[geoIndex].PrimitiveCount;
            }
        }
    }
 }
 GRL_INLINE void copyIndiciesAndVerticies(D3D12_RAYTRACING_GEOMETRY_DESC* desc, QuadLeaf* quad)
 {
    float* vertices = (float*)D3D12_get_triangles_VertexBuffer_StartAddress(desc);
    uint64_t firstTriangleIndex = quad->primIndex0;
    uint64_t numTriangles = QuadLeaf_IsSingleTriangle(quad) ? 1 : 2;
    vertices[firstTriangleIndex * 9] = quad->v[0][0];
    vertices[firstTriangleIndex * 9 + 1] = quad->v[0][1];
    vertices[firstTriangleIndex * 9 + 2] = quad->v[0][2];
    vertices[firstTriangleIndex * 9 + 3] = quad->v[1][0];
    vertices[firstTriangleIndex * 9 + 4] = quad->v[1][1];
    vertices[firstTriangleIndex * 9 + 5] = quad->v[1][2];
    vertices[firstTriangleIndex * 9 + 6] = quad->v[2][0];
    vertices[firstTriangleIndex * 9 + 7] = quad->v[2][1];
    vertices[firstTriangleIndex * 9 + 8] = quad->v[2][2];
    if (numTriangles == 2)
    {
        uint64_t secondTriangleIndex = firstTriangleIndex + QuadLeaf_GetPrimIndexDelta(quad);
        uint32_t packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
        for( size_t i=0; i<3; i++ )
        {
            uint32_t idx = packed_indices & 3 ; packed_indices >>= 2;
            for( size_t j=0; j<3; j++ )
                vertices[secondTriangleIndex * 9 + i * 3 + j] = quad->v[idx][j];
        }
    }
 }
 GRL_INLINE
 void storeProceduralDesc(
    struct AABB     procAABB,
    uint32_t        primId,
    D3D12_RAYTRACING_GEOMETRY_DESC* geoDesc)
 {
    D3D12_RAYTRACING_AABB* proceduralDescs = (D3D12_RAYTRACING_AABB*)D3D12_get_procedurals_AABBs_StartAddress(geoDesc);
    D3D12_set_raytracing_aabb(&proceduralDescs[primId], &procAABB);
 }
 GRL_INLINE
 void copyDataFromLProcedurals(
    BVHBase* base,
    D3D12_RAYTRACING_GEOMETRY_DESC* descs)
 {
    unsigned numProcedurals = BVHBase_GetNumProcedurals(base);
    InternalNode* innerNodes = BVHBase_GetInternalNodes(base);
    unsigned numInnerNodes = BVHBase_GetNumInternalNodes(base);
    if (BVHBase_GetNumProcedurals(base) > 0) //< there's no point entering here if there are no procedurals
    {
        // iterate on all inner nodes to identify those with procedural children, we have to take aabbs from them
        for (uint32_t nodeI = get_local_id(0); nodeI < numInnerNodes; nodeI += get_local_size(0))
        {
            InternalNode* innerNode = innerNodes + nodeI;
            if (innerNode->nodeType == NODE_TYPE_PROCEDURAL)
            {
                float* origin = innerNode->lower;
                global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer((struct QBVHNodeN*)innerNode);
                for (uint k = 0; k < 6; k++)
                {
                    if (InternalNode_IsChildValid(innerNode, k))
                    {
                        struct AABB3f qbounds = {
                            (float)(innerNode->lower_x[k]), (float)(innerNode->lower_y[k]), (float)(innerNode->lower_z[k]),
                            (float)(innerNode->upper_x[k]), (float)(innerNode->upper_y[k]), (float)(innerNode->upper_z[k]) };
                        struct AABB dequantizedAABB;
                        dequantizedAABB.lower[0] = origin[0] + bitShiftLdexp(qbounds.lower[0], innerNode->exp_x - 8);
                        dequantizedAABB.lower[1] = origin[1] + bitShiftLdexp(qbounds.lower[1], innerNode->exp_y - 8);
                        dequantizedAABB.lower[2] = origin[2] + bitShiftLdexp(qbounds.lower[2], innerNode->exp_z - 8);
                        dequantizedAABB.upper[0] = origin[0] + bitShiftLdexp(qbounds.upper[0], innerNode->exp_x - 8);
                        dequantizedAABB.upper[1] = origin[1] + bitShiftLdexp(qbounds.upper[1], innerNode->exp_y - 8);
                        dequantizedAABB.upper[2] = origin[2] + bitShiftLdexp(qbounds.upper[2], innerNode->exp_z - 8);
                        dequantizedAABB = conservativeAABB(&dequantizedAABB);
                        /* extract geomID and primID from leaf */
                        const uint startPrim = QBVHNodeN_startPrim((struct QBVHNodeN*) innerNode, k);
                        const uint geomID = ProceduralLeaf_geomIndex(leaf);
                        const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
                        storeProceduralDesc(dequantizedAABB, primID, descs + geomID);
                    }
                    /* advance leaf pointer to next child */
                    leaf += QBVHNodeN_blockIncr((struct QBVHNodeN*)innerNode, k);
                }
            }
            else if (innerNode->nodeType == NODE_TYPE_MIXED) { ERROR(); }
            else {/* do nothing for other internal node types, they can't have procedural child (directly)*/; }
        }
    }
 }
 GRL_INLINE
 void copyDataFromQuadLeaves(BVHBase* base,
    D3D12_RAYTRACING_GEOMETRY_DESC* descs)
 {
    QuadLeaf* quads = BVHBase_GetQuadLeaves(base);
    uint64_t numQuads = BVHBase_GetNumQuads(base);
    for (uint64_t quadIdx = get_local_id(0); quadIdx < numQuads; quadIdx += get_local_size(0))
    {
        uint64_t descIdx = PrimLeaf_GetGeoIndex(&quads[quadIdx].leafDesc);
        copyIndiciesAndVerticies(&descs[descIdx], &quads[quadIdx]);
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel clone_indirect(global char* dest,
    global char* src)
 {
    BVHBase* base = (BVHBase*)src;
    uint64_t bvhSize = base->Meta.allocationSize;
    uint numGroups = GroupCountForCopy(base);
    CopyMemory(dest, src, bvhSize, numGroups);
 }
 GRL_INLINE void compactT(global char* dest, global char* src, uint64_t compactedSize, uint skipCopy, uint groupCnt)
 {
    global BVHBase* baseSrc = (global BVHBase*)src;
    global BVHBase* baseDest = (global BVHBase*)dest;
    uint32_t offset = sizeof(BVHBase);
    uint32_t numNodes = BVHBase_GetNumInternalNodes(baseSrc);
    uint32_t nodeSize = numNodes * sizeof(InternalNode);
    offset += nodeSize;
    int quadChildFix = baseSrc->quadLeafStart;
    int procChildFix = baseSrc->proceduralDataStart;
    int instChildFix = baseSrc->instanceLeafStart;
    // serialization already copies part of bvh base so skip this part
    CopyMemory(dest + skipCopy, src + skipCopy, sizeof(BVHBase) - skipCopy, groupCnt);
    baseDest->Meta.allocationSize = compactedSize;
    if (baseSrc->Meta.instanceCount)
    {
        const uint32_t instLeafsSize = BVHBase_GetNumHWInstanceLeaves(baseSrc) * sizeof(HwInstanceLeaf);
        CopyMemory(dest + offset, (global char*)BVHBase_GetHWInstanceLeaves(baseSrc), instLeafsSize, groupCnt);
        const uint instanceLeafStart = (uint)(offset / 64);
        baseDest->instanceLeafStart = instanceLeafStart;
        instChildFix -= instanceLeafStart;
        offset += instLeafsSize;
        baseDest->instanceLeafEnd = (uint)(offset / 64);
    }
    if (baseSrc->Meta.geoCount)
    {
        const uint quadLeafsSize = BVHBase_GetNumQuads(baseSrc) * sizeof(QuadLeaf);
        if (quadLeafsSize)
        {
            CopyMemory(dest + offset, (global char*)BVHBase_GetQuadLeaves(baseSrc), quadLeafsSize, groupCnt);
            const uint quadLeafStart = (uint)(offset / 64);
            baseDest->quadLeafStart = quadLeafStart;
            quadChildFix -= quadLeafStart;
            offset += quadLeafsSize;
            baseDest->quadLeafCur = (uint)(offset / 64);
        }
        const uint procLeafsSize = BVHBase_GetNumProcedurals(baseSrc) * sizeof(ProceduralLeaf);
        if (procLeafsSize)
        {
            CopyMemory(dest + offset, (global char*)BVHBase_GetProceduralLeaves(baseSrc), procLeafsSize, groupCnt);
            const uint proceduralDataStart = (uint)(offset / 64);
            baseDest->proceduralDataStart = proceduralDataStart;
            procChildFix -= proceduralDataStart;
            offset += procLeafsSize;
            baseDest->proceduralDataCur = (uint)(offset / 64);
        }
    }
    // copy nodes with fixed child offsets
    global uint* nodeDest = (global uint*)(dest + sizeof(BVHBase));
    global InternalNode* nodeSrc = (global InternalNode*)BVHBase_GetInternalNodes(baseSrc);
    // used in mixed case
    char* instanceLeavesBegin = (char*)BVHBase_GetHWInstanceLeaves(baseSrc);
    char* instanceLeavesEnd = (char*)BVHBase_GetHWInstanceLeaves_End(baseSrc);
    uint localId = get_sub_group_local_id();
    for (uint i = get_group_id(0); i < numNodes; i += groupCnt)
    {
        uint nodePart = CacheLineSubgroupRead((const global char*)&nodeSrc[i]);
        char nodeType = as_char4(sub_group_broadcast(nodePart, offsetof(InternalNode, nodeType) / 4))[0];
        if (localId * 4 == offsetof(InternalNode, childOffset))
        {
            int childOffset = as_int(nodePart);
            if (nodeType == NODE_TYPE_MIXED)
            {
                char* childPtr = (char*)&nodeSrc[i] + 64 * childOffset;
                if (childPtr > instanceLeavesBegin && childPtr < instanceLeavesEnd)
                    nodePart = as_int(childOffset - instChildFix);
            }
            else if (nodeType == NODE_TYPE_INSTANCE)
                nodePart = as_int(childOffset - instChildFix);
            else if (nodeType == NODE_TYPE_QUAD)
                nodePart = as_int(childOffset - quadChildFix);
            else if (nodeType == NODE_TYPE_PROCEDURAL)
                nodePart = as_int(childOffset - procChildFix);
        }
        nodeDest[i * 16 + localId] = nodePart;
    }
    if (baseSrc->Meta.instanceCount)
    {
        const uint32_t instanceDescSize = baseSrc->Meta.instanceCount * sizeof(InstanceDesc);
        CopyMemory(dest + offset, src + baseSrc->Meta.instanceDescsStart, instanceDescSize, groupCnt);
        baseDest->Meta.instanceDescsStart = offset;
        offset += instanceDescSize;
    }
    if (baseSrc->Meta.geoCount)
    {
        const uint32_t geoMetaSize = baseSrc->Meta.geoCount * sizeof(GeoMetaData);
        CopyMemory(dest + offset, src + baseSrc->Meta.geoDescsStart, geoMetaSize, groupCnt);
        baseDest->Meta.geoDescsStart = offset;
        offset += (geoMetaSize + 63) & ~63; // align to 64
    }
    uint backPointerDataStart     = offset / 64;
    uint refitTreeletsDataStart   = backPointerDataStart;
    uint refitStartPointDataStart = backPointerDataStart;
    uint dataEnd                  = backPointerDataStart;
    uint fatLeafTableStart = dataEnd;
    uint fatLeafCount      = baseSrc->fatLeafCount;
    uint innerTableStart   = dataEnd;
    uint innerCount        = baseSrc->innerCount;
    uint quadLeftoversCountNewAtomicUpdate = baseSrc->quadLeftoversCountNewAtomicUpdate;
    uint quadTableSizeNewAtomicUpdate = baseSrc->quadTableSizeNewAtomicUpdate;
    uint quadIndicesDataStart = dataEnd;
    if (BVHBase_HasBackPointers(baseSrc))
    {
 #if 0 //
        const uint oldbackpontersDataStart = baseSrc->backPointerDataStart;
        const uint shift = oldbackpontersDataStart - backPointerDataStart;
        const uint refitStructsSize = ((BVHBase_GetRefitStructsDataSize(baseSrc)) + 63) & ~63;
        CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), refitStructsSize, groupCnt);
        refitTreeletsDataStart   = baseSrc->refitTreeletsDataStart - shift;
        refitStartPointDataStart = baseSrc->refitStartPointDataStart - shift;
        dataEnd                  = baseSrc->BVHDataEnd - shift;
 #else // compacting version
        const uint backpointersSize = ((numNodes*sizeof(uint)) + 63) & ~63;
        CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), backpointersSize, groupCnt);
        offset += backpointersSize;
        refitTreeletsDataStart = offset / 64;
        refitStartPointDataStart = offset / 64;
        // TODO: remove treelets from .... everywhere
        const uint treeletExecutedCnt = *BVHBase_GetRefitTreeletCntPtr(baseSrc);
        if (treeletExecutedCnt)
        {
            const uint treeletCnt = treeletExecutedCnt > 1 ? treeletExecutedCnt + 1 : 1;
            refitTreeletsDataStart = offset / 64;
            const uint treeletsSize = ((treeletCnt * sizeof(RefitTreelet)) + 63) & ~63;
            RefitTreelet* destTreelets = (RefitTreelet*)(dest + offset);
            RefitTreelet* srcTreelets = BVHBase_GetRefitTreeletDescs(baseSrc);
            uint numThreads = groupCnt * get_local_size(0);
            uint globalID = (get_group_id(0) * get_local_size(0)) + get_local_id(0);
            for (uint i = globalID; i < treeletCnt; i += numThreads)
            {
                RefitTreelet dsc = srcTreelets[i];
                RefitTreeletTrivial* trivial_dsc = (RefitTreeletTrivial*)&dsc;
                if (trivial_dsc->numStartpoints == 1 && trivial_dsc->childrenOffsetOfTheNode > numNodes) {
                    trivial_dsc->childrenOffsetOfTheNode -= quadChildFix;
                }
                destTreelets[i] = dsc;
            }
            offset += treeletsSize;
            refitStartPointDataStart = offset / 64;
            const uint startPointsSize = (BVHBase_GetRefitStartPointsSize(baseSrc) + 63) & ~63;
            CopyMemory(dest + offset, (global char*)BVHBase_GetRefitStartPoints(baseSrc), startPointsSize, groupCnt);
            offset += startPointsSize;
            dataEnd = offset / 64;
        }
        uint fatleafEntriesSize = ((fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63);
        fatLeafTableStart = offset / 64;
        if (fatleafEntriesSize) {
            CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), fatleafEntriesSize, groupCnt);
        }
        offset += fatleafEntriesSize;
        // New atomic update
        if(baseSrc->quadIndicesDataStart > baseSrc->backPointerDataStart)
        {
            uint numQuads = BVHBase_GetNumQuads(baseSrc);
            uint quadTableMainBufferSize = (numQuads + 255) & ~255;
            uint quadLeftoversSize = (quadLeftoversCountNewAtomicUpdate + 255) & ~255;
            uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
            if (quadTableEntriesSize) {
                CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), quadTableEntriesSize, groupCnt);
            }
            offset += quadTableEntriesSize;
            uint quadIndicesDataSize = ((numQuads * sizeof(QuadDataIndices) + 63) & ~63);
            quadIndicesDataStart = offset / 64;
            if (quadIndicesDataSize) {
                CopyMemory(dest + offset, (global char*)BVHBase_GetQuadDataIndicesTable(baseSrc), quadIndicesDataSize, groupCnt);
            }
            offset += quadIndicesDataSize;
        }
        uint innerEntriesSize = ((innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63);
        innerTableStart = offset / 64;
        if (innerEntriesSize) {
            CopyMemory(dest + offset, (global char*)BVHBase_GetInnerNodeTable(baseSrc), innerEntriesSize, groupCnt);
        }
        offset += innerEntriesSize;
        dataEnd = offset / 64;
 #endif
    }
    baseDest->backPointerDataStart = backPointerDataStart;
    baseDest->refitTreeletsDataStart = refitTreeletsDataStart;
    baseDest->refitStartPointDataStart = refitStartPointDataStart;
    baseDest->fatLeafTableStart = fatLeafTableStart ;
    baseDest->fatLeafCount = fatLeafCount;
    baseDest->innerTableStart = innerTableStart;
    baseDest->innerCount = innerCount;
    baseDest->quadLeftoversCountNewAtomicUpdate = quadLeftoversCountNewAtomicUpdate;
    baseDest->quadTableSizeNewAtomicUpdate = quadTableSizeNewAtomicUpdate;
    baseDest->quadIndicesDataStart = quadIndicesDataStart;
    baseDest->BVHDataEnd = dataEnd;
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 void kernel compact(global char* dest,
    global char* src,
    uint groupCnt)
 {
    uint64_t compactedSize = compute_compacted_size((BVHBase*)src);
    compactT(dest, src, compactedSize, 0, groupCnt);
 }
 // set serialization header along all lanes, each lane will get one dword of header plus 64bit reminding data
 GRL_INLINE
 unsigned prepare_header(
    uint64_t headerSize,
    uint64_t instancePtrSize,
    uint64_t numInstances,
    uint64_t bvhSize,
    uint8_t* driverID,
    uint64_t reminder)
 {
    unsigned loc_id = get_sub_group_local_id();
    uint64_t SerializedSizeInBytesIncludingHeader = headerSize + instancePtrSize * numInstances + bvhSize;
    uint64_t DeserializedSizeInBytes = bvhSize;
    uint64_t InstanceHandleCount = numInstances;
    char bvh_magic_str[] = BVH_MAGIC_MACRO;
    uint* bvh_magic_uint = (uint*)bvh_magic_str;
    unsigned headerTempLanePiece;
    if (loc_id < 4) { headerTempLanePiece = *((unsigned*)&driverID[4*loc_id]); }
    else if (loc_id == 4) { headerTempLanePiece = bvh_magic_uint[0]; }
    else if (loc_id == 5) { headerTempLanePiece = bvh_magic_uint[1]; }
    else if (loc_id == 6) { headerTempLanePiece = bvh_magic_uint[2]; }
    else if (loc_id == 7) { headerTempLanePiece = bvh_magic_uint[3]; }
    else if (loc_id == 8) { headerTempLanePiece = (uint)SerializedSizeInBytesIncludingHeader; }
    else if (loc_id == 9) { headerTempLanePiece = (uint)(SerializedSizeInBytesIncludingHeader >> 32ul); }
    else if (loc_id == 10) { headerTempLanePiece = (uint)DeserializedSizeInBytes; }
    else if (loc_id == 11) { headerTempLanePiece = (uint)(DeserializedSizeInBytes >> 32ul); }
    else if (loc_id == 12) { headerTempLanePiece = (uint)InstanceHandleCount; }
    else if (loc_id == 13) { headerTempLanePiece = (uint)(InstanceHandleCount >> 32ul); }
    else if (loc_id == 14) { headerTempLanePiece = (uint)reminder; }
    else if (loc_id == 15) { headerTempLanePiece = (uint)(reminder >> 32ul); }
    return headerTempLanePiece;
 }
 GRL_INLINE
 void serializeT(
    global byte_align64B* dest,
    global byte_align64B* src,
    global uint8_t* driverID,
    uint groups_count)
 {
    SerializationHeader* header = (SerializationHeader*)dest;
    BVHBase* base = (BVHBase*)src;
    const uint headerSize = sizeof(SerializationHeader);
    const uint numInstances = base->Meta.instanceCount;
    const uint instancePtrSize = sizeof(gpuva_t);
    const uint compactedSize = compute_compacted_size(base);
    uint local_id = get_sub_group_local_id();
    // this is not 64byte aligned :(
    const uint offsetToBvh = headerSize + instancePtrSize * numInstances;
    global InstanceDesc* src_instances = 0;
    if (numInstances) {
        src_instances = (global InstanceDesc*)((uint64_t)base + base->Meta.instanceDescsStart);
    }
    // effectively this part should end up as one 64B aligned 64B write
    if (get_group_id(0) == groups_count - 1)
    {
        Block64B headerPlus;
        // we patch the missing piece with instance or bhv beginning (TRICK A and B)
        // we assume header is 56B.
        global uint64_t* srcPiece = (numInstances != 0) ? &src_instances[0].AccelerationStructureGPUVA : (global uint64_t*)src;
        unsigned headerTemp;
        headerTemp = prepare_header(
            headerSize,
            instancePtrSize,
            numInstances,
            compactedSize,
            driverID,
            *srcPiece);
        CacheLineSubgroupWrite((global byte_align64B*)dest, headerTemp);
    }
    if (numInstances > 0)
    {
        uint instancesOffset = headerSize;
        uint aligned_instance_ptrs_offset = ((instancesOffset + 63) >> 6) << 6;
        uint unaligned_prefixing_instance_cnt = (aligned_instance_ptrs_offset - instancesOffset) >> 3;
        unaligned_prefixing_instance_cnt = min(unaligned_prefixing_instance_cnt, numInstances);
        global uint64_t* dst_instances = (global uint64_t*)(dest + instancesOffset);
        // we've copied first instance onto a header, (see TRICK A)
        // now we have only instances start at aligned memory
        uint numAlignedInstances = numInstances - unaligned_prefixing_instance_cnt;
        dst_instances += unaligned_prefixing_instance_cnt;
        src_instances += unaligned_prefixing_instance_cnt;
        if (numAlignedInstances)
        {
            // each 8 instances form a cacheline
            uint numCachelines = numAlignedInstances >> 3; //qwords -> 64Bs
            // qwords besides multiple of 8;
            uint startReminder = numAlignedInstances & ~((1 << 3) - 1);
            uint numreminder = numAlignedInstances & ((1 << 3) - 1);
            uint task_id = get_group_id(0);
            while (task_id < numCachelines)
            {
                uint src_id = task_id * 8 + (local_id >> 1);
                uint* src_uncorected = (uint*)& src_instances[src_id].AccelerationStructureGPUVA;
                uint* src = ((local_id & 1) != 0) ? src_uncorected + 1 : src_uncorected;
                uint data = *src;
                global char* dst = (global byte_align64B*)(dst_instances + (8 * task_id));
                CacheLineSubgroupWrite(dst, data);
                task_id += groups_count;
            }
            if (task_id == numCachelines && local_id < 8 && numreminder > 0)
            {
                // this should write full cacheline
                uint index = startReminder + local_id;
                // data will be taken from instances for lanes (local_id < numreminder)
                // copy srcbvh beginning as uint64_t for remaining lanes (TRICK B)
                global uint64_t* srcData = (local_id < numreminder) ?
                    &src_instances[index].AccelerationStructureGPUVA :
                    ((global uint64_t*)src) + (local_id - numreminder);
                dst_instances[index] = *srcData;
            }
        }
    }
    // the parts above copied unaligned dst beginning of bvh (see TRICK B)
    uint32_t unalignedPartCopiedElsewhere = (64u - (offsetToBvh & (64u - 1u)))&(64u - 1u);
    compactT(dest + offsetToBvh, src, compactedSize, unalignedPartCopiedElsewhere, groups_count);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 void kernel serialize_indirect(
    global char* dest,
    global char* src,
    global uint8_t* driverID)
 {
    BVHBase* base = (BVHBase*)src;
    uint groups_count = GroupCountForCopy(base);
    serializeT(dest, src, driverID, groups_count);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 void kernel serialize_for_input_dump_indirect(
    global struct OutputBatchPtrs* batchPtrs,
    global dword* dstOffset,
    global char* src,
    global uint8_t* driverID)
 {
    BVHBase* base = (BVHBase*)src;
    uint groups_count = GroupCountForCopy(base);
    global char* dest = (global char*)(batchPtrs->dataStart + *dstOffset);
    dest += (sizeof(OutputData) + 127) & ~127;
    serializeT(dest, src, driverID, groups_count);
 }
 GRL_INLINE
 void deserializeT(
    global char* dest,
    global char* src,
    unsigned groupCnt)
 {
    SerializationHeader* header = (SerializationHeader*)src;
    const uint64_t headerSize = sizeof(struct SerializationHeader);
    const uint64_t instancePtrSize = sizeof(gpuva_t);
    const uint64_t numInstances = header->InstanceHandleCount;
    const uint64_t offsetToBvh = headerSize + instancePtrSize * numInstances;
    const uint64_t bvhSize = header->DeserializedSizeInBytes;
    if (numInstances)
    {
        const bool instances_mixed_with_inner_nodes = false;
        if (instances_mixed_with_inner_nodes)
        {
            // not implemented !
            // copy each node with 64byte granularity if node is instance, patch it mid-copy
        }
        else
        {
            BVHBase* srcBvhBase = (BVHBase*)(src + offsetToBvh);
            // numHWInstances can be bigger (because of rebraiding) or smaller (because of inactive instances) than
            // numInstances (count of pointers and descriptors).
            uint offsetToHwInstances = srcBvhBase->instanceLeafStart << 6;
            uint numHwInstances = (srcBvhBase->instanceLeafEnd - srcBvhBase->instanceLeafStart) >> 1;
            //
            // instances are in separate memory intervals
            // copy all the other data simple way
            //
            uint nodesEnd = srcBvhBase->Meta.instanceDescsStart;
            // copy before instance leafs
            CopyMemory(dest, (global char*)(src + offsetToBvh), offsetToHwInstances, groupCnt);
            uint offsetPostInstances = srcBvhBase->instanceLeafEnd << 6;
            uint instanceDescStart = srcBvhBase->Meta.instanceDescsStart;
            uint sizePostInstances = instanceDescStart - offsetPostInstances;
            // copy after instance leafs before instance desc
            CopyMemory(dest + offsetPostInstances, (global char*)(src + offsetToBvh + offsetPostInstances), sizePostInstances, groupCnt);
            uint instanceDescEnd = instanceDescStart + numInstances * sizeof(InstanceDesc);
            uint sizePostInstanceDescs = bvhSize - instanceDescEnd;
            // copy after instance desc
            CopyMemory(dest + instanceDescEnd, (global char*)(src + offsetToBvh + instanceDescEnd), sizePostInstanceDescs, groupCnt);
            global gpuva_t* newInstancePtrs = (global gpuva_t*)(src + headerSize);
            global InstanceDesc* dstDesc = (global InstanceDesc*)(dest + instanceDescStart);
            global InstanceDesc* srcDesc = (global InstanceDesc*)(src + offsetToBvh + instanceDescStart);
            // copy and patch instance descriptors
            for (uint64_t instanceIndex = get_group_id(0); instanceIndex < numInstances; instanceIndex += groupCnt)
            {
                InstanceDesc desc = srcDesc[instanceIndex];
                uint64_t newInstancePtr = newInstancePtrs[instanceIndex];
                desc.AccelerationStructureGPUVA = newInstancePtr; // patch it with new ptr;
                dstDesc[instanceIndex] = desc;
            }
            // copy and patch hw instance leafs
            global HwInstanceLeaf* dstInstleafs = (global HwInstanceLeaf*)(dest + offsetToHwInstances);
            global HwInstanceLeaf* srcInstleafs = (global HwInstanceLeaf*)(src + offsetToBvh + offsetToHwInstances);
            for (uint hwLeafIndex = get_group_id(0); hwLeafIndex < numHwInstances; hwLeafIndex += groupCnt)
            {
                // pull the instance from srcBVH
                HwInstanceLeaf tmpInstleaf = srcInstleafs[hwLeafIndex];
                uint swInstanceIndex = HwInstanceLeaf_GetInstanceIndex(&tmpInstleaf);
                uint64_t childBvhPtr = (uint64_t)newInstancePtrs[swInstanceIndex];
                uint64_t originalBvhPtr = (uint64_t)HwInstanceLeaf_GetBVH(&tmpInstleaf);
                HwInstanceLeaf_SetBVH(&tmpInstleaf, childBvhPtr);
                uint64_t startNode = HwInstanceLeaf_GetStartNode(&tmpInstleaf);
                if (startNode != 0) {
                    uint64_t rootNodeOffset = startNode - originalBvhPtr;
                    HwInstanceLeaf_SetStartNode(&tmpInstleaf, childBvhPtr + rootNodeOffset);
                }
                dstInstleafs[hwLeafIndex] = tmpInstleaf;
            }
        }
    }
    else
    {
        CopyMemory(dest, (global char*)(src + offsetToBvh), bvhSize, groupCnt);
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel deserialize_indirect(
    global char* dest,
    global char* src)
 {
    SerializationHeader* header = (SerializationHeader*)src;
    const uint64_t bvhSize = header->DeserializedSizeInBytes;
    unsigned groupCnt = GroupCountForCopySize(bvhSize);
    deserializeT(dest, src, groupCnt);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel dxr_decode(global char* dest,
    global char* src)
 {
    DecodeHeader* header = (DecodeHeader*)dest;
    BVHBase* base = (BVHBase*)src;
    uint32_t numGeos = base->Meta.geoCount;
    uint32_t numInstances = base->Meta.instanceCount;
    if (numInstances > 0)
    {
        header->Type = TOP_LEVEL;
        header->NumDesc = numInstances;
        D3D12_RAYTRACING_INSTANCE_DESC* instanceDesc = (D3D12_RAYTRACING_INSTANCE_DESC*)(dest + sizeof(DecodeHeader));
        copyInstanceDescs((InstanceDesc*)((uint64_t)base + (uint64_t)base->Meta.instanceDescsStart),
            instanceDesc,
            numInstances);
    }
    else if (numGeos > 0)
    {
        header->Type = BOTTOM_LEVEL;
        header->NumDesc = numGeos;
        D3D12_RAYTRACING_GEOMETRY_DESC* geomDescs = (D3D12_RAYTRACING_GEOMETRY_DESC*)(dest + sizeof(DecodeHeader));
        uint64_t data = (uint64_t)geomDescs + sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) * numGeos;
        createGeoDescs((GeoMetaData*)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
            geomDescs,
            numGeos,
            data);
        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
        copyDataFromQuadLeaves(base,
            geomDescs);
        copyDataFromLProcedurals(base,
            geomDescs);
    }
    else
    {
        header->Type = BOTTOM_LEVEL;
        header->NumDesc = 0;
    }
 }
--- a/src/intel/vulkan/grl/gpu/bvh_debug.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_debug.cl
@ -1,208 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 // @file bvh_debug.cl
 //
 // @brief routines to do basic integrity checks
 //
 // Notes:
 //
 #include "GRLGen12.h"
 #include "intrinsics.h"
 #include "libs/lsc_intrinsics.h"
 #include "GRLGen12IntegrityChecks.h"
 #include "api_interface.h"
 #define ERROR_PRINTF 0
 GRL_INLINE bool commit_err(
    global uint* some_null,
    global BVHBase* bvh,
    global ERROR_INFO* err_info_slot,
    ERROR_INFO err)
 {
    if (err.type != error_t_no_error) {
        uint expected = error_t_no_error;
        atomic_compare_exchange_global(&err_info_slot->type, &expected, err.type);
        if (expected == error_t_no_error)
        {
            err_info_slot->offset_in_BVH = err.offset_in_BVH;
            err_info_slot->when = err.when;
            err_info_slot->reserved = 0xAAACCAAA;
            mem_fence_evict_to_memory();
 #if ERROR_PRINTF
            printf("bvh = 0x%llX, err.type = %X, err.offset_in_BVH = %d\n", bvh, err.type, err.offset_in_BVH);
 #else 
 			// This is to trigger PF. Note we have to write directly to memory.
            // If write would stay in L3 it won't give a PF untill this will get evicted to mem.
            store_uint_L1UC_L3UC(some_null, 0, 0x0EEE0000 + err.type);
 #endif
            return true;
        }
    }
    return false;
 }
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel check_tree_topology(
    global uint* some_null,
    global BVHBase* bvh,
    global ERROR_INFO* err,
    uint phase)
 {
    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
    if (err->type != error_t_no_error) return;
    uint dummy1, dummy2, dummy3;
    ERROR_INFO reterr =  check_tree_topology_helper(bvh, globalID, &dummy1, &dummy2, &dummy3, false);
    if (reterr.type == error_t_no_error)
    {
        reterr = check_backpointers(bvh, globalID);
    }
    if (reterr.type == error_t_no_error)
    {
        reterr = validate_atomic_update_structs(bvh, globalID);
    }
    reterr.when = phase;
    commit_err(some_null, bvh, err, reterr);
 }
 GRL_INLINE bool IsValid48bPtr(qword ptr)
 {
    qword CANONIZED_BITS = 0xFFFFul << 48ul;
    qword canonized_part = ptr & CANONIZED_BITS;
    bool isIt = ptr != 0 && (
        canonized_part == 0 || canonized_part == CANONIZED_BITS);
    return isIt;
 }
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel check_geos_before_quad_update(
    global BVHBase* bvh, //dest bvh
    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    global uint* some_null,
    global ERROR_INFO* err,
    uint phase,
    uint numGeos,
    uint numThreads)
 {
    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
    if (err->type != error_t_no_error) return;
    // first check sanity of geos
    ERROR_INFO geo_insanity_error = { error_t_input_geo_insane, 0 };  
    for (uint ID = globalID; ID < numGeos; ID += numThreads * get_sub_group_size())
    {
        bool IsSane = IsValid48bPtr((qword)(qword)geomDesc);
        if (IsSane) {
            GRL_RAYTRACING_GEOMETRY_DESC geo = geomDesc[globalID];
            IsSane = geo.Type < NUM_GEOMETRY_TYPES;
            if (IsSane) {
                if (geo.Type == GEOMETRY_TYPE_TRIANGLES) {
                    if (geo.Desc.Triangles.IndexFormat >= INDEX_FORMAT_END) {
                        IsSane = false;
                    }
                    else
                    {
                        if (geo.Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE && geo.Desc.Triangles.IndexCount > 2)
                        {
                            IsSane = (geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END) &&
                                IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) &&
                                IsValid48bPtr((qword)geo.Desc.Triangles.pIndexBuffer);
                        }   
                        else if (geo.Desc.Triangles.VertexCount > 2)
                        {
                            IsSane =
                                geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END&&
                                IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) != 0;
                        }
                    }
                }
            }
        }
        geo_insanity_error.offset_in_BVH = ID;
        geo_insanity_error.when = phase;
        if (!IsSane) {
            commit_err(some_null, bvh, err, geo_insanity_error);
        }
        return;
    }
 }
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel check_geos_vs_quads(
    global BVHBase* bvh,
    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
    global uint* some_null,
    global ERROR_INFO* err,
    uint phase,
    uint numGeos,
    uint numThreads)
 {
    uint numQuads = BVHBase_GetNumQuads(bvh);
    QuadLeaf* quads = BVHBase_GetQuadLeaves(bvh);
    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
    uint qoffset = bvh->quadLeafStart;
    if (err->type != error_t_no_error) return;
    ERROR_INFO theErr = { error_t_no_error, 0 };
    for (uint ID = globalID; ID < numQuads; ID += numThreads * get_sub_group_size())
    {
        ERROR_INFO quadErr = { error_t_quad_leaf_broken, qoffset + ID, phase };
        QuadLeaf quad = quads[ID];
        uint geoIdx = PrimLeaf_GetGeoIndex(&quad.leafDesc);
        if (geoIdx > numGeos) { commit_err(some_null, bvh, err, quadErr); return; }
        uint numPrimsInGeo = geomDesc[geoIdx].Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE ?
            geomDesc[geoIdx].Desc.Triangles.IndexCount  / 3 :
            geomDesc[geoIdx].Desc.Triangles.VertexCount / 3;
        if(quad.primIndex0 >= numPrimsInGeo) { 
            commit_err(some_null, bvh, err, quadErr);
            return; 
        }
        if(!QuadLeaf_IsSingleTriangle(&quad) && 
           (quad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&quad) >= numPrimsInGeo))
        {
            commit_err(some_null, bvh, err, quadErr);
            return; 
        }
    }
 }
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel check_instances_linked_bvhs(
    global uint* some_null,
    global BVHBase* bvh,
    global ERROR_INFO* err,
    uint phase)
 {
    if (err->type != error_t_no_error) return;
    uint instanceLeafStart = bvh->instanceLeafStart;
    uint instanceLeafEnd = bvh->instanceLeafEnd;
    uint numInstances = (instanceLeafEnd - instanceLeafStart) / 2;
    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
    ERROR_INFO reterr = check_instances_linked_bvhs_helper(bvh, globalID, /*touchBlas*/true);
    reterr.when = phase;
    commit_err(some_null, bvh, err, reterr);
 }
--- a/src/intel/vulkan/grl/gpu/bvh_debug.grl
+++ b/src/intel/vulkan/grl/gpu/bvh_debug.grl
@ -1,107 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module bvh_on_gpu_checks;
 kernel_module debug_kernels ("bvh_debug.cl") 
 {
    links lsc_intrinsics;
    kernel opencl_check_tree_topology                        < kernelFunction="check_tree_topology">;
    kernel opencl_check_instances_linked_bvhs                < kernelFunction="check_instances_linked_bvhs">;
    kernel opencl_check_geos_before_quad_update              < kernelFunction="check_geos_before_quad_update">;
    kernel opencl_check_geos_vs_quads                        < kernelFunction="check_geos_vs_quads">;
 }
 metakernel debug_checks_prepare_const_regs()
 {
    define cRoundingSIMD REG4;
    define cInit0        REG5;
    define cShiftForSIMD REG3;
    cRoundingSIMD = (16-1);
    cShiftForSIMD = 4;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
 }
 metakernel debug_checks_bvh_topology(
    qword some_null_ptr,
    qword bvh,
    qword bvh_inner_nodes_end,
    qword error_struct,
    dword when,
    dword bvh_inner_nodes_start_value )
 {
    define cRoundingSIMD REG4;
    define cShiftForSIMD REG3;
    REG1.lo = load_dword(bvh_inner_nodes_end);
    REG0 = bvh_inner_nodes_start_value;
    REG1.hi = 0;
    REG2 = REG1 - REG0;
    REG2 = REG2 + cRoundingSIMD;
    REG2 = REG2 >> cShiftForSIMD;
    DISPATCHDIM_X = REG2.lo;
    dispatch_indirect opencl_check_tree_topology args(
        some_null_ptr,
        bvh,
        error_struct,
        when);
 }
 metakernel debug_check_instances_linked_bvhs(
    qword some_null_ptr,
    qword bvh,
    qword error_struct, 
    dword numHWThreads,
    dword when)
 {
    dispatch opencl_check_instances_linked_bvhs(numHWThreads,1,1) args(
        some_null_ptr,
        bvh,
        error_struct,
        when);
 }
 metakernel debug_check_geos_before_quad_update(
    qword bvh,
    qword geos,
    qword some_null_ptr,
    qword error_struct, 
    dword when,
    dword numGeos,
    dword numHWThreads )
 {
    dispatch opencl_check_geos_before_quad_update(numHWThreads,1,1) args(
        bvh,
        geos,
        some_null_ptr,
        error_struct, 
        when,
        numGeos,
        numHWThreads );
 }
 metakernel debug_check_geos_vs_quads(
    qword bvh,
    qword geos,
    qword some_null_ptr,
    qword error_struct, 
    dword when,
    dword numGeos,
    dword numHWThreads )
 {
    dispatch opencl_check_geos_vs_quads(numHWThreads,1,1) args(
        bvh,
        geos,
        some_null_ptr,
        error_struct, 
        when,
        numGeos,
        numHWThreads );
 }
--- a/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl
@ -1,97 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "api_interface.h"
 #include "d3d12.h"
 #include "common.h"
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(1, 1, 1))) void kernel compacted_size(global char *bvh_mem,
                                                                          global char *postbuild_info)
 {
    BVHBase *base = (BVHBase *)bvh_mem;
    PostbuildInfoCompactedSize *postbuildInfoCompacted = (PostbuildInfoCompactedSize *)postbuild_info;
    postbuildInfoCompacted->CompactedSizeInBytes = compute_compacted_size(base);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(1, 1, 1))) void kernel current_size(global char *bvh_mem,
                                                                        global char *postbuild_info)
 {
    BVHBase *base = (BVHBase *)bvh_mem;
    PostbuildInfoCurrentSize *postbuildInfoCurrent = (PostbuildInfoCurrentSize *)postbuild_info;
    postbuildInfoCurrent->CurrentSizeInBytes = base->Meta.allocationSize;
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(1, 1, 1))) void kernel serialized_size(global char *bvh_mem,
                                                                           global char *postbuild_info)
 {
    BVHBase *base = (BVHBase *)bvh_mem;
    PostbuildInfoSerializationDesc *postbuildInfoSerialization = (PostbuildInfoSerializationDesc *)postbuild_info;
    uint64_t headerSize = sizeof(SerializationHeader);
    uint64_t numInstances = base->Meta.instanceCount;
    postbuildInfoSerialization->SerializedSizeInBytes = sizeof(SerializationHeader) +
                                                        numInstances * sizeof(gpuva_t) +
                                                        compute_compacted_size(base);
                                                        //base->Meta.allocationSize;
    postbuildInfoSerialization->NumBottomLevelAccelerationStructurePointers = numInstances;
 }
 void countTrianglesAndProcedurals(GeoMetaData *geoMetaData,
                                  uint64_t numGeos,
                                  uint64_t *numTriangles,
                                  uint64_t *numProcedurals)
 {
    uint64_t numTrianglesLoc = 0;
    uint64_t numProceduralsLoc = 0;
    for (uint64_t geoIndex = get_local_id(0); geoIndex < numGeos; geoIndex += get_local_size(0))
    {
        if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
        {
            *numTriangles += geoMetaData[geoIndex].PrimitiveCount;
        }
        else
        {
            *numProcedurals += geoMetaData[geoIndex].PrimitiveCount;
        }
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(1, 1, 1))) void kernel decoded_size(global char *bvh_mem,
                                                                        global char *postbuild_info)
 {
    BVHBase *base = (BVHBase *)bvh_mem;
    PostbuildInfoToolsVisualizationDesc *postbuildInfoDecoded = (PostbuildInfoToolsVisualizationDesc *)postbuild_info;
    uint64_t numTriangles = 0;
    uint64_t numProcedurals = 0;
    countTrianglesAndProcedurals((GeoMetaData *)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
                                 base->Meta.geoCount,
                                 &numTriangles,
                                 &numProcedurals);
    uint64_t numInstances = base->Meta.instanceCount;
    uint64_t numDescs = base->Meta.geoCount;
    uint64_t headerSize = sizeof(DecodeHeader);
    uint64_t descsSize = numDescs * sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) +
                         numInstances * sizeof(D3D12_RAYTRACING_INSTANCE_DESC);
    // Each triangle is stored separately - 3 vertices (9 floats) per triangle
    uint64_t triangleDataSize = 9 * sizeof(float);
    uint64_t proceduralDataSize = sizeof(D3D12_RAYTRACING_AABB);
    uint64_t geoDataSize = numTriangles * triangleDataSize + numProcedurals * proceduralDataSize;
    postbuildInfoDecoded->DecodedSizeInBytes = headerSize + descsSize + geoDataSize;
 }
--- a/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
+++ b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
--- a/src/intel/vulkan/grl/gpu/common.h
+++ b/src/intel/vulkan/grl/gpu/common.h
@ -1,429 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "shared.h"
 #include "intrinsics.h"
 #include "AABB.h"
 #include "AABB3f.h"
 #include "qbvh6.h"
 /* ====== BVH_BUILDER config ====== */
 __constant const float cfg_intCost = 4.0f;
 __constant const float cfg_travCost = 1.0f;
 __constant const uint cfg_minLeafSize = BVH_LEAF_N_MIN;
 __constant const uint cfg_maxLeafSize = BVH_LEAF_N_MAX;
 __constant const uint cfg_maxDepth = BUILDRECORD_STACK_SIZE;
 #define ENABLE_CONVERSION_CHECKS 0
 #ifdef ENABLE_BIG_REG_ANNOTATION
 #define GRL_ANNOTATE_BIG_REG_REQ __attribute__((annotate("num-thread-per-eu 4")))
 #else
 #define GRL_ANNOTATE_BIG_REG_REQ
 #endif
 #ifdef ENABLE_IGC_DO_NOT_SPILL
 #define GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((annotate("igc-do-not-spill")))
 #else
 #define GRL_ANNOTATE_IGC_DO_NOT_SPILL
 #endif
 #define ERROR()
 /* =================================================================================================================================================== */
 /* =================================================================================================================================================== */
 /* =================================================================================================================================================== */
 /* =================================================================================================================================================== */
 GRL_INLINE unsigned int getNumLeafPrims(unsigned int offset)
 {
    return (offset & 0x7) - 3;
 }
 GRL_INLINE unsigned int getLeafOffset(unsigned int offset)
 {
    return offset & (~0x7);
 }
 GRL_INLINE float4 triangleNormal(const float4 v0, const float4 v1, const float4 v2)
 {
    const float4 a = v1 - v0;
    const float4 b = v2 - v0;
    return cross(a, b);
 }
 GRL_INLINE float areaTriangle(const float4 v0, const float4 v1, const float4 v2)
 {
    const float4 normal = triangleNormal(v0, v1, v2);
    return length((float3)(normal.x, normal.y, normal.z)) * 0.5f;
 }
 GRL_INLINE float det2(const float2 a, const float2 b)
 {
    return a.x * b.y - a.y * b.x;
 }
 GRL_INLINE float areaProjectedTriangle(const float4 v0, const float4 v1, const float4 v2)
 {
    const float xy = 0.5f * fabs(det2(v1.xy - v0.xy, v2.xy - v0.xy));
    const float yz = 0.5f * fabs(det2(v1.yz - v0.yz, v2.yz - v0.yz));
    const float zx = 0.5f * fabs(det2(v1.zx - v0.zx, v2.zx - v0.zx));
    return xy + yz + zx;
 }
 typedef struct Block64B  {
    char data[64];
 } Block64B __attribute__((aligned(64)));
 typedef char byte_align64B __attribute__((aligned(64)));
 /* ====================================================================== */
 /* ============================== GLOBALS =============================== */
 /* ====================================================================== */
 GRL_INLINE bool Globals_OnFinish(global struct Globals *globals)
 {
    /* last active HW thread ? */
    if (get_local_id(0) == 0)
    {
        const uint sync = atomic_add(&globals->sync, 1);
        if (sync + 1 == get_num_groups(0))
        {
            globals->sync = 0;
            return true;
        }
    }
    return false;
 }
 GRL_INLINE uint BlockAllocator_BytesUsed(struct BlockAllocator *p)
 {
    return p->cur - p->start;
 };
 GRL_INLINE uint BlockAllocator_Alloc(__global struct BlockAllocator *p, const uint size)
 {
    return atomic_add(&p->cur, size);
 }
 GRL_INLINE uint BlockAllocator_Alloc_Single(__global struct BlockAllocator *p, const uint size)
 {
    uint offset = 0;
    if (get_sub_group_local_id() == 0)
        offset = atomic_add(&p->cur, size);
    return sub_group_broadcast(offset, 0);
 }
 // node allocation returns an offset from beginning of BVH to allocated node
 //  in multiples of 64B
 GRL_INLINE uint allocate_inner_nodes(global struct BVHBase* base, uint num_nodes )
 {
    return atomic_add_global( &base->nodeDataCur, num_nodes );
 }
 GRL_INLINE uint allocate_procedural_leaves(global struct BVHBase* base, uint num_nodes)
 {
    return atomic_add_global(&base->proceduralDataCur, num_nodes);
 }
 GRL_INLINE uint allocate_quad_leaves(global struct BVHBase* base, uint num_nodes)
 {
    return atomic_add_global(&base->quadLeafCur, num_nodes);
 }
 #if 0
 GRL_INLINE uint alloc_node_mem(global struct Globals *globals, const uint size)
 {
    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
    return BlockAllocator_Alloc(&globals->node_mem_allocator, aligned_size);
 }
 GRL_INLINE uint alloc_single_node_mem(global struct Globals *globals, const uint size)
 {
    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
    return BlockAllocator_Alloc_Single(&globals->node_mem_allocator, aligned_size);
 }
 GRL_INLINE uint alloc_quad_leaf_mem(global struct Globals *globals, const uint size)
 {
    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
    return BlockAllocator_Alloc(&globals->quad_mem_allocator, aligned_size);
 }
 GRL_INLINE uint alloc_procedural_leaf_mem(global struct Globals *globals, const uint size)
 {
    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
    return BlockAllocator_Alloc(&globals->procedural_mem_allocator, aligned_size);
 }
 #endif
 GRL_INLINE global struct BuildRecord *getBuildRecords(char *bvh_mem, struct Globals *globals)
 {
    return (global struct BuildRecord *)(bvh_mem + globals->build_record_start);
 }
 /* ======================================================================= */
 /* ============================== TRIANGLE =============================== */
 /* ======================================================================= */
 /*GRL_INLINE void printTriangle(struct Triangle *t)
 {
  printf("vtx[0] %d vtx[1] %d vtx[2] %d primID %d geomID %d \n",t->vtx[0],t->vtx[1],t->vtx[2],t->primID,t->geomID);
  }*/
 /* ==================================================================== */
 /* ============================== SPLIT =============================== */
 /* ==================================================================== */
 GRL_INLINE void printSplit(struct Split *split)
 {
    printf("split sah %f dim %d pos %d \n", split->sah, split->dim, split->pos);
 }
 /* ========================================================================== */
 /* ============================== BUILDRECORD =============================== */
 /* ========================================================================== */
 GRL_INLINE void initBuildRecord(struct BuildRecord *buildRecord, uint start, uint end)
 {
    AABB_init(&buildRecord->centroidBounds);
    buildRecord->start = start;
    buildRecord->end = end;
 }
 GRL_INLINE void extendBuildRecord(struct BuildRecord *buildRecord, struct AABB *primref)
 {
    AABB_extend_point(&buildRecord->centroidBounds, AABB_centroid2(primref));
 }
 GRL_INLINE uint getBuildRecursionDepth(struct BuildRecord *buildRecord)
 {
    return as_uint(buildRecord->centroidBounds.upper.w);
 }
 GRL_INLINE void setBuildRecursionDepth(struct BuildRecord *buildRecord, uint depth)
 {
    buildRecord->centroidBounds.upper.w = as_float(depth);
 }
 GRL_INLINE uint getNumPrimsBuildRecord(struct BuildRecord *buildRecord)
 {
    return buildRecord->end - buildRecord->start;
 }
 /* ========================================================================== */
 /* =================== BinaryMortonCodeHierarchy ============================= */
 /* ========================================================================== */
 GRL_INLINE void BinaryMortonCodeHierarchy_init(struct BinaryMortonCodeHierarchy *record, uint start, uint end)
 {
    record->range.start = start;
    record->range.end = end;
    record->leftChild = -1;
    record->rightChild = -1;
 //    record->flag = 0;
 }
 GRL_INLINE uint BinaryMortonCodeHierarchy_getNumPrimitives(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
 {
    /* leaf case */
    if (nodeID & (uint)(1 << 31))
        return 1;
    /* inner node case*/
    else
        return nodes[nodeID].range.end - nodes[nodeID].range.start + 1;
 }
 GRL_INLINE struct BinaryMortonCodeHierarchy BinaryMortonCodeHierarchy_getEntry(global struct BinaryMortonCodeHierarchy* nodes, uint nodeID)
 {
    struct BinaryMortonCodeHierarchy entry;
    if (nodeID & (uint)(1 << 31)) {
        /* leaf case */
        uint rangeStart = nodeID ^ (uint)(1 << 31);
        BinaryMortonCodeHierarchy_init(&entry, rangeStart, rangeStart);
    }
    else {
        /* inner node case*/
        entry = nodes[nodeID];
    }
    return entry;
 }
 GRL_INLINE uint BinaryMortonCodeHierarchy_getRangeStart(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
 {
    /* leaf case */
    if (nodeID & (uint)(1 << 31))
        return nodeID ^ (uint)(1 << 31);
    /* inner node case*/
    else
        return nodes[nodeID].range.start;
 }
 /* ==================================================================== */
 /* ============================== RANGE =============================== */
 /* ==================================================================== */
 GRL_INLINE void printRange(struct Range *range)
 {
    printf("start %d end %d \n", range->start, range->end);
 }
 GRL_INLINE bool equalRange(struct Range *range0, struct Range *range1)
 {
    if (range0->start == range1->start &&
        range0->end == range1->end)
        return true;
    return false;
 }
 GRL_INLINE uint getSizeRange(struct Range *range)
 {
    return range->end - range->start;
 }
 /* ==================================================================== */
 /* ========================= ProceduralLeaf =========================== */
 /* ==================================================================== */
 #if 0
 struct ProceduralLeaf
 {
  uint shaderIndex_geomMask;
  uint geomIndex_flags;
  uint N_last;
  uint primIndex[13];
 };
 #endif
 GRL_INLINE uint ProceduralLeaf_geomIndex(global struct ProceduralLeaf *This)
 {
    return This->leafDesc.geomIndex_flags & 0x1FFFFFFF;
 }
 GRL_INLINE uint ProceduralLeaf_primIndex(global struct ProceduralLeaf *This, uint i)
 {
    //assert(i < N);
    return This->_primIndex[i];
 }
 /* ==================================================================== */
 /* =========================== TrianglePair =========================== */
 /* ==================================================================== */
 struct TrianglePair
 {
    uint4 a;    // indices of the 4 verts to store in the quad
    uint3 lb;   //   index of the second triangle's verts in 'a'
 };
 GRL_INLINE struct TrianglePair TrianglePair_Constructor(uint3 tri0, uint primID0, uint3 tri1, uint primID1)
 {
    struct TrianglePair q;
    q.a.x = tri0.x;
    q.a.y = tri0.y;
    q.a.z = tri0.z;
    q.a.w = tri0.z;
    uint3 b;
    b.x = tri1.x;
    b.y = tri1.y;
    b.z = tri1.z;
    q.lb = (uint3)(3);
    q.lb.x = (b.x == q.a.x) ? 0 : q.lb.x;
    q.lb.y = (b.y == q.a.x) ? 0 : q.lb.y;
    q.lb.z = (b.z == q.a.x) ? 0 : q.lb.z;
    q.lb.x = (b.x == q.a.y) ? 1 : q.lb.x;
    q.lb.y = (b.y == q.a.y) ? 1 : q.lb.y;
    q.lb.z = (b.z == q.a.y) ? 1 : q.lb.z;
    q.lb.x = (b.x == q.a.z) ? 2 : q.lb.x;
    q.lb.y = (b.y == q.a.z) ? 2 : q.lb.y;
    q.lb.z = (b.z == q.a.z) ? 2 : q.lb.z;
    q.lb.x = (primID0 != primID1) ? q.lb.x : 0;
    q.lb.y = (primID0 != primID1) ? q.lb.y : 0;
    q.lb.z = (primID0 != primID1) ? q.lb.z : 0;
    q.a.w = (q.lb.x == 3) ? b.x : q.a.w;
    q.a.w = (q.lb.y == 3) ? b.y : q.a.w;
    q.a.w = (q.lb.z == 3) ? b.z : q.a.w;
    return q;
 }
 GRL_INLINE float InstanceDesc_get_transform(const InstanceDesc *d, const uint32_t row, const uint32_t column)
 {
    return d->Transform[row][column];
 }
 GRL_INLINE uint32_t InstanceDesc_get_instanceID(const InstanceDesc *d)
 {
    return d->InstanceIDAndMask & (0x00FFFFFF);
 }
 GRL_INLINE uint32_t InstanceDesc_get_InstanceMask(const InstanceDesc *d)
 {
    return d->InstanceIDAndMask >> 24;
 }
 GRL_INLINE uint32_t InstanceDesc_get_InstanceContributionToHitGroupIndex(const InstanceDesc *d)
 {
    return d->InstanceContributionToHitGroupIndexAndFlags & ((1 << 24) - 1);
 }
 GRL_INLINE uint32_t InstanceDesc_get_InstanceFlags(const InstanceDesc *d)
 {
    return d->InstanceContributionToHitGroupIndexAndFlags >> 24;
 }
 GRL_INLINE gpuva_t InstanceDesc_get_AccelerationStructure(const InstanceDesc *d)
 {
    return d->AccelerationStructureGPUVA;
 }
 GRL_INLINE void InstanceDesc_set_transform(InstanceDesc *d, const uint32_t row, const uint32_t column, float value)
 {
    d->Transform[row][column] = value;
 }
 GRL_INLINE void InstanceDesc_set_instanceID(InstanceDesc *d, const uint32_t id)
 {
    d->InstanceIDAndMask &= 255 << 24;
    d->InstanceIDAndMask |= id & ((1 << 24) - 1);
 }
 GRL_INLINE void InstanceDesc_set_InstanceMask(InstanceDesc *d, const uint32_t mask)
 {
    d->InstanceIDAndMask &= ((1 << 24) - 1);
    d->InstanceIDAndMask |= mask << 24;
 }
 GRL_INLINE void InstanceDesc_set_InstanceContributionToHitGroupIndex(InstanceDesc *d, const uint32_t contribution)
 {
    d->InstanceContributionToHitGroupIndexAndFlags &= 255 << 24;
    d->InstanceContributionToHitGroupIndexAndFlags |= contribution & ((1 << 24) - 1);
 }
 GRL_INLINE void InstanceDesc_set_InstanceFlags(InstanceDesc *d, const uint32_t flags)
 {
    d->InstanceContributionToHitGroupIndexAndFlags &= ((1 << 24) - 1);
    d->InstanceContributionToHitGroupIndexAndFlags |= flags << 24;
 }
 GRL_INLINE void InstanceDesc_set_AccelerationStructure(InstanceDesc *d, gpuva_t address)
 {
    d->AccelerationStructureGPUVA = address;
 }
--- a/src/intel/vulkan/grl/gpu/copy.grl
+++ b/src/intel/vulkan/grl/gpu/copy.grl
@ -1,129 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module copy; // In copy we assume output data structure to be DXR compatible
 kernel clone_indirect < source="bvh_copy.cl", kernelFunction="clone_indirect" >
 kernel compact < source="bvh_copy.cl", kernelFunction="compact" >
 kernel serialize_indirect < source="bvh_copy.cl", kernelFunction="serialize_indirect" >
 kernel serialize_for_input_dump_indirect < source="bvh_copy.cl", kernelFunction="serialize_for_input_dump_indirect" >
 kernel deserialize_indirect < source="bvh_copy.cl", kernelFunction="deserialize_indirect" >
 kernel dxr_decode < source="bvh_copy.cl", kernelFunction="dxr_decode" >
 metakernel clone_indirect(
    qword dest,
    qword src,
    qword srcBVHsizedwordAddr)
 {
 // this has to be compatible with in kernel GroupCountForCopy(...)
    define byteSize REG0;
    define numGroupsRqd REG1;
    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
    byteSize = load_dword(srcBVHsizedwordAddr);
    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
    DISPATCHDIM_X = numGroupsRqd.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect clone_indirect args(
        dest,
        src);
 }
 metakernel compact(
    qword dest,
    qword src)
 {
    dispatch compact(32,1,1) args(
        dest,
        src,
        32);
 }
 metakernel serialize_indirect(
    qword dest,
    qword src,
    qword driverID,
    qword srcBVHsizedwordAddr)
 {
    define byteSize REG0;
    define numGroupsRqd REG1;
    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
    byteSize = load_dword(srcBVHsizedwordAddr);
    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
    DISPATCHDIM_X = numGroupsRqd.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect serialize_indirect args(
        dest,
        src,
        driverID);
 }
 metakernel serialize_for_input_dump_indirect(
    qword batchPtrs,
    qword dstOffset,
    qword src,
    qword driverID,
    qword srcBVHsizedwordAddr)
 {
    define byteSize REG0;
    define numGroupsRqd REG1;
    define BYTE_PER_GROUP_CHUNK_SHIFT   REG2;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
    define REMINDER_NUM_GROUPS          REG3;   REMINDER_NUM_GROUPS = 4;
    byteSize = load_dword(srcBVHsizedwordAddr);
    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
    DISPATCHDIM_X = numGroupsRqd.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect serialize_for_input_dump_indirect args(
        batchPtrs,
        dstOffset,
        src,
        driverID);
 }
 metakernel deserialize_indirect(
    qword dest,
    qword src,
    qword srcBVHsizedwordAddr)
 {
    define byteSize REG0;
    define numGroupsRqd REG1;
    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
    byteSize = load_dword(srcBVHsizedwordAddr);
    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
    DISPATCHDIM_X = numGroupsRqd.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect deserialize_indirect args(
        dest,
        src);
 }
 metakernel dxr_decode(
    qword dest,
    qword src)
 {
    dispatch dxr_decode(1,1,1) args(
        dest,
        src);
 }
--- a/src/intel/vulkan/grl/gpu/d3d12.h
+++ b/src/intel/vulkan/grl/gpu/d3d12.h
@ -1,525 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "GRLStructs.h"
 #include "shared.h"
 typedef global void *D3D12_GPU_VIRTUAL_ADDRESS;
 typedef void *ID3D12StateObjectPrototype;
 enum DXGI_FORMAT
 {
    DXGI_FORMAT_UNKNOWN,
    DXGI_FORMAT_R32G32B32A32_TYPELESS,
    DXGI_FORMAT_R32G32B32A32_FLOAT,
    DXGI_FORMAT_R32G32B32A32_UINT,
    DXGI_FORMAT_R32G32B32A32_SINT,
    DXGI_FORMAT_R32G32B32_TYPELESS,
    DXGI_FORMAT_R32G32B32_FLOAT,
    DXGI_FORMAT_R32G32B32_UINT,
    DXGI_FORMAT_R32G32B32_SINT,
    DXGI_FORMAT_R16G16B16A16_TYPELESS,
    DXGI_FORMAT_R16G16B16A16_FLOAT,
    DXGI_FORMAT_R16G16B16A16_UNORM,
    DXGI_FORMAT_R16G16B16A16_UINT,
    DXGI_FORMAT_R16G16B16A16_SNORM,
    DXGI_FORMAT_R16G16B16A16_SINT,
    DXGI_FORMAT_R32G32_TYPELESS,
    DXGI_FORMAT_R32G32_FLOAT,
    DXGI_FORMAT_R32G32_UINT,
    DXGI_FORMAT_R32G32_SINT,
    DXGI_FORMAT_R32G8X24_TYPELESS,
    DXGI_FORMAT_D32_FLOAT_S8X24_UINT,
    DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS,
    DXGI_FORMAT_X32_TYPELESS_G8X24_UINT,
    DXGI_FORMAT_R10G10B10A2_TYPELESS,
    DXGI_FORMAT_R10G10B10A2_UNORM,
    DXGI_FORMAT_R10G10B10A2_UINT,
    DXGI_FORMAT_R11G11B10_FLOAT,
    DXGI_FORMAT_R8G8B8A8_TYPELESS,
    DXGI_FORMAT_R8G8B8A8_UNORM,
    DXGI_FORMAT_R8G8B8A8_UNORM_SRGB,
    DXGI_FORMAT_R8G8B8A8_UINT,
    DXGI_FORMAT_R8G8B8A8_SNORM,
    DXGI_FORMAT_R8G8B8A8_SINT,
    DXGI_FORMAT_R16G16_TYPELESS,
    DXGI_FORMAT_R16G16_FLOAT,
    DXGI_FORMAT_R16G16_UNORM,
    DXGI_FORMAT_R16G16_UINT,
    DXGI_FORMAT_R16G16_SNORM,
    DXGI_FORMAT_R16G16_SINT,
    DXGI_FORMAT_R32_TYPELESS,
    DXGI_FORMAT_D32_FLOAT,
    DXGI_FORMAT_R32_FLOAT,
    DXGI_FORMAT_R32_UINT,
    DXGI_FORMAT_R32_SINT,
    DXGI_FORMAT_R24G8_TYPELESS,
    DXGI_FORMAT_D24_UNORM_S8_UINT,
    DXGI_FORMAT_R24_UNORM_X8_TYPELESS,
    DXGI_FORMAT_X24_TYPELESS_G8_UINT,
    DXGI_FORMAT_R8G8_TYPELESS,
    DXGI_FORMAT_R8G8_UNORM,
    DXGI_FORMAT_R8G8_UINT,
    DXGI_FORMAT_R8G8_SNORM,
    DXGI_FORMAT_R8G8_SINT,
    DXGI_FORMAT_R16_TYPELESS,
    DXGI_FORMAT_R16_FLOAT,
    DXGI_FORMAT_D16_UNORM,
    DXGI_FORMAT_R16_UNORM,
    DXGI_FORMAT_R16_UINT,
    DXGI_FORMAT_R16_SNORM,
    DXGI_FORMAT_R16_SINT,
    DXGI_FORMAT_R8_TYPELESS,
    DXGI_FORMAT_R8_UNORM,
    DXGI_FORMAT_R8_UINT,
    DXGI_FORMAT_R8_SNORM,
    DXGI_FORMAT_R8_SINT,
    DXGI_FORMAT_A8_UNORM,
    DXGI_FORMAT_R1_UNORM,
    DXGI_FORMAT_R9G9B9E5_SHAREDEXP,
    DXGI_FORMAT_R8G8_B8G8_UNORM,
    DXGI_FORMAT_G8R8_G8B8_UNORM,
    DXGI_FORMAT_BC1_TYPELESS,
    DXGI_FORMAT_BC1_UNORM,
    DXGI_FORMAT_BC1_UNORM_SRGB,
    DXGI_FORMAT_BC2_TYPELESS,
    DXGI_FORMAT_BC2_UNORM,
    DXGI_FORMAT_BC2_UNORM_SRGB,
    DXGI_FORMAT_BC3_TYPELESS,
    DXGI_FORMAT_BC3_UNORM,
    DXGI_FORMAT_BC3_UNORM_SRGB,
    DXGI_FORMAT_BC4_TYPELESS,
    DXGI_FORMAT_BC4_UNORM,
    DXGI_FORMAT_BC4_SNORM,
    DXGI_FORMAT_BC5_TYPELESS,
    DXGI_FORMAT_BC5_UNORM,
    DXGI_FORMAT_BC5_SNORM,
    DXGI_FORMAT_B5G6R5_UNORM,
    DXGI_FORMAT_B5G5R5A1_UNORM,
    DXGI_FORMAT_B8G8R8A8_UNORM,
    DXGI_FORMAT_B8G8R8X8_UNORM,
    DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM,
    DXGI_FORMAT_B8G8R8A8_TYPELESS,
    DXGI_FORMAT_B8G8R8A8_UNORM_SRGB,
    DXGI_FORMAT_B8G8R8X8_TYPELESS,
    DXGI_FORMAT_B8G8R8X8_UNORM_SRGB,
    DXGI_FORMAT_BC6H_TYPELESS,
    DXGI_FORMAT_BC6H_UF16,
    DXGI_FORMAT_BC6H_SF16,
    DXGI_FORMAT_BC7_TYPELESS,
    DXGI_FORMAT_BC7_UNORM,
    DXGI_FORMAT_BC7_UNORM_SRGB,
    DXGI_FORMAT_AYUV,
    DXGI_FORMAT_Y410,
    DXGI_FORMAT_Y416,
    DXGI_FORMAT_NV12,
    DXGI_FORMAT_P010,
    DXGI_FORMAT_P016,
    DXGI_FORMAT_420_OPAQUE,
    DXGI_FORMAT_YUY2,
    DXGI_FORMAT_Y210,
    DXGI_FORMAT_Y216,
    DXGI_FORMAT_NV11,
    DXGI_FORMAT_AI44,
    DXGI_FORMAT_IA44,
    DXGI_FORMAT_P8,
    DXGI_FORMAT_A8P8,
    DXGI_FORMAT_B4G4R4A4_UNORM,
    DXGI_FORMAT_P208,
    DXGI_FORMAT_V208,
    DXGI_FORMAT_V408,
    DXGI_FORMAT_FORCE_UINT
 };
 typedef enum D3D12_RAYTRACING_GEOMETRY_FLAGS
 {
    D3D12_RAYTRACING_GEOMETRY_FLAG_NONE = 0,
    D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE = 0x1,
    D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2
 } D3D12_RAYTRACING_GEOMETRY_FLAGS;
 typedef enum D3D12_RAYTRACING_GEOMETRY_TYPE
 {
    D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES = 0,
    D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS = (D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES + 1)
 } D3D12_RAYTRACING_GEOMETRY_TYPE;
 typedef enum D3D12_RAYTRACING_INSTANCE_FLAGS
 {
    D3D12_RAYTRACING_INSTANCE_FLAG_NONE = 0,
    D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
    D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
    D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
    D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8
 } D3D12_RAYTRACING_INSTANCE_FLAGS;
 typedef struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE
 {
    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
    unsigned long StrideInBytes;
 } D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE;
 typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE
 {
    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
    unsigned long SizeInBytes;
 } D3D12_GPU_VIRTUAL_ADDRESSRANGE;
 typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE
 {
    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
    unsigned long SizeInBytes;
    unsigned long StrideInBytes;
 } D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE;
 typedef struct D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC
 {
    D3D12_GPU_VIRTUAL_ADDRESS Transform;
    enum DXGI_FORMAT IndexFormat;
    enum DXGI_FORMAT VertexFormat;
    unsigned int IndexCount;
    unsigned int VertexCount;
    D3D12_GPU_VIRTUAL_ADDRESS IndexBuffer;
    struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE VertexBuffer;
 } D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC;
 typedef struct D3D12_RAYTRACING_AABB
 {
    float MinX;
    float MinY;
    float MinZ;
    float MaxX;
    float MaxY;
    float MaxZ;
 } D3D12_RAYTRACING_AABB;
 GRL_INLINE void D3D12_set_raytracing_aabb(D3D12_RAYTRACING_AABB* dest, struct AABB* source)
 {
    dest->MinX = source->lower.x;
    dest->MinY = source->lower.y;
    dest->MinZ = source->lower.z;
    dest->MaxX = source->upper.x;
    dest->MaxY = source->upper.y;
    dest->MaxZ = source->upper.z;
 }
 typedef struct D3D12_RAYTRACING_GEOMETRY_AABBS_DESC
 {
    unsigned long AABBCount;
    D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE AABBs;
 } D3D12_RAYTRACING_GEOMETRY_AABBS_DESC;
 typedef struct D3D12_RAYTRACING_GEOMETRY_DESC
 {
    D3D12_RAYTRACING_GEOMETRY_TYPE Type;
    D3D12_RAYTRACING_GEOMETRY_FLAGS Flags;
    //unsigned int ShaderIndex : 24; // extension
    //unsigned int Mask : 8; // extension
    //unsigned int ShaderIndex_Mask; // extension
    union {
        D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC Triangles;
        D3D12_RAYTRACING_GEOMETRY_AABBS_DESC AABBs;
    };
 } D3D12_RAYTRACING_GEOMETRY_DESC;
 GRL_INLINE void D3D12_set_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_TYPE type)
 {
    geomDesc->Type = type;
 }
 GRL_INLINE D3D12_RAYTRACING_GEOMETRY_TYPE D3D12_get_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->Type;
 }
 GRL_INLINE void D3D12_set_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_FLAGS flags)
 {
    geomDesc->Flags = flags;
 }
 GRL_INLINE D3D12_RAYTRACING_GEOMETRY_FLAGS D3D12_get_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->Flags;
 }
 GRL_INLINE void D3D12_set_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS transform)
 {
    geomDesc->Triangles.Transform = transform;
 }
 GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->Triangles.Transform;
 }
 GRL_INLINE void D3D12_set_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, IndexFormat format)
 {
    switch (format)
    {
    case INDEX_FORMAT_NONE:
        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_UNKNOWN;
        break;
    case INDEX_FORMAT_R16_UINT:
        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R16_UINT;
        break;
    case INDEX_FORMAT_R32_UINT:
        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R32_UINT;
        break;
    }
 }
 GRL_INLINE IndexFormat D3D12_get_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    switch (geomDesc->Triangles.IndexFormat)
    {
    case DXGI_FORMAT_R16_UINT:
        return INDEX_FORMAT_R16_UINT;
    case DXGI_FORMAT_R32_UINT:
        return INDEX_FORMAT_R32_UINT;
    case DXGI_FORMAT_UNKNOWN:
    default:
        return INDEX_FORMAT_NONE;
    }
 }
 GRL_INLINE void D3D12_set_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, VertexFormat format)
 {
    switch (format)
    {
    case VERTEX_FORMAT_R32G32_FLOAT:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32_FLOAT;
        break;
    case VERTEX_FORMAT_R32G32B32_FLOAT:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32B32_FLOAT;
        break;
    case VERTEX_FORMAT_R16G16_FLOAT:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_FLOAT;
        break;
    case VERTEX_FORMAT_R16G16B16A16_FLOAT:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_FLOAT;
        break;
    case VERTEX_FORMAT_R16G16_SNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_SNORM;
        break;
    case VERTEX_FORMAT_R16G16B16A16_SNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_SNORM;
        break;
    case VERTEX_FORMAT_R16G16B16A16_UNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_UNORM;
        break;
    case VERTEX_FORMAT_R16G16_UNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_UNORM;
        break;
    case VERTEX_FORMAT_R10G10B10A2_UNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R10G10B10A2_UNORM;
        break;
    case VERTEX_FORMAT_R8G8B8A8_UNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_UNORM;
        break;
    case VERTEX_FORMAT_R8G8_UNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_UNORM;
        break;
    case VERTEX_FORMAT_R8G8B8A8_SNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_SNORM;
        break;
    case VERTEX_FORMAT_R8G8_SNORM:
        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_SNORM;
        break;
    }
 }
 GRL_INLINE VertexFormat D3D12_get_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    switch(geomDesc->Triangles.VertexFormat)
    {
    case DXGI_FORMAT_R32G32_FLOAT:
        return VERTEX_FORMAT_R32G32_FLOAT;
    case DXGI_FORMAT_R32G32B32_FLOAT:
        return VERTEX_FORMAT_R32G32B32_FLOAT;
    case DXGI_FORMAT_R16G16_FLOAT:
        return VERTEX_FORMAT_R16G16_FLOAT;
    case DXGI_FORMAT_R16G16B16A16_FLOAT:
        return VERTEX_FORMAT_R16G16B16A16_FLOAT;
    case DXGI_FORMAT_R16G16_SNORM:
        return VERTEX_FORMAT_R16G16_SNORM;
    case DXGI_FORMAT_R16G16B16A16_SNORM:
        return VERTEX_FORMAT_R16G16B16A16_SNORM;
    case DXGI_FORMAT_R16G16B16A16_UNORM:
        return VERTEX_FORMAT_R16G16B16A16_UNORM;
    case DXGI_FORMAT_R16G16_UNORM:
        return VERTEX_FORMAT_R16G16_UNORM;
    case DXGI_FORMAT_R10G10B10A2_UNORM:
        return VERTEX_FORMAT_R10G10B10A2_UNORM;
    case DXGI_FORMAT_R8G8B8A8_UNORM:
        return VERTEX_FORMAT_R8G8B8A8_UNORM;
    case DXGI_FORMAT_R8G8_UNORM:
        return VERTEX_FORMAT_R8G8_UNORM;
    case DXGI_FORMAT_R8G8B8A8_SNORM:
        return VERTEX_FORMAT_R8G8B8A8_SNORM;
    case DXGI_FORMAT_R8G8_SNORM:
        return VERTEX_FORMAT_R8G8_SNORM;
    default:
        return VERTEX_FORMAT_R32G32_FLOAT;
    }
 }
 GRL_INLINE void D3D12_set_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
 {
    geomDesc->Triangles.IndexCount = count;
 }
 GRL_INLINE unsigned int D3D12_get_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->Triangles.IndexCount;
 }
 GRL_INLINE void D3D12_set_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
 {
    geomDesc->Triangles.VertexCount = count;
 }
 GRL_INLINE unsigned int D3D12_get_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->Triangles.VertexCount;
 }
 GRL_INLINE void D3D12_set_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS buffer)
 {
    geomDesc->Triangles.IndexBuffer = buffer;
 }
 GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->Triangles.IndexBuffer;
 }
 GRL_INLINE void D3D12_set_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
 {
    geomDesc->Triangles.VertexBuffer.StartAddress = address;
 }
 GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->Triangles.VertexBuffer.StartAddress;
 }
 GRL_INLINE void D3D12_set_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
 {
    geomDesc->Triangles.VertexBuffer.StrideInBytes = stride;
 }
 GRL_INLINE unsigned long D3D12_get_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->Triangles.VertexBuffer.StrideInBytes;
 }
 GRL_INLINE void D3D12_set_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long count)
 {
    geomDesc->AABBs.AABBCount = count;
 }
 GRL_INLINE unsigned long D3D12_get_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->AABBs.AABBCount;
 }
 GRL_INLINE void D3D12_set_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
 {
    geomDesc->AABBs.AABBs.StartAddress = address;
 }
 GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->AABBs.AABBs.StartAddress;
 }
 GRL_INLINE void D3D12_set_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
 {
    geomDesc->AABBs.AABBs.StrideInBytes = stride;
 }
 GRL_INLINE unsigned long D3D12_get_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
 {
    return geomDesc->AABBs.AABBs.StrideInBytes;
 }
 typedef struct D3D12_RAYTRACING_INSTANCE_DESC
 {
    float Transform[12];
    //     unsigned int InstanceID : 24;
    //     unsigned int InstanceMask : 8;
    uint32_t DW0;
    //     unsigned int InstanceContributionToHitGroupIndex : 24;
    //     unsigned int Flags : 8;
    uint32_t DW1;
    global char *AccelerationStructure;
 } D3D12_RAYTRACING_INSTANCE_DESC;
 GRL_INLINE float D3D12_get_transform(const D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column)
 {
    return d->Transform[row * 4 + column];
 }
 GRL_INLINE uint32_t D3D12_get_instanceID(const D3D12_RAYTRACING_INSTANCE_DESC *d)
 {
    return d->DW0 & ((1 << 24) - 1);
 }
 GRL_INLINE uint32_t D3D12_get_InstanceMask(const D3D12_RAYTRACING_INSTANCE_DESC *d)
 {
    return d->DW0 >> 24;
 }
 GRL_INLINE uint32_t D3D12_get_InstanceContributionToHitGroupIndex(const D3D12_RAYTRACING_INSTANCE_DESC *d)
 {
    return d->DW1 & ((1 << 24) - 1);
 }
 GRL_INLINE uint32_t D3D12_get_InstanceFlags(const D3D12_RAYTRACING_INSTANCE_DESC *d)
 {
    return d->DW1 >> 24;
 }
 GRL_INLINE gpuva_t D3D12_get_AccelerationStructure(const D3D12_RAYTRACING_INSTANCE_DESC *d)
 {
    return (gpuva_t)d->AccelerationStructure;
 }
 GRL_INLINE void D3D12_set_transform(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column, float value)
 {
    d->Transform[row * 4 + column] = value;
 }
 GRL_INLINE void D3D12_set_instanceID(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t id)
 {
    d->DW0 &= 255 << 24;
    d->DW0 |= id & ((1 << 24) - 1);
 }
 GRL_INLINE void D3D12_set_InstanceMask(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t mask)
 {
    d->DW0 &= ((1 << 24) - 1);
    d->DW0 |= mask << 24;
 }
 GRL_INLINE void D3D12_set_InstanceContributionToHitGroupIndex(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t contribution)
 {
    d->DW1 &= 255 << 24;
    d->DW1 |= contribution & ((1 << 24) - 1);
 }
 GRL_INLINE void D3D12_set_InstanceFlags(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t flags)
 {
    d->DW1 &= ((1 << 24) - 1);
    d->DW1 |= flags << 24;
 }
 GRL_INLINE void D3D12_set_AccelerationStructure(D3D12_RAYTRACING_INSTANCE_DESC *d, gpuva_t address)
 {
    d->AccelerationStructure = (global char*)address;
 }
--- a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl
+++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl
@ -1,59 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "api_interface.h"
 __attribute__((reqd_work_group_size(1, 1, 1))) void kernel CopyGeom(
    global struct Geo *src,
    global struct Geo *dst,
    global float4 *vec,
    global ushort *indices,
    dword step)
 {
    src = src + get_group_id(0);
    dst = dst + get_group_id(0);
    dst->Flags = src->Flags;
    dst->Type = src->Type;
    if (src->Type == GEOMETRY_TYPE_PROCEDURAL)
    {
        dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
        dst->Desc.Procedural.AABBCount = src->Desc.Procedural.AABBCount;
        dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
    }
    else
    {
        dst->Desc.Triangles.pTransformBuffer = src->Desc.Triangles.pTransformBuffer;
        if (step == 0)
            return;
        dst->Desc.Triangles.IndexCount = src->Desc.Triangles.IndexCount;
        if (step == 1)
            return;
        dst->Desc.Triangles.VertexCount = src->Desc.Triangles.VertexCount;
        if (step == 2)
            return;
        dst->Desc.Triangles.IndexFormat = src->Desc.Triangles.IndexFormat;
        if (step == 3)
            return;
        dst->Desc.Triangles.pIndexBuffer = src->Desc.Triangles.pIndexBuffer;
        if (step == 4)
            return;
        dst->Desc.Triangles.pVertexBuffer = src->Desc.Triangles.pVertexBuffer;
        if (step == 5)
            return;
        dst->Desc.Triangles.VertexBufferByteStride = src->Desc.Triangles.VertexBufferByteStride;
        dst->Desc.Triangles.VertexFormat = src->Desc.Triangles.VertexFormat;
        for (uint t = 0; t * 3 < dst->Desc.Triangles.IndexCount; t++)
        {
            uint3 tri = GRL_load_triangle(src, t);
            vec[t * 3] = GRL_load_vertex(src, tri[0]);
            vec[t * 3 + 1] = GRL_load_vertex(src, tri[1]);
            vec[t * 3 + 2] = GRL_load_vertex(src, tri[2]);
        }
    }
 }
--- a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl
+++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl
@ -1,27 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module api_interface_verify;
 kernel copy_geom                   < source="grl_api_interface_verify.cl", kernelFunction="CopyGeom" >
 metakernel ifc0_copy( 
    qword src,
    qword dst,
    qword vec,
    qword srcIndices,
    dword numGroups,
    dword step)
 {
    dispatch copy_geom(numGroups,1,1) args(
        src,
        dst,
        vec,
        srcIndices,
        step
        );
 }
--- a/src/intel/vulkan/grl/gpu/input_dump.cl
+++ b/src/intel/vulkan/grl/gpu/input_dump.cl
@ -1,723 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "api_interface.h"
 #include "common.h"
 #include "d3d12.h"
 #include "mem_utils.h"
 #include "misc_shared.h"
 /// Align value to 128
 ///
 /// @param value vale to align
 /// @return aligned value
 GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; }
 GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) {
    return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch)));
 }
 /// Finds max used byte in vertex buffer
 ///
 /// @param indexBuffPtr pointer to index buffer
 /// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers
 /// @param IndexCount number of indices in index buffer
 /// @param IndexFormat index format
 /// @param VertexCount number of vertices in vertex buffer
 /// @param VertexBufferByteStride vertex buffer byte stride
 __attribute__((reqd_work_group_size(256, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel find_max_used_byte_in_buff(
    global void* indexBuffPtr,
    global uint* vertexBufferUsedByteEnd,
    dword IndexCount,
    dword IndexFormat,
    dword VertexCount,
    qword VertexBufferByteStride)
 {
    local uint sgMax[16];
    uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0);
    if (IndexFormat != INDEX_FORMAT_NONE)
    {
        uint endByte = 0;
        if (glob_id < IndexCount)
        {
            if (IndexFormat == INDEX_FORMAT_R16_UINT)
            {
                global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr;
                endByte = indexBuffPtrShort[glob_id];
            }
            else
            {
                global uint* indexBuffPtrUint = (global uint*) indexBuffPtr;
                endByte = indexBuffPtrUint[glob_id];
            }
        }
        endByte = sub_group_reduce_max(endByte);
        if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (get_sub_group_id() == 0)
        {
            endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]);
            if (get_sub_group_local_id() == 0) 
            {
                endByte = min(endByte, VertexCount);
                if (endByte < VertexCount && IndexCount != 0)
                    ++endByte;
                endByte *= (dword)VertexBufferByteStride;
                atomic_max(vertexBufferUsedByteEnd, endByte);
            }
        }
    }
    else if (glob_id == 0)
    {
        uint endByte = VertexCount * VertexBufferByteStride;
        atomic_max(vertexBufferUsedByteEnd, endByte);
    }
 }
 /// Allocates buffer for vertices
 ///
 /// @param batchPtrs batch pointers struct
 /// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers
 /// @param vertexBufferOffset pointer to offsets to vertex buffers
 /// @param numVertexBuffers number of vertex buffers
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel allocate_linear_offsets_for_vertex_buffers(
    global InputBatchPtrs* batchPtrs,
    global uint* vertexBufferUsedByteEnd,
    global uint* vertexBufferOffset,
    dword numVertexBuffers)
 {
    uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
    if (glob_id < numVertexBuffers)
    {
        uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]);
        uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes);
        vertexBufferOffset[glob_id] = position;
    }
 }
 /// Sets the dst data space for input dump of this batch
 ///
 /// @param inputDumpMainBuffer pointer to main dump buffer
 /// @param batchPtrs batch pointers struct
 /// @param nonVertexSize size of non vertex data
 /// @param batchIdPtr pointer to batch id
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel allocate_data_space_for_inputs(
    global DebugBufferHeader* inputDumpMainBuffer,
    global InputBatchPtrs* batchPtrs,
    uint nonVertexSize,
    global qword* batchIdPtr)
 {
    if (get_sub_group_local_id() == 0) 
    {
        uint vertexBufferSize = batchPtrs->vertexBuffersSize;
        uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize;
        if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2)) 
        {
            inputDumpMainBuffer->overflow = 1;
            batchPtrs->dumpDst = 0;
            batchPtrs->globalDumpBuffer = 0;
            batchPtrs->nonVertexDataStart = 0;
            batchPtrs->totalSize = 0;
            return;
        }
        dword prevHead = inputDumpMainBuffer->gpuHead;
        dword newHead;
        bool circled;
        do
        {
            circled = false;
            newHead = prevHead + sizeOfThisBatch;
            dword bufferBegin = prevHead;
            if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize)
            {
                circled = true;
                newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch;
                bufferBegin = inputDumpMainBuffer->headStart;
            }
            dword bufferEnd = newHead + sizeof(InputBatch);
            uint tail;
            uint tail2 = 7;
            bool wait;
            do
            {
                wait = true;
                tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0);
                // dead code, workaround so IGC won't move tail load out of loop
                if (tail > inputDumpMainBuffer->totalSize) 
                {
                   store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2);
                   tail2 = tail;
                }
                if( prevHead >= tail )
                {
                    //colision example:
                    //  ----------T=======H------------
                    //  -------B=====E-----------------
                    //
                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
                    {
                        wait = false;
                    }
                }
                else 
                {
                    //colision example:
                    //  ==========H-------T============
                    //  B==============E---------------
                    // caution: we will never have H circled completely so that H == T
                    if((bufferEnd < tail) && (bufferBegin >= prevHead)) 
                    {
                        wait = false;
                    }
                }
            } while (wait);
        } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead));
        if (circled)
        {
            global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead);
            endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER;
            prevHead = inputDumpMainBuffer->headStart;
        }
        global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead;
        batchPtrs->dumpDst = (qword)thisBatchDump;
        batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer;
        batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize);
        batchPtrs->totalSize = sizeOfThisBatch;
        global InputBatch* batchOp = (global InputBatch*) thisBatchDump;
        batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH;
        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
        batchOp->vertexBufferDataSize = vertexBufferSize;
        batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize;
        batchOp->batchId = *batchIdPtr;
    }
 }
 /// Sets the dst data space for output dump of this batch
 ///
 /// @param outputDumpMainBuffer pointer to main dump buffer
 /// @param batchPtrs batch pointers struct
 /// @param batchIdPtr pointer to batch id
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel allocate_data_space_for_outputs(
    global DebugBufferHeader* outputDumpMainBuffer,
    global OutputBatchPtrs* batchPtrs,
    global qword* batchIdPtr)
 {
    if (get_sub_group_local_id() == 0) 
    {
        uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize;
        if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2)) 
        {
            outputDumpMainBuffer->overflow = 1;
            batchPtrs->dumpDst = 0;
            batchPtrs->dataStart = 0;
            batchPtrs->totalSize = 0;
            return;
        }
        dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
        dword newHead;
        bool circled;
        do
        {
            //mem_fence_gpu_invalidate();
            //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
            circled = false;
            newHead = prevHead + sizeOfThisBatch;
            dword bufferBegin = prevHead;
            if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize)
            {
                circled = true;
                newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch;
                bufferBegin = outputDumpMainBuffer->headStart;
            }
            dword bufferEnd = newHead + sizeof(OutputBatch);
            uint tail;
            uint tail2 = 7;
            bool wait;
            do
            {
                wait = true;
                tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0);
                // dead code, workaround so IGC won't move tail load out of loop
                if (tail > outputDumpMainBuffer->totalSize) 
                {
                   store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2);
                   tail2 = tail;
                }
                if( prevHead >= tail )
                {
                    //colision example:
                    //  ----------T=======H------------
                    //  -------B=====E-----------------
                    //
                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
                    {
                        wait = false;
                    }
                }
                else 
                {
                    //colision example:
                    //  ==========H-------T============
                    //  B==============E---------------
                    // caution: we will never have H circled completely so that H == T
                    if((bufferEnd < tail) && (bufferBegin >= prevHead)) 
                    {
                        wait = false;
                    }
                }
            } while (wait);
        } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead));
        if (circled)
        {
            global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead);
            endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER;
            prevHead = outputDumpMainBuffer->headStart;
        }
        global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead;
        batchPtrs->dumpDst = (qword)thisBatchDump;
        batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch)));
        batchPtrs->totalSize = sizeOfThisBatch;
        global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump;
        batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH;
        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
        batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch));
        batchOp->batchId = *batchIdPtr;
    }
 }
 /// Calculates sum of output sizes
 ///
 /// @param pbi pointer to post build infos
 /// @param destOffset offset in dest buffer
 /// @param numOutputs number of outputs
 /// @param batchPtrs batch pointers struct
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel calc_outputs_data_size(
    global PostbuildInfoSerializationDesc* pbi,
    global dword* destOffsets,
    qword numOutputs,
    global OutputBatchPtrs* batchPtrs)
 {
    uint offset = 0;
    for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH)
    {
        uint size = 0;
        if (i < numOutputs)
        {
            size = AlignTo128(pbi[i].SerializedSizeInBytes);
            size += AlignTo128(sizeof(OutputData));
            destOffsets[i] = offset + sub_group_scan_exclusive_add(size);
        }
        offset += sub_group_reduce_add(size);
    }
    if (get_sub_group_local_id() == 0)
        batchPtrs->dataSize = offset;
 }
 /// Adds output data operation to batch
 ///
 /// @param batchPtrs batch pointers struct
 /// @param destOffset offset in dest buffer
 /// @param src pointer to source bvh
 /// @param pbi pointer to post build info
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel write_output_data_op(
    global OutputBatchPtrs* batchPtrs,
    global dword* destOffset,
    qword src,
    global PostbuildInfoSerializationDesc* pbi)
 {
    if (batchPtrs->dataStart == 0)
        return;
    global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset);
    out->header.operationType = OUTPUT_DUMP_OP_DATA;
    out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes);
    out->srcBvhPtr = src;
 }
 /// Writes indices and transform or procedurals data
 ///
 /// @param batchPtrs batch pointers struct
 /// @param srcDesc description of source geometry
 /// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer
 /// @param dstDescOffset offset to dest geo desc
 /// @param dstDataOffset offset to dest geo data
 /// @param numThreads number of threads
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
 void kernel write_geo_data(
    global InputBatchPtrs* batchPtrs,
    global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc,
    global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
    global uint* pVertexBufferSize,
    qword dstDescOffset,
    qword dstDataOffset,
    dword numThreads)
 {
    if (batchPtrs->dumpDst == 0) return;
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc;
    global char* dstDataPtr = (global char*)(
        batchPtrs->nonVertexDataStart + dstDataOffset);
    global char* srcDataPtr;
    global char* dstTransform;
    uint bytesToCopy = 0;
    if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES)
    {
        uint sizeOfMatrix = 0;
        if (geoDescToStore.Desc.Triangles.pTransformBuffer)
        {
            sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float));
            if (glob_id < 12)
            {
                global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer;
                global float* matrixDst = (global float*)dstDataPtr;
                matrixDst[glob_id] = matrixSrc[glob_id];
                if (glob_id == 0) 
                {
                    geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer;
                }
            }
        }
        dstDataPtr += sizeOfMatrix;
        srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer;
        bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount);
        if (bytesToCopy && (glob_id == 0)) 
        {
            qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
            // for this we remember offset relative to global debug buffer
            geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
            geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
            geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride;
        }
        else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0)
        {
            if (geoDescToStore.Desc.Triangles.pVertexBuffer)
            {
                qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
                // for this we remember offset relative to global debug buffer
                geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
            }
        }
        else if (glob_id == 0)
        {
            geoDescToStore.Desc.Triangles.IndexCount = 0;
            geoDescToStore.Desc.Triangles.VertexCount = 0;
            geoDescToStore.Desc.Triangles.pVertexBuffer = 0;
            geoDescToStore.Desc.Triangles.pIndexBuffer = 0;
        }
    }
    else 
    {
        srcDataPtr  = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA;
        bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount);
        if (glob_id == 0) 
        {
            geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
        }
    }
    if (bytesToCopy) 
    {
        CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads);
    }
    if (glob_id == 0) 
    {
        global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)(
            batchPtrs->nonVertexDataStart + dstDescOffset);
        *dstDescPtr = geoDescToStore;
    }
 }
 /// Adds build operation to batch
 ///
 /// @param batchPtrs batch pointers struct
 /// @param buildOpOffset offset in dst buffer
 /// @param srcBvh address of src bvh (in case of update)
 /// @param dstBvhAddr address of dest bvh buffer
 /// @param offsetToEnd offset to end of this operation
 /// @param flags build flags
 /// @param numGeometries number of geometries in build
 /// @param numInstances number of instances in build
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel write_input_build_op(
    global InputBatchPtrs* batchPtrs,
    qword buildOpOffset,
    qword srcBvh,
    qword dstBvhAddr,
    dword offsetToEnd,
    dword flags,
    dword numGeometries, 
    dword numInstances,
    dword instArrayOfPtrs)
 {
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
    global InputBuild* buildOp = (global InputBuild*)(
        batchPtrs->nonVertexDataStart + buildOpOffset);
    buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD;
    buildOp->header.endOfData = offsetToEnd;
    buildOp->dstBvhPtr = dstBvhAddr;
    buildOp->srcBvhPtr = srcBvh;
    buildOp->flags = flags;
    buildOp->numGeos = numGeometries;
    buildOp->numInstances = numInstances;
    buildOp->instArrayOfPtrs = instArrayOfPtrs;
 }
 /// Copies instance description
 ///
 /// @param batchPtrs batch pointers struct
 /// @param instanceDescArr inst desc source
 /// @param offset ptr to offset in dst buffer
 /// @param numInstances number of instances to copy
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
 copy_instance_descriptors_array(
    global InputBatchPtrs* batchPtrs,
    global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr,
    qword offset,                               
    dword numInstances) 
 {
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    if (batchPtrs->dumpDst == 0) return;
    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )(
        batchPtrs->nonVertexDataStart + offset);
    if (glob_id < numInstances)
    {
        dst[glob_id] = instanceDescArr[glob_id];
    }
 }
 /// Copies instance description, array of pointers version
 ///
 /// @param batchPtrs batch pointers struct
 /// @param pInstanceDescPtrsArr inst desc source
 /// @param offset ptr to offset in dst buffer
 /// @param numInstances number of instances to copy
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
 copy_instance_descriptors_array_of_ptrs(
    global InputBatchPtrs* batchPtrs,
    global qword* pInstanceDescPtrsArr,
    qword offset,
    dword numInstances)
 {
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    if (batchPtrs->dumpDst == 0) return;
    // save gpuva of instance descs for debug
    global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset);
    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)(
        batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset);
    global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr;
    if (glob_id < numInstances)
    {
        gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id];
        dst[glob_id] = *(instanceDescPtrsArr[glob_id]);
    }
 }
 /// Adds copy operation to batch
 ///
 /// @param batchPtrs batch pointers struct
 /// @param offset ptr to offset in dst buffer
 /// @param src copy source pointer
 /// @param dst copy destination pointer
 /// @param copyOpType copy type
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
 void kernel insert_copy_op(
    global InputBatchPtrs* batchPtrs,
    qword offset,
    global void* src,
    global void* dst,
    uint copyOpType)
 {
    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
    global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset);
    copyOp->header.operationType = copyOpType;
    copyOp->header.endOfData = AlignTo128(sizeof(InputCopy));
    copyOp->srcBvhPtr = (qword)src;
    copyOp->dstBvhPtr = (qword)dst;
 }
 /// Copies vertex buffer
 ///
 /// @param batchPtrs batch pointers struct
 /// @param src input buffer
 /// @param offset ptr to offset in dst buffer
 /// @param size ptr to number of bytes to copy
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
 void kernel copy_vertex_data(
    global InputBatchPtrs* batchPtrs,
    global const char* src,
    global const uint* offset,
    global const uint* size) 
 {
    if (batchPtrs->dumpDst == 0) return;
    global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset);
    uint numGroups = (*size >> 6) + 1;
    CopyMemory(dst, src, *size, numGroups);
 }
 /// Generate unique batch id
 ///
 /// @param batchIds array of unique batch ids
 /// @param index index of batch id to generate
 __attribute__((reqd_work_group_size(1, 1, 1)))
 void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) {
    global unsigned int *counterPtrs = (global unsigned int *)batchIds;
    atomic_add(&counterPtrs[index * 2 + 1], 1);
    batchIds[index] |= (unsigned long)index;
 }
 /// Sets batch as ready to read and moves cpuHead forward, inputs case
 ///
 /// @param batchPtrs batch pointers struct
 /// @param dumpMainBuffer pointer to main dump buffer
 __attribute__((reqd_work_group_size(1, 1, 1)))
 void kernel finish_batch_dump_inputs(
    global InputBatchPtrs* batchPtrs,
    global DebugBufferHeader* dumpMainBuffer)
 {
    if (batchPtrs->dumpDst == 0)
        return;
    global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst;
    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
    dword seven = 7;
    while (true)
    {
        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
        {
            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
            currentHead = seven;
        }
        if (currentHead == myDstOffset)
        {
            mem_fence_evict_to_memory();
            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
            break;
        }
        else if (myDstOffset == dumpMainBuffer->headStart)
        {
            global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead);
            if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER)
            {
                mem_fence_evict_to_memory();
                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
                break;
            }
        }
    }
 }
 /// Sets batch as ready to read and moves cpuHead forward, outputs case
 ///
 /// @param batchPtrs batch pointers struct
 /// @param dumpMainBuffer pointer to main dump buffer
 __attribute__((reqd_work_group_size(1, 1, 1)))
 void kernel finish_batch_dump_outputs(
    global OutputBatchPtrs* batchPtrs,
    global DebugBufferHeader* dumpMainBuffer)
 {
    if (batchPtrs->dumpDst == 0)
        return;
    global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst;
    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
    dword seven = 7;
    while (true)
    {
        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
        {
            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
            currentHead = seven;
        }
        if (currentHead == myDstOffset)
        {
            mem_fence_evict_to_memory();
            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
            break;
        }
        else if (myDstOffset == dumpMainBuffer->headStart)
        {
            global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead);
            if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER)
            {
                mem_fence_evict_to_memory();
                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
                break;
            }
        }
    }
 }
--- a/src/intel/vulkan/grl/gpu/input_dump.grl
+++ b/src/intel/vulkan/grl/gpu/input_dump.grl
@ -1,252 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module input_dump;
 kernel_module input_dumper("input_dump.cl")
 {
    links lsc_intrinsics;
    kernel opencl_kernel_find_max_used_byte_in_buff                  < kernelFunction="find_max_used_byte_in_buff" >;
    kernel opencl_kernel_allocate_linear_offsets_for_vertex_buffers  < kernelFunction="allocate_linear_offsets_for_vertex_buffers" >;
    kernel opencl_kernel_allocate_data_space_for_inputs              < kernelFunction="allocate_data_space_for_inputs" >;
    kernel opencl_kernel_allocate_data_space_for_outputs             < kernelFunction="allocate_data_space_for_outputs" >;
    kernel opencl_kernel_calc_outputs_data_size                      < kernelFunction="calc_outputs_data_size" >;
    kernel opencl_kernel_write_output_data_op                        < kernelFunction="write_output_data_op" >;
    kernel opencl_kernel_write_geo_data                              < kernelFunction="write_geo_data" >;
    kernel opencl_kernel_write_input_build_op                        < kernelFunction="write_input_build_op" >;
    kernel opencl_kernel_copy_instance_descriptors_array             < kernelFunction="copy_instance_descriptors_array" >;
    kernel opencl_kernel_copy_instance_descriptors_array_of_ptrs     < kernelFunction="copy_instance_descriptors_array_of_ptrs" >;
    kernel opencl_kernel_insert_copy_op                              < kernelFunction="insert_copy_op" >;
    kernel opencl_kernel_copy_vertex_data                            < kernelFunction="copy_vertex_data" >;
    kernel opencl_kernel_generate_unique_batch_id                    < kernelFunction="generate_unique_batch_id" >;
    kernel opencl_kernel_finish_batch_dump_inputs                    < kernelFunction="finish_batch_dump_inputs" >;
    kernel opencl_kernel_finish_batch_dump_outputs                   < kernelFunction="finish_batch_dump_outputs" >;
 }
 metakernel find_max_used_byte_in_buff(
    qword indexBuffPtr,
    qword vertexBufferUsedByteEnd,
    dword IndexCount,
    dword IndexFormat,
    dword VertexCount,
    qword VertexBufferByteStride,
    dword numPhysThreads)
 { 
    dispatch opencl_kernel_find_max_used_byte_in_buff(numPhysThreads, 1, 1)   args(
        indexBuffPtr,
        vertexBufferUsedByteEnd,
        IndexCount,
        IndexFormat,
        VertexCount,
        VertexBufferByteStride);
 }
 metakernel allocate_linear_offsets_for_vertex_buffers(
    qword batchPtrs,
    qword m_VertexBufferUsedByteEnd,
    qword m_VertexBufferOffset,
    dword numVertexBuffers,
    dword numPhysThreads)
 { 
    dispatch opencl_kernel_allocate_linear_offsets_for_vertex_buffers(numPhysThreads, 1, 1) args(
        batchPtrs,
        m_VertexBufferUsedByteEnd,
        m_VertexBufferOffset,
        numVertexBuffers);
 }
 metakernel allocate_data_space_for_inputs(
    qword inputDumpMainBuffer,
    qword batchPtrs,
    dword nonVertexSize,
    qword batchIdPtr)
 {  
    dispatch opencl_kernel_allocate_data_space_for_inputs(1, 1, 1) args(
        inputDumpMainBuffer,
        batchPtrs,
        nonVertexSize,
        batchIdPtr);
 }
 metakernel allocate_data_space_for_outputs(
    qword inputDumpMainBuffer,
    qword batchPtrs,
    qword batchIdPtr)
 {  
    dispatch opencl_kernel_allocate_data_space_for_outputs(1, 1, 1) args(
        inputDumpMainBuffer,
        batchPtrs,
        batchIdPtr);
 }
 metakernel calc_outputs_data_size(
    qword pbi,
    qword destOffsets,
    qword numOutputs,
    qword batchPtrs)
 {
    dispatch opencl_kernel_calc_outputs_data_size(1, 1, 1) args(
        pbi,
        destOffsets,
        numOutputs,
        batchPtrs);
 }
 metakernel write_output_data_op(
    qword batchPtrs,
    qword destOffset,
    qword src,
    qword pbi)
 {
    dispatch opencl_kernel_write_output_data_op(1, 1, 1) args(
        batchPtrs,
        destOffset,
        src,
        pbi);
 }
 metakernel write_geo_data(
    qword batchPtrs,
    qword srcDesc,
    qword pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
    qword pVertexBufferSize,
    qword dstDescOffset,
    qword dstDataOffset,
    dword numThreads)
 {  
    dispatch opencl_kernel_write_geo_data(numThreads, 1, 1) args(
        batchPtrs,
        srcDesc,
        pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
        pVertexBufferSize,
        dstDescOffset,
        dstDataOffset,
        numThreads);
 }
 metakernel write_input_build_op(
    qword batchPtrs,
    qword buildOpOffset,
    qword srcBvh,
    qword dstBvhAddr,
    dword offsetToEnd,
    dword flags,
    dword numGeometries,
    dword numInstances,
    dword instArrayOfPtrs)
 {  
    dispatch opencl_kernel_write_input_build_op(1, 1, 1) args(
        batchPtrs,
        buildOpOffset,
        srcBvh,
        dstBvhAddr,
        offsetToEnd,
        flags,
        numGeometries,
        numInstances,
        instArrayOfPtrs);
 }
 metakernel copy_instance_descriptors_array(
    qword batchPtrs,
    qword instanceDescArr,
    qword offset,
    dword numInstances,
    dword numPhysThreads)
 {  
    dispatch opencl_kernel_copy_instance_descriptors_array(numPhysThreads, 1, 1) args(
        batchPtrs,
        instanceDescArr,
        offset,
        numInstances);
 }
 metakernel copy_instance_descriptors_array_of_ptrs(
    qword batchPtrs,
    qword instanceDescArrPtrs,
    qword offset,
    dword numInstances,
    dword numPhysThreads)
 {  
    dispatch opencl_kernel_copy_instance_descriptors_array_of_ptrs(numPhysThreads, 1, 1) args(
        batchPtrs,
        instanceDescArrPtrs,
        offset,
        numInstances);
 }
 metakernel insert_copy_op(
    qword batchPtrs,
    qword offset,
    qword src,
    qword dst,
    dword type)
 {  
    dispatch opencl_kernel_insert_copy_op(1, 1, 1) args(
        batchPtrs,
        offset,
        src,
        dst,
        type);
 }
 metakernel copy_vertex_data(
    qword desc,
    qword src,
    qword offset,
    qword size)
 {
    define byteSize REG0;
    define numGroupsRqd REG1;
    define shift REG2;
    define minimum REG3;
    shift = 6;
    minimum = 1;
    byteSize = load_dword(size);
    numGroupsRqd = byteSize >> shift;
    numGroupsRqd = numGroupsRqd + minimum;
    DISPATCHDIM_X = numGroupsRqd.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_copy_vertex_data args(
        desc,
        src,
        offset,
        size);
 }
 metakernel generate_unique_batch_id(
    qword batchIds,
    dword batchIndex)
 {
    dispatch opencl_kernel_generate_unique_batch_id(1, 1, 1) args(
        batchIds,
        batchIndex);
 }
 metakernel finish_batch_dump_inputs(
    qword batchPtrs,
    qword dumpMainBuffer)
 {
    dispatch opencl_kernel_finish_batch_dump_inputs(1, 1, 1) args(
        batchPtrs,
        dumpMainBuffer);
 }
 metakernel finish_batch_dump_outputs(
    qword batchPtrs,
    qword dumpMainBuffer)
 {
    dispatch opencl_kernel_finish_batch_dump_outputs(1, 1, 1) args(
        batchPtrs,
        dumpMainBuffer);
 }
--- a/src/intel/vulkan/grl/gpu/instance.h
+++ b/src/intel/vulkan/grl/gpu/instance.h
@ -1,183 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "shared.h"
 #include "affinespace.h"
 #include "api_interface.h"
 #include "qbvh6.h"
 #include "libs/lsc_intrinsics.h"
 GRL_INLINE uint32_t HwInstanceLeafPart1_getInstanceIndex(struct HwInstanceLeaf *I)
 {
    return I->part1.instanceIndex;
 }
 GRL_INLINE void encodeDW0_HwInstanceLeafPart0(
    uint32_t shaderIndex,
    uint32_t geomMask,
    uint4 *dst)
 {
    (*dst).x = (shaderIndex & ((1 << 24) - 1)) |
             (geomMask << 24);
 }
 GRL_INLINE void encodeDW1_HwInstanceLeafPart0(
    uint32_t instanceContributionToHitGroupIndex,
    uint32_t notProcedural,
    uint32_t geomFlags,
    uint4* dst)
 {
    (*dst).y = (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
        ((notProcedural & 1) << (24 + 5)) |
        ((geomFlags & 3) << (24 + 5 + 1));
 }
 GRL_INLINE void encodeDW2DW3_HwInstanceLeafPart0(
    uint64_t rootNodePtr,
    uint32_t instFlags,
    uint4* dst)
 {
    uint64_t flags = instFlags;
    uint DW2 = (uint)rootNodePtr;
    uint DW3 = ((uint)(rootNodePtr >> 32ul) & 0xffff);
    DW3 |= flags << 16ull;
    (*dst).z = DW2;
    (*dst).w = DW3;
 }
 GRL_INLINE void HwInstanceLeafPart0_setDW0(struct HwInstanceLeaf *I,
                                       uint32_t shaderIndex,
                                       uint32_t geomMask)
 {
    I->part0.DW0 =
        (shaderIndex & ((1 << 24) - 1)) |
        (geomMask << 24);
 }
 GRL_INLINE void HwInstanceLeafPart0_setDW1(struct HwInstanceLeaf *I,
                                       uint32_t instanceContributionToHitGroupIndex,
                                       uint32_t notProcedural,
                                       uint32_t geomFlags)
 {
    I->part0.DW1 =
        (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
        ((notProcedural & 1) << (24 + 5)) |
        ((geomFlags & 3) << (24 + 5 + 1));
 }
 GRL_INLINE void HwInstanceLeafPart1_setDW0DW1(struct HwInstanceLeaf *I,
                                          global char *pBvhPtr)
 {
    I->part1.DW0_DW1 = ((uint64_t)pBvhPtr) & (((uint64_t)1 << 48) - 1);
 }
 GRL_INLINE void HwInstanceLeafPart0_setDW2DW3(struct HwInstanceLeaf *I,
                                          uint64_t rootNodePtr,
                                          uint32_t instFlags)
 {
    uint64_t flags = instFlags;
    flags = flags << 48ull;
    uint64_t ptr = rootNodePtr & 0x0000ffffffffffff;
    I->part0.DW2_DW3 = ptr + flags;
 }
 GRL_INLINE void HwInstanceLeaf_Constructor(global struct HwInstanceLeaf* leaf,
    global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
    uint instanceIndex,
    uint rootNodeByteOffset,
    uint instanceMask)
 {
    global uint4* InstanceLeaf_4DWparts = (global uint4*) (leaf);
    struct AffineSpace3f obj2world = AffineSpace3f_load_row_major(instDesc->Transform);
    qword accStructPtr = (qword)instDesc->AccelerationStructure;
    uint4 p1_DW0_3 = (uint4)(
        (uint)accStructPtr,
        (uint)(accStructPtr >> (uint64_t)32),
        GRL_get_instanceID(instDesc),
        instanceIndex);
    struct AffineSpace3f world2obj = AffineSpace3f_invert(obj2world);
    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 0 /*part1 + 0DW*/, p1_DW0_3);
    uint4 p1_DW4_7 = (uint4)(
        as_uint(obj2world.l.vx.x),
        as_uint(obj2world.l.vx.y),
        as_uint(obj2world.l.vx.z),
        as_uint(obj2world.l.vy.x));
    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 1 /*part1 + 4DW*/, p1_DW4_7);
    uint4 p1_DW8_11 = (uint4)(
        as_uint(obj2world.l.vy.y),
        as_uint(obj2world.l.vy.z),
        as_uint(obj2world.l.vz.x),
        as_uint(obj2world.l.vz.y));
    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 2 /*part1 + 8DW*/, p1_DW8_11);
    uint4 p1_DW12_15 = (uint4)(
        as_uint(obj2world.l.vz.z),
        as_uint(world2obj.p.x),
        as_uint(world2obj.p.y),
        as_uint(world2obj.p.z));
    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 3 /*part1 + 12DW*/, p1_DW12_15);
    uint hit_group_index = GRL_get_InstanceContributionToHitGroupIndex(instDesc);
    global struct BVHBase* bvh = (global struct BVHBase*)instDesc->AccelerationStructure;
    uint4 p0_DW0_3;
    encodeDW0_HwInstanceLeafPart0(
        hit_group_index,
        instanceMask,
        &p0_DW0_3);
    encodeDW1_HwInstanceLeafPart0(
        hit_group_index, // for HW instance leaf, this field is used to offset the hit-group index
        1,  // disable opaque culling.. Necessary for SW instancing.. don't-care for HW instancing
        0,
        &p0_DW0_3);
    encodeDW2DW3_HwInstanceLeafPart0(
        rootNodeByteOffset == NO_NODE_OFFSET ? 0 : ((uint64_t)bvh) + rootNodeByteOffset, // offset NO_NODE_OFFSET is for degenerated instance, put null as root pointer
        GRL_get_InstanceFlags(instDesc),
        &p0_DW0_3);
    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 0 /*part0 + 0DW*/, p0_DW0_3);
    uint4 p0_DW4_7 = (uint4)(
        as_uint(world2obj.l.vx.x),
        as_uint(world2obj.l.vx.y),
        as_uint(world2obj.l.vx.z),
        as_uint(world2obj.l.vy.x));
    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 1 /*part0 + 4DW*/, p0_DW4_7);
    uint4 p0_DW8_11 = (uint4)(
        as_uint(world2obj.l.vy.y),
        as_uint(world2obj.l.vy.z),
        as_uint(world2obj.l.vz.x),
        as_uint(world2obj.l.vz.y));
    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 2 /*part0 + 8DW*/, p0_DW8_11);
    uint4 p0_DW12_15 = (uint4)(
        as_uint(world2obj.l.vz.z),
        as_uint(obj2world.p.x),
        as_uint(obj2world.p.y),
        as_uint(obj2world.p.z));
    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 3 /*part0 + 12DW*/, p0_DW12_15);
 }
--- a/src/intel/vulkan/grl/gpu/intrinsics.h
+++ b/src/intel/vulkan/grl/gpu/intrinsics.h
@ -1,581 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 // TODO: AABB_work_group_reduce is super slow, remove !!!
 #pragma cl_intel_subgroups : enable
 #pragma cl_khr_fp16        : enable
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 uint intel_sub_group_ballot(bool valid);
 // atom_min
 float __attribute__((overloadable)) atom_min(volatile __global float *p, float val);
 float __attribute__((overloadable)) atom_min(volatile __local float *p, float val);
 float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val);
 float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val);
 // atom_max
 float __attribute__((overloadable)) atom_max(volatile __global float *p, float val);
 float __attribute__((overloadable)) atom_max(volatile __local float *p, float val);
 float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val);
 float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val);
 // atom_cmpxchg
 float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val);
 float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val);
 float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val);
 float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val);
 inline uint subgroup_single_atomic_add(global uint *p, uint val)
 {
    const uint subgroupLocalID = get_sub_group_local_id();
    const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0;
    return sub_group_broadcast(v, 0);
 }
 inline float halfarea(const float3 d)
 {
    return fma(d.x, (d.y + d.z), d.y * d.z);
 }
 inline float area(const float3 d)
 {
    return halfarea(d) * 2.0f;
 }
 inline uint maxDim(const float3 a)
 {
    const float3 b = fabs(a);
    const bool b_x_y = b.x > b.y;
    const float cur_max = b_x_y ? b.x : b.y;
    const uint cur_idx = b_x_y ? 0 : 1;
    const bool b_x_y_z = b.z > cur_max;
    return b_x_y_z ? 2 : cur_idx;
 }
 inline uint3 sortByMaxDim(const float3 a)
 {
    const uint kz = maxDim(a);
    const uint _kx = (kz + 1) % 3;
    const uint _ky = (_kx + 1) % 3;
    const bool kz_pos = a[kz] >= 0.0f;
    const uint kx = kz_pos ? _ky : _kx;
    const uint ky = kz_pos ? _kx : _ky;
    return (uint3)(kx, ky, kz);
 }
 inline uint4 sort4_ascending(const uint4 dist)
 {
    const uint a0 = dist.s0;
    const uint a1 = dist.s1;
    const uint a2 = dist.s2;
    const uint a3 = dist.s3;
    const uint b0 = min(a0, a2);
    const uint b1 = min(a1, a3);
    const uint b2 = max(a0, a2);
    const uint b3 = max(a1, a3);
    const uint c0 = min(b0, b1);
    const uint c1 = max(b0, b1);
    const uint c2 = min(b2, b3);
    const uint c3 = max(b2, b3);
    const uint d0 = c0;
    const uint d1 = min(c1, c2);
    const uint d2 = max(c1, c2);
    const uint d3 = c3;
    return (uint4)(d0, d1, d2, d3);
 }
 __constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6};
 __constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4};
 __constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6};
 __constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0};
 __constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5};
 __constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6};
 __constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6};
 __constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1};
 __constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1};
 __constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1};
 __constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1};
 inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask)
 {
    const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
    const uint a_min = min(a0, a1);
    const uint a_max = max(a0, a1);
    return select(a_max, a_min, selectMask);
 }
 inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask)
 {
    const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
    const uint a_min = min(a0, a1);
    const uint a_max = max(a0, a1);
    return select(a_min, a_max, selectMask);
 }
 inline uint sort8_descending(const uint aa)
 {
    const unsigned int slotID = get_sub_group_local_id() % 8;
    const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
    const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
    const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]);
    const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]);
    const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]);
    const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]);
    return gg;
 }
 inline uint sort8_ascending(const uint aa)
 {
    const unsigned int slotID = get_sub_group_local_id() % 8;
    const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]);
    const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]);
    const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]);
    const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]);
    const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]);
    const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]);
    return gg;
 }
 inline uint sort4_descending(const uint aa)
 {
    const unsigned int slotID = get_sub_group_local_id() % 8;
    const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
    const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
    const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]);
    return dd;
 }
 inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
 {
    const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
    const ulong a_min = min(a0, a1);
    const ulong a_max = max(a0, a1);
    return select(a_max, a_min, (ulong)selectMask);
 }
 inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
 {
    const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
    const ulong a_min = min(a0, a1);
    const ulong a_max = max(a0, a1);
    return select(a_min, a_max, (ulong)selectMask);
 }
 inline ulong sort8_ascending_ulong(const ulong aa)
 {
    const unsigned int slotID = get_sub_group_local_id() % 8;
    const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]);
    const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]);
    const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]);
    const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]);
    const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]);
    const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]);
    return gg;
 }
 inline uint bitInterleave3D(const uint4 in)
 {
    uint x = in.x, y = in.y, z = in.z;
    x = (x | (x << 16)) & 0x030000FF;
    x = (x | (x << 8)) & 0x0300F00F;
    x = (x | (x << 4)) & 0x030C30C3;
    x = (x | (x << 2)) & 0x09249249;
    y = (y | (y << 16)) & 0x030000FF;
    y = (y | (y << 8)) & 0x0300F00F;
    y = (y | (y << 4)) & 0x030C30C3;
    y = (y | (y << 2)) & 0x09249249;
    z = (z | (z << 16)) & 0x030000FF;
    z = (z | (z << 8)) & 0x0300F00F;
    z = (z | (z << 4)) & 0x030C30C3;
    z = (z | (z << 2)) & 0x09249249;
    return x | (y << 1) | (z << 2);
 }
 inline uint bitInterleave4D(const uint4 in)
 {
    uint x = in.x, y = in.y, z = in.z, w = in.w;
    x = x & 0x000000ff;
    x = (x ^ (x << 16)) & 0x00c0003f;
    x = (x ^ (x << 8)) & 0x00c03807;
    x = (x ^ (x << 4)) & 0x08530853;
    x = (x ^ (x << 2)) & 0x09090909;
    x = (x ^ (x << 1)) & 0x11111111;
    y = y & 0x000000ff;
    y = (y ^ (y << 16)) & 0x00c0003f;
    y = (y ^ (y << 8)) & 0x00c03807;
    y = (y ^ (y << 4)) & 0x08530853;
    y = (y ^ (y << 2)) & 0x09090909;
    y = (y ^ (y << 1)) & 0x11111111;
    z = z & 0x000000ff;
    z = (z ^ (z << 16)) & 0x00c0003f;
    z = (z ^ (z << 8)) & 0x00c03807;
    z = (z ^ (z << 4)) & 0x08530853;
    z = (z ^ (z << 2)) & 0x09090909;
    z = (z ^ (z << 1)) & 0x11111111;
    w = w & 0x000000ff;
    w = (w ^ (w << 16)) & 0x00c0003f;
    w = (w ^ (w << 8)) & 0x00c03807;
    w = (w ^ (w << 4)) & 0x08530853;
    w = (w ^ (w << 2)) & 0x09090909;
    w = (w ^ (w << 1)) & 0x11111111;
    return (x | (y << 1) | (z << 2) | (w << 3));
 }
 inline ulong ulong_bitInterleave4D(const uint4 in)
 {
    ulong x = in.x, y = in.y, z = in.z, w = in.w;
    x = x & 0x0000ffff;
    x = (x ^ (x << 32)) & 0x0000f800000007ff;
    x = (x ^ (x << 16)) & 0x0000f80007c0003f;
    x = (x ^ (x << 8)) & 0x00c0380700c03807;
    x = (x ^ (x << 4)) & 0x0843084308430843;
    x = (x ^ (x << 2)) & 0x0909090909090909;
    x = (x ^ (x << 1)) & 0x1111111111111111;
    y = y & 0x0000ffff;
    y = (y ^ (y << 32)) & 0x0000f800000007ff;
    y = (y ^ (y << 16)) & 0x0000f80007c0003f;
    y = (y ^ (y << 8)) & 0x00c0380700c03807;
    y = (y ^ (y << 4)) & 0x0843084308430843;
    y = (y ^ (y << 2)) & 0x0909090909090909;
    y = (y ^ (y << 1)) & 0x1111111111111111;
    z = z & 0x0000ffff;
    z = (z ^ (z << 32)) & 0x0000f800000007ff;
    z = (z ^ (z << 16)) & 0x0000f80007c0003f;
    z = (z ^ (z << 8)) & 0x00c0380700c03807;
    z = (z ^ (z << 4)) & 0x0843084308430843;
    z = (z ^ (z << 2)) & 0x0909090909090909;
    z = (z ^ (z << 1)) & 0x1111111111111111;
    w = w & 0x0000ffff;
    w = (w ^ (w << 32)) & 0x0000f800000007ff;
    w = (w ^ (w << 16)) & 0x0000f80007c0003f;
    w = (w ^ (w << 8)) & 0x00c0380700c03807;
    w = (w ^ (w << 4)) & 0x0843084308430843;
    w = (w ^ (w << 2)) & 0x0909090909090909;
    w = (w ^ (w << 1)) & 0x1111111111111111;
    return (x | (y << 1) | (z << 2) | (w << 3));
 }
 inline uint bitCompact(uint x)
 {
    x &= 0x09249249;
    x = (x ^ (x >> 2)) & 0x030c30c3;
    x = (x ^ (x >> 4)) & 0x0300f00f;
    x = (x ^ (x >> 8)) & 0xff0000ff;
    x = (x ^ (x >> 16)) & 0x000003ff;
    return x;
 }
 inline uint3 bitCompact3D(const uint in)
 {
    const uint x = bitCompact(x >> 0);
    const uint y = bitCompact(y >> 1);
    const uint z = bitCompact(z >> 2);
    return (uint3)(x, y, z);
 }
 inline uint convertToPushIndices8(uint ID)
 {
    const unsigned int slotID = get_sub_group_local_id();
    uint index = 0;
    for (uint i = 0; i < 8; i++)
    {
        const uint mask = intel_sub_group_ballot(ID == i);
        const uint new_index = ctz(mask);
        index = i == slotID ? new_index : index;
    }
    return index;
 }
 inline uint convertToPushIndices16(uint ID)
 {
    const unsigned int slotID = get_sub_group_local_id();
    uint index = 0;
    for (uint i = 0; i < 16; i++)
    {
        const uint mask = intel_sub_group_ballot(ID == i);
        const uint new_index = ctz(mask);
        index = i == slotID ? new_index : index;
    }
    return index;
 }
 #define FLOAT_EXPONENT_MASK     (0x7F800000)  // used to be EXPONENT_MASK
 #define FLOAT_MANTISSA_MASK     (0x007FFFFF)  // used to be MANTISSA_MASK
 #define FLOAT_NEG_ONE_EXP_MASK  (0x3F000000)
 #define FLOAT_BIAS              (127)
 #define FLOAT_MANTISSA_BITS     (23)
 inline float3 frexp_vec3(float3 len, int3* exp)
 {
    float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK));
    mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f)));
    mant = copysign(mant, len);
    *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1));
    return mant;
 }
 #ifndef uniform
 #define uniform
 #endif
 #ifndef varying
 #define varying
 #endif
 uint get_sub_group_global_id()
 {
    return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 );
 }
 // each lane contains the number of 1 bits below the corresponding position in 'mask'
 uint subgroup_bit_prefix_exclusive(uniform uint mask)
 {
    varying ushort lane = get_sub_group_local_id();
    varying uint lane_mask = (1 << lane) - 1;
    varying uint m = mask & lane_mask;
    return popcount(m);
 }
 uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx )
 {
    varying uint lane_mask = (1 << lane_idx) - 1;
    varying uint m = mask & lane_mask;
    return popcount(m);
 }
 uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx)
 {
    return (uint3)(sub_group_broadcast(v.x,idx),
                   sub_group_broadcast(v.y,idx),
                   sub_group_broadcast(v.z,idx));
 }
 float3 sub_group_broadcast_float3(float3 v, uniform ushort idx)
 {
    return (float3)(sub_group_broadcast(v.x, idx),
                    sub_group_broadcast(v.y, idx),
                    sub_group_broadcast(v.z, idx));
 }
 float3 sub_group_reduce_min_float3(float3 v)
 {
    return (float3)(sub_group_reduce_min(v.x),
                    sub_group_reduce_min(v.y),
                    sub_group_reduce_min(v.z) );
 }
 float3 sub_group_reduce_max_float3(float3 v)
 {
    return (float3)(sub_group_reduce_max(v.x),
                    sub_group_reduce_max(v.y),
                    sub_group_reduce_max(v.z));
 }
 float3 sub_group_shuffle_float3(float3 v, uniform ushort idx)
 {
    return (float3)(intel_sub_group_shuffle(v.x, idx),
                    intel_sub_group_shuffle(v.y, idx),
                    intel_sub_group_shuffle(v.z, idx));
 }
 uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx)
 {
    return (uint3)( intel_sub_group_shuffle(v.x, idx),
                    intel_sub_group_shuffle(v.y, idx),
                    intel_sub_group_shuffle(v.z, idx));
 }
 inline uchar sub_group_reduce_or_N6(uchar val)
 {
    val = val | intel_sub_group_shuffle_down(val, val, 4);
    val = val | intel_sub_group_shuffle_down(val, val, 2);
    val = val | intel_sub_group_shuffle_down(val, val, 1);
    return sub_group_broadcast(val, 0);
 }
 inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)
 {
    uint SIMD8_id = get_sub_group_local_id() / 8;
    val = val | intel_sub_group_shuffle_down(val, val, 4);
    val = val | intel_sub_group_shuffle_down(val, val, 2);
    val = val | intel_sub_group_shuffle_down(val, val, 1);
    return intel_sub_group_shuffle(val, SIMD8_id * 8);
 }
 inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p )
 {
    return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group );
 }
 inline __attribute__((overloadable)) int atomic_inc_local(local int* p)
 {
    return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
 }
 inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p)
 {
    return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group);
 }
 inline __attribute__((overloadable)) int atomic_dec_local(local int* p)
 {
    return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
 }
 inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n)
 {
    return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
 }
 inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n )
 {
    return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group);
 }
 inline uint atomic_add_local( local uint* p, uint n )
 {
    return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
 }
 inline uint atomic_xor_local(local uint* p, uint n)
 {
    return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
 }
 inline uint atomic_or_local(local uint* p, uint n)
 {
    return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
 }
 inline uint atomic_min_local(local uint* p, uint n)
 {
    return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
 }
 inline uint atomic_max_local(local uint* p, uint n)
 {
    return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
 }
 inline uint atomic_inc_global( global uint* p )
 {
    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
 }
 inline uint atomic_dec_global(global uint* p)
 {
    return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
 }
 inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired)
 {
    return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device);
 }
 inline uint atomic_add_global( global uint* p, uint n )
 {
    return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
 }
 inline uint atomic_sub_global(global uint* p, uint n)
 {
    return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
 }
 inline uint atomic_or_global(global uint* p, uint n)
 {
    return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
 }
 inline uint atomic_inc_global_acquire(global uint* p)
 {
    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device);
 }
 inline uint atomic_inc_global_release(global uint* p)
 {
    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
 }
 inline uint atomic_dec_global_release(global uint* p)
 {
    return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
 }
 inline uint generic_atomic_add(uint* p, uint val)
 {
    if (to_global(p) != NULL)
        return atomic_add_global(to_global(p), val);
    if (to_local(p) != NULL)
        return atomic_add_local(to_local(p), val);
    return 0;
 }
 inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n )
 {
    n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
    n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
    n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
    return sub_group_broadcast( n, 0 );
 }
 inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n )
 {
    n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
    n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
    n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
    return sub_group_broadcast( n, 0 );
 }
 inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)
 {
    n = max(n, intel_sub_group_shuffle_down(n, n, 4));
    n = max(n, intel_sub_group_shuffle_down(n, n, 2));
    n = max(n, intel_sub_group_shuffle_down(n, n, 1));
    return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0);
 }
 inline uint generic_atomic_inc(uint* p)
 {
    if (to_global(p) != NULL)
        return atomic_inc_global(to_global(p));
    if (to_local(p) != NULL)
        return atomic_inc(to_local(p));
    return 0;
 }
 // Built-in GRL function which, if called in a kernel body, will force the kernel
 //  to be compiled to the minimum SIMD width supported by the platform
 void GRL_UseMinimumSIMDWidth();
--- a/src/intel/vulkan/grl/gpu/libs/libraries.grl
+++ b/src/intel/vulkan/grl/gpu/libs/libraries.grl
@ -1,13 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 library lsc_intrinsics
 {
    default   "lsc_intrinsics.cl" ;
    fallback  "lsc_intrinsics_fallback.cl";
 }
--- a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl
--- a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h
@ -1,207 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 // LSC Loads
 uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset);
 uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset);
 uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset);
 uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset);
 uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset);
 uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset);
 uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset);
 uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset);
 uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset);
 uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset);
 uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset);
 uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset);
 uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset);
 uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset);
 uint load_uint_L1UC_L3UC(global uint* it, int offset);
 uint load_uint_L1UC_L3C(global uint* it, int offset);
 uint load_uint_L1C_L3UC(global uint* it, int offset);
 uint load_uint_L1C_L3C(global uint* it, int offset);
 uint load_uint_L1S_L3UC(global uint* it, int offset);
 uint load_uint_L1S_L3C(global uint* it, int offset);
 uint load_uint_L1IAR_L3C(global uint* it, int offset);
 uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset);
 uint2 load_uint2_L1UC_L3C(global uint2* it, int offset);
 uint2 load_uint2_L1C_L3UC(global uint2* it, int offset);
 uint2 load_uint2_L1C_L3C(global uint2* it, int offset);
 uint2 load_uint2_L1S_L3UC(global uint2* it, int offset);
 uint2 load_uint2_L1S_L3C(global uint2* it, int offset);
 uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset);
 uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset);
 uint3 load_uint3_L1UC_L3C(global uint3* it, int offset);
 uint3 load_uint3_L1C_L3UC(global uint3* it, int offset);
 uint3 load_uint3_L1C_L3C(global uint3* it, int offset);
 uint3 load_uint3_L1S_L3UC(global uint3* it, int offset);
 uint3 load_uint3_L1S_L3C(global uint3* it, int offset);
 uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset);
 uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset);
 uint4 load_uint4_L1UC_L3C(global uint4* it, int offset);
 uint4 load_uint4_L1C_L3UC(global uint4* it, int offset);
 uint4 load_uint4_L1C_L3C(global uint4* it, int offset);
 uint4 load_uint4_L1S_L3UC(global uint4* it, int offset);
 uint4 load_uint4_L1S_L3C(global uint4* it, int offset);
 uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset);
 uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset);
 uint8 load_uint8_L1UC_L3C(global uint8* it, int offset);
 uint8 load_uint8_L1C_L3UC(global uint8* it, int offset);
 uint8 load_uint8_L1C_L3C(global uint8* it, int offset);
 uint8 load_uint8_L1S_L3UC(global uint8* it, int offset);
 uint8 load_uint8_L1S_L3C(global uint8* it, int offset);
 uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset);
 ulong load_ulong_L1UC_L3UC(global ulong* it, int offset);
 ulong load_ulong_L1UC_L3C(global ulong* it, int offset);
 ulong load_ulong_L1C_L3UC(global ulong* it, int offset);
 ulong load_ulong_L1C_L3C(global ulong* it, int offset);
 ulong load_ulong_L1S_L3UC(global ulong* it, int offset);
 ulong load_ulong_L1S_L3C(global ulong* it, int offset);
 ulong load_ulong_L1IAR_L3C(global ulong* it, int offset);
 ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset);
 ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset);
 ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset);
 ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset);
 ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset);
 ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset);
 ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset);
 ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset);
 ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset);
 ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset);
 ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset);
 ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset);
 ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset);
 ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset);
 ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset);
 ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset);
 ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset);
 ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset);
 ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset);
 ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset);
 ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset);
 ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset);
 ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset);
 ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset);
 ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset);
 ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset);
 ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset);
 ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset);
 // LSC Stores
 void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value);
 void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value);
 void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value);
 void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value);
 void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value);
 void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value);
 void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value);
 void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value);
 void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value);
 void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value);
 void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value);
 void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value);
 void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value);
 void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value);
 void store_uint_L1UC_L3UC(global uint* it, int offset, uint value);
 void store_uint_L1UC_L3WB(global uint* it, int offset, uint value);
 void store_uint_L1WT_L3UC(global uint* it, int offset, uint value);
 void store_uint_L1WT_L3WB(global uint* it, int offset, uint value);
 void store_uint_L1S_L3UC(global uint* it, int offset, uint value);
 void store_uint_L1S_L3WB(global uint* it, int offset, uint value);
 void store_uint_L1WB_L3WB(global uint* it, int offset, uint value);
 void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value);
 void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value);
 void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value);
 void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value);
 void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value);
 void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value);
 void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value);
 void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value);
 void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value);
 void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value);
 void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value);
 void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value);
 void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value);
 void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value);
 void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value);
 void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value);
 void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value);
 void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value);
 void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value);
 void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value);
 void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value);
 void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value);
 void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value);
 void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value);
 void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value);
 void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value);
 void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value);
 void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value);
 void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value);
 void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value);
 void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value);
 void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value);
 void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value);
 void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value);
 void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value);
 void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value);
 void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value);
 void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value);
 void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value);
 void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value);
 void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value);
 void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value);
 void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value);
 void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value);
 void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value);
 void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value);
 void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value);
 void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value);
 void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value);
 void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value);
 void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value);
 void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value);
 void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value);
 void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value);
 void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value);
 void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value);
 void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value);
 void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value);
 void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value);
 void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value);
 void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value);
 void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value);
 void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value);
 // LSC Fence support
 void mem_fence_gpu_default();
 void mem_fence_workgroup_default();
 void mem_fence_gpu_invalidate();
 void mem_fence_gpu_evict();
 void mem_fence_evict_to_memory();
--- a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl
@ -1,898 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 // LSC Loads
 // uchar
 uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
 {
    return (uint)(it[offset]);
 }
 // ushort
 uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
 {
    return (uint)(it[offset]);
 }
 uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
 {
    return (uint)(it[offset]);
 }
 // uint
 uint load_uint_L1UC_L3UC(global uint* it, int offset)
 {
    return it[offset];
 }
 uint load_uint_L1UC_L3C(global uint* it, int offset)
 {
    return it[offset];
 }
 uint load_uint_L1C_L3UC(global uint* it, int offset)
 {
    return it[offset];
 }
 uint load_uint_L1C_L3C(global uint* it, int offset)
 {
    return it[offset];
 }
 uint load_uint_L1S_L3UC(global uint* it, int offset)
 {
    return it[offset];
 }
 uint load_uint_L1S_L3C(global uint* it, int offset)
 {
    return it[offset];
 }
 uint load_uint_L1IAR_L3C(global uint* it, int offset)
 {
    return it[offset];
 }
 // uint2
 uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
 {
    return it[offset];
 }
 uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
 {
    return it[offset];
 }
 uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
 {
    return it[offset];
 }
 uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
 {
    return it[offset];
 }
 uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
 {
    return it[offset];
 }
 uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
 {
    return it[offset];
 }
 uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
 {
    return it[offset];
 }
 // uint3
 uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
 {
    return it[offset];
 }
 uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
 {
    return it[offset];
 }
 uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
 {
    return it[offset];
 }
 uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
 {
    return it[offset];
 }
 uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
 {
    return it[offset];
 }
 uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
 {
    return it[offset];
 }
 uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
 {
    return it[offset];
 }
 // uint4
 uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
 {
    return it[offset];
 }
 uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
 {
    return it[offset];
 }
 uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
 {
    return it[offset];
 }
 uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
 {
    return it[offset];
 }
 uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
 {
    return it[offset];
 }
 uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
 {
    return it[offset];
 }
 uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
 {
    return it[offset];
 }
 // uint8
 uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
 {
    return it[offset];
 }
 uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
 {
    return it[offset];
 }
 uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
 {
    return it[offset];
 }
 uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
 {
    return it[offset];
 }
 uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
 {
    return it[offset];
 }
 uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
 {
    return it[offset];
 }
 uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
 {
    return it[offset];
 }
 // ulong
 ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
 {
    return it[offset];
 }
 ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
 {
    return it[offset];
 }
 ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
 {
    return it[offset];
 }
 ulong load_ulong_L1C_L3C(global ulong* it, int offset)
 {
    return it[offset];
 }
 ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
 {
    return it[offset];
 }
 ulong load_ulong_L1S_L3C(global ulong* it, int offset)
 {
    return it[offset];
 }
 ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
 {
    return it[offset];
 }
 // ulong2
 ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
 {
    return it[offset];
 }
 ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
 {
    return it[offset];
 }
 ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
 {
    return it[offset];
 }
 ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
 {
    return it[offset];
 }
 ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
 {
    return it[offset];
 }
 ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
 {
    return it[offset];
 }
 ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
 {
    return it[offset];
 }
 // ulong3
 ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
 {
    return it[offset];
 }
 ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
 {
    return it[offset];
 }
 ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
 {
    return it[offset];
 }
 ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
 {
    return it[offset];
 }
 ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
 {
    return it[offset];
 }
 ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
 {
    return it[offset];
 }
 ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
 {
    return it[offset];
 }
 // ulong4
 ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
 {
    return it[offset];
 }
 ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
 {
    return it[offset];
 }
 ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
 {
    return it[offset];
 }
 ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
 {
    return it[offset];
 }
 ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
 {
    return it[offset];
 }
 ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
 {
    return it[offset];
 }
 ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
 {
    return it[offset];
 }
 // ulong8
 ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
 {
    return it[offset];
 }
 ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
 {
    return it[offset];
 }
 ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
 {
    return it[offset];
 }
 ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
 {
    return it[offset];
 }
 ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
 {
    return it[offset];
 }
 ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
 {
    return it[offset];
 }
 ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
 {
    return it[offset];
 }
 // LSC Stores
 // uchar
 void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
 {
    it[offset] = (uchar)(value);
 }
 void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
 {
    it[offset] = (uchar)(value);
 }
 void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
 {
    it[offset] = (uchar)(value);
 }
 void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
 {
    it[offset] = (uchar)(value);
 }
 void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
 {
    it[offset] = (uchar)(value);
 }
 void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
 {
    it[offset] = (uchar)(value);
 }
 void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
 {
    it[offset] = (uchar)(value);
 }
 // ushort
 void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
 {
    it[offset] = (ushort)(value);
 }
 void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
 {
    it[offset] = (ushort)(value);
 }
 void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
 {
    it[offset] = (ushort)(value);
 }
 void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
 {
    it[offset] = (ushort)(value);
 }
 void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
 {
    it[offset] = (ushort)(value);
 }
 void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
 {
    it[offset] = (ushort)(value);
 }
 void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
 {
    it[offset] = (ushort)(value);
 }
 // uint
 void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
 {
    it[offset] = value;
 }
 void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
 {
    it[offset] = value;
 }
 void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
 {
    it[offset] = value;
 }
 void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
 {
    it[offset] = value;
 }
 void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
 {
    it[offset] = value;
 }
 void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
 {
    it[offset] = value;
 }
 void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
 {
    it[offset] = value;
 }
 // uint2
 void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
 {
    it[offset] = value;
 }
 void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
 {
    it[offset] = value;
 }
 void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
 {
    it[offset] = value;
 }
 void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
 {
    it[offset] = value;
 }
 void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
 {
    it[offset] = value;
 }
 void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
 {
    it[offset] = value;
 }
 void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
 {
    it[offset] = value;
 }
 // uint3
 void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
 {
    it[offset] = value;
 }
 void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
 {
    it[offset] = value;
 }
 void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
 {
    it[offset] = value;
 }
 void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
 {
    it[offset] = value;
 }
 void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
 {
    it[offset] = value;
 }
 void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
 {
    it[offset] = value;
 }
 void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
 {
    it[offset] = value;
 }
 // uint4
 void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
 {
    it[offset] = value;
 }
 void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
 {
    it[offset] = value;
 }
 void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
 {
    it[offset] = value;
 }
 void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
 {
    it[offset] = value;
 }
 void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
 {
    it[offset] = value;
 }
 void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
 {
    it[offset] = value;
 }
 void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
 {
    it[offset] = value;
 }
 // uint8
 void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
 {
    it[offset] = value;
 }
 void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
 {
    it[offset] = value;
 }
 void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
 {
    it[offset] = value;
 }
 void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
 {
    it[offset] = value;
 }
 void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
 {
    it[offset] = value;
 }
 void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
 {
    it[offset] = value;
 }
 void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
 {
    it[offset] = value;
 }
 // ulong
 void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
 {
    it[offset] = value;
 }
 void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
 {
    it[offset] = value;
 }
 void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
 {
    it[offset] = value;
 }
 void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
 {
    it[offset] = value;
 }
 void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
 {
    it[offset] = value;
 }
 void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
 {
    it[offset] = value;
 }
 void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
 {
    it[offset] = value;
 }
 // ulong2
 void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
 {
    it[offset] = value;
 }
 void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
 {
    it[offset] = value;
 }
 void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
 {
    it[offset] = value;
 }
 void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
 {
    it[offset] = value;
 }
 void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
 {
    it[offset] = value;
 }
 void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
 {
    it[offset] = value;
 }
 void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
 {
    it[offset] = value;
 }
 // ulong3
 void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
 {
    it[offset] = value;
 }
 void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
 {
    it[offset] = value;
 }
 void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
 {
    it[offset] = value;
 }
 void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
 {
    it[offset] = value;
 }
 void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
 {
    it[offset] = value;
 }
 void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
 {
    it[offset] = value;
 }
 void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
 {
    it[offset] = value;
 }
 // ulong4
 void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
 {
    it[offset] = value;
 }
 void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
 {
    it[offset] = value;
 }
 void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
 {
    it[offset] = value;
 }
 void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
 {
    it[offset] = value;
 }
 void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
 {
    it[offset] = value;
 }
 void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
 {
    it[offset] = value;
 }
 void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
 {
    it[offset] = value;
 }
 // ulong8
 void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
 {
    it[offset] = value;
 }
 void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
 {
    it[offset] = value;
 }
 void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
 {
    it[offset] = value;
 }
 void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
 {
    it[offset] = value;
 }
 void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
 {
    it[offset] = value;
 }
 void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
 {
    it[offset] = value;
 }
 void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
 {
    it[offset] = value;
 }
 // LSC Fence support
 void mem_fence_gpu_default()
 {
    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 void mem_fence_workgroup_default()
 {
    write_mem_fence( CLK_GLOBAL_MEM_FENCE );
 }
 void mem_fence_gpu_invalidate()
 {
    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 void mem_fence_gpu_evict()
 {
    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 void mem_fence_evict_to_memory()
 {
    mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
--- a/src/intel/vulkan/grl/gpu/mem_utils.h
+++ b/src/intel/vulkan/grl/gpu/mem_utils.h
@ -1,161 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "shared.h"
 /// Write cache line to global memory
 /// Assumes subgroup_size is 16
 ///
 /// @param dst 64 bytes aligned output pointer
 /// @param val value to write
 GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val)
 {
    global uint* addrAligned = (global uint*)(global uint16*)dst;
    intel_sub_group_block_write(addrAligned, val);
 }
 /// Read cache line from global memory
 /// Assumes subgroup_size is 16
 ///
 /// @param src 64 bytes aligned input pointer
 /// @return uint read from memory
 GRL_INLINE uint CacheLineSubgroupRead(const global char* src)
 {
    const global uint* addrAligned = (const global uint*)(global uint16*)src;
    return intel_sub_group_block_read(addrAligned);
 }
 /// Copy cache line
 /// Assumes subgroup_size is 16
 ///
 /// @param dst 64 bytes aligned output pointer
 /// @param src input pointer
 GRL_INLINE void CopyCacheLine(global char* dst, const global char* src)
 {
    global const uint* usrc = (global const uint*) (src);
    uint data = intel_sub_group_block_read(usrc);
    CacheLineSubgroupWrite(dst, data);
 }
 /// Fast memory copy
 /// 
 /// @param dst output pointer
 /// @param src input pointer
 /// @param size number of bytes to copy
 /// @param numGroups number of groups that execute this function
 GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups)
 {
    const uint CACHELINE_SIZE = 64;
    uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0);
    // this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline.
    // it copies laso reminder
    {
        uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1);
        alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1);
        if (size > alignAdd)
        {
            uint alignedBytesCount = size - alignAdd;
            uint alignedDWsCount = alignedBytesCount >> 2;
            global uint* dstAlignedPart = (global uint*)(dst + alignAdd);
            global uint* srcAlignedPart = (global uint*)(src + alignAdd);
            for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups)
            {
                dstAlignedPart[id] = srcAlignedPart[id];
            }
            if (globalID < alignedBytesCount - (alignedDWsCount << 2))
            {
                global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount);
                global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount);
                dstByteRem[globalID] = srcByteRem[globalID];
            }
        }
    }
    // copy to dst below aligned up to chacheline
    {
        uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3;
        if (misalignmentBytesSize)
        {
            if (globalID < misalignmentBytesSize)
            {
                dst[globalID] = src[globalID];
            }
            dst += misalignmentBytesSize;
            src += misalignmentBytesSize;
        }
        uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1);
        if (misalignmentDWSize)
        {
            if (globalID < (misalignmentDWSize >> 2))
            {
                ((global uint*)dst)[globalID] = ((global uint*)src)[globalID];
            }
        }
    }
 }
 #define CACHELINE_SIZE 64
 #define CACHELINE_PER_BLOCK 4
 #define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
 GRL_INLINE
 global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset)
 {
    if (array != NULL)
    {
        return array + byteOffset;
    }
    else
    {
        return (global char *)arrayOfPtrs[byteOffset >> 6];
    }
 }
 // assummed:
 // dst is always 64 bytes alligned
 // size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes)
 GRL_INLINE
 void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups)
 {
    uint taskId = get_group_id(0);
    uint blockedSize = (size) & (~(BLOCK_SIZE - 1));
    uint cachelinedTailOffset = blockedSize;
    uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1));
    uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE
    uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1)));
    if (reversedTaskId < tailCacheLines)
    {
        uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE);
        global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
        CopyCacheLine(dst + byteOffset, src);
    }
    uint numBlocks = blockedSize >> 8;
    while (taskId < numBlocks)
    {
        uint byteOffset = (taskId * BLOCK_SIZE);
        for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++)
        {
            global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
            CopyCacheLine(dst + byteOffset, src);
            byteOffset += CACHELINE_SIZE;
        }
        taskId += numGroups;
    }
 }
--- a/src/intel/vulkan/grl/gpu/misc.cl
+++ b/src/intel/vulkan/grl/gpu/misc.cl
@ -1,367 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "api_interface.h"
 #include "common.h"
 #include "instance.h"
 #include "misc_shared.h"
 #include "mem_utils.h"
 #define DBG(x)
 #define ENABLE_CHECKS 0
 #define CACHELINE_SIZE 64
 #define CACHELINE_PER_BLOCK 4
 #define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
 GRL_INLINE
 uint32_t getGeomDescPrimitiveCountAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
 {
    return  (uint32_t)GRL_get_primitive_count(&geomDesc[index]);
 }
 GRL_INLINE
 uint32_t getGeomDescTypeAndFlagsAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
 {
    return (uint32_t)GRL_get_Type(&geomDesc[index]) |
           (((uint32_t)GRL_get_Flags(&geomDesc[index])) << 16);
 }
 GRL_INLINE
 uint64_t getGeomDescAsUint64t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
 {
    return (uint64_t)getGeomDescPrimitiveCountAsUint32t(geomDesc, index) |
           (((uint64_t)getGeomDescTypeAndFlagsAsUint32t(geomDesc, index)) << 32);
 }
 // assummed:
 // dst is always 64 bytes alligned
 GRL_INLINE
 void copyGeoMetaData(global char* dst, global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t size, uint numGroups)
 {
    uint taskId = get_group_id(0);
    uint localId = get_sub_group_local_id();
    uint cachelinedSize = (size) & (~(CACHELINE_SIZE-1));
    uint reminderOffset = cachelinedSize;
    uint reminderQWSize = (size - reminderOffset) >> 3;
    uint tailCacheLines = cachelinedSize >> 6; // divide by CACHELINE_SIZE
    uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups-1)));
    if (reversedTaskId == tailCacheLines && localId < reminderQWSize)
    {
        uint reminderOffsetQW = reminderOffset >> 3;
        global uint64_t* dstQW = (global uint64_t*)(dst);
        dstQW[localId + reminderOffsetQW] = getGeomDescAsUint64t(geomDesc, localId + reminderOffsetQW);
    }
    uint numCacheLines = cachelinedSize >> 6;
    while (taskId < numCacheLines)
    {
        uint byteOffset = taskId * CACHELINE_SIZE;
        uint geoIdFromOffset = (byteOffset >> 3) + (localId >> 1);
        uint32_t data = 0;
        if (localId & 1)
        {
            data = getGeomDescTypeAndFlagsAsUint32t(geomDesc, geoIdFromOffset);
        }
        else
        {
            data = getGeomDescPrimitiveCountAsUint32t(geomDesc, geoIdFromOffset);
        }
        CacheLineSubgroupWrite(dst + byteOffset, data);
        taskId += numGroups;
    }
 }
 GRL_INLINE
 uint groupCountForInstancesCopySize(uint size)
 {
    return (size >> 8) + 3;
 }
 GRL_INLINE
 uint groupCountForGeoMetaDataCopySize(uint size)
 {
    return (size >> 6) + 1;
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_instances(global char* dest, global char* instancesArray, uint64_t size)
 {
  //  global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_instances_indirect(global char* dest, global char* instancesArray, global const struct IndirectBuildRangeInfo* const indirect_data)
 {
    uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
    instancesArray += indirect_data->primitiveOffset;
    uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (tid == 0)
    {
        struct BVHBase* bvh     = (struct BVHBase*)dest;
        bvh->Meta.instanceCount = indirect_data->primitiveCount;
    }
    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_instance_ptrs(global char* dest, global uint64_t* arrayOfPtrs, uint64_t size)
 {
    //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_instance_ptrs_indirect(global char* dest, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
 {
    uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
    arrayOfPtrs += indirect_data->primitiveOffset;
    uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (tid == 0)
    {
        struct BVHBase* bvh     = (struct BVHBase*)dest;
        bvh->Meta.instanceCount = indirect_data->primitiveCount;
    }
    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_instances_base_ptr(global BVHBase* bvh, global char* instancesArray, uint64_t size)
 {
    global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_instances_base_ptr_indirect(global BVHBase* bvh, global char* instancesArray, global struct IndirectBuildRangeInfo const * const indirect_data)
 {
    global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
    uint64_t     size = indirect_data->primitiveCount * sizeof(InstanceDesc);
    instancesArray += indirect_data->primitiveOffset;
    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_instance_ptrs_base_ptr(global BVHBase* bvh, global uint64_t* arrayOfPtrs, uint64_t size)
 {
    global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_instance_ptrs_base_ptr_indirect(global BVHBase* bvh, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
 {
    global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
    uint64_t     size = indirect_data->primitiveCount * sizeof(InstanceDesc);
    arrayOfPtrs += indirect_data->primitiveOffset;
    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel copy_geo_meta_data(global char* dest, global char* src, uint64_t size)
 {
    //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.geoDescsStart);
    global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc = (global GRL_RAYTRACING_GEOMETRY_DESC *)((unsigned long)src);
    copyGeoMetaData(dest, geomDesc, size, groupCountForGeoMetaDataCopySize(size));
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__( ( reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 ) ) )
 __attribute__( ( intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH ) ) )
 void kernel copy_geo_descs_indirect_build(global char* dest, global char* src, global struct IndirectBuildRangeInfo const * const indirect_data, uint numGeometries)
 {
    uint32_t gid = get_local_id(0) + get_group_id(0) * get_local_size(0);
    if (gid < numGeometries) {
        global GRL_RAYTRACING_GEOMETRY_DESC* dstDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(dest);
        global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(src);
        GRL_RAYTRACING_GEOMETRY_DESC geo = srcDesc[gid];
        uint primitiveCount  = indirect_data[gid].primitiveCount;
        uint primitiveOffset = indirect_data[gid].primitiveOffset;
        uint firstVertex     = indirect_data[gid].firstVertex;
        uint transformOffset = indirect_data[gid].transformOffset;
        if (srcDesc[gid].Type == GEOMETRY_TYPE_TRIANGLES)
        {
            if (geo.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
            {
                geo.Desc.Triangles.VertexCount = primitiveCount * 3;
                geo.Desc.Triangles.pVertexBuffer += primitiveOffset
                                                    + firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
            }
            else
            {
                geo.Desc.Triangles.IndexCount = primitiveCount * 3;
                geo.Desc.Triangles.pIndexBuffer += primitiveOffset;
                geo.Desc.Triangles.pVertexBuffer += firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
            }
            if (geo.Desc.Triangles.pTransformBuffer) {
                geo.Desc.Triangles.pTransformBuffer += transformOffset;
            }
        } else {
            // GEOMETRY_TYPE_PROCEDURAL
            geo.Desc.Procedural.AABBCount = primitiveCount;
            geo.Desc.Procedural.pAABBs_GPUVA += primitiveOffset;
        }
        dstDesc[gid] = geo;
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(1, 1, 1))) void kernel batched_init_globals(global struct BatchedInitGlobalsData *data)
 {
    uint groupID = get_group_id(0);
    struct BatchedInitGlobalsData entry = data[groupID];
    global struct Globals* globals = (global struct Globals*)entry.p_build_globals;
    global char *bvh_mem = (global char*)entry.p_bvh_buffer;
    uint numPrimitives = entry.numPrimitives;
    uint numGeometries = entry.numGeometries;
    uint numInstances = entry.numInstances;
    uint instance_descs_start = entry.instance_descs_start;
    uint geo_meta_data_start = entry.geo_meta_data_start;
    uint node_data_start = entry.node_data_start;
    uint quad_data_start = entry.leaf_data_start;
    uint instance_data_start = entry.leaf_data_start;
    uint procedural_data_start = entry.procedural_data_start;
    uint back_pointer_start = entry.back_pointer_start;
    uint build_record_start = entry.leaf_data_start;
    uint totalBytes = entry.sizeTotal;
    uint leafPrimType = entry.leafType;
    uint leafSize = entry.leafSize;
    uint root_node_offset = node_data_start;
    struct BVHBase *base = (struct BVHBase *)bvh_mem;
    base->Meta.instanceCount      = numInstances;
    base->Meta.geoCount           = numGeometries;
    base->Meta.instanceDescsStart = instance_descs_start;
    base->Meta.geoDescsStart      = geo_meta_data_start;
    base->Meta.allocationSize     = totalBytes;
    // This doesnt work correctly
    //ERROR_INFO initErr = { 0, 0, 0, 0xAAABBAAA };
    //base->Meta.errors = initErr;
    base->Meta.errors.type = 0;
    base->Meta.errors.offset_in_BVH = 0; //in 64B units
    base->Meta.errors.when = 0;
    base->Meta.errors.reserved = 0xAAABBAAA;
    base->nodeDataCur = node_data_start / 64;
    base->quadLeafStart = quad_data_start / 64;
    base->quadLeafCur = quad_data_start / 64;
    base->instanceLeafStart = instance_data_start / 64;
    base->instanceLeafEnd = instance_data_start / 64;
    base->proceduralDataStart = procedural_data_start / 64;
    base->proceduralDataCur = procedural_data_start / 64;
    base->backPointerDataStart = back_pointer_start / 64;
    base->refitTreeletsDataStart = totalBytes / 64;
    base->refitStartPointDataStart = totalBytes / 64;
    base->BVHDataEnd = totalBytes / 64;
    base->refitTreeletCnt = 0;
    base->refitTreeletCnt2 = 0;
    base->rootNodeOffset = root_node_offset;
    base->fatLeafCount = 0;
    base->fatLeafTableStart = entry.fatleaf_table_start / 64;
    base->innerCount = 0;
    base->innerTableStart = entry.innernode_table_start / 64;
    base->quadLeftoversCountNewAtomicUpdate = 0;
    base->quadTableSizeNewAtomicUpdate = 0;
    base->quadIndicesDataStart = entry.quad_indices_data_start / 64;
    if (back_pointer_start != totalBytes)
    {
        BackPointers* back_pointers = BVHBase_GetBackPointers(base);
        uint root_node_idx = root_node_offset - node_data_start;
        global uint *root_node_backpointer = (global uint *)InnerNode_GetBackPointer(back_pointers,root_node_idx);
        *root_node_backpointer = ((uint)-1) << 6;
    }
    AABB3f_init(&base->Meta.bounds);
    AABB_init(&globals->centroidBounds);
    globals->build_record_start = build_record_start;
    globals->numBuildRecords = 0;
    globals->numBuildRecords_extended = 0;
    globals->numPrimitives = numPrimitives;
    globals->numSplittedPrimitives = 0;
    globals->sync = 0;
    globals->probThreshold = 0.0f;
    globals->leafPrimType = leafPrimType;
    globals->leafSize = leafSize;
 }
 // This is temporary WA for mock in DXR
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel copy_mock(global char *dest,
                                                                                     global char *src,
                                                                                     uint32_t size)
 {
    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
    uint32_t globalSize = get_num_groups(0) * get_local_size(0);
    for (uint32_t i = globalId; i < size; i += globalSize)
    {
        dest[i] = src[i];
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(32, 1, 1)))
 void kernel mem_set(global char *dest,
    dword byte,
    dword size)
 {
    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
    if (globalId < size)
    {
        dest[globalId] = (char)byte;
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(32, 1, 1)))
 void kernel mem_set_size_ptr(global char *dest,
    dword byte,
    global qword* sizePtr)
 {
    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
    if (globalId < *sizePtr)
    {
        dest[globalId] = (char)byte;
    }
 }
--- a/src/intel/vulkan/grl/gpu/misc.grl
+++ b/src/intel/vulkan/grl/gpu/misc.grl
@ -1,278 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module misc;
 kernel_module misc("misc.cl")
 {
    kernel opencl_kernel_batched_init_globals                 < kernelFunction="batched_init_globals" >;
    kernel opencl_kernel_copy_instances                       < kernelFunction="copy_instances" >;
    kernel opencl_kernel_copy_instances_indirect              < kernelFunction="copy_instances_indirect" >;
    kernel opencl_kernel_copy_instance_ptrs                   < kernelFunction="copy_instance_ptrs" >;
    kernel opencl_kernel_copy_instance_ptrs_indirect          < kernelFunction="copy_instance_ptrs_indirect" >;
    kernel opencl_kernel_copy_instances_base_ptr              < kernelFunction="copy_instances_base_ptr" >;
    kernel opencl_kernel_copy_instances_base_ptr_indirect     < kernelFunction="copy_instances_base_ptr_indirect" >;
    kernel opencl_kernel_copy_instance_ptrs_base_ptr          < kernelFunction="copy_instance_ptrs_base_ptr" >;
    kernel opencl_kernel_copy_instance_ptrs_base_ptr_indirect < kernelFunction="copy_instance_ptrs_base_ptr_indirect" >;
    kernel opencl_kernel_copy_geo_meta_data                   < kernelFunction="copy_geo_meta_data" >;
    kernel opencl_kernel_copy_geo_descs_indirect_build        < source="misc.cl", kernelFunction="copy_geo_descs_indirect_build" >;
    kernel opencl_kernel_copy_mock                            < kernelFunction="copy_mock" >;
    kernel opencl_kernel_memset                               < kernelFunction="mem_set" >;
    kernel opencl_kernel_memset_size_ptr                      < kernelFunction="mem_set_size_ptr" >;
 }
 import struct MKBuilderState "structs.grl";
 import struct MKSizeEstimate "structs.grl";
 metakernel batched_init_globals(
    qword p_data,
    dword numWgs)
 {
    dispatch opencl_kernel_batched_init_globals(numWgs,1,1) args(p_data);
 }
 metakernel copy_instances(
    qword bvh_buffer,
    qword instanceDescsBuffer,
    qword totalSizeToCopy,
    dword numThreads)
 {
    dispatch opencl_kernel_copy_instances (numThreads, 1, 1) args(
        bvh_buffer,
        instanceDescsBuffer,
        totalSizeToCopy);
 }
 metakernel
 copy_instances_indirect( qword bvh_buffer, qword instanceDescsBuffer, qword indirectBuildRangeInfo )
 {
    define num_groups REG0;
    define C_2        REG2;
    define C_3        REG3;
    C_2       = 2;
    C_3       = 3;
    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
    num_groups = load_dword( indirectBuildRangeInfo );
    num_groups = num_groups >> C_2;
    num_groups = num_groups + C_3;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_copy_instances_indirect args(
        bvh_buffer,
        instanceDescsBuffer,
        indirectBuildRangeInfo);
 }
 metakernel copy_instance_ptrs(
    qword bvh_buffer,
    qword instanceDescPtrsBuffer,
    qword totalSizeToCopy,
    dword numThreads)
 {
    dispatch opencl_kernel_copy_instance_ptrs (numThreads, 1, 1) args(
        bvh_buffer,
        instanceDescPtrsBuffer,
        totalSizeToCopy);
 }
 metakernel copy_instance_ptrs_indirect(
    qword bvh_buffer,
    qword instanceDescPtrsBuffer,
    qword indirectBuildRangeInfo)
 {
    define num_groups REG0;
    define C_2        REG2;
    define C_3        REG3;
    C_2       = 2;
    C_3       = 3;
    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
    num_groups = load_dword( indirectBuildRangeInfo );
    num_groups = num_groups >> C_2;
    num_groups = num_groups + C_3;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_copy_instance_ptrs_indirect args(
        bvh_buffer,
        instanceDescPtrsBuffer,
        indirectBuildRangeInfo);
 }
 metakernel copy_instances_base_ptr(
    qword bvh_buffer,
    qword instanceDescsBuffer,
    qword totalSizeToCopy,
    dword numThreads)
 {
    dispatch opencl_kernel_copy_instances_base_ptr (numThreads, 1, 1) args(
        bvh_buffer,
        instanceDescsBuffer,
        totalSizeToCopy);
 }
 metakernel copy_instances_base_ptr_indirect(
    qword bvh_buffer,
    qword instanceDescsBuffer,
    qword indirectBuildRangeInfo)
 {
    define num_groups REG0;
    define C_2        REG2;
    define C_3        REG3;
    C_2       = 2;
    C_3       = 3;
    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
    num_groups = load_dword( indirectBuildRangeInfo );
    num_groups = num_groups >> C_2;
    num_groups = num_groups + C_3;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_copy_instances_base_ptr_indirect args(
        bvh_buffer,
        instanceDescsBuffer,
        indirectBuildRangeInfo);
 }
 metakernel copy_instance_ptrs_base_ptr(
    qword bvh_buffer,
    qword instanceDescPtrsBuffer,
    qword totalSizeToCopy,
    dword numThreads)
 {
    dispatch opencl_kernel_copy_instance_ptrs_base_ptr (numThreads, 1, 1) args(
        bvh_buffer,
        instanceDescPtrsBuffer,
        totalSizeToCopy);
 }
 metakernel copy_instance_ptrs_base_ptr_indirect(
    qword bvh_buffer,
    qword instanceDescPtrsBuffer,
    qword indirectBuildRangeInfo)
 {
    define num_groups REG0;
    define C_2        REG2;
    define C_3        REG3;
    C_2       = 2;
    C_3       = 3;
    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
    num_groups = load_dword( indirectBuildRangeInfo );
    num_groups = num_groups >> C_2;
    num_groups = num_groups + C_3;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_copy_instance_ptrs_base_ptr_indirect  args(
        bvh_buffer,
        instanceDescPtrsBuffer,
        indirectBuildRangeInfo);
 }
 metakernel copy_geo_descs(
    qword private_dest,
    qword transient_src,
    qword indirectBuildRangeInfo,
    dword numGeometries)
 {
    define num_groups (numGeometries + 16 - 1) / 16;
    dispatch opencl_kernel_copy_geo_descs_indirect_build(num_groups, 1, 1) args(
        private_dest,
        transient_src,
        indirectBuildRangeInfo,
        numGeometries);
 }
 metakernel copy_geo_meta_data(
    qword bvh_buffer,
    qword geomdesc_buffer,
    qword totalSizeToCopy,
    dword numThreads)
 {
    dispatch opencl_kernel_copy_geo_meta_data (numThreads, 1, 1) args(
        bvh_buffer,
        geomdesc_buffer,
        totalSizeToCopy);
 }
 const COPY_MOCK_GROUP_SIZE = 16;
 metakernel copy_mock(
    qword dest,
    qword src,
    dword size)
 {
    define num_groups (size + COPY_MOCK_GROUP_SIZE - 1) / COPY_MOCK_GROUP_SIZE;
    dispatch opencl_kernel_copy_mock(num_groups, 1, 1) args(
        dest,
        src,
        size);
 }
 metakernel memset(
    qword dest,
    dword byte,
    dword size)
 {
    define num_groups (size + 32 - 1) / 32;
    dispatch opencl_kernel_memset(num_groups, 1, 1) args(
        dest,
        byte,
        size);
 }
 metakernel memset_size_ptr(
    qword dest,
    dword byte,
    qword sizePtr)
 {
    define byteSize REG0;
    define C_32 REG1; C_32 = 32;
    define C_1 REG2; C_1 = 1;
    define C_4 REG3; C_4 = 4;
    define numGroupsRqd REG4;
    byteSize = load_dword(sizePtr);
    numGroupsRqd = byteSize + C_32;
    numGroupsRqd = numGroupsRqd - C_1;
    numGroupsRqd = numGroupsRqd >> C_4;
    numGroupsRqd = numGroupsRqd >> C_1;
    DISPATCHDIM_X = numGroupsRqd.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_kernel_memset_size_ptr args(
        dest,
        byte,
        sizePtr);
 }
--- a/src/intel/vulkan/grl/gpu/misc_legacy.cl
+++ b/src/intel/vulkan/grl/gpu/misc_legacy.cl
@ -1,386 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "input_client_structs.h"
 #include "common.h"
 #include "instance.h"
 #define DBG(x)
 #define ENABLE_CHECKS 0
 /*
  This kernel implements a exclusive scan addition operation. The
  implementation currently only uses one DSS.
 */
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 parallel_scan_exclusive_add(global uint *input,
                            global uint *output,
                            const uint N)
 {
    const uint j = get_local_id(0);
    const uint J = get_local_size(0);
    const uint BLOCKSIZE = (N + J - 1) / J;
    const uint start = min((j + 0) * BLOCKSIZE, N);
    const uint end = min((j + 1) * BLOCKSIZE, N);
    uint base = 0;
    for (uint i = start; i < end; i++)
        base += input[i];
    base = work_group_scan_exclusive_add(base);
    uint accu = 0;
    for (uint i = start; i < end; i++)
    {
        output[i] = base + accu;
        accu += input[i];
    }
 }
 /*
  This kernel implements a exclusive scan addition operation that can use the entire GPU.
 */
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 parallel_scan_exclusive_add_phase0(global uint *input,
                                   global uint *output,
                                   global uint *prefix_sums,
                                   const uint N)
 {
    const uint local_size = get_local_size(0);
    const uint numTasks = get_num_groups(0);
    const uint groupID = get_group_id(0);
    const uint localID = get_local_id(0);
    const uint global_startID = (groupID + 0) * N / numTasks;
    const uint global_endID = (groupID + 1) * N / numTasks;
    uint base = 0;
    for (uint i = global_startID + localID; i < global_endID; i += local_size)
        base += input[i];
    base = work_group_reduce_add(base);
    if (localID == 0)
    {
        prefix_sums[groupID] = base;
        printf("%d -> %d \n", groupID, base);
    }
 }
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 parallel_scan_exclusive_add_phase1(global uint *input,
                                   global uint *output,
                                   global uint *prefix_sums,
                                   const uint N)
 {
    const uint local_size = get_local_size(0);
    const uint numTasks = get_num_groups(0);
    const uint groupID = get_group_id(0);
    const uint localID = get_local_id(0);
    const uint global_startID = (groupID + 0) * N / numTasks;
    const uint global_endID = (groupID + 1) * N / numTasks;
    const uint local_range = global_endID - global_startID;
    uint global_base = 0;
    for (uint i = 0; i < groupID; i++)
        global_base += prefix_sums[i];
    const uint j = get_local_id(0);
    const uint J = get_local_size(0);
    const uint BLOCKSIZE = (local_range + J - 1) / J;
    const uint startID = (j + 0) * local_range / J + global_startID;
    const uint endID = (j + 1) * local_range / J + global_startID;
    uint base = 0;
    for (uint i = startID; i < endID; i++)
        base += input[i];
    base = work_group_scan_exclusive_add(base);
    uint accu = 0;
    for (uint i = startID; i < endID; i++)
    {
        output[i] = global_base + base + accu;
        accu += input[i];
    }
 }
 /* ========================================================================= */
 /* ============================== STATISTICS =============================== */
 /* ========================================================================= */
 /* ====== STATS config ====== */
 #define ENABLE_STAT_CHECKS 1
 #define DBG_STATS(x)
 __attribute__((reqd_work_group_size(256, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 printBVHStatistics(global struct Globals *globals,
                   global char *bvh_mem,
                   global struct StatStackEntry *global_stack0,
                   global struct StatStackEntry *global_stack1,
                   const uint presplit)
 {
    const uint globalID = get_global_id(0);
    const uint localID = get_local_id(0);
    const uint local_size = get_local_size(0);
    struct BVHBase *base = (struct BVHBase *)bvh_mem;
    const uint root = base->rootNodeOffset;
    local uint stack_items[2];
    local uint iterations;
    struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root));
    root_aabb = conservativeAABB(&root_aabb);
    const float root_area = AABB_halfArea(&root_aabb);
    global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
    if (root_node->type != BVH_INTERNAL_NODE)
    {
        const uint numChildren = getNumChildren_QBVHNodeN(root_node);
        const uint current = root;
        for (uint i = 0; i < numChildren; i++)
        {
            struct AABB aabb = extractAABB_QBVHNodeN(root_node, i);
            const float area = AABB_halfArea(&aabb);
            global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad);
            global_stack0[i].type = root_node->type;
            global_stack0[i].area = area;
            global_stack0[i].aabb = aabb;
            global_stack0[i].depth = 0;
        }
        stack_items[0] = numChildren;
        stack_items[1] = 0;
    }
    else
    {
        global_stack0[0].node = root;
        global_stack0[0].type = root_node->type;
        global_stack0[0].area = root_area;
        global_stack0[0].aabb = root_aabb;
        global_stack0[0].depth = 1;
        stack_items[0] = 1;
        stack_items[1] = 0;
    }
    const uint maxInnerNodeOffset = globals->node_mem_allocator.cur;
    const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur;
    DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64));
    iterations = 0;
    work_group_barrier(CLK_LOCAL_MEM_FENCE);
    float sah_nodes = 0.0f;
    float sah_leaves = 0.0f;
    uint leaves = 0;
    uint inner_nodes = 0;
    uint max_depth = 0;
    uint leaf_items = 0;
    uint inner_nodes_valid_children = 0;
    while (1)
    {
        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
        const uint buffer_index = (iterations % 2) == 0 ? 0 : 1;
        global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1;
        global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0;
        const uint local_stack_items = stack_items[buffer_index];
        stack_items[1 - buffer_index] = 0;
        DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items));
        if (local_stack_items == 0)
            break;
        //if (iterations == 5) break;
        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
        if (globalID == 0)
            iterations++;
        for (uint sindex = localID; sindex < local_stack_items; sindex += local_size)
        {
            uint current = input_global_stack[sindex].node;
            uint type = input_global_stack[sindex].type;
            float current_area = input_global_stack[sindex].area;
            struct AABB current_aabb = input_global_stack[sindex].aabb;
            uint current_depth = input_global_stack[sindex].depth;
            //printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items);
            max_depth = max(max_depth, current_depth);
            if (type == BVH_QUAD_NODE)
            {
                unsigned int prims = 1; //getNumLeafPrims(current);
                if (prims > BVH_LEAF_N_MAX)
                    printf("too many items in leaf %d \n", prims);
                unsigned int prims_offset = current; //getLeafOffset(current);
                //printf("prims_offset %d \n",prims_offset);
                leaf_items += prims;
                sah_leaves += current_area;
                leaves++;
 #if ENABLE_STAT_CHECKS == 1
                struct AABB leafAABB;
                AABB_init(&leafAABB);
                global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset);
                //printf("prims_offset %d \n",prims_offset);
                for (uint i = 0; i < prims; i++)
                {
                    struct AABB quadAABB = getAABB_Quad(&quads[i]);
                    AABB_extend(&leafAABB, &quadAABB);
                }
                if (!presplit && !AABB_subset(&leafAABB, &current_aabb))
                {
                    printf("leaf error: current %d depth %d \n", current, current_depth);
                    AABB_print(&current_aabb);
                    printf("leaf bounds: \n");
                    AABB_print(&leafAABB);
                }
 #endif
            }
            else if (type == BVH_INTERNAL_NODE)
            {
                inner_nodes++;
                sah_nodes += current_area;
                global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current);
                uint children = 0;
                for (uint i = 0; i < BVH_NODE_N6; i++)
                {
                    if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i])
                        break;
                    children++;
                }
                //printf("children %d \n",children);
 #if ENABLE_STAT_CHECKS == 1
                if (children > BVH_NODE_N6 || children == 0)
                {
                    printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID);
                    printQBVHNodeN(nodeN);
                }
                if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0)
                {
                    printf("offset error %d \n", nodeN->offset);
                }
 #endif
                uint children_offset = atomic_add(&stack_items[1 - buffer_index], children);
                for (uint i = 0; i < children; i++)
                {
                    inner_nodes_valid_children++;
                    struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i);
                    const float area = AABB_halfArea(&aabb);
                    aabb = conservativeAABB(&aabb);
 #if 0 // ENABLE_STAT_CHECKS == 1                            // FIXME: not clear whether parent child property still holds !!!!
                  // if (aabb.lower.x == (float)(INFINITY))
                  //   {
                  //     printf("aabb inf error %d current %d nodeN %d \n",i, current, children);
                  //     break;
                  //   }
                  if (!presplit && !AABB_subset(&aabb,&current_aabb))
                    {
                      printf("Parent: current %d depth %d children %d \n",current, current_depth, children);
                      AABB_print(&current_aabb);
                      printf("Child %d: \n",i);
                      AABB_print(&aabb);
                    }
 #endif
                    uint dest_index = children_offset + i;
                    if (nodeN->type == BVH_QUAD_NODE)
                    {
                        output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad);
                        if (output_global_stack[dest_index].node >= maxLeafNodeOffset)
                        {
                            printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64);
                        }
                    }
                    else if (nodeN->type == BVH_INTERNAL_NODE)
                    {
                        output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN));
                        if (output_global_stack[dest_index].node >= maxInnerNodeOffset)
                        {
                            printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset);
                        }
                    }
                    output_global_stack[dest_index].type = nodeN->type;
                    output_global_stack[dest_index].area = area;
                    output_global_stack[dest_index].aabb = aabb;
                    output_global_stack[dest_index].depth = current_depth + 1;
                    //printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type);
                }
            }
        }
    }
    sah_nodes = work_group_reduce_add(sah_nodes);
    sah_leaves = work_group_reduce_add(sah_leaves);
    leaves = work_group_reduce_add(leaves);
    inner_nodes = work_group_reduce_add(inner_nodes);
    max_depth = work_group_reduce_max(max_depth);
    leaf_items = work_group_reduce_add(leaf_items);
    inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children);
    if (globalID == 0)
    {
        /*
    sah_nodes  *= 1.0f / root_area;
    sah_leaves *= 1.0f / root_area;
    float sah = sah_nodes + sah_leaves;
    const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start;
    const uint totalAllocatedMem = globals->totalAllocatedMem;
    printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX);
    float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6);
    float leaf_util = 100.0f * (float)leaf_items / (leaves);
    printf("allocators: node  %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start);
    printf("inner nodes %d leaves %d  sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves);
    uint node_mem     = globals->node_mem_allocator_cur;
    uint max_node_mem = globalLeafMemAllocatorOffset;
    float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem;
    uint leaf_mem        = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset;
    uint max_leaf_mem    = totalAllocatedMem - globalLeafMemAllocatorOffset;
    float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem;
    uint total_mem = node_mem + leaf_mem;
    float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem;
    printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem);
    */
    }
 }
--- a/src/intel/vulkan/grl/gpu/misc_shared.h
+++ b/src/intel/vulkan/grl/gpu/misc_shared.h
@ -1,196 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 //
 //   This file contains structure definitions shared by GRL OCL kernels and host code
 //
 #pragma once
 #include "GRLGen12.h"
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
 GRL_NAMESPACE_BEGIN(MISC)
 struct BatchedInitGlobalsData
 {
    qword p_build_globals;
    qword p_bvh_buffer;
    dword numPrimitives;
    dword numGeometries;
    dword numInstances;
    dword instance_descs_start;
    dword geo_meta_data_start;
    dword node_data_start;
    dword leaf_data_start;
    dword procedural_data_start;
    dword back_pointer_start;
    dword sizeTotal;
    dword leafType;
    dword leafSize;
    dword fatleaf_table_start;
    dword innernode_table_start;
    dword quad_indices_data_start;
 };
 /// Header of debug buffer
 ///
 /// Header is placed at the begining of debug buffer. 
 /// After header there is circullar buffer space
 typedef struct DebugBufferHeader
 {
    /// Offset to begin of buffer (after header)
    dword headStart;
    /// Offset to free memory in buffer (used by gpu)
    dword gpuHead;
    /// Offset to end of data in buffer that is ready to read (read on cpu, set on gpu, might be behind gpuHeader)
    dword cpuHead;
    /// Flag for buffer overflow
    dword overflow;
    /// Total size of buffer
    dword totalSize;
    /// Padding needed because otherwise GPU overrides tail with cacheline flush
    dword pad[11];
    /// Offset to begin of data in buffer
    dword tail;
 } DebugBufferHeader;
 enum InputDumpOperationType
 {
    INPUT_DUMP_OP_NOP,
    INPUT_DUMP_OP_BATCH,
    INPUT_DUMP_OP_BUILD,
    INPUT_DUMP_OP_UPDATE,
    INPUT_DUMP_OP_CLONE,
    INPUT_DUMP_OP_COMPACT,
    INPUT_DUMP_OP_SERIALIZE,
    INPUT_DUMP_OP_DESERIALIZE,
    INPUT_DUMP_OP_END_BUFFER
 };
 // each operation starts with the same header structure and looks like this
 //  some defined struct { <-----------------start
 //     OpHeader 
 //     .... struct type specific data
 //  }
 //  ... auxilary data of variable len
 //  <-------------------------------------- end - indicated by endOfData
 typedef struct OpHeader
 {
    dword operationType;
    dword endOfData; // offset to end of this primitive
 } OpHeader;
 // header for batch operations
 typedef struct BatchOpHeader
 {
    OpHeader opHeader;
 } BatchOpHeader;
 // interpretation for operationType INPUT_DUMP_OP_BATCH
 typedef struct InputBatch
 {
    BatchOpHeader header;
    qword batchId;
    dword vertexBufferDataSize;
    dword firstContainedOpOffset;
    // layout of batch is as below, each line is 128B aligned:
    // 
    //  InputBatch <-------------------------------- start
    //       optional: batchVertexData
    //  InputBuildDesc/InputCopy <------------------ start + firstContainedOpOffset
    //       optional: extra data of above token
    //  InputBuildDesc/InputCopy
    //       optional: extra data of above token
    //  ...
    //  InputBuildDesc/InputCopy
    //       optional: extra data of above token
    //  <-------------------------------------------- end    = start + endOfData
 } InputBatch;
 // for operationType:
 //   INPUT_DUMP_OP_BUILD,
 //   INPUT_DUMP_OP_UPDATE,
 // followed by auxilary data of variable len
 typedef struct InputBuild
 {
    OpHeader header;
    qword srcBvhPtr;
    qword dstBvhPtr;
    dword flags;
    dword numGeos;
    dword numInstances;
    dword instArrayOfPtrs;
 } InputBuild;
 // for operationType:
 //   INPUT_DUMP_OP_CLONE,
 //   INPUT_DUMP_OP_COMPACT,
 //   INPUT_DUMP_OP_SERIALIZE,
 // 
 //   Not for INPUT_DUMP_OP_DESERIALIZE!
 typedef struct InputCopy
 {
    OpHeader header;
    qword srcBvhPtr;
    qword dstBvhPtr;
 } InputCopy;
 // for INPUT_DUMP_OP_DESERIALIZE
 // decode for debug tools follows this format
 typedef struct InputDeserialize
 {
    OpHeader header;
    qword dstBvhPtr;
 } InputDeserialize;
 typedef struct InputBatchPtrs
 {
    qword dumpDst;
    qword globalDumpBuffer;
    qword nonVertexDataStart;
    dword vertexBuffersSize;
    dword totalSize;
 } InputBatchPtrs;
 enum OutputDumpOperationType
 {
    OUTPUT_DUMP_OP_NOP,
    OUTPUT_DUMP_OP_BATCH,
    OUTPUT_DUMP_OP_DATA,
    OUTPUT_DUMP_OP_END_BUFFER
 };
 // interpretation for operationType OUTPUT_DUMP_OP_BATCH
 typedef struct OutputBatch {
    BatchOpHeader header;
    qword batchId;
    dword firstContainedOpOffset;
 } OutputBatch;
 // interpretation for operationType OUTPUT_DUMP_OP_DATA
 typedef struct OutputData
 {
    OpHeader header;
    qword srcBvhPtr;
 } OutputData;
 typedef struct OutputBatchPtrs 
 {
    qword dumpDst;
    qword dataStart;
    dword dataSize;
    dword totalSize;
 } OutputBatchPtrs;
 GRL_NAMESPACE_END(MISC)
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/gpu/morton/morton_common.h
+++ b/src/intel/vulkan/grl/gpu/morton/morton_common.h
@ -1,245 +0,0 @@
 //
 // Copyright (C) 2009-2022 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "common.h"
 #define MORTON_DEBUG_CHECKS 0
 #define MORTON_VERBOSE_LOG 0
 GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift )
 {
 #if 0 // turn off, because current hierarchy build requires full sort
    // Difference between max iterations needed for LSB sorting and
    // number of iterations needed for LSB sorting without primIDs
    // This indicates how many of first iterations would be skipped in LSB
    return 8 - (8 - (shift >> 3));
 #else
    return 0;
 #endif
 }
 typedef struct BuildRecordLocalMortonFlattener
 {
    unsigned int leftChild;  // global
    unsigned int rightChild; // global
    unsigned int rangeStart; // global
    unsigned int local_parent_index__numItems;
 } BuildRecordLocalMortonFlattener;
 // TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced
 typedef union UPerNodeData {
    float4                           four_DWs;
    BuildRecordLocalMortonFlattener  buildRecord;
    MortonFlattenedBoxlessNode             boxlessNode;
    struct AABB                      box;
 } UPerNodeData;
 GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)
 {
    return bn.childOffset_type >> 6;
 }
 GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)
 {
    return bn.childOffset_type & ((1<<6) -1);
 }
 GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
 {
    short lane_used = index % get_sub_group_size();
    short shift = (index / get_sub_group_size()) * get_sub_group_size();
    if (lane_used == lane) {
        *arr |= (val << shift);
    }
 }
 GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane)
 {
    short r = 0;
    short lane_used = index % get_sub_group_size();
    short shift =    (index / get_sub_group_size()) * get_sub_group_size();
        r = arr >> shift;
    r = sub_group_broadcast(r, lane_used);
    return r;
 }
 GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst)
 {
    if (lane < count)
    {
        dst[lane]=(ushort)(arr & 0xFFFF);
        short hi_idx = lane + get_sub_group_size();
        if (hi_idx < count) {
            dst[hi_idx] = (ushort)(arr >> 16);
        }
    }
 }
 GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane)
 {
    if (lane < count)
    {
        *arr = src[lane];
        short hi_idx = lane + get_sub_group_size();
        if (hi_idx < count) {
            *arr |= ((uint)(src[hi_idx])) << 16u;
        }
    }
 }
 GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane)
 {
    short lane_used = index % get_sub_group_size();
    short shift = (index / get_sub_group_size()) * get_sub_group_size();
    if (lane_used == lane) {
        uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint
        *arr = (val << shift) | rem_val;
    }
 }
 GRL_INLINE void SUBGROUP_refit_bottom_up_local(
    uniform struct QBVHNodeN* globalNodeData,
    uniform struct BackPointers* backPointers,
    uniform uint treeletRootGlobalIndex,
    uniform uint globalBaseForInternalNodes,
    varying ushort lane,
    uniform local union UPerNodeData* local_nodes,
    varying uint sg_bu_startpoints,
    uniform uint sg_bu_startpoints_cnt)
 {
    if(sg_bu_startpoints_cnt == 0)
        return;
    const uint head_lane = 0;
    uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
    uniform uint prev_loc_index = 0;
    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
    uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
    while (curNodeIndex != 0)
    {
        uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode);
        uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
        varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane;
        uint numChildren = BackPointer_GetNumChildren(backpointer);
        if (child_loc_idx != prev_loc_index &&
            lane < numChildren)
        {
            child_aabb = local_nodes[child_loc_idx].box;
        }
        else if (lane >= numChildren) {
            AABB_init(&child_aabb);
            child_aabb.lower.w = as_float(0u);
        }
        // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
        reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 );
        uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
        reduced_bounds.lower.w = as_float((uint)instMask);
        uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0);
        local uint* pbox = (local uint*)(local_nodes+ curNodeIndex);
        if (lane < 8)
        {
            pbox[lane] = reduce_bounds_lane;
        }
        uint global_node_idx = globalBaseForInternalNodes + curNodeIndex;
        /* get bounds of all children from child nodes directly */
        struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
        subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false);
        child_aabb = reduced_bounds;
        uint parentIndex = BackPointer_GetParentIndex(backpointer);
        write_mem_fence(CLK_LOCAL_MEM_FENCE);
        if (lane == 0)
        {
            backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer));
            uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex;
            uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3);
            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer;
 #if MORTON_VERBOSE_LOG
            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n",
                   global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx);
 #endif
        }
        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
        prev_loc_index = curNodeIndex;
        curNodeIndex = parentIndex;
        /* if all children got refitted, then continue */
        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
        if (numChildrenRefitted != numChildrenTotal)
        {
            if(sg_bu_startpoints_cnt)
            {
                curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
                backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
            }
            else
                return;
        }
    }
    // process root of the treelet
    {
 #if MORTON_DEBUG_CHECKS
        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
 #endif
        uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode);
        varying uint child_loc_idx = lead_child_loc_offset + 0 + lane;
        uint numChildren = BackPointer_GetNumChildren(backpointer);
        if (child_loc_idx != prev_loc_index &&
            lane < numChildren)
        {
            child_aabb = local_nodes[child_loc_idx].box;
        }
        else if (lane >= numChildren) {
            AABB_init(&child_aabb);
            child_aabb.lower.w = as_float(0u);
        }
        // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
        uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
        uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
        uint global_node_idx = treeletRootGlobalIndex;
        uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset;
        /* get bounds of all children from child nodes directly */
        struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
        subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false);
        /* reset refit counter for next refit */
        if (lane == 0)
        {
            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u);
            // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes
 #if MORTON_VERBOSE_LOG
            printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
                   curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
 #endif
        }
    }
 }
--- a/src/intel/vulkan/grl/gpu/morton/phase0.cl
+++ b/src/intel/vulkan/grl/gpu/morton/phase0.cl
@ -1,400 +0,0 @@
 //
 // Copyright (C) 2009-2022 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "libs/lsc_intrinsics.h"
 #include "morton/morton_common.h"
 GRL_INLINE void SUBGROUP_create_node_phase0(
    uniform global struct Globals* globals,
    uniform global struct BinaryMortonCodeHierarchy* bnodes,
    uniform global char* bvh_mem,
    uniform global uint *global_refit_startpoints,
    uniform uint rID,
    uniform local uint* local_numRecords,
    uniform local uint* local_QNodeOffset,
    uniform global struct BuildRecordMorton* records,
    uniform struct BuildRecordMorton current,
    uniform local uint* local_startpoints_num)
 {
    uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    varying ushort lane = get_sub_group_local_id();
    /* initialize child array */
    uniform uint numChildren = 2;
    varying struct BuildRecordMorton sg_children;
    sg_children.items = 0;
    sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
    if ( lane < numChildren )
        sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
    /* fill QBVH6 node with up to 6 children */
    while ( numChildren < BVH_NODE_N6 )
    {
        varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
        if ( sub_group_all( sg_is_leaf ) )
            break;
        uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
        uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
        uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
        varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
        if ( lane == numChildren || lane == bestChild )
        {
            sg_children.nodeID = nodeID;
            sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
        }
        numChildren++;
    }
    const uint current_index = current.current_index;
    struct QBVHNodeN* qnode = nodeData + current_index;
    SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
    uniform uint global_offset;
    uniform uint child_node_offset;
    // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
    // used in global refit after phase1
    varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
    uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
    if ( lane == 0 )
    {
        child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
        /* create node, but to not set bounds yet as these get calculated during refit */
        QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE );
        QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) );
        /* set back pointers */
        uint backpointer = (current.parent_index << 6) | (numChildren << 3);
        global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
 #if MORTON_VERBOSE_LOG
        printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n",
               rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren);
 #endif
        if(children_roots_num == numChildren)
        {
            uint startpoints_offset = atomic_inc_local( local_startpoints_num );
            global_refit_startpoints[startpoints_offset] = current_index;
        }
        else
        {
            backpointer += children_roots_num;
        }
        *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
    }
    child_node_offset = sub_group_broadcast( child_node_offset, 0 );
    global_offset = sub_group_broadcast( global_offset, 0 );
    uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
    sg_children.current_index = childNodes - nodeData + lane;
    sg_children.parent_index = current_index;
    if ( lane < numChildren )
    {
        uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
        records[write_position] = sg_children;
    }
 }
 GRL_INLINE void SUBGROUP_create_node_phase0_local_sync(
    uniform global struct Globals* globals,
    uniform global struct BinaryMortonCodeHierarchy* bnodes,
    uniform global char* bvh_mem,
    uniform uint rID,
    uniform local uint* local_numRecords,
    uniform local uint* local_QNodeOffset,
    uniform global struct BuildRecordMorton* records,
    uniform struct BuildRecordMorton current,
    uniform local uint* local_p0_total,
    uniform global struct MortonFlattenedBoxlessNode *boxless_nodes,
    uniform uint nodeDataStart)
 {
    uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    uniform const uint rootNodeOffset = bvh->rootNodeOffset;
    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    varying ushort lane = get_sub_group_local_id();
    /* initialize child array */
    uniform uint numChildren = 2;
    varying struct BuildRecordMorton sg_children;
    sg_children.items = 0;
    sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
    if ( lane < numChildren )
        sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
    /* fill QBVH6 node with up to 6 children */
    while ( numChildren < BVH_NODE_N6 )
    {
        varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
        if ( sub_group_all( sg_is_leaf ) )
            break;
        uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
        uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
        uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
        varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
        if ( lane == numChildren || lane == bestChild )
        {
            sg_children.nodeID = nodeID;
            sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
        }
        numChildren++;
    }
    const uint current_index = current.current_index;
    uniform uint global_offset;
    uniform uint child_node_offset;
    // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
    // used in global refit after phase1
    varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
    uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane);
    uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
    if ( lane == 0 )
    {
        child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
        /* Do not create qnodes here */
        uint backpointer = (current.parent_index << 6) | (numChildren << 3);
        global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
 #if MORTON_VERBOSE_LOG
        printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n",
               rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart);
 #endif
        MortonFlattenedBoxlessNode flattened_node;
        if(children_roots_num != numChildren)
            backpointer += children_roots_num;
        flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask;
        uint loc_id = atomic_inc_local( local_p0_total );
        flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE;
        flattened_node.backPointer = backpointer;
        //TODO: change this writes to L1WB or streaming
        boxless_nodes[loc_id] = flattened_node;
        *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
    }
    child_node_offset = sub_group_broadcast( child_node_offset, 0 );
    global_offset = sub_group_broadcast( global_offset, 0 );
    uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
    sg_children.current_index = childNodes - nodeData + lane;
    sg_children.parent_index = current_index;
    if ( lane < numChildren )
    {
        uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
        records[write_position] = sg_children;
    }
 }
 /*
  In this phase a single large work group performs the construction of
  the top of the BVH and creates a build record array.
  Two varians of this kernel:
  1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit
     in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase
     that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate
     that is not effective.
  2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with
     number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit.
     In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1.
     Refit is performed only with local synchronization.
 */
 __attribute__((reqd_work_group_size(512, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
 parallel_build_phase0(global struct Globals *globals,
                      global struct BinaryMortonCodeHierarchy *bnodes,
                      global char *bvh_mem,
                      global uint *global_refit_startpoints)
 {
    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
    /* a queue of build records in global memory */
    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
    local uint local_numRecords;
    local uint local_QNodeOffset;
    local uint local_startpoints_num;
    /* initialize first build record */
    if (get_local_id(0) == 0)
    {
        /* allocate root node */
        uint root_node_offset = 64*bvh->nodeDataCur;
        global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
        //assert(root_node_offset == 0);
        records[0].nodeID = globals->binary_hierarchy_root;
        records[0].items = globals->numPrimitives;
        records[0].current_index = rootNode - nodeData;
        records[0].parent_index = -1;
        local_numRecords = 1;
        local_QNodeOffset = root_node_offset + 64;
        local_startpoints_num = 0;
        mem_fence_workgroup_default();
    }
    uint num_records = 1;
    /* terminate when all subtrees are under size threshold */
    while(true)
    {
        work_group_barrier(CLK_LOCAL_MEM_FENCE);
        /* all work items in the work group pick a subtree to build */
        for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
        {
            /* small subtrees will get built in next phase */
            if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
                continue;
            /* create QBVH node */
            SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset,
                                        records, records[ID], &local_startpoints_num);
        }
        work_group_barrier( CLK_LOCAL_MEM_FENCE );
        mem_fence_workgroup_default();
        uint old_num_records = num_records;
        num_records = local_numRecords;
        if( old_num_records == num_records )
            break;
    }
    /* remember number of build records for next phase */
    if (get_local_id( 0 ) == 0)
    {
        globals->numBuildRecords = local_numRecords;
        globals->p0_created_num = local_startpoints_num;
        bvh->nodeDataCur = local_QNodeOffset / 64;
 #if MORTON_VERBOSE_LOG
        printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num);
 #endif
    }
 }
 __attribute__((reqd_work_group_size(512, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
 parallel_build_phase0_local_sync(global struct Globals *globals,
                      global struct BinaryMortonCodeHierarchy *bnodes,
                      global char *bvh_mem,
                      global struct MortonFlattenedBoxlessNode *boxless_nodes)
 {
    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
    uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
    /* a queue of build records in global memory */
    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
    local uint local_numRecords;
    local uint local_QNodeOffset;
    local uint local_p0_total;
    /* initialize first build record */
    if (get_local_id(0) == 0)
    {
        /* allocate root node */
        uint root_node_offset = 64*bvh->nodeDataCur;
        global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
        //assert(root_node_offset == 0);
        records[0].nodeID = globals->binary_hierarchy_root;
        records[0].items = globals->numPrimitives;
        records[0].current_index = rootNode - nodeData;
        records[0].parent_index = -1;
        local_numRecords = 1;
        local_QNodeOffset = root_node_offset + 64;
        local_p0_total = 0;
        mem_fence_workgroup_default();
    }
    uint num_records = 1;
    /* terminate when all subtrees are under size threshold */
    while(true)
    {
        work_group_barrier(CLK_LOCAL_MEM_FENCE);
        /* all work items in the work group pick a subtree to build */
        for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
        {
            /* small subtrees will get built in next phase */
            if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
                continue;
            /* create QBVH node */
            SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records,
                                                   records[ID], &local_p0_total, boxless_nodes, nodeDataStart);
        }
        mem_fence_workgroup_default();
        work_group_barrier( CLK_LOCAL_MEM_FENCE );
        uint old_num_records = num_records;
        num_records = local_numRecords;
        if( old_num_records == num_records )
            break;
    }
    /* remember number of build records for next phase */
    if (get_local_id( 0 ) == 0)
    {
        globals->numBuildRecords = local_numRecords;
        bvh->nodeDataCur = local_QNodeOffset / 64;
        globals->p0_allocated_num = BVHBase_numNodes(bvh);
        globals->p0_created_num = local_p0_total;
 #if MORTON_VERBOSE_LOG
            printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints);
 #endif
    }
 }
--- a/src/intel/vulkan/grl/gpu/morton/phase1.cl
+++ b/src/intel/vulkan/grl/gpu/morton/phase1.cl
@ -1,785 +0,0 @@
 //
 // Copyright (C) 2009-2022 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "libs/lsc_intrinsics.h"
 #include "morton/morton_common.h"
 // caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards;
 BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec)
 {
    BuildRecordLocalMortonFlattener rec;
    rec.leftChild  = srcRec.leftChild;
    rec.rightChild = srcRec.rightChild;
    rec.rangeStart = srcRec.range.start;
    rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1;
    return rec;
 }
 GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless)
 {
    BuildRecordLocalMortonFlattener rec;
    rec.leftChild = boxless.binary_hierarchy_index;
    rec.rightChild = boxless.childOffset_type;
    rec.rangeStart = boxless.backPointer;
    rec.local_parent_index__numItems = 0;
    return rec;
 }
 GRL_INLINE void SUBGROUP_create_boxless_node_phase1(
    uniform global struct Globals* globals,
    uniform global struct BinaryMortonCodeHierarchy* bnodes,
    uniform global char* bvh_mem,
    uniform BuildRecordLocalMortonFlattener currentRecord,
    uniform uint  currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record
    uniform local uint* local_numRecords,
    uniform uint tictoc,
    uniform uint* sg_bu_startpoint_arr,
    uniform uint* sg_bu_startpoint_cnt,
    uniform uint parentOfRoot,
    uniform bool processRoot,
    uniform UPerNodeData* nodeData)
 {
    varying ushort lane = get_sub_group_local_id();
    /* initialize child array */
    uniform uint numChildren = 2;
    varying struct BuildRecordLocalMortonFlattener sg_children;
    sg_children.local_parent_index__numItems = 0;
    uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild;
    if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31;
    sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx));
    /* fill QBVH6 node with up to 6 children */
    while (numChildren < BVH_NODE_N6)
    {
        // we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point
        uint childNumItems = sg_children.local_parent_index__numItems;
        varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize;
        if (sub_group_all(sg_is_leaf)) { break; }
        uniform uint   bestItems = sub_group_reduce_max_N6(childNumItems);
        uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems));
        varying uint   leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes
        uniform uint   rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild);
        varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest;
        if (lane == numChildren || lane == bestChild)
        {
            sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID));
        }
        numChildren++;
    }
    uniform uint global_offset;
    uniform uint child_node_index;
    bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren);
    uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild));
    if (lane <= numChildren) {
        uint           writeIDX = 0;
        if (lane == numChildren)
    {
        /* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */
        MortonFlattenedBoxlessNode flattened_node;
            uint parentIDX;
            if (processRoot)
            {
                *local_numRecords = numChildren + 1;
                child_node_index = 1;
                writeIDX = 0;
        flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
                flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE;
                parentIDX = parentOfRoot;
            }
            else
            {
                uint shift = (16 * tictoc);
                uint mask = 0xFFFF;
                uint atomicAddVal = numChildren << shift;
                child_node_index = atomic_add_local(local_numRecords, atomicAddVal);
                sub_group_barrier(0);
                writeIDX = currQnodeLocalId;
                parentIDX = currentRecord.local_parent_index__numItems >> 16;
                flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
                sub_group_barrier(0);
                child_node_index = (child_node_index >> 16) + (child_node_index & mask);
        flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE;
            }
 #if MORTON_VERBOSE_LOG
            printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren);
 #endif
            flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren;
            sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node);
    }
        child_node_index = sub_group_broadcast(child_node_index, numChildren);
        if (lane != numChildren)
    {
            writeIDX = child_node_index + lane;
            sg_children.local_parent_index__numItems |= currQnodeLocalId << 16;
    }
        nodeData[writeIDX].buildRecord = sg_children;
    }
    if (numFatleafChildren == numChildren) {
        uint arridx = *sg_bu_startpoint_cnt;
        // GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
        set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane);
        *sg_bu_startpoint_cnt = arridx + 1;
    }
 }
 // TODO_OPT:  Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants
 //    of this kernel with different WG sizes.   There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is
 //     probably often wasted
 GRL_INLINE void phase1_process_fatleaf(
    uint   globalBaseForInternalNodes,    // for root node this is indexOfRoot
    uint   globalParent          ,        // for root this should be parentOfRoot
    bool   isInstancePrimLeafType,        //
    uint   leafPrimType,                  //
    uint   leafStride,                    //
    global struct QBVHNodeN* nodeData,    // per group
    uint nodeDataStart,                   //
    struct AABB* primref,                 //
    BackPointers* backPointers,           //
    global struct MortonCodePrimitive* mc,//
    uint nodesToLeafsGap,                 //
    local union UPerNodeData* perNodeData,//
    bool processRoot,                               //
    short localNodeId,                              //
    BuildRecordLocalMortonFlattener fatleafRecord,  // per node
    uint primID )                                   //
 {
    uint lane = get_sub_group_local_id();
    uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
    uniform uint mcID = fatleafRecord.rangeStart;
    uint pseudolane = lane < numChildren ? lane : 0;
    varying struct AABB sg_bounds = primref[primID];
    uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16);
    uint globalNodeId = globalBaseForInternalNodes + localNodeId;
    uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId;
    uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId;
    {
        /* For all primitives in a fat leaf we store a back
         * pointer. This way we can modify the fat leaf node at leaf construction time. */
        uint back_pointer = globalNodeId + nodeDataStart;
        /* Store back pointer and primID inside morton code array to
         * be later used by leaf creation. */
        mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
    }
    struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds);
    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
    uint8_t instMask;
    if (isInstancePrimLeafType)
    {
        instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0;
        subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask);
        instMask = sub_group_reduce_or_N6(instMask);
    }
    else
    {
        instMask = 0xFF;
        subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds);
    }
    reduce_bounds.lower.w = as_float((uint)instMask);
    uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0);
    local uint* boxUint = (local uint*)(perNodeData + localNodeId);
    if (get_sub_group_size() == 8 || lane < 8)
    {
        boxUint[lane] = reduce_bounds_lane;
        uint globalParentIdx;
        if (processRoot) {
            // for root, treeletRootGlobalIndex is index of rootsParent in global space
            globalParentIdx = globalParent;
        }
        else {
            // for non root, raw_parent_idx is in local space
            globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent;
        }
        if (lane == 0) {
            *InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3);
        }
    }
 }
 GRL_INLINE void perform_phase1(global struct Globals* globals,
    global struct MortonCodePrimitive* mc,
    global struct AABB* primref,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global char* bvh_mem,
    local union UPerNodeData* perNodeData,
    local uint* local_records_head,
    local uint* local_globalOffsetForNodes,
    BuildRecordLocalMortonFlattener rootRecord,
    uint treeletRootGlobalIndex,
    uint parentOfRootIndex,
    const uint leafPrimType,
    bool isInstancePrimLeafType)
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    varying ushort lane = get_sub_group_local_id();
    // array that will keep 2x8 shorts indices
    varying uint    sg_fatleaf_array = 0x0;
    uniform uint8_t sg_fatleaf_cnt = 0;
    /* terminate when all subtrees are leaves */
    uint subgroupId = get_sub_group_id();
    uint ID = subgroupId;
    uint sg_bu_startpoints = 0;
    uniform uint sg_bu_startpoints_cnt = 0;
    const uint shift_mask = globals->shift_mask;
    const uint nodeDataStart  = BVH_ROOT_NODE_OFFSET / 64;
    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
    global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh);
    uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart;
    uint  leafStart = *pLeafStart;
    uint  leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode));
    uint  nodesToLeafsGap = leafStart - nodeDataStart;
    if (ID == 0)
    {
        BuildRecordLocalMortonFlattener current = rootRecord;
        if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
        {
             *local_records_head = 1;
 #if MORTON_DEBUG_CHECKS
                if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
 #endif
            BuildRecordLocalMortonFlattener fatleafRecord = current;
            uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
            uint pseudolane = lane < numChildren ? lane : 0;
            uniform const uint mcID = fatleafRecord.rangeStart;
            varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
            phase1_process_fatleaf(
                treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride,
                nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
                true, 0, fatleafRecord, primID);
        }
        else
        {
 #if MORTON_VERBOSE_LOG
            if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); }
 #endif
            //printf("local_records_head = %d\n", *local_records_head);
            SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData);
            *local_globalOffsetForNodes = treeletRootGlobalIndex;
        }
        ID += get_num_sub_groups();
    }
    uniform uint priv_records_tail = 1;
    /* wait for all work items to have updated local_records array */
    work_group_barrier(CLK_LOCAL_MEM_FENCE);
    uniform uint priv_records_head = *local_records_head & 0xFFFF;
    treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1
    uniform uint priv_records_tail_prev = priv_records_tail;
    uniform uint other_records_head = priv_records_head;
    uint ticToc = 1;
    if (priv_records_head == priv_records_tail)
    {
        return;
    }
    else
    {
        do
        {
            for (; ID < priv_records_head; ID += get_num_sub_groups())
            {
                BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord);
                if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
                {
                    set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane);
 #if MORTON_VERBOSE_LOG
                    if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID);
 #endif
 #if MORTON_DEBUG_CHECKS
                    if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
 #endif
                }
                else
                {
                    SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData);
                }
            }
            priv_records_tail = priv_records_head;
            /* wait for all work items to have updated local_records array */
            work_group_barrier(CLK_LOCAL_MEM_FENCE);
            {
                uint records_as_in_mem = *local_records_head;
                priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF;
                uint other_records_head_temp = priv_records_head;
                priv_records_head += other_records_head;
                other_records_head = other_records_head_temp;
                ticToc = ticToc ^ 1;
 #if MORTON_VERBOSE_LOG
                if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem);
 #endif
            }
        } while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head
    }
    bool atomicNodeAllocation = treeletRootGlobalIndex > 0;
    bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation;
    uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0;
    uniform uint globalBaseForInternalNodes = 0;
    // we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex
    // if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so
    // there's no need to synchronize multiple treelets nodes allocations with atomics.
    if (atomicNodeAllocationProduce)
    {
        *local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1);
    }
    // because, root is allocated elsewhere, and first node placed in global mem is node with local index 1
            // mapping local to global:
            // local space                           global space
            // [0]             - treelet root        [treeletRootGlobalIndex]
            //                                       ... possibly very long distance ...
            // [1]             - first non root      [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above
            // [2]             - first               [globalBaseForInternalNodes + 2]
            // ...
            // [numToAllocate] - last node           [globalBaseForInternalNodes + 3]
    if (atomicNodeAllocation)
    {
        work_group_barrier(CLK_LOCAL_MEM_FENCE);
        globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1);
    }
 #if MORTON_VERBOSE_LOG
    if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); }
 #endif
    if (sg_fatleaf_cnt)
    {
        short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane);
        //if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue;
        //if(local_startpoints_cnt > 1) return;
        BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord;
        varying uint primID;
        {
            uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
            uint pseudolane = lane < numChildren ? lane : 0;
                uniform const uint mcID = fatleafRecord.rangeStart;
                primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
        }
        // process fatleafs, and store their boxes to SLM
        // also put startpoints for bottom up
        //uint fatleaf_cnt = *local_startpoints_cnt;
        while (sg_fatleaf_cnt-- > 1)
        {
            short                           nextLocalNodeId   = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane);
            BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord;
            varying uint                    nextPrimId;
            {
                uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF);
                uint pseudolane = lane < numChildren ? lane : 0;
                uniform const uint mcID = nextfatleafRecord.rangeStart;
                nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
            }
            phase1_process_fatleaf(
                globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
                nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
                false, localNodeId, fatleafRecord, primID);
            fatleafRecord = nextfatleafRecord;
            localNodeId   = nextLocalNodeId;
            primID        = nextPrimId;
        }
        phase1_process_fatleaf(
            globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
            nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
            false, localNodeId, fatleafRecord, primID);
        }
 #if 0
    // put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups.
        {
            ushort myStartpointWriteSite = 0;
            if (lane == 0)
            {
                myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt);
            }
            myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0);
            unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite);
        }
 #endif
        work_group_barrier(CLK_LOCAL_MEM_FENCE);
        // distribute bottom-up startpoints
 #if 0
        {
            short sp_count_to_divide = (*local_startpoints_cnt);
            //calculate the chunk for each sg.
            sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups();
            uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups();
            uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt;
            if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) {
                //from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx
                // and all sgs before it also have one extra
                myReadSite += get_sub_group_id();
                sg_bu_startpoints_cnt++;
        }
        else
        {
            // all reminder elements are consummed by previous sgs
            myReadSite += sg_bu_startpoints_cnt_reminder;
        }
        pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane);
    }
 #endif
    SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt);
    if (singleTreeletBumpBVHnodeCnt)
    {
        bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt;
    }
 }
 GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType)
 {
    if (get_sub_group_id() == 0 )
    {
        global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh);
        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
        //set required fields to mark that blas is empty
        uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0;
        qnode->type = leafPrimType;
        qnode->instMask = 0;
        qnode->qbounds.lower_x[k] = 0x80;
        qnode->qbounds.upper_x[k] = 0;
        *InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6);
    }
 }
 /*
  POSTSORT PHASE1:
  Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD.
  1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip
  2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards)
 */
 __attribute__( (reqd_work_group_size( 512, 1, 1 )) )
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 parallel_build_phase1_Indirect_SG( global struct Globals* globals,
    global struct MortonCodePrimitive* mc,
    global struct AABB* primref,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global char* bvh_mem)
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    const uint leafPrimType = globals->leafPrimType;
    //special case for empty blas
    if(globals->numPrimitives == 0)
    {
        bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1;
        update_empty_blas(bvh, leafPrimType);
        return;
    }
    local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1];
    local uint local_records_head;
    // Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers
    local uint local_globalOffsetForNodes, local_globalOffsetForNodes2;
    uint rootIndex = 0;
    uint parentOfRoot = 0;
    BuildRecordLocalMortonFlattener  rootBuildRecord;
    /* add start build record to local stack */
    if (get_sub_group_id() == 0 )
    {
        global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart);
        uint recordID = get_group_id(0);
        struct BuildRecordMorton mortonGlobalRecord = records[recordID];
        rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID));
        parentOfRoot = mortonGlobalRecord.parent_index;
        rootIndex = mortonGlobalRecord.current_index;
 #if MORTON_VERBOSE_LOG
        printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n",
               local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index);
 #endif
    }
    if (leafPrimType == NODE_TYPE_INSTANCE)
    {
        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
            &local_records_head, &local_globalOffsetForNodes,
            rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true);
    }
    else
    {
        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
            &local_records_head, &local_globalOffsetForNodes,
            rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false);
    }
 }
 __attribute__( (reqd_work_group_size( 512, 1, 1 )) )
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 parallel_build_phase1_Indirect_global_root( global struct Globals* globals,
    global struct MortonCodePrimitive* mc,
    global struct AABB* primref,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global char* bvh_mem)
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    const uint leafPrimType = globals->leafPrimType;
    const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
    bvh->nodeDataCur = nodeDataStart + 1;
    //special case for empty blas
    if(globals->numPrimitives == 0)
    {
        update_empty_blas(bvh, leafPrimType);
        return;
    }
    local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1];
    local uint local_records_head;
    local uint local_globalOffsetForNodes;
    BuildRecordLocalMortonFlattener rootBuildRecord;
    if (get_sub_group_id() == 0 )
    {
        struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root);
        rootBuildRecord = TranslateToLocalRecord(binaryNode);
        local_globalOffsetForNodes = 0;
    }
    if (leafPrimType == NODE_TYPE_INSTANCE)
    {
        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
            &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true);
    }
    else
    {
        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
            &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false);
    }
 }
 #if 0
 GRL_INLINE void
 DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
    global struct MortonCodePrimitive* mc,
    global struct AABB* primref,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global char* bvh_mem,
    uint startID, uint endID,
    local uint* local_numRecords,
    local uint* local_numRecordsOld,
    local struct BuildRecordMorton* local_records
 )
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
    /* iterate over all subtrees this workgroup should build */
    for ( uint recordID = startID; recordID < endID; recordID++ )
    {
        /* add start build record to local stack */
        if ( get_local_id( 0 ) == 0 )
        {
            local_records[0] = records[recordID];
            *local_numRecords = 1;
            *local_numRecordsOld = 0;
        }
        work_group_barrier( CLK_LOCAL_MEM_FENCE );
        /* terminate when all subtrees are leaves */
        while ( *local_numRecords != *local_numRecordsOld )
        {
            /* remember the old number of build records to detect later
       * whether we are done */
            if ( get_local_id( 0 ) == 0 )
            {
                *local_numRecordsOld = *local_numRecords;
            }
            work_group_barrier( CLK_LOCAL_MEM_FENCE );
            /* all work items in the sub group pick a subtree to build */
            for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
            {
                /* ignore small subtrees */
                if ( local_records[ID].items <= BVH_NODE_N6 )
                    continue;
                /* create QBVH node */
                create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
            }
            /* wait for all work items to have updated local_records array */
            work_group_barrier( CLK_LOCAL_MEM_FENCE );
        }
        const uint shift_mask = globals->shift_mask;
        const uint leafPrimType = globals->leafPrimType;
        const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
        BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
        global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
        /* create all fat leaf nodes and initiate refit */
        for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
        {
            struct BuildRecordMorton current = local_records[ID];
            const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
            global struct QBVHNodeN* qnode = nodeData + current.current_index;
            /* get bounds of all children of the fat leaf node */
            struct AABB bounds[BVH_NODE_N6];
            for ( uint i = 0; i < current.items; i++ )
            {
                /* get primID and bounds of primitive */
                const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
                bounds[i] = primref[primID];
                /* For all primitives in a fat leaf we store a back
                 * pointer. This way we can modify the fat leaf node at leaf construction time. */
                const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
                /* Store back pointer and primID inside morton code array to
                 * be later used by leaf creation. */
                mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
            }
            /* update fat leaf node */
            QBVHNodeN_setType( qnode, leafPrimType );
            global void* offset;
            if ( leafPrimType != BVH_INSTANCE_NODE )
            {
                offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
                QBVHNodeN_setChildIncr1( qnode );
            }
            else
            {
                offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
                QBVHNodeN_setChildIncr2( qnode );
            }
            QBVH6Node_set_offset( qnode, offset );
            QBVHNodeN_setBounds( qnode, bounds, current.items );
            /* set back pointers for fat leaf nodes */
            *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
            /* bottom up refit */
            refit_bottom_up( qnode, bvh, bounds, current.items );
        }
    }
 }
 /*
  This phase takes the build records calculated in phase0 as input and
  finished the BVH construction for all these subtrees.
 */
 __attribute__((reqd_work_group_size(8, 1, 1)))
 old_parallel_build_phase1(global struct Globals *globals,
                      global struct MortonCodePrimitive *mc,
                      global struct AABB *primref,
                      global struct BinaryMortonCodeHierarchy *bnodes,
                      global char *bvh_mem)
 {
    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
    /* a queue of build records */
    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
    local uint local_numRecords;
    local uint local_numRecordsOld;
    /* construct range of build records that each sub group will process */
    const uint numRecords = globals->numBuildRecords;
    const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
    const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
 }
 __attribute__( (reqd_work_group_size( 8, 1, 1 )) )
 old_parallel_build_phase1_Indirect( global struct Globals* globals,
    global struct MortonCodePrimitive* mc,
    global struct AABB* primref,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global char* bvh_mem )
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
    /* a queue of build records */
    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
    local uint local_numRecords;
    local uint local_numRecordsOld;
    /* construct range of build records that each sub group will process */
    const uint numRecords = globals->numBuildRecords;
    uint startID = get_group_id( 0 );
    uint endID   = startID + 1;
    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
 }
 #endif
--- a/src/intel/vulkan/grl/gpu/morton/phase2.cl
+++ b/src/intel/vulkan/grl/gpu/morton/phase2.cl
@ -1,314 +0,0 @@
 //
 // Copyright (C) 2009-2022 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "bvh_build_refit.h"
 #include "libs/lsc_intrinsics.h"
 #include "morton/morton_common.h"
 /*
  POSTSORT PHASE2:
  Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value.
  1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate.
                                   This kernel should be used only for very big bvh, it is faster than non-SLM fallback
                                   in parallel_build_phase2_refit_local.
  2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of
                                   nodes allocated in phase0, but there is also non-SLM fallback there, as the
                                   decision on which kernel to run is based on the nodes estimates on the host
                                   side.
 */
 GRL_INLINE void refit_bottom_up_global_sync(
    global char* bvh_mem,
    global uint* global_refit_startpoints,
    uniform uint nodeId,
    uniform ushort lane)
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
    // Get the node idx that was put here in phase1
    const uint innerNodeIdx = global_refit_startpoints[nodeId];
    // Get the qnode and backpointer
    uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx;
    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
    varying struct AABB childrenAABB; // one child AABB per lane
    AABB_init(&childrenAABB);
    uniform uint numChildren = (backPointer >> 3) & 0x7;
    if(numChildren == 0) return;
    global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
    varying ushort child_idx = (lane < numChildren) ? lane : 0;
    childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
 #if MORTON_VERBOSE_LOG
    if(lane == 0)
        printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx);
 #endif
    struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
    subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane);
    uint children_mask = qnode_child[child_idx].instMask;
    qnode->instMask = sub_group_reduce_or_N6(children_mask);
    SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 );
 }
 __attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel
 parallel_build_phase2_refit( global char* bvh_mem,
    global uint* global_refit_startpoints )
 {
    refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0));
 }
 GRL_INLINE void SUBGROUP_refit_bottom_up_global(
    uniform global struct QBVHNodeN* globalNodeData,
    uniform struct BackPointers* backPointers,
    varying ushort lane,
    varying uint curNodeIndex)
 {
    uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
    const uint head_lane = 0;
    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
    while (curNodeIndex != 0)
    {
        global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex;
        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
        uint numChildren = BackPointer_GetNumChildren(backpointer);
        varying ushort child_idx = (lane < numChildren) ? lane : 0;
        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
        /* get bounds of all children from child nodes directly */
        subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane);
        uchar childrenMask = qnode_child[child_idx].instMask;
        qnode->instMask = sub_group_reduce_or_N6(childrenMask);
        uint parentIndex = BackPointer_GetParentIndex(backpointer);
        mem_fence_gpu_invalidate();
        if (lane == 0)
        {
            backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex));
            uint globalBackpointer = (parentIndex << 6) | (numChildren << 3);
            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer;
 #if MORTON_VERBOSE_LOG
            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n",
                   curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x);
 #endif
        }
        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
        curNodeIndex = parentIndex;
        /* if all children got refitted, then continue */
        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
        if (numChildrenRefitted != numChildrenTotal)
                return;
    }
    // process root of the treelet
    {
 #if MORTON_DEBUG_CHECKS
        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
 #endif
        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData );
        uint numChildren = BackPointer_GetNumChildren(backpointer);
        varying ushort child_idx = (lane < numChildren) ? lane : 0;
        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
        /* get bounds of all children from child nodes directly */
        subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane);
        uchar childrenMask = qnode_child[child_idx].instMask;
        globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask);
        /* reset refit counter for next refit */
        if (lane == 0)
        {
            /* set global back pointer */
            *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u);
 #if MORTON_VERBOSE_LOG
        printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
               curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
 #endif
        }
    }
 }
 // TODO: Check why 512 wg size has worse performance than 256
 __attribute__( (reqd_work_group_size( 512, 1, 1 )) )
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 parallel_build_phase2_refit_local( global struct Globals* globals,
    global char* bvh_mem,
    global struct MortonFlattenedBoxlessNode *boxless_nodes)
 {
    // Number of nodes created in P0, to be refitted in this stage
    uint p0_created_num = globals->p0_created_num;
    // Return immediately if host executed this kernel but there is nothing to do
    if(p0_created_num == 0)
        return;
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
    varying ushort lane = get_sub_group_local_id();
    // Hardcode SLM to max here as we do not know upfront how much mem will be needed
    local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */
    // Number of allocated nodes in phase0 (p0_created_num + children)
    uint p0_allocated_num = globals->p0_allocated_num;
    // array that will keep 2x8 shorts indices
    varying uint sg_fatleaf_array = 0x0;
    uniform uint8_t sg_bu_startpoints_cnt = 0;
    // Determine if we can fit into SLM with all the nodes allocated in phase0,
    // There are two paths here:
    // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local,
    //    which does refit nad creates qnodes in bvh
    // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization.
    //    It is not performant to do so, keep it as a guardrail here. On the host side we do fallback
    //    to the old refit separated path, with wg_size 8 with better EU reuse.
    if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM)
    {
        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
        {
            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
            uint current_id = boxless_node.binary_hierarchy_index >> 6;
            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
            if(lane == 0)
                perNodeData[current_id].boxlessNode = boxless_node;
            // When no children are subtree roots, we are done and skip to the next iteration
            if(children_root_mask == 0x0)
            {
                continue;
            }
            // When all children are subtree roots, put them to sg_fatleaf_array
            else if(children_root_mask == 0x3F)
            {
                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
            }
            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
            varying ushort child_idx = (lane < numChildren) ? lane : 0;
            varying struct AABB childrenAABB; // one child AABB per lane
            AABB_init(&childrenAABB);
            uint lead_child_global_id = current_id + lead_child_offset;
            uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id;
            childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
            // Get only AABBs of children that are p1 subtree roots
            bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx);
            if(lane_active)
            {
                uint child_global_id = lead_child_global_id + child_idx;
                perNodeData[child_global_id].box = childrenAABB;
                perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask);
            }
 #if MORTON_VERBOSE_LOG
            if(lane == 0)
                printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset);
 #endif
        }
        work_group_barrier(CLK_LOCAL_MEM_FENCE);
        SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt);
    }
    else
    {
        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
        {
            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
            uint current_id = boxless_node.binary_hierarchy_index >> 6;
            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
            uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node);
            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
            SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
            if(lane == 0)
            {
                QBVH6Node_set_type( qnode, nodeType );
                qnode->offset = lead_child_offset;
            }
            // When no children are subtree roots, we are done and skip to the next iteration
            if(children_root_mask == 0x0)
            {
                continue;
            }
            // When all children are subtree roots, put them to sg_fatleaf_array
            else if(children_root_mask == 0x3F)
            {
                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
            }
 #if MORTON_VERBOSE_LOG
            if(lane == 0)
                printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset);
 #endif
        }
        while (sg_bu_startpoints_cnt > 0)
        {
            uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane);
            SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex);
        }
    }
 }
--- a/src/intel/vulkan/grl/gpu/morton/post_sort.cl
+++ b/src/intel/vulkan/grl/gpu/morton/post_sort.cl
@ -1,521 +0,0 @@
 //
 // Copyright (C) 2009-2022 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "libs/lsc_intrinsics.h"
 #include "morton/morton_common.h"
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 /*
    This kernel constructs a binary hierarchy in bottom up fashion from
    the morton codes.
 */
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 int Delta(global struct MortonCodePrimitive* mc, const uint64_t key0, const uint i1 )
 {
    const uint64_t key1 = mc[i1].index_code;
    return  clz(key0 ^ key1);
 }
 int sign( int d )
 {
    return (d > 0) ? 1 : -1;
 }
 __attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
 __attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
 void kernel build_bottom_up_indirect( global struct Globals* globals,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global struct MortonCodePrimitive* mc )
 {
    /* construct range of primitives that each work group will process */
    const uint numPrimitives = globals->numPrimitives;
    uint i = get_group_id( 0 ) * get_local_size(0) + get_local_id( 0 );
    if (i == 0)
    {
        globals->binary_hierarchy_root = 0;
        if (numPrimitives == 1)
        {
            // special kludge for 1-prim tree.  Make sure the one leaf node is initialized
            bnodes[i].range.start   = 0;
            bnodes[i].range.end     = 0;
            bnodes[i].leftChild     = -1;
            bnodes[i].rightChild    = -1;
        }
        // store pointer to the binary hierarchy in the globals struct.
        //  This will be used
        globals->binary_hierarchy_buffer = (gpuva_t) bnodes;
    }
    uint num_inner_nodes = numPrimitives-1;
    if ( i < num_inner_nodes )
    {
        //
        // direction is 1 if this morton code is the node's first key, -1 if it's the last
        //    By construction every internal node is either the start or the end of a given key range
        //  direction should be towards the neighbor with the most bits in common
        uint64_t ki = mc[i].index_code;
        int direction, delta_min;
        uint lmax;
        if( i == 0 )
        {
            direction = 1;
            delta_min = -1;
            lmax = numPrimitives;
        }
        else
        {
            direction = sign( Delta( mc, ki, i + 1 ) - Delta( mc,  ki, i - 1 ) );
            delta_min = Delta( mc,  ki, i - direction );
            // find upper bound for length of this node's key range
            lmax = 8;
            while ( (i+lmax*direction) < numPrimitives && Delta( mc, ki, i+lmax*direction ) > delta_min)
                lmax = lmax * 2;
        }
        // clamp max length so that the binary searches are fully in-bounds
        uint maxLen = (direction>0) ? (numPrimitives - i) : (i+1);
        lmax = min(lmax, maxLen);
        // find end of range using binary search
        uint length = 0;
        uint end    = lmax-1;
        while (length != end)
        {
            uint mid = length + ((end-length)/2) + ((end-length)%2);
            bool bigger =  Delta( mc, ki, i+mid*direction) > delta_min;
            length = bigger ? mid : length;
            end    = bigger ? end : mid-1;
        }
        uint j = i + length*direction ;
        // find split position using binary search
        uint split = 0;
        end    = length-1;
        int delta_node = Delta(mc, ki, j);
        while (split != end)
        {
            uint mid = split + ((end-split)/2) + ((end-split)%2);
            bool bigger =  Delta( mc, ki, i+mid*direction) > delta_node;
            split = bigger ? mid : split;
            end   = bigger ? end : mid-1;
        }
        split = i + split*direction + min(direction,0);
        uint left  = split;
        uint right = split+1;
        // mark leaves
        if( min(i,j) == split )
            left = left | (1<<31);
        if( max(i,j) == split+1 )
            right = right | (1<<31);
        bnodes[i].range.start = min(i,j);
        bnodes[i].range.end   = max(i,j);
        bnodes[i].leftChild   = left;
        bnodes[i].rightChild  = right;
    }
 }
 #if 0
 __attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
 __attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
 void kernel build_bottom_up_indirect( global struct Globals* globals,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global struct MortonCodePrimitive* mc )
 {
    /* construct range of primitives that each work group will process */
    const uint numPrimitives = globals->numPrimitives;
    // RangeFactor determines the distance between adjacent nodeIds in work group.
    // The aim of the nodes distribution within work group, for rangeFactor > 1
    // is to be sure that half of the work groups will entirelly be dropped off
    // at the bottom layer of the graph. This way the EUs can be reused faster.
    // The factor needs to be smaller than MAX_HW_SIMD_WIDTH
    const uint rangeFactor = 2;
    const uint numGroups = ((numPrimitives + MAX_HW_SIMD_WIDTH - 1) / MAX_HW_SIMD_WIDTH);
    const uint globalId = get_group_id( 0 ) * MAX_HW_SIMD_WIDTH + get_local_id( 0 );
    const uint numPrimitivesAlignedToWGSize = MAX_HW_SIMD_WIDTH * numGroups;
    const uint groupsRange = numPrimitivesAlignedToWGSize / rangeFactor;
    /* iterate over all primitives the work group should process */
    const uint i = (globalId * rangeFactor) % numPrimitivesAlignedToWGSize + globalId / groupsRange;
    if ( i < numPrimitives )
    {
        uint node = i | ((uint)1 << 31);
        uint start = i;
        uint end = i;
        /* bottom up */
        while ( true )
        {
            /* goto parent node and link parent node to current node */
            node = updateParent( bnodes, mc, node, start, end, numPrimitives - 1 );
            /* do not continue if we reached this node the first time */
            if ( node == -1 )
                break;
            mem_fence_gpu_invalidate();
            /* update range */
            start = bnodes[node].range.start;
            end = bnodes[node].range.end;
            /* stop when we reached the root node */
            if ( start == 0 && end == numPrimitives - 1 )
            {
                globals->binary_hierarchy_root = node;
                break;
            }
        }
    }
 }
 #endif
 /*
  This function builds one QBVH6 node by opening the provided binary
  BVH nodes until the QBVH node is full.
 */
 GRL_INLINE void create_node(global struct Globals *globals,
                        global struct BinaryMortonCodeHierarchy *bnodes,
                        global char *bvh_mem,
                        uint rID,
                        local uint *local_numRecords,
                        local uint *local_QNodeOffset,
                        struct BuildRecordMorton *records,
                        struct BuildRecordMorton *current)
 {
    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
    const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
    BackPointers *backPointers = BVHBase_GetBackPointers(bvh);
    /* initialize child array */
    uint numChildren = 2;
    struct BuildRecordMorton children[BVH_NODE_N6];
    children[0].nodeID = bnodes[current->nodeID].leftChild;
    children[0].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[0].nodeID);
    children[1].nodeID = bnodes[current->nodeID].rightChild;
    children[1].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[1].nodeID);
    /* fill QBVH6 node with up to 6 children */
    while (numChildren < BVH_NODE_N6)
    {
        /*! find best child to split */
        uint bestItems = 0;
        int bestChild = -1;
        for (int i = 0; i < numChildren; i++)
        {
            const uint items = children[i].items;
            /* ignore leaves as they cannot get split */
            if (items <= cfg_minLeafSize)
                continue;
            /* find child with largest number of items */
            if (items > bestItems)
            {
                bestItems = items;
                bestChild = i;
            }
        }
        if (bestChild == -1)
            break;
        /* perform best found split */
        const uint bestNodeID = children[bestChild].nodeID;
        struct BuildRecordMorton *lrecord = &children[bestChild];
        struct BuildRecordMorton *rrecord = &children[numChildren];
        lrecord->nodeID = bnodes[bestNodeID].leftChild;
        lrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, lrecord->nodeID);
        rrecord->nodeID = bnodes[bestNodeID].rightChild;
        rrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, rrecord->nodeID);
        numChildren++;
    }
    /* allocate memory for all children */
    const uint child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
    global struct QBVHNodeN *childNodes = (global struct QBVHNodeN *)(bvh_mem + child_node_offset);
    /* create node, but to not set bounds yet as these get calculated during refit */
    const uint current_index = current->current_index;
    struct QBVHNodeN *qnode = nodeData + current_index;
    QBVH6Node_set_type(qnode, BVH_INTERNAL_NODE);
    QBVHNodeN_setChildIncr1(qnode);
    QBVH6Node_set_offset(qnode, childNodes);
    /* set back pointers */
    *InnerNode_GetBackPointer(backPointers,  current_index) = (current->parent_index << 6) | (numChildren << 3);
    /* update parent pointer of build records of all children */
    for (uint ID = 0; ID < numChildren; ID++)
    {
        children[ID].current_index = childNodes - nodeData + ID;
        children[ID].parent_index = current_index;
    }
    /* write out child build records */
    const uint global_offset = atomic_add_local(local_numRecords, numChildren - 1);
    records[rID] = children[0];
    for (uint i = 1; i < numChildren; i++)
        records[global_offset + i - 1] = children[i];
    mem_fence_workgroup_default();
 }
 #if 0
 /* This function calculates the similarity between two morton
 * codes. It essentially counts how many bits of the morton codes are
 * equal starting at the top. The more bits are equal, the similar the
 * codes, and the closer the primitives are located spatially. */
 GRL_INLINE uint64_t delta(global struct MortonCodePrimitive *mc,
                      const uint id)
 {
    const uint64_t key0 = mc[id + 0].index_code;
    const uint64_t key1 = mc[id + 1].index_code;
    return clz(key0 ^ key1);
 }
 /* This function checks for a range [left,right] of morton codes, if
 * it is spatially closer to the left or to the right nodes. */
 GRL_INLINE bool merge_to_right(global struct MortonCodePrimitive *mc,
                           const uint left,
                           const uint right,
                           const uint last)
 {
    /* merge to right if we are at the left end of the array */
    if (left == 0)
        return true;
    /* merge to left if we are at the right end of the array */
    if (right == last)
        return false;
    /* otherwise merge to the side where the morton code sequence has
   * the largest number of equal bits from the top */
    return delta(mc, right) > delta(mc, left - 1);
 }
 GRL_INLINE uint updateParent(global struct BinaryMortonCodeHierarchy *bnodes,
                         global struct MortonCodePrimitive *mc,
                         const uint nodeID,
                         const uint left,
                         const uint right,
                         const uint last)
 {
    uint parent;
    /* check if we should merge this node to the left or right */
    if (merge_to_right(mc, left, right, last))
    {
        parent = right;
        bnodes[parent].leftChild = nodeID;
        bnodes[parent].range.start = left;
    }
    else
    {
        parent = left - 1;
        bnodes[parent].rightChild = nodeID;
        bnodes[parent].range.end = right;
    }
    mem_fence_gpu_default();
    /* stop ascending the tree if we reached this node the first time */
    const bool first = atomic_inc_global((global uint *)&bnodes[parent].flag) == 0;
    return first ? -1 : parent;
 }
 GRL_INLINE void
 DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
    global struct MortonCodePrimitive* mc,
    global struct AABB* primref,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global char* bvh_mem,
    uint startID, uint endID,
    local uint* local_numRecords,
    local uint* local_numRecordsOld,
    local struct BuildRecordMorton* local_records
 )
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
    /* iterate over all subtrees this workgroup should build */
    for ( uint recordID = startID; recordID < endID; recordID++ )
    {
        /* add start build record to local stack */
        if ( get_local_id( 0 ) == 0 )
        {
            local_records[0] = records[recordID];
            *local_numRecords = 1;
            *local_numRecordsOld = 0;
        }
        work_group_barrier( CLK_LOCAL_MEM_FENCE );
        /* terminate when all subtrees are leaves */
        while ( *local_numRecords != *local_numRecordsOld )
        {
            /* remember the old number of build records to detect later
       * whether we are done */
            if ( get_local_id( 0 ) == 0 )
            {
                *local_numRecordsOld = *local_numRecords;
            }
            work_group_barrier( CLK_LOCAL_MEM_FENCE );
            /* all work items in the sub group pick a subtree to build */
            for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
            {
                /* ignore small subtrees */
                if ( local_records[ID].items <= BVH_NODE_N6 )
                    continue;
                /* create QBVH node */
                create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
            }
            /* wait for all work items to have updated local_records array */
            work_group_barrier( CLK_LOCAL_MEM_FENCE );
        }
        const uint shift_mask = globals->shift_mask;
        const uint leafPrimType = globals->leafPrimType;
        const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
        BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
        global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
        /* create all fat leaf nodes and initiate refit */
        for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
        {
            struct BuildRecordMorton current = local_records[ID];
            const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
            global struct QBVHNodeN* qnode = nodeData + current.current_index;
            /* get bounds of all children of the fat leaf node */
            struct AABB bounds[BVH_NODE_N6];
            for ( uint i = 0; i < current.items; i++ )
            {
                /* get primID and bounds of primitive */
                const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
                bounds[i] = primref[primID];
                /* For all primitives in a fat leaf we store a back
                 * pointer. This way we can modify the fat leaf node at leaf construction time. */
                const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
                /* Store back pointer and primID inside morton code array to
                 * be later used by leaf creation. */
                mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
            }
            /* update fat leaf node */
            QBVHNodeN_setType( qnode, leafPrimType );
            global void* offset;
            if ( leafPrimType != BVH_INSTANCE_NODE )
            {
                offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
                QBVHNodeN_setChildIncr1( qnode );
            }
            else
            {
                offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
                QBVHNodeN_setChildIncr2( qnode );
            }
            QBVH6Node_set_offset( qnode, offset );
            QBVHNodeN_setBounds( qnode, bounds, current.items );
            /* set back pointers for fat leaf nodes */
            *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
            /* bottom up refit */
            refit_bottom_up( qnode, bvh, bounds, current.items );
        }
    }
 }
 /*
  This phase takes the build records calculated in phase0 as input and
  finished the BVH construction for all these subtrees.
 */
 __attribute__((reqd_work_group_size(8, 1, 1)))
 old_parallel_build_phase1(global struct Globals *globals,
                      global struct MortonCodePrimitive *mc,
                      global struct AABB *primref,
                      global struct BinaryMortonCodeHierarchy *bnodes,
                      global char *bvh_mem)
 {
    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
    /* a queue of build records */
    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
    local uint local_numRecords;
    local uint local_numRecordsOld;
    /* construct range of build records that each sub group will process */
    const uint numRecords = globals->numBuildRecords;
    const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
    const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
 }
 __attribute__( (reqd_work_group_size( 8, 1, 1 )) )
 old_parallel_build_phase1_Indirect( global struct Globals* globals,
    global struct MortonCodePrimitive* mc,
    global struct AABB* primref,
    global struct BinaryMortonCodeHierarchy* bnodes,
    global char* bvh_mem )
 {
    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
    /* a queue of build records */
    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
    local uint local_numRecords;
    local uint local_numRecordsOld;
    /* construct range of build records that each sub group will process */
    const uint numRecords = globals->numBuildRecords;
    uint startID = get_group_id( 0 );
    uint endID   = startID + 1;
    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
 }
 #endif
--- a/src/intel/vulkan/grl/gpu/morton/pre_sort.cl
+++ b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl
@ -1,117 +0,0 @@
 //
 // Copyright (C) 2009-2022 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "morton/morton_common.h"
 GRL_INLINE uint get_morton_shift( uint numPrimitives )
 {
    return 32 - clz( numPrimitives );
 }
 GRL_INLINE uint get_morton_shift_mask( uint numPrimitives )
 {
    uint shift = get_morton_shift( numPrimitives );
    uint mask =(uint)(((ulong)1 << shift));
    return mask - 1; // separated due to problems in DX
 }
 __attribute__((reqd_work_group_size(1, 1, 1))) void kernel init( global struct Globals *globals )
 {
    /* variable shift for putting morton code + index to 64 bit */
    const uint shift = 32 - clz(globals->numPrimitives);
    globals->shift = shift;
    globals->shift_mask = (uint)(((ulong)1 << shift));
    globals->shift_mask -= 1; // separated due to problems in DX
    globals->binary_hierarchy_root = 0;
    globals->morton_sort_in_flight = 0;
    globals->sort_iterations = get_morton_sort_lsb_req_iterations(shift);
 }
 /*
  This kernel create a morton code array containing a morton code and
  index into the primref array.
  The code uses the maximal number of bits for the morton code, such
  that the morton code and index can still both get stored in 64 bits.
  The algorithm first maps the centroids of the primitives and their
  bounding box diagonal into a 4D grid, and then interleaves all 4
  grid coordinates to construct the to morton code.
 */
 __attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
 __attribute__( (intel_reqd_sub_group_size( 16 )) ) void kernel
 create_morton_codes_indirect( global struct Globals* globals,
    global struct BVHBase* bvh,
    global struct AABB* primref,
    global struct MortonCodePrimitive* morton_codes,
    global struct MortonCodePrimitive* morton_codes_tmp,
    uint use_new_morton_sort)
 {
    /* construct range of morton codes each work group should create */
    const uint numPrimitives = globals->numPrimitives;
    const uint startID = get_group_id( 0 ) * get_local_size( 0 );
    const uint endID   = min((uint)(startID + get_local_size(0)), numPrimitives);
    /* get lower and upper bounds of geometry and length of scene diagonal */
    const float3 lower = globals->centroidBounds.lower.xyz;
    const float3 upper = globals->centroidBounds.upper.xyz;
    const float diag = length( AABB3f_size( &bvh->Meta.bounds ).xyz );
    /* calculates the 4D grid */
    const uint shift = get_morton_shift( numPrimitives );
    const uint grid_size = 1 << (64 - shift) / 4;
    const float4 grid_base = (float4)(lower, 0.0f);
    const float4 grid_extend = (float4)(upper - lower, diag);
    const float4 grid_scale = select( (grid_size * 0.99f) / grid_extend, 0.0f, grid_extend == 0.0f ); // FIXME: 0.99f!!!!!
    const uint req_iterations = get_morton_sort_lsb_req_iterations(shift);
    /* each work group iterates over its range of morton codes to create */
    uint primID = startID + get_local_id( 0 );
    if( primID < endID )
    {
        /* calculate position inside 4D grid */
        float4 centroid2 = AABB_centroid2( &primref[primID] );
        centroid2.w = length( AABB_size( &primref[primID] ).xyz );
        const uint4 gridpos = convert_uint4_rtz( (centroid2 - grid_base) * grid_scale );
        /* calculate and store morton code */
        const ulong code = ulong_bitInterleave4D( gridpos );
        const ulong index_code = ((ulong)code << shift) | (ulong)primID;
        // It is required for morton code to be in morton_codes buffer after LSB sort finishes.
        // If there would be odd iteration number needed for sorting, it is needed
        // to skip some iterations of sorting. For odd number of iteration start with morton_codes_tmp buffer
        if(req_iterations & 1 && !use_new_morton_sort)
            morton_codes_tmp[primID].index_code = index_code;
        else
            morton_codes[primID].index_code = index_code;
    }
 }
 /*
  Initialization of the binary morton code hierarchy.
 */
 __attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) void kernel init_bottom_up_indirect( global struct Globals* globals,
    global struct BinaryMortonCodeHierarchy* bnodes )
 {
    /* construct range each work group will process */
    const uint numPrimitives = globals->numPrimitives;
    const uint startID = get_group_id( 0 ) * get_local_size(0);
    const uint endID   = min((uint)(startID + get_local_size(0)), numPrimitives);
    /* each workgroup iterates over its range to initialize the binary BVH */
    uint i = startID + get_local_id( 0 );
    if( i < endID )
        BinaryMortonCodeHierarchy_init( &bnodes[i], 0, numPrimitives - 1 );
 }
--- a/src/intel/vulkan/grl/gpu/morton_builder.grl
+++ b/src/intel/vulkan/grl/gpu/morton_builder.grl
@ -1,335 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module morton_builder;
 kernel_module morton_kernels ("morton/pre_sort.cl")
 {
    kernel opencl_build_kernel_init                                     < kernelFunction="init" >;
    kernel opencl_build_morton_kernel_create_morton_codes_indirect      < kernelFunction="create_morton_codes_indirect" >;
    kernel opencl_build_morton_kernel_init_bottom_up_indirect           < kernelFunction="init_bottom_up_indirect" >;
 }
 kernel_module morton_kernels ("morton/post_sort.cl")
 {
    links lsc_intrinsics;
    kernel opencl_build_morton_kernel_build_bottom_up_indirect          < kernelFunction="build_bottom_up_indirect" >;
 }
 kernel_module morton_kernels ("morton/phase0.cl")
 {
    links lsc_intrinsics;
    kernel opencl_build_morton_kernel_parallel_build_phase0             < kernelFunction="parallel_build_phase0" >;
    kernel opencl_build_morton_kernel_parallel_build_phase0_local_sync  < kernelFunction="parallel_build_phase0_local_sync" >;
 }
 kernel_module morton_kernels ("morton/phase1.cl")
 {
    links lsc_intrinsics;
    kernel opencl_build_morton_kernel_parallel_build_phase1_Indirect    < kernelFunction="parallel_build_phase1_Indirect_SG" >;
    kernel opencl_build_morton_kernel_parallel_build_phase1_root        < kernelFunction="parallel_build_phase1_Indirect_global_root" >;
 }
 kernel_module morton_kernels ("morton/phase2.cl")
 {
    links lsc_intrinsics;
    kernel opencl_build_morton_kernel_parallel_build_phase2_refit       < kernelFunction="parallel_build_phase2_refit" >;
    kernel opencl_build_morton_kernel_parallel_build_phase2_refit_local < kernelFunction="parallel_build_phase2_refit_local" >;
 }
 import struct MKBuilderState "structs.grl";
 /*
 metakernel begin(
    MKBuilderState state,
    qword morton_code_buffer,
    dword primLeafType,
    dword numHwThreads)
 {
    dispatch opencl_build_kernel_init(1, 1, 1) args(
        state.build_globals
        );
    control(wait_idle);
    dispatch opencl_build_morton_kernel_create_morton_codes(numHwThreads, 1, 1) args(
        state.build_globals,
        state.bvh_buffer,
        state.build_primref_buffer,
        morton_code_buffer);
    control(wait_idle);
 }
 metakernel build_bottom_up(
    MKBuilderState state,
    qword buildrecords_bottom_up,
    qword morton_code_buffer,
    dword numHwThreads)
 {
    dispatch opencl_build_morton_kernel_init_bottom_up(numHwThreads, 1, 1) args(
        state.build_globals,
        buildrecords_bottom_up);
    control(wait_idle);
    dispatch opencl_build_morton_kernel_build_bottom_up(numHwThreads, 1, 1) args(
        state.build_globals,
        buildrecords_bottom_up,
        morton_code_buffer);
    control(wait_idle);
 }
 metakernel parallel_build(
    MKBuilderState state,
    qword buildrecords_bottom_up,
    qword morton_code_buffer,
    dword numHwThreads)
 {
    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
        state.build_globals,
        buildrecords_bottom_up,
        state.bvh_buffer);
    control(wait_idle);
    dispatch opencl_build_morton_kernel_parallel_build_phase1(numHwThreads, 1, 1) args(
        state.build_globals,
        morton_code_buffer,
        state.build_primref_buffer,
        buildrecords_bottom_up,
        state.bvh_buffer);
   control(wait_idle);
 }
 */
 metakernel NewMorton_pre_sort(
    qword num_primrefs_counter,
    MKBuilderState state,
    qword morton_code_buffer,
    qword morton_code_buffer_tmp,
    qword buildrecords_bottom_up,
    dword use_new_morton_sort)
 {
    {
        REG1 = 15;
        REG2 = 4;
        REG0 = load_dword( num_primrefs_counter );
        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
        REG1 = ~REG1;
        REG0 = REG0 & REG1;
        REG0 = REG0 >> REG2;
    }
    dispatch opencl_build_kernel_init(1, 1, 1) args( state.build_globals );
    DISPATCHDIM_X = REG0.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    /*
    // new bottom-up kernel does not need this
    dispatch_indirect opencl_build_morton_kernel_init_bottom_up_indirect args(
        state.build_globals,
        buildrecords_bottom_up);
        */
    dispatch_indirect opencl_build_morton_kernel_create_morton_codes_indirect args(
        state.build_globals,
        state.bvh_buffer,
        state.build_primref_buffer,
        morton_code_buffer,
        morton_code_buffer_tmp,
        use_new_morton_sort);
 }
 metakernel NewMorton_post_sort(
    qword num_primrefs_counter,
    qword num_buildrecords_counter,
    MKBuilderState state,
    qword buildrecords_bottom_up,
    qword morton_code_buffer )
 {
    {
        REG1 = 15;
        REG2 = 4;
        REG0 = load_dword( num_primrefs_counter );
        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
        REG1 = ~REG1;
        REG0 = REG0 & REG1;
        REG0 = REG0 >> REG2;
    }
    DISPATCHDIM_X = REG0.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
        state.build_globals,
        buildrecords_bottom_up,
        morton_code_buffer);
    /*
   dispatch opencl_build_morton_kernel_build_bottom_up(16, 1, 1) args(
        state.build_globals,
        buildrecords_bottom_up,
        morton_code_buffer);
        */
    control(wait_idle);
    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
        state.build_globals,
        buildrecords_bottom_up,
        state.bvh_buffer);
    control(wait_idle);
    DISPATCHDIM_X = load_dword( num_buildrecords_counter );
    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
        state.build_globals,
        morton_code_buffer,
        state.build_primref_buffer,
        buildrecords_bottom_up,
        state.bvh_buffer);
   control(wait_idle);
 }
 metakernel NewMorton_bottom_up(
    qword num_primrefs_counter,
    MKBuilderState state,
    qword buildrecords_bottom_up,
    qword morton_code_buffer )
 {
    {
        REG1 = 15;
        REG2 = 4;
        REG0 = load_dword( num_primrefs_counter );
        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
        REG1 = ~REG1;
        REG0 = REG0 & REG1;
        REG0 = REG0 >> REG2;
    }
    DISPATCHDIM_X = REG0.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
        state.build_globals,
        buildrecords_bottom_up,
        morton_code_buffer);
 }
 metakernel NewMorton_phase0(
    MKBuilderState state,
    qword buildrecords_bottom_up,
    qword morton_p0_refit_startpoints)
 {
    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
        state.build_globals,
        buildrecords_bottom_up,
        state.bvh_buffer,
        morton_p0_refit_startpoints);
 }
 metakernel NewMorton_phase0_local_sync(
    MKBuilderState state,
    qword buildrecords_bottom_up,
    qword p0_boxless_nodes)
 {
    dispatch opencl_build_morton_kernel_parallel_build_phase0_local_sync(1, 1, 1) args(
        state.build_globals,
        buildrecords_bottom_up,
        state.bvh_buffer,
        p0_boxless_nodes);
 }
 metakernel NewMorton_phase1(
    qword num_buildrecords_counter,
    MKBuilderState state,
    qword buildrecords_bottom_up,
    qword morton_code_buffer)
 {
    DISPATCHDIM_X = load_dword( num_buildrecords_counter );
    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
        state.build_globals,
        morton_code_buffer,
        state.build_primref_buffer,
        buildrecords_bottom_up,
        state.bvh_buffer);
 }
 metakernel NewMorton_phase1_root(
    qword num_buildrecords_counter,
    MKBuilderState state,
    qword buildrecords_bottom_up,
    qword morton_code_buffer)
 {
    dispatch opencl_build_morton_kernel_parallel_build_phase1_root(1, 1, 1) args(
        state.build_globals,
        morton_code_buffer,
        state.build_primref_buffer,
        buildrecords_bottom_up,
        state.bvh_buffer);
 }
 metakernel NewMorton_phase2(
    qword num_leaves_counter,
    MKBuilderState state,
    qword bottom_node_ids )
 {
    DISPATCHDIM_X = load_dword( num_leaves_counter );
    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase2_refit args(
        state.bvh_buffer,
        bottom_node_ids);
 }
 metakernel NewMorton_phase2_local(
    MKBuilderState state,
    qword p0_boxless_nodes)
 {
    dispatch opencl_build_morton_kernel_parallel_build_phase2_refit_local(1, 1, 1) args(
        state.build_globals,
        state.bvh_buffer,
        p0_boxless_nodes);
 }
--- a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl
@ -1,9 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 // just inlines the kernels that are there in the header
 #include "morton_msb_radix_bitonic_sort.h"
--- a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h
@ -1,924 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "common.h"
 #include "morton_msb_radix_bitonic_sort_shared.h"
 #include "libs/lsc_intrinsics.h"
 ///////////////////////////////////////////////////////////////////////////////
 //
 // Configuration switches
 //
 ///////////////////////////////////////////////////////////////////////////////
 #define DEBUG 0
 #define MERGE_BLS_WITHIN_SG 0
 ///////////////////////////////////////////////////////////////////////////////
 #if DEBUG
 #define DEBUG_CODE(A) A
 #else
 #define DEBUG_CODE(A)
 #endif
 #define BOTTOM_LEVEL_SORT_WG_SIZE 512
 // this kernel is only used to put into metakernel for debug to print that the code reached that place
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(1, 1, 1)))
 void kernel debug_print_kernel(uint variable)
 {
    if(get_local_id(0) == 0)
    printf("I'm here! %d\n", variable);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(1, 1, 1)))
 void kernel check_bls_sort(global struct Globals* globals, global ulong* input)
 {
    uint prims_num = globals->numPrimitives;
    printf("in check_bls_sort kernel. Values count:: %d\n", prims_num);
    ulong left = input[0];
    ulong right;
    for (int i = 0; i < prims_num - 1; i++)
    {
        right = input[i + 1];
        printf("sorted val: %llu\n", left);
        if (left > right)
        {
            printf("element %d is bigger than %d: %llu > %llu\n", i, i+1, left, right);
        }
        left = right;
    }
 }
 inline uint wg_scan_inclusive_add_opt(local uint* tmp, uint val, uint SG_SIZE, uint WG_SIZE)
 {
    const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE;
    const uint sg_local_id = get_local_id(0) % SG_SIZE;
    const uint NUM_HW_THREADS_IN_WG = WG_SIZE / SG_SIZE;
    uint acc = sub_group_scan_inclusive_add(val);
    if (NUM_HW_THREADS_IN_WG == 1)
    {
        return acc;
    }
    tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc, SG_SIZE - 1);
    barrier(CLK_LOCAL_MEM_FENCE);
    uint loaded_val = sg_local_id < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
    uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
    uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
    // for > 256 workitems in SIMD16 we won't fit in 16 workitems per subgroup, so we need additional iteration
    // same for > 64 workitems and more in SIMD8
    uint num_iterations = (NUM_HW_THREADS_IN_WG + SG_SIZE - 1) / SG_SIZE;
    for (int i = 1; i < num_iterations; i++)
    {
        // need to add tmp[] because of "exclusive" scan, so last element misses it
        uint prev_max_sum = sub_group_broadcast(wgs_acc, SG_SIZE - 1) + tmp[(i * SG_SIZE) - 1];
        loaded_val = (sg_local_id + i * SG_SIZE) < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
        wgs_acc = sub_group_scan_exclusive_add(loaded_val);
        wgs_acc += prev_max_sum;
        uint new_acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id % SG_SIZE);
        if (hw_thread_in_wg_id >= i * SG_SIZE)
            acc_for_this_hw_thread = new_acc_for_this_hw_thread;
    }
    return acc + acc_for_this_hw_thread;
 }
 struct MSBDispatchArgs
 {
    global struct MSBRadixContext* context;
    uint num_of_wgs; // this is the number of workgroups that was dispatched for this context
    ulong* wg_key_start; // this is where keys to process start for current workgroup
    ulong* wg_key_end;
    uint shift_bit;
 };
 struct MSBDispatchArgs get_msb_dispatch_args(global struct VContextScheduler* scheduler)
 {
    global struct MSBDispatchQueue* queue = &scheduler->msb_queue;
    uint group = get_group_id(0);
    struct MSBDispatchRecord record;
    // TODO_OPT:  Load this entire prefix array into SLM instead of searching..
    //    Or use sub-group ops
    uint i = 0;
    while (i < queue->num_records)
    {
        uint n = queue->records[i].wgs_to_dispatch;
        if (group < n)
        {
            record = queue->records[i];
            break;
        }
        group -= n;
        i++;
    }
    uint context_id = i;
    global struct MSBRadixContext* context = &scheduler->contexts[context_id];
    // moving to ulongs to avoid uint overflow
    ulong group_id_in_dispatch = group;
    ulong start_offset = context->start_offset;
    ulong num_keys = context->num_keys;
    ulong wgs_to_dispatch = record.wgs_to_dispatch;
    struct MSBDispatchArgs args;
    args.context = context;
    args.num_of_wgs = record.wgs_to_dispatch;
    args.wg_key_start = context->keys_in + start_offset + (group_id_in_dispatch * num_keys / wgs_to_dispatch);
    args.wg_key_end = context->keys_in + start_offset + ((group_id_in_dispatch+1) * num_keys / wgs_to_dispatch);
    args.shift_bit = MSB_SHIFT_BYTE_START_OFFSET - context->iteration * MSB_BITS_PER_ITERATION;
    return args;
 }
 void BLSDispatchQueue_push(global struct BLSDispatchQueue* queue, struct BLSDispatchRecord* record)
 {
    uint new_idx = atomic_inc_global(&queue->num_records);
    queue->records[new_idx] = *record;
    DEBUG_CODE(printf("adding bls of size: %d\n", record->count));
 }
 void DO_CountSort(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
 {
    uint tid = get_local_id(0);
    global ulong* in = ((global ulong*)(dispatchRecord.keys_in)) + dispatchRecord.start_offset;
    ulong a = tid < dispatchRecord.count ? in[tid] : ULONG_MAX;
    SLM_shared[tid] = a;
    uint counter = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    ulong curr = SLM_shared[get_sub_group_local_id()];
    for (uint i = 16; i < dispatchRecord.count; i += 16)
    {
        ulong next  = SLM_shared[i + get_sub_group_local_id()];
        for (uint j = 0; j < 16; j++)
        {
            // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
            uint2 curr_as_uint2 = as_uint2(curr);
            uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
            ulong c = as_ulong(sg_curr_as_uint2);
            if (c < a)
                counter++;
        }
        curr = next;
    }
    // last iter
    for (uint j = 0; j < 16; j++)
    {
        // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
        uint2 curr_as_uint2 = as_uint2(curr);
        uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
        ulong c = as_ulong(sg_curr_as_uint2);
        if (c < a)
            counter++;
    }
    // save elements to its sorted positions
    if (tid < dispatchRecord.count)
        output[dispatchRecord.start_offset + counter] = a;
 }
 void DO_Bitonic(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
 {
    uint lid = get_local_id(0);
    uint elements_to_sort = BOTTOM_LEVEL_SORT_THRESHOLD;
    while ((elements_to_sort >> 1) >= dispatchRecord.count && elements_to_sort >> 1 >= BOTTOM_LEVEL_SORT_WG_SIZE)
    {
        elements_to_sort >>= 1;
    }
    for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
    {
        uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
        if (tid >= dispatchRecord.count)
            SLM_shared[tid] = ULONG_MAX;
        else
            SLM_shared[tid] = ((global ulong*)(dispatchRecord.keys_in))[dispatchRecord.start_offset + tid];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    uint k_iterations = elements_to_sort;
    while(k_iterations >> 1 >= dispatchRecord.count && k_iterations != 0)
    {
        k_iterations >>= 1;
    }
    for (unsigned int k = 2; k <= k_iterations; k *= 2)
    {
        for (unsigned int j = k / 2; j > 0; j /= 2)
        {
            // this loop is needed when we can't create big enough workgroup so we need to process multiple times
            for (uint i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
            {
                uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
                unsigned int ixj = tid ^ j;
                if (ixj > tid)
                {
                    if ((tid & k) == 0)
                    {
                        if (SLM_shared[tid] > SLM_shared[ixj])
                        {
                            ulong tmp = SLM_shared[tid];
                            SLM_shared[tid] = SLM_shared[ixj];
                            SLM_shared[ixj] = tmp;
                        }
                    }
                    else
                    {
                        if (SLM_shared[tid] < SLM_shared[ixj])
                        {
                            ulong tmp = SLM_shared[tid];
                            SLM_shared[tid] = SLM_shared[ixj];
                            SLM_shared[ixj] = tmp;
                        }
                    }
                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
    }
    for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
    {
        uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
        if (tid < dispatchRecord.count)
            output[dispatchRecord.start_offset + tid] = SLM_shared[tid];
    }
 }
 void DO_Create_Separate_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
 {
    uint lid = get_local_id(0);
    uint start = context->start[lid];
    uint count = context->count[lid];
    uint start_offset = context->start_offset + start;
    struct BLSDispatchRecord record;
    record.start_offset = start_offset;
    record.count = count;
    record.keys_in = context->keys_out;
    if (count == 0) // we don't have elements so don't do anything
    {
    }
    else if (count == 1) // single element so just write it out
    {
        input[start_offset] = ((global ulong*)record.keys_in)[start_offset];
    }
    else if (count <= BOTTOM_LEVEL_SORT_THRESHOLD)
    {
        BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
    }
 }
 // We try to merge small BLS into larger one within the sub_group
 void DO_Create_SG_Merged_BLS_Work_Parallel(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
 {
    uint lid = get_local_id(0);
    uint sid = get_sub_group_local_id();
    uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
    uint start = context->start[lid];
    uint count = context->count[lid];
    uint ctx_start_offset = context->start_offset;
    if (sid == 0 || create_msb_work) // these SIMD lanes are the begining of merged BLS
    {
        struct BLSDispatchRecord record;
        if (create_msb_work)
        {
            record.start_offset = ctx_start_offset + start + count;
            record.count = 0;
        }
        else // SIMD lane 0 case
        {
            record.start_offset = ctx_start_offset + start; 
            record.count = count;
        }
        record.keys_in = context->keys_out;
        uint loop_idx = 1;
        while (sid + loop_idx < 16) // loop over subgroup
        {
            uint _create_msb_work = intel_sub_group_shuffle_down(create_msb_work, 0u, loop_idx);
            uint _count = intel_sub_group_shuffle_down(count, 0u, loop_idx);
            uint _start = intel_sub_group_shuffle_down(start, 0u, loop_idx);
            if (_create_msb_work) // found out next MSB work, so range of merges ends
                break;
            // need to push record since nothing more will fit
            if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
            {
                if (record.count == 1)
                {
                    input[record.start_offset] = record.keys_in[record.start_offset];
                }
                else if (record.count > 1)
                {
                    BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
                }
                record.start_offset = ctx_start_offset + _start;
                record.count = _count;
            }
            else
            {
                record.count += _count;
            }
            loop_idx++;
        }
        // if we have any elements left, then schedule them
        if (record.count == 1) // only one element, so just write it out
        {
            input[record.start_offset] = record.keys_in[record.start_offset];
        }
        else if (record.count > 1)
        {
            BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
        }
    }
 }
 // We try to merge small BLS into larger one within the sub_group
 void DO_Create_SG_Merged_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
 {
    uint lid = get_local_id(0);
    uint sid = get_sub_group_local_id();
    uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
    uint start = context->start[lid];
    uint count = context->count[lid];
    uint ctx_start_offset = context->start_offset;
    if (sid == 0)
    {
        struct BLSDispatchRecord record;
        record.start_offset = ctx_start_offset + start;
        record.count = 0;
        record.keys_in = context->keys_out;
        for (int i = 0; i < 16; i++)
        {
            uint _create_msb_work = sub_group_broadcast(create_msb_work, i);
            uint _count = sub_group_broadcast(count, i);
            uint _start = sub_group_broadcast(start, i);
            if (_create_msb_work)
            {
                if (record.count == 1) // only one element, so just write it out
                {
                    input[record.start_offset] = record.keys_in[record.start_offset];
                }
                else if (record.count > 1)
                {
                    BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
                }
                record.start_offset = ctx_start_offset + _start + _count;
                record.count = 0;
                continue;
            }
            // need to push record since nothing more will fit
            if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
            {
                BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
                record.start_offset = ctx_start_offset + _start;
                record.count = _count;
            }
            else
            {
                record.count += _count;
            }
        }
        // if we have any elements left, then schedule them
        if (record.count == 1) // only one element, so just write it out
        {
            input[record.start_offset] = record.keys_in[record.start_offset];
        }
        else if (record.count > 1)
        {
            BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
        }
    }
 }
 void DO_Create_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input, local uint* slm_for_wg_scan, uint sg_size, uint wg_size)
 {
    uint lid = get_local_id(0);
    uint iteration = context->iteration + 1;
    uint start = context->start[lid];
    uint count = context->count[lid];
    uint start_offset = context->start_offset + start;
    uint create_msb_work = count > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
 #if MERGE_BLS_WITHIN_SG
    DO_Create_SG_Merged_BLS_Work_Parallel(scheduler, context, input);
 #else
    DO_Create_Separate_BLS_Work(scheduler, context, input);
 #endif
    uint new_entry_id = wg_scan_inclusive_add_opt(slm_for_wg_scan, create_msb_work, sg_size, wg_size);//work_group_scan_inclusive_add(create_msb_work);
    uint stack_begin_entry;
    // last workitem in wg contains number of all new entries
    if (lid == (MSB_RADIX_NUM_BINS - 1))
    {
        stack_begin_entry = atomic_add_global(&scheduler->msb_stack.num_entries, new_entry_id);
    }
    stack_begin_entry = work_group_broadcast(stack_begin_entry, (MSB_RADIX_NUM_BINS - 1));
    new_entry_id += stack_begin_entry -1;
    if (create_msb_work)
    {
        scheduler->msb_stack.entries[new_entry_id].start_offset = start_offset;
        scheduler->msb_stack.entries[new_entry_id].count = count;
        scheduler->msb_stack.entries[new_entry_id].iteration = iteration;
    }
    if (lid == 0) {
        DEBUG_CODE(printf("num of new bls: %d\n", scheduler->next_bls_queue->num_records));
    }
 }
 struct BatchedBLSDispatchEntry
 {
    /////////////////////////////////////////////////////////////
    //  State data used for communication with command streamer
    //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
    /////////////////////////////////////////////////////////////
    qword p_data_buffer;
    qword num_elements; // number of elements in p_data_buffer
 };
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel sort_morton_codes_batched_BLS_dispatch(global struct BatchedBLSDispatchEntry* bls_dispatches)
 {
    uint dispatch_id = get_group_id(0);
    uint lid = get_local_id(0);
    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
    struct BatchedBLSDispatchEntry dispatchArgs = bls_dispatches[dispatch_id];
    struct BLSDispatchRecord dispatchRecord;
    dispatchRecord.start_offset = 0;
    dispatchRecord.count = dispatchArgs.num_elements;
    dispatchRecord.keys_in = (ulong*)dispatchArgs.p_data_buffer;
    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", dispatchRecord.count));
    if(dispatchRecord.count > 1)
        DO_Bitonic(dispatchRecord, SLM_shared, (global ulong*)dispatchRecord.keys_in);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel sort_morton_codes_bottom_level_single_wg(global struct Globals* globals, global ulong* input, global ulong* output)
 {
    uint lid = get_local_id(0);
    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", globals->numPrimitives));
    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
    struct BLSDispatchRecord dispatchRecord;
    dispatchRecord.start_offset = 0;
    dispatchRecord.count = globals->numPrimitives;
    dispatchRecord.keys_in = (ulong*)input;
    //TODO: count or bitonic here?
    //DO_Bitonic(dispatchRecord, SLM_shared, output);
    DO_CountSort(dispatchRecord, SLM_shared, output);
 }
 // This kernel initializes first context to start up the whole execution
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel sort_morton_codes_msb_begin(
    global struct Globals* globals,
    global struct VContextScheduler* scheduler,
    global ulong* buf0,
    global ulong* buf1)
 {
    uint lid = get_local_id(0);
    uint gid = get_group_id(0);
    DEBUG_CODE(if (lid == 0)printf("running sort_morton_codes_msb_begin\n"));
    scheduler->contexts[gid].count[lid] = 0;
    if (gid == 0 && lid == 0)
    {
        global struct MSBRadixContext* context = &scheduler->contexts[lid];
        const uint num_prims = globals->numPrimitives;
        scheduler->bls_queue0.num_records = 0;
        scheduler->bls_queue1.num_records = 0;
        scheduler->curr_bls_queue = &scheduler->bls_queue1;
        scheduler->next_bls_queue = &scheduler->bls_queue0;
        context->start_offset = 0;
        context->num_wgs_in_flight = 0;
        context->num_keys = num_prims;
        context->iteration = 0;
        context->keys_in = buf0;
        context->keys_out = buf1;
        uint msb_wgs_to_dispatch = (num_prims + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
        scheduler->msb_queue.records[0].wgs_to_dispatch = msb_wgs_to_dispatch;
        scheduler->num_wgs_msb = msb_wgs_to_dispatch;
        scheduler->num_wgs_bls = 0;
        scheduler->msb_stack.num_entries = 0;
        scheduler->msb_queue.num_records = 1;
    }
 }
 __attribute__((reqd_work_group_size(MSB_RADIX_NUM_VCONTEXTS, 1, 1)))
 kernel void
 scheduler(global struct VContextScheduler* scheduler, global ulong* buf0, global ulong* buf1)
 {
    uint lid = get_local_id(0);
    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler\n"));
    uint context_idx = lid;
    const uint num_of_stack_entries = scheduler->msb_stack.num_entries;
    uint msb_wgs_to_dispatch = 0;
    if (lid < num_of_stack_entries)
    {
        struct MSBStackEntry entry = scheduler->msb_stack.entries[(num_of_stack_entries-1) - lid];
        global struct MSBRadixContext* context = &scheduler->contexts[lid];
        context->start_offset = entry.start_offset;
        context->num_wgs_in_flight = 0;
        context->num_keys = entry.count;
        context->iteration = entry.iteration;
        context->keys_in = entry.iteration % 2 == 0 ? buf0 : buf1;
        context->keys_out = entry.iteration % 2 == 0 ? buf1 : buf0;
        msb_wgs_to_dispatch = (entry.count + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
        scheduler->msb_queue.records[lid].wgs_to_dispatch = msb_wgs_to_dispatch;
    }
    msb_wgs_to_dispatch = work_group_reduce_add(msb_wgs_to_dispatch);// TODO: if compiler implementation is slow, then consider to manually write it
    if (lid == 0)
    {
        // swap queue for next iteration
        struct BLSDispatchQueue* tmp = scheduler->curr_bls_queue;
        scheduler->curr_bls_queue = scheduler->next_bls_queue;
        scheduler->next_bls_queue = tmp;
        scheduler->next_bls_queue->num_records = 0;
        scheduler->num_wgs_bls = scheduler->curr_bls_queue->num_records;
        scheduler->num_wgs_msb = msb_wgs_to_dispatch;
        if (num_of_stack_entries < MSB_RADIX_NUM_VCONTEXTS)
        {
            scheduler->msb_queue.num_records = num_of_stack_entries;
            scheduler->msb_stack.num_entries = 0;
        }
        else
        {
            scheduler->msb_queue.num_records = MSB_RADIX_NUM_VCONTEXTS;
            scheduler->msb_stack.num_entries -= MSB_RADIX_NUM_VCONTEXTS;
        }
    }
    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler finished, to spawn %d MSB wgs in %d contexts and %d BLS wgs, MSB records on stack %d\n",
        scheduler->num_wgs_msb, scheduler->msb_queue.num_records, scheduler->num_wgs_bls, scheduler->msb_stack.num_entries));
 }
 // this is the lowest sub-task, which should end return sorted codes
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel sort_morton_codes_bottom_level( global struct VContextScheduler* scheduler, global ulong* output)
 {
    uint lid = get_local_id(0);
    DEBUG_CODE(if (get_group_id(0) == 0 && lid == 0) printf("running sort_morton_codes_bottom_level\n"));
    local struct BLSDispatchRecord l_dispatchRecord;
    if (lid == 0)
    {
        uint record_idx = get_group_id(0);
        l_dispatchRecord = scheduler->curr_bls_queue->records[record_idx];
        //l_dispatchRecord = BLSDispatchQueue_pop((global struct BLSDispatchQueue*)scheduler->curr_bls_queue);
        atomic_dec_global(&scheduler->num_wgs_bls);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    struct BLSDispatchRecord dispatchRecord = l_dispatchRecord;
    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
    // right now use only bitonic sort
    // TODO: maybe implement something else
    if (1)
    {
        //DO_Bitonic(dispatchRecord, SLM_shared, output);
        DO_CountSort(dispatchRecord, SLM_shared, output);
    }
 }
 #define MSB_COUNT_WG_SIZE MSB_RADIX_NUM_BINS
 #define MSB_COUNT_SG_SIZE 16
 // count how many elements per buckets we have
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MSB_COUNT_WG_SIZE, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MSB_COUNT_SG_SIZE)))
 void kernel sort_morton_codes_msb_count_items( global struct VContextScheduler* scheduler)
 {
    uint lid = get_local_id(0);
    uint lsz = MSB_RADIX_NUM_BINS;
    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_count_items\n"));
    local uint bucket_count[MSB_RADIX_NUM_BINS];
    local uint finish_count;
    bucket_count[lid] = 0;
    if (lid == 0)
    {
        finish_count = 0;
    }
    struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
    global struct MSBRadixContext* context = dispatchArgs.context;
    global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
    global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
    uint shift_bit = dispatchArgs.shift_bit;
    uchar shift_byte = shift_bit / 8; // so we count how many uchars to shift
    barrier(CLK_LOCAL_MEM_FENCE);
    global uchar* ks = (global uchar*)key_start;
    ks += shift_byte;
    global uchar* ke = (global uchar*)key_end;
    ke += shift_byte;
    // double buffering on value loading
    if (ks < ke)
    {
        uchar bucket_id = *ks;
        ks += lsz * sizeof(ulong);
        for (global uchar* k = ks; k < ke; k += lsz * sizeof(ulong))
        {
            uchar next_bucket_id = *k;
            atomic_inc_local(&bucket_count[bucket_id]);
            bucket_id = next_bucket_id;
        }
        atomic_inc_local(&bucket_count[bucket_id]);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    //update global counters for context
    uint count = bucket_count[lid];
    if (count > 0)
        atomic_add_global(&context->count[lid], bucket_count[lid]);
    mem_fence_gpu_invalidate();
    work_group_barrier(0);
    bool final_wg = true;
    // count WGs which have reached the end
    if (dispatchArgs.num_of_wgs > 1)
    {
        if (lid == 0)
            finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
        barrier(CLK_LOCAL_MEM_FENCE);
        final_wg = finish_count == dispatchArgs.num_of_wgs;
    }
    local uint partial_dispatches[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
    // if this is last wg for current dispatch, update context
    if (final_wg)
    {
        // code below does work_group_scan_exclusive_add(context->count[lid]);
        {
            uint lane_val = context->count[lid];
            uint sg_result = sub_group_scan_inclusive_add(lane_val);
            partial_dispatches[get_sub_group_id()] = sub_group_broadcast(sg_result, MSB_COUNT_SG_SIZE - 1);
            barrier(CLK_LOCAL_MEM_FENCE);
            uint slm_result = sub_group_scan_exclusive_add(partial_dispatches[get_sub_group_local_id()]);
            slm_result = sub_group_broadcast(slm_result, get_sub_group_id());
            uint result = slm_result + sg_result - lane_val;
            context->start[lid] = result;//work_group_scan_exclusive_add(context->count[lid]);
        }
        context->count[lid] = 0;
        if(lid == 0)
            context->num_wgs_in_flight = 0;
    }
 }
 // sort elements into appropriate buckets
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16)))
 void kernel sort_morton_codes_msb_bin_items(
    global struct VContextScheduler* scheduler, global ulong* input)
 {
    uint lid = get_local_id(0);
    uint lsz = get_local_size(0);
    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_bin_items\n"));
    local uint finish_count;
    if (lid == 0)
    {
        finish_count = 0;
    }
    struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
    global struct MSBRadixContext* context = dispatchArgs.context;
    global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
    global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
    uint shift_bit = dispatchArgs.shift_bit;
    barrier(CLK_LOCAL_MEM_FENCE);
    global ulong* sorted_keys = (global ulong*)context->keys_out + context->start_offset;
 #if MSB_RADIX_NUM_BINS == MSB_WG_SORT_ELEMENTS_THRESHOLD // special case meaning that we process exactly 1 element per workitem
    // here we'll do local counting, then move to global
    local uint slm_counters[MSB_RADIX_NUM_BINS];
    slm_counters[lid] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    uint place_in_slm_bucket;
    uint bucket_id;
    ulong val;
    bool active_lane = key_start < key_end;
    if (active_lane)
    {
        val = *key_start;
        bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
        place_in_slm_bucket = atomic_inc_local(&slm_counters[bucket_id]);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // override slm_counters with global counters - we don't need to override counters with 0 elements since we won't use them anyway
    if (slm_counters[lid])
        slm_counters[lid] = atomic_add_global(&context->count[lid], slm_counters[lid]);
    barrier(CLK_LOCAL_MEM_FENCE);
    uint id_in_bucket = slm_counters[bucket_id] + place_in_slm_bucket;//atomic_inc_global(&context->count[bucket_id]);
    if (active_lane)
        sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
 #else
    // double buffering on value loading
    if (key_start < key_end)
    {
        ulong val = *key_start;
        key_start += lsz;
        for (global ulong* k = key_start; k < key_end; k += lsz)
        {
            ulong next_val = *k;
            uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
            uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
            //printf("dec: %llu, val: %llX bucket_id: %X", *k, *k, bucket_id);
            sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
            val = next_val;
        }
        uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
        uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
        sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
    }
 #endif
    // make sure all groups's "counters" and "starts" are visible to final workgroup
    mem_fence_gpu_invalidate();
    work_group_barrier(0);
    bool final_wg = true;
    // count WGs which have reached the end
    if (dispatchArgs.num_of_wgs > 1)
    {
        if (lid == 0)
            finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
        barrier(CLK_LOCAL_MEM_FENCE);
        final_wg = finish_count == dispatchArgs.num_of_wgs;
    }
    local uint slm_for_wg_funcs[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
    // if this is last wg for current dispatch, then prepare sub-tasks
    if (final_wg)
    {
        DO_Create_Work(scheduler, context, input, slm_for_wg_funcs, 16, MSB_RADIX_NUM_BINS);
        // clear context's counters for future execution
        context->count[lid] = 0;
    }
 }
--- a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h
@ -1,135 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 //
 //   This file contains structure definitions shared by GRL OCL kernels and host code
 //
 #pragma once
 #include "GRLGen12.h"
 // NOTE:
 // MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work
 // BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work
 //
 #define MSB_RADIX_NUM_BINS    256
 #define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration
 #define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here
 #define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used
 #define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here,
 // since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these
 #define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context
 #define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS,
 // so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS
 #define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup.
                                           // If a single MSB entry needs more, then it will spawn more WGs
                                           // after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num
 #define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance
 // Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with.
 // Since we use ulong(8bytes) we can store 4096 elements
 // This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler
 // Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD
 #define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
 GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT)
 struct MSBStackEntry
 {
    uint start_offset;
    uint count;
    uint iteration;
 };
 struct MSBStack
 {
    dword num_entries;
    struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM];
 };
 struct MSBRadixContext
 {
    uint start[MSB_RADIX_NUM_BINS];
    uint count[MSB_RADIX_NUM_BINS];
    uint num_wgs_in_flight; // this is used to identify which msb wg is last
    uint num_keys; // number of keys to process
    uint iteration;
    ulong* keys_in;
    ulong* keys_out;
    uint start_offset; //offset from the beginning of the buffer
 };
 struct MSBDispatchRecord
 {
    uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record
 };
 struct MSBDispatchQueue
 {
    dword num_records;
    struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record
 };
 // BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks
 struct BLSDispatchRecord
 {
    uint start_offset; // offset from the beginning of the buffer
    uint count;
    ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer 
 };
 struct BLSDispatchQueue
 {
    dword num_records;
    struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS];
 };
 struct VContextScheduler
 {
    /////////////////////////////////////////////////////////////
    //  State data used for communication with command streamer
    //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
    /////////////////////////////////////////////////////////////
    dword num_wgs_msb; // number of MSB workgroups being processed by current iteration
    dword num_wgs_bls; // number of BLS workgroups being processed by current iteration
    dword scheduler_postsync;
    dword _pad1;
    /////////////////////////////////////////////////////////////
    struct MSBDispatchQueue msb_queue;
    struct BLSDispatchQueue bls_queue0;
    struct BLSDispatchQueue bls_queue1;
    struct BLSDispatchQueue* curr_bls_queue;
    struct BLSDispatchQueue* next_bls_queue;
    struct MSBStack msb_stack;
    struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS];
 };
 GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT)
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/gpu/morton_radix_sort.cl
+++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl
@ -1,9 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 // just inlines the kernels that are there in the header
 #include "morton_radix_sort.h"
--- a/src/intel/vulkan/grl/gpu/morton_radix_sort.h
+++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.h
@ -1,855 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "common.h"
 #include "libs/lsc_intrinsics.h"
 /* ============================================================================= */
 /* ============================== LSB RADIX SORT =============================== */
 /* ============================================================================= */
 #define RADIX_BINS 256
 #define SCATTER_WG_SIZE 512
 #define MORTON_LSB_SORT_NO_SHIFT_THRESHOLD 0xFFFFFFFF // turn off, because current hierarchy build requires full sort
 uint2 get_thread_range( uint numItems, uint numGroups, uint taskID )
 {
    uint items_per_group = (numItems / numGroups);
    uint remainder = numItems - (items_per_group * numGroups);
    uint startID = taskID * items_per_group  + min(taskID, remainder);
    uint endID   = startID + items_per_group + ((taskID < remainder) ? 1 : 0);
    return (uint2)(startID,endID);
 }
 GRL_INLINE void sort_morton_codes_bin_items_taskID_func(global struct Globals* globals,
                                                 global uint* global_histogram,
                                                 global uchar* input,
                                                 local uint* histogram,
                                                 uint iteration,
                                                 uint numGroups,
                                                 uint numItems,
                                                 bool shift_primID,
                                                 uint taskID,
                                                 uint startID,
                                                 uint endID)
 {
    const uint shift = globals->shift;
    for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
        histogram[i] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (shift_primID)
    {
        for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
        {
            // Read input as ulong to make bitshift, so the bits representing primID are not being
            // taken into account during sorting, which would result in smaller sort loops for
            // cases where morton shift are bigger than 8 bits
            ulong* ptr_ul = (ulong*)&input[8 * i];
            ulong code = *ptr_ul;
            uchar* ptr = (uchar*)&code;
            code >>= shift;
            uchar bin = ptr[iteration];
            atomic_inc_local(&histogram[bin]);
        }
    }
    else
    {
        for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
        {
            uchar bin = input[8 * i + iteration];
            atomic_inc_local(&histogram[bin]);
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
        global_histogram[RADIX_BINS * taskID + i] = histogram[i];
 }
 GRL_INLINE void sort_morton_codes_bin_items_func(global struct Globals* globals,
    global uint* global_histogram,
    global uint* wg_flags,
    global uchar* input,
    local uint* histogram,
    uint iteration,
    uint numGroups,
    uint numItems,
    bool shift_primID,
    bool update_wg_flags)
 {
    if (shift_primID)
    {
        // This check is present in other LSB sort functions as well, its purpose is
        // to skip first n iterations where n is the difference between max iterations
        // and actually needed iterations to sort without primIDs
        const uint req_iterations = globals->sort_iterations;
        if (iteration < req_iterations)
            return;
        // iteration needs to be adjusted to reflect the skipped cycles
        iteration -= req_iterations;
    }
    const uint taskID = get_group_id(0);
    if (taskID == 0 && update_wg_flags)
    {
        for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
            wg_flags[i] = 0;
    }
    uint2 ids = get_thread_range(numItems, numGroups, taskID);
    uint startID = ids.x;
    uint endID = ids.y;
    sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, shift_primID,
                                            taskID, startID, endID);
 }
 __attribute__((reqd_work_group_size(512, 1, 1)))
 void kernel
 sort_morton_codes_bin_items(
    global struct Globals* globals,
    global uint* global_histogram,
    global uint* wg_flags,
    global uchar* input,
    uint iteration,
    uint numGroups,
    uint update_wg_flags
 )
 {
    local uint histogram[RADIX_BINS];
    const uint numItems = globals->numPrimitives;
    if(numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
        sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, false, update_wg_flags);
    else
        sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, true, update_wg_flags);
 }
 GRL_INLINE void sort_morton_codes_reduce_bins_func(global struct Globals* globals,
                                                   global uint* global_histogram,
                                                   local uint* partials,
                                                   uint numTasks,
                                                   uint iteration,
                                                   bool shift_primID)
 {
    const uint localID = get_local_id(0);
    if (shift_primID)
    {
        const uint req_iterations = globals->sort_iterations;
        if (iteration < req_iterations)
            return;
    }
    uint t = 0;
    for (uint j = 0; j < numTasks; j++)
    {
        const uint count = load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + localID], 0);
        store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + localID], 0, t);
        t += count;
    }
    // each lane now contains the number of elements in the corresponding bin
    //     prefix sum this for use in the subsequent scattering pass.
    uint global_count = t;
    partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
    barrier(CLK_LOCAL_MEM_FENCE);
    uint lane = get_sub_group_local_id();
    uint p = partials[lane];
    p = (lane < get_sub_group_id()) ? p : 0;
    global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
    store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numTasks + localID], 0, global_count);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(256, 1, 1)))
 void kernel
 sort_morton_codes_reduce_bins(global struct Globals* globals,
    uint numTasks,
    global uint* global_histogram,
    uint iteration)
 {
    local uint partials[RADIX_BINS];
    const uint numItems = globals->numPrimitives;
    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
        sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, false);
    else
        sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, true);
 }
 #if 1
 GRL_INLINE void sort_morton_codes_scatter_items_func(
    global struct Globals* globals,
    global uint* global_histogram,
    global ulong* input,
    global ulong* output,
    local uint* local_offset,
    local uint* flags,
    uint iteration,
    uint numGroups,
    uint numItems,
    bool shift_primID,
    bool update_morton_sort_in_flight)
 {
    const uint gID = get_local_id(0) + get_group_id(0) * get_local_size(0);
    const uint global_shift = globals->shift;
    const uint localID = get_local_id(0);
    const uint taskID = get_group_id(0);
    if (gID == 0 && update_morton_sort_in_flight)
        globals->morton_sort_in_flight = 0;
    uint2 ids = get_thread_range(numItems, numGroups, taskID);
    uint startID = ids.x;
    uint endID = ids.y;
    if (shift_primID)
    {
        const uint req_iterations = globals->sort_iterations;
        if (iteration < req_iterations)
            return;
        iteration -= req_iterations;
    }
    const uint shift = 8 * iteration;
    // load the global bin counts, and add each bin's global prefix
    //   to the local prefix
    {
        uint global_prefix = 0, local_prefix = 0;
        if (localID < RADIX_BINS)
        {
            local_prefix = global_histogram[RADIX_BINS * taskID + localID];
            global_prefix = global_histogram[RADIX_BINS * numGroups + localID];
            local_offset[localID] = global_prefix + local_prefix;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    // move elements in WG-sized chunks.   The elements need to be moved sequentially (can't use atomics)
    //   because relative order has to be preserved for LSB radix sort to work
    // For each bin, a bit vector indicating which elements are in the bin
    for (uint block_base = startID; block_base < endID; block_base += get_local_size(0))
    {
        // initialize bit vectors
        for (uint i = 4 * localID; i < RADIX_BINS * SCATTER_WG_SIZE / 32; i += 4 * get_local_size(0))
        {
            flags[i + 0] = 0;
            flags[i + 1] = 0;
            flags[i + 2] = 0;
            flags[i + 3] = 0;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // read sort key, determine which bin it goes into, scatter into the bit vector
        //  and pre-load the local offset
        uint ID = localID + block_base;
        ulong key = 0;
        uint bin_offset = 0;
        uint bin = 0;
        uint bin_word = localID / 32;
        uint bin_bit = 1 << (localID % 32);
        if (ID < endID)
        {
            key = input[ID];
            if (shift_primID)
                bin = ((key >> global_shift) >> shift) & (RADIX_BINS - 1);
            else
                bin = (key >> shift) & (RADIX_BINS - 1);
            atomic_add_local(&flags[(SCATTER_WG_SIZE / 32) * bin + bin_word], bin_bit);
            bin_offset = local_offset[bin];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (ID < endID)
        {
            // each key reads the bit-vectors for its bin,
            //    - Computes local prefix sum to determine its output location
            //    - Computes number of items added to its bin (last thread adjusts bin position)
            uint prefix = 0;
            uint count = 0;
            for (uint i = 0; i < (SCATTER_WG_SIZE / 32); i++)
            {
                uint bits = flags[(SCATTER_WG_SIZE / 32) * bin + i];
                uint bc = popcount(bits);
                uint pc = popcount(bits & (bin_bit - 1));
                prefix += (i < bin_word) ? bc : 0;
                prefix += (i == bin_word) ? pc : 0;
                count += bc;
            }
            // store the key in its proper place..
            output[prefix + bin_offset] = key;
            // last item for each bin adjusts local offset for next outer loop iteration
            if (prefix == count - 1)
                local_offset[bin] += count;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    /* uint local_offset[RADIX_BINS];   */
    /* uint offset_global = 0; */
    /* for (int i=0;i<RADIX_BINS;i++) */
    /*   { */
    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
    /*     local_offset[i] = offset_global + offset_local; */
    /*     offset_global += count_global; */
    /*   } */
    /* for (uint ID=startID;ID<endID;ID++) */
    /* { */
    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
    /*   const uint offset = local_offset[bin]; */
    /*   output[offset] = input[ID]; */
    /*   local_offset[bin]++; */
    /* } */
 }
 #else
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 sort_morton_codes_scatter_items(
    global struct Globals* globals,
    uint shift,
    global uint* global_histogram,
    global char* input0,
    global char* input1,
    unsigned int input0_offset,
    unsigned int input1_offset,
    uint iteration)
 {
    const uint numItems = globals->numPrimitives;
    const uint local_size = get_local_size(0);
    const uint taskID = get_group_id(0);
    const uint numTasks = get_num_groups(0);
    const uint localID = get_local_id(0);
    const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
    const uint subgroupLocalID = get_sub_group_local_id();
    const uint subgroup_size = get_sub_group_size();
    const uint startID = (taskID + 0) * numItems / numTasks;
    const uint endID = (taskID + 1) * numItems / numTasks;
    global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
    global ulong* output = (global ulong*)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
    local uint local_offset[RADIX_BINS];
    uint off = 0;
    for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
    {
        const uint count = global_histogram[RADIX_BINS * numTasks + i];
        const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
        const uint sum = sub_group_reduce_add(count);
        const uint prefix_sum = sub_group_scan_exclusive_add(count);
        local_offset[i] = off + offset_task + prefix_sum;
        off += sum;
    }
    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
    {
        const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
        const uint offset = atomic_add_local(&local_offset[bin], 1);
        output[offset] = input[ID];
    }
    /* uint local_offset[RADIX_BINS];   */
    /* uint offset_global = 0; */
    /* for (int i=0;i<RADIX_BINS;i++) */
    /*   { */
    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
    /*     local_offset[i] = offset_global + offset_local; */
    /*     offset_global += count_global; */
    /*   } */
    /* for (uint ID=startID;ID<endID;ID++) */
    /* { */
    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
    /*   const uint offset = local_offset[bin]; */
    /*   output[offset] = input[ID]; */
    /*   local_offset[bin]++; */
    /* } */
 }
 #endif
 #if 1
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(SCATTER_WG_SIZE, 1, 1)))
 void kernel
 sort_morton_codes_scatter_items(
    global struct Globals *globals,
    global uint *global_histogram,
    global ulong *input,
    global ulong *output,
    uint iteration,
    uint numGroups,
    uint update_morton_sort_in_flight)
 {
    local uint local_offset[RADIX_BINS];
    local uint flags[RADIX_BINS*SCATTER_WG_SIZE/32];
    const uint numItems = globals->numPrimitives;
    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
        sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
                                             flags, iteration, numGroups, numItems, false, update_morton_sort_in_flight);
    else
        sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
                                             flags, iteration, numGroups, numItems, true, update_morton_sort_in_flight);
 }
 #else
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 sort_morton_codes_scatter_items(
    global struct Globals *globals,
    uint shift,
    global uint *global_histogram,
    global char *input0,
    global char *input1,
    unsigned int input0_offset,
    unsigned int input1_offset,
    uint iteration)
 {
    const uint numItems = globals->numPrimitives;
    const uint local_size = get_local_size(0);
    const uint taskID = get_group_id(0);
    const uint numTasks = get_num_groups(0);
    const uint localID = get_local_id(0);
    const uint globalID = get_local_id(0) + get_group_id(0)*get_local_size(0);
    const uint subgroupLocalID = get_sub_group_local_id();
    const uint subgroup_size = get_sub_group_size();
    const uint startID = (taskID + 0) * numItems / numTasks;
    const uint endID = (taskID + 1) * numItems / numTasks;
    global ulong *input = (global ulong *)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
    global ulong *output = (global ulong *)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
    local uint local_offset[RADIX_BINS];
    uint off = 0;
    for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
    {
        const uint count = global_histogram[RADIX_BINS * numTasks + i];
        const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
        const uint sum = sub_group_reduce_add(count);
        const uint prefix_sum = sub_group_scan_exclusive_add(count);
        local_offset[i] = off + offset_task + prefix_sum;
        off += sum;
    }
    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
    {
        const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
        const uint offset = atomic_add_local(&local_offset[bin], 1);
        output[offset] = input[ID];
    }
    /* uint local_offset[RADIX_BINS];   */
    /* uint offset_global = 0; */
    /* for (int i=0;i<RADIX_BINS;i++) */
    /*   { */
    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
    /*     local_offset[i] = offset_global + offset_local; */
    /*     offset_global += count_global; */
    /*   } */
    /* for (uint ID=startID;ID<endID;ID++) */
    /* { */
    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
    /*   const uint offset = local_offset[bin]; */
    /*   output[offset] = input[ID]; */
    /*   local_offset[bin]++; */
    /* } */
 }
 #endif
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(512, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
 void kernel
 sort_morton_codes_merged(
    global struct Globals* globals,
    global uint* global_histogram,
    global uchar* input,
    uint iteration,
    uint numGroups
 )
 {
    const uint numItems = globals->numPrimitives;
    const uint taskID = get_group_id(0);
    const uint loc_id = get_local_id(0);
    const uint lane = get_sub_group_local_id();
    uint2 ids = get_thread_range(numItems, numGroups, taskID);
    uint startID = ids.x;
    uint endID = ids.y;
    local uint histogram[RADIX_BINS];
    local uint hist_tmp[RADIX_BINS];
    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
    {
        sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, false,
            taskID, startID, endID);
    }
    else
    {
        const uint req_iterations = globals->sort_iterations;
        if (iteration < req_iterations)
            return;
        iteration -= req_iterations;
        sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, true,
            taskID, startID, endID);
    }
    uint last_group = 0;
    if (loc_id == 0)
        last_group = atomic_inc_global(&globals->morton_sort_in_flight);
    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
    barrier(CLK_LOCAL_MEM_FENCE);
    last_group = work_group_broadcast(last_group, 0);
    bool isLastGroup = (loc_id < RADIX_BINS) && (last_group == numGroups - 1);
    uint global_count = 0;
    if (isLastGroup)
    {
        for (uint j = 0; j < numGroups; j++)
        {
            const uint count = (j == taskID) ? histogram[loc_id] : load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + loc_id], 0);
            store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + loc_id], 0, global_count);
            global_count += count;
        }
        hist_tmp[get_sub_group_id()] = (get_sub_group_id() < MAX_HW_SIMD_WIDTH) ? sub_group_reduce_add(global_count) : 0;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (isLastGroup)
    {
        uint p = hist_tmp[lane];
        p = (lane < get_sub_group_id()) ? p : 0;
        global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
        store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numGroups + loc_id], 0, global_count);
    }
 }
 #if 0
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(16))) void kernel
 sort_morton_codes_bin_items(
    global struct Globals* globals,
    uint shift,
    global uint* global_histogram,
    global char* input0,
    global char* input1,
    unsigned int input0_offset,
    unsigned int input1_offset,
    uint iteration)
 {
    const uint numItems = globals->numPrimitives;
    const uint local_size = get_local_size(0);
    const uint taskID = get_group_id(0);
    const uint numTasks = get_num_groups(0);
    const uint localID = get_local_id(0);
    const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
    const uint subgroupLocalID = get_sub_group_local_id();
    const uint subgroup_size = get_sub_group_size();
    const uint startID = (taskID + 0) * numItems / numTasks;
    const uint endID = (taskID + 1) * numItems / numTasks;
    global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
 #if 1
    local uint histogram[RADIX_BINS];
    for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
        histogram[i] = 0;
    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
    {
        const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
        atomic_add(&histogram[bin], 1);
    }
    for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
        global_histogram[RADIX_BINS * taskID + i] = histogram[i];
 #else
    uint histogram[RADIX_BINS];
    for (int i = 0; i < RADIX_BINS; i++)
        histogram[i] = 0;
    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
    {
        const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
        histogram[bin]++;
    }
    for (uint i = 0; i < RADIX_BINS; i++)
    {
        const uint reduced_counter = sub_group_reduce_add(histogram[i]);
        global_histogram[RADIX_BINS * taskID + i] = reduced_counter;
    }
 #endif
 }
 #endif
 #define WG_SIZE_WIDE 256
 #define SG_SIZE_SCAN 16
 // Fast implementation of work_group_scan_exclusive using SLM for WG size 256 and SG size 16
 GRL_INLINE uint work_group_scan_exclusive_add_opt(local uint* tmp, uint val)
 {
    const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE_SCAN;
    const uint sg_local_id = get_local_id(0) % SG_SIZE_SCAN;
    const uint NUM_HW_THREADS_IN_WG = WG_SIZE_WIDE / SG_SIZE_SCAN;
    uint acc = sub_group_scan_exclusive_add(val);
    uint acc2 = acc + val;
    tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc2, SG_SIZE_SCAN - 1);
    barrier(CLK_LOCAL_MEM_FENCE);
    uint loaded_val = tmp[sg_local_id];
    uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
    uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
    return acc + acc_for_this_hw_thread;
 }
 // Wide reduce algorithm is divided into 2 kernels:
 // 1. First, partial exclusive add scans are made within each work group using SLM.
 //    Then, The last work group for each histogram bin perform exclusive add scan along the bins using separate histgram_partials buffer.
 //    Last work group is determined using global atomics on wg_flags buffer.
 // 2. Second kernel globally adds the values from histgram_partials to the histogram buffer where partial sums are.
 //    Then, last work group performs one more work_group scan and add so the histogram buffer values are adjusted with the global ones.
 GRL_INLINE void sort_morton_codes_reduce_bins_wide_partial_sum_func(
    global struct Globals* globals,
    global uint* global_histogram,
    global uint* global_histogram_partials,
    global uint* wg_flags,
    local uint* exclusive_scan_tmp,
    uint numTasks,
    uint numGroups,
    uint iteration,
    bool shift_primID)
 {
    if (shift_primID)
    {
        const uint req_iterations = globals->sort_iterations;
        if (iteration < req_iterations)
            return;
        iteration -= req_iterations;
    }
    const uint groupID = get_group_id(0) % RADIX_BINS;
    const uint scanGroupID = get_group_id(0) / RADIX_BINS;
    uint localID = get_local_id(0);
    uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
    const uint lastGroup = (numGroups / WG_SIZE_WIDE);
    const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
    uint temp = 0;
    uint last_count = 0;
    if (globalID < numTasks)
    {
        temp = global_histogram[RADIX_BINS * globalID + groupID];
        // Store the last value of the work group, it is either last element of histogram or last item in work group
        if (globalID == endID)
            last_count = temp;
    }
    uint val = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp);
    if (globalID <= numTasks)
    {
        global_histogram[RADIX_BINS * globalID + groupID] = val;
        // Store the block sum value to separate buffer
        if (globalID == endID)
            global_histogram_partials[scanGroupID * WG_SIZE_WIDE + groupID] = val + last_count;
    }
    // Make sure that global_histogram_partials is updated in all work groups
    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
    barrier(0);
    // Now, wait for the last group for each histogram bin, so we know that
    // all work groups already updated the global_histogram_partials buffer
    uint last_group = 0;
    if (localID == 0)
        last_group = atomic_inc_global(&wg_flags[groupID]);
    last_group = work_group_broadcast(last_group, 0);
    bool isLastGroup = (last_group == lastGroup - 1);
    // Each of the last groups computes the scan exclusive add for each partial sum we have
    if (isLastGroup)
    {
        uint temp1 = 0;
        if (localID < lastGroup)
            temp1 = global_histogram_partials[localID * WG_SIZE_WIDE + groupID];
        uint val2 = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp1);
        if (localID < lastGroup)
            global_histogram_partials[localID * WG_SIZE_WIDE + groupID] = val2;
    }
 }
 GRL_INLINE void sort_morton_codes_reduce_bins_wide_add_reduce_func(
    global struct Globals* globals,
    global uint* global_histogram,
    global uint* global_histogram_partials,
    local uint* partials,
    uint numTasks,
    uint numGroups,
    uint iteration,
    bool shift_primID)
 {
    if (shift_primID)
    {
        const uint req_iterations = globals->sort_iterations;
        if (iteration < req_iterations)
            return;
        iteration -= req_iterations;
    }
    const uint groupID = get_group_id(0) % RADIX_BINS;
    const uint scanGroupID = get_group_id(0) / RADIX_BINS;
    const uint lastGroup = (numGroups / WG_SIZE_WIDE);
    uint localID = get_local_id(0);
    uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
    const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
    // Add the global sums to the partials, skip the firsy scanGroupID as the first add
    // value is 0 in case of exclusive add scans
    if (scanGroupID > 0 && globalID <= numTasks)
    {
        uint add_val = global_histogram_partials[scanGroupID * RADIX_BINS + groupID];
        atomic_add_global(&global_histogram[globalID * RADIX_BINS + groupID], add_val);
    }
    // Wait for the last group
    uint last_group = 0;
    if (localID == 0)
        last_group = atomic_inc_global(&globals->morton_sort_in_flight);
    last_group = work_group_broadcast(last_group, 0);
    bool isLastGroup = (last_group == numGroups - 1);
    // Do the exclusive scan within all bins with global data now
    if (isLastGroup)
    {
        mem_fence_gpu_invalidate();
        uint global_count = global_histogram[numTasks * RADIX_BINS + localID];
        partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
        barrier(CLK_LOCAL_MEM_FENCE);
        uint lane = get_sub_group_local_id();
        uint p = partials[lane];
        p = (lane < get_sub_group_id()) ? p : 0;
        global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
        store_uint_L1WB_L3WB(&global_histogram[numTasks * RADIX_BINS + localID], 0, global_count);
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
 void kernel
 sort_morton_codes_reduce_bins_wide_partial_sum(
    global struct Globals* globals,
    uint numTasks,
    uint numGroups,
    global uint* global_histogram,
    global uint* global_histogram_partials,
    global uint* wg_flags,
    uint iteration)
 {
    local uint exclusive_scan_tmp[WG_SIZE_WIDE / SG_SIZE_SCAN];
    const uint numItems = globals->numPrimitives;
    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
        sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, false);
    else
        sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, true);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
 void kernel
 sort_morton_codes_reduce_bins_wide_add_reduce(
    global struct Globals* globals,
    uint numTasks,
    uint numGroups,
    global uint* global_histogram,
    global uint* global_histogram_partials,
    uint iteration)
 {
    local uint partials[RADIX_BINS];
    const uint numItems = globals->numPrimitives;
    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
        sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, false);
    else
        sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, true);
 }
--- a/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl
+++ b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl
@ -1,297 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module msb_radix_bitonic_sort;
 kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl")
 {
    links lsc_intrinsics;
    kernel opencl_debug_print                                    < kernelFunction="debug_print_kernel">;
    kernel opencl_check_bls                                      < kernelFunction="check_bls_sort">;
    kernel opencl_bottom_level_sort_single_wg                    < kernelFunction="sort_morton_codes_bottom_level_single_wg">;
    kernel opencl_build_morton_kernel_sort_msb_init              < kernelFunction="sort_morton_codes_msb_begin">;
    kernel opencl_build_morton_kernel_sort_msb_scheduler         < kernelFunction="scheduler">;
    kernel opencl_build_morton_kernel_sort_bottom_level          < kernelFunction="sort_morton_codes_bottom_level">;
    kernel opencl_build_morton_kernel_sort_msb_count_items       < kernelFunction="sort_morton_codes_msb_count_items">;
    kernel opencl_build_morton_kernel_sort_msb_bin_items         < kernelFunction="sort_morton_codes_msb_bin_items">;
    kernel opencl_build_morton_kernel_sort_batched_bls_dispatch  < kernelFunction="sort_morton_codes_batched_BLS_dispatch">;
 }
 const MSB_RADIX_NUM_VCONTEXTS  = 8;
 const BOTTOM_LEVEL_SORT_THRESHOLD  = 512;
 struct MSBRadixScheduler
 {
    dword num_wgs_msb;
    dword num_wgs_bls;
    dword scheduler_postsync;
    dword _pad1;
 };
 struct MSBRadixArgs
 {
    qword p_scheduler;
    qword p_num_primitives;
 };
 struct BatchedBLSDispatchEntry
 {
    qword p_data_buffer;
    qword num_elements; // number of elements in p_data_buffer
 };
 metakernel add_bls_dispatch_init(qword p_storage)
 {
    define REG_numWgs         REG14;
    define REG_p_storage      REG15;
    REG_numWgs = 0;
    REG_p_storage = p_storage;
 }
 // basically this code does:
 // bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives };
 // dispatchId++;
 //
 metakernel add_bls_dispatch(
    qword p_data,
    qword p_num_primitives
 )
 {
    define C_1                                REG0;
    define C_8                                REG1;
    define C_MIN_PRIMREFS                     REG2;
    define REG_p_data                         REG3;
    define REG_num_prims                      REG4;
    define REG_no_dispatch                    REG5;
    define REG_numWgs                         REG14;
    define REG_p_storage                      REG15;
    C_MIN_PRIMREFS = 2;
    REG_num_prims = 0;
    REG_num_prims.lo = load_dword(p_num_primitives);
    REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
    goto l_finish if(REG_no_dispatch.lo);
    C_1 = 1;
    C_8 = 8;
    // pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data
    REG_p_data = p_data;
    store_qword( REG_p_storage, REG_p_data ); // store the data pointer
    REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct
    // pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives
    store_qword( REG_p_storage, REG_num_prims );
    REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance
    REG_numWgs = REG_numWgs + C_1;
 l_finish:
 }
 metakernel batched_bls_dispatch(
    qword private_mem
 )
 {
    define REG_numWgs REG14;
    DISPATCHDIM_X = REG_numWgs;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem);
 }
 metakernel sort_bottom_level(
    qword build_globals,
    qword input,
    qword p_num_primitives)
 {
    define REG_num_prims       REG0;
    define C_MIN_PRIMREFS      REG1;
    define REG_no_dispatch     REG2;
    REG_num_prims  = load_dword( p_num_primitives );
    C_MIN_PRIMREFS = 2;
    REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
    goto l_finish if(REG_no_dispatch.lo);
    dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
 l_finish:
 }
 metakernel sort(
    qword build_globals,
    qword input,
    qword tmp,
    MSBRadixArgs sort_args)
 {
    define REG_num_prims       REG0;
    {
        define C_MIN_PRIMREFS           REG1;
        define C_MAX_PRIMREFS           REG2;
        define REG_no_dispatch          REG3;
        define REG_dispatch_single_wg   REG4;
        REG_num_prims  = load_dword( sort_args.p_num_primitives );
        C_MIN_PRIMREFS = 2;
        C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD;
        REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
        REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS;
        goto l_sort_finish if(REG_no_dispatch.lo);
        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
        goto l_full_sort;
    }
 l_dispatch_single_wg:
    {
        dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
        goto l_sort_finish;
    }
 l_full_sort:
    define p_scheduler                  sort_args.p_scheduler;
    define p_scheduler_postsync        (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) );
    define p_num_wgs_bls               (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) );
    define REG_scheduler_postsync    REG3;
    REG_scheduler_postsync = p_scheduler_postsync;
    define C_0    REG4;
    define C_8    REG5;
    define C_255  REG6;
    C_0 = 0;
    C_8 = 8;
    C_255 = 255;
    store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
    REG_num_prims = REG_num_prims + C_255;
    REG_num_prims = REG_num_prims >> C_8;
    DISPATCHDIM_X = REG_num_prims.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    control( cs_store_fence ); // commit the semaphore write 
    // initialize the whole execution
    dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp)
        postsync store_dword( p_scheduler_postsync, 1 );
    // wait on count_items kernel
    semaphore_wait while( *p_scheduler_postsync != 1 );
    dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
        postsync store_dword( p_scheduler_postsync, 2 );
    // wait on count_items kernel
    semaphore_wait while( *p_scheduler_postsync != 2 );
    dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
        postsync store_dword( p_scheduler_postsync, 0 );
    define C_MASK_HI REG4;
    C_MASK_HI = 0x00000000ffffffff;
    l_build_loop:
    {
        semaphore_wait while( *p_scheduler_postsync != 0 );
        {
            dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp )
                postsync store_dword( p_scheduler_postsync, 1 );
            // wait on scheduler kernel
            semaphore_wait while( *p_scheduler_postsync != 1 );
        }
        // load and process the scheduler results
        define REG_wg_counts    REG0;
        define REG_num_msb_wgs  REG0.lo;
        define REG_num_bls_wgs  REG0.hi;
        define REG_p_scheduler  REG1;
        define REG_no_msb_wgs   REG2;
        {
            REG_p_scheduler = p_scheduler;
            REG_wg_counts    = load_qword( REG_p_scheduler ); 
            REG_no_msb_wgs = REG_wg_counts  & C_MASK_HI;
            REG_no_msb_wgs = REG_no_msb_wgs == 0;
        }
        // dispatch new bls WGs
        DISPATCHDIM_X = REG_num_bls_wgs;
        dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input );
        // jump out if there are no msb WGs
        goto l_sort_finish if (REG_no_msb_wgs);
        DISPATCHDIM_X = REG_num_msb_wgs;
        dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
            postsync store_dword( p_scheduler_postsync, 2 );
        // wait on count_items kernel
        semaphore_wait while( *p_scheduler_postsync != 2 );
        dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
            postsync store_dword( p_scheduler_postsync, 0 );
        // wait till all BLS finished launching
        semaphore_wait while( *p_num_wgs_bls != 0 );
        goto l_build_loop;
    }
 l_sort_finish:
 }
--- a/src/intel/vulkan/grl/gpu/new_sah_builder.grl
+++ b/src/intel/vulkan/grl/gpu/new_sah_builder.grl
@ -1,665 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module new_sah_builder;
 kernel_module bfs_kernels ("bvh_build_BFS.cl")
 {
    links lsc_intrinsics;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial      <  kernelFunction="BFS_pass1_initial"  >   ;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed      <  kernelFunction="BFS_pass1_indexed"  >   ;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial      <  kernelFunction="BFS_pass2_initial"  >   ;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed      <  kernelFunction="BFS_pass2_indexed"  >   ;
    kernel opencl_build_kernel_BinnedSAH_DFS                    <  kernelFunction="DFS"        >;
    // kernel opencl_build_kernel_BinnedSAH_BuildQNodes            <  kernelFunction="build_qnodes" >;
    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff    <  kernelFunction="build_qnodes_pc_kickoff" >;
    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify    <  kernelFunction="build_qnodes_pc_amplify" >;
    kernel opencl_build_kernel_BinnedSAH_begin                  <  kernelFunction = "begin" >;
    kernel opencl_build_kernel_BinnedSAH_scheduler              <  kernelFunction = "scheduler" >;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch   < kernelFunction="BFS_pass1_initial_batchable"  >;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch   < kernelFunction="BFS_pass1_indexed_batchable"  >;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch   < kernelFunction="BFS_pass2_initial_batchable"  >;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch   < kernelFunction="BFS_pass2_indexed_batchable"  >;
    kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >;
    kernel opencl_build_kernel_BinnedSAH_begin_batched     < kernelFunction="begin_batchable"   >;
    kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched      < kernelFunction="build_qnodes_init_scheduler_batched" >;
    kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched               < kernelFunction="build_qnodes_begin_batchable" >;
    kernel opencl_build_kernel_BinnedSAH_qnode_scheduler                   < kernelFunction="build_qnodes_scheduler" >;
    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch         < kernelFunction="build_qnodes_pc_amplify_batched" >;
    kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >;
 }
 kernel opencl_build_kernel_DFS_single_wg             < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" >
 kernel opencl_build_kernel_DFS_trivial               < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial"  >
 kernel opencl_build_kernel_DFS_single_wg_batch       < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" >
 kernel opencl_build_kernel_DFS_trivial_batch         < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable"   >
 kernel single_pass_binsah                            < source="bvh_build_DFS.cl", kernelFunction="DFS"                           >
 const DFS_MIN_PRIMREFS  = 6;
 const DFS_MAX_PRIMREFS  = 256;
 const BFS_WG_SIZE_SHIFT = 9;
 struct Scheduler
 {
    dword num_bfs_wgs;
    dword num_dfs_wgs;
    dword scheduler_postsync;
    dword _pad1;
    dword num_trivial_builds;
    dword num_single_builds;
    dword batched_build_wg_count;
    dword batched_build_loop_mask;
 };
 struct SAHBuildArgs
 {
    qword p_num_primitives;
    qword p_qnode_child_buffer;
    qword p_scheduler;
    qword p_sah_globals;
    qword p_globals;
    qword p_primref_buffer;
    qword p_primref_index_buffers;
    qword p_bvh_base;
    qword p_bvh2;
    qword p_root_buffer_counters;
    dword sah_build_flags;
    dword leaf_size;
    dword leaf_type;
    dword max_internal_nodes;
 };
 metakernel single_pass_binsah(
    qword build_globals,
    qword bvh_buffer,
    qword build_primref_buffer,
    qword build_primref_index_buffers,
    dword alloc_backpointers )
 {
    dispatch single_pass_binsah(1, 1, 1) args(
        build_globals,
        bvh_buffer,
        build_primref_buffer,
        build_primref_index_buffers,
        alloc_backpointers
    );
 }
 metakernel new_sah_build( SAHBuildArgs build_args )
 {
    define REG_num_prims    REG0;
    {
        define C_MIN_PRIMREFS           REG1;
        define C_MAX_PRIMREFS           REG2;
        define REG_dispatch_trivial     REG3;
        define REG_dispatch_single_wg   REG4;
        REG_num_prims  = load_dword( build_args.p_num_primitives );
        C_MIN_PRIMREFS = DFS_MIN_PRIMREFS;
        C_MAX_PRIMREFS = DFS_MAX_PRIMREFS;
        REG_dispatch_trivial   = REG_num_prims <= C_MIN_PRIMREFS;
        REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS;
        goto l_dispatch_trivial   if(REG_dispatch_trivial.lo);
        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
        goto l_full_build;
    }
 l_dispatch_trivial:
    {
        dispatch opencl_build_kernel_DFS_trivial    (1,1,1)
            args( build_args.p_globals,
                  build_args.p_bvh_base,
                  build_args.p_primref_buffer,
                  build_args.p_primref_index_buffers,
                  build_args.sah_build_flags
                  );
        control( wait_idle );
        goto l_done;
    }
 l_dispatch_single_wg:
    {
        dispatch opencl_build_kernel_DFS_single_wg    (1,1,1)
            args( build_args.p_globals,
                  build_args.p_bvh_base,
                  build_args.p_primref_buffer,
                  build_args.p_primref_index_buffers,
                  build_args.sah_build_flags
                  );
        control( wait_idle );
        goto l_done;
    }
 l_full_build:
    {
        define p_scheduler                  build_args.p_scheduler;
        define p_num_dfs_wgs                build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs);
        define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
        define C_0    REG1;
        define C_8    REG2;
        C_8 = 8;
        C_0 = 0;
        //
        //  Init pass
        //
        store_dword( p_scheduler_postsync, C_0.lo );
        // compute number of BFS WGs from prim-count
        // NOTE:  This code uses a hardcoded WG size of 512 for BFS
        //    If the BFS wg size ever changes, it needs to be touched
        //    This is necessary because DG2 shifter only supports POW2 shifts
        {
            define REG_scheduler_postsync    REG3;
            define C_511    REG4;
            define C_1      REG5;
            REG_scheduler_postsync = p_scheduler_postsync;
            C_511 = 511;
            C_1   = 1;
            store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
            REG_num_prims = REG_num_prims + C_511;
            REG_num_prims = REG_num_prims >> C_8;
            REG_num_prims = REG_num_prims >> C_1;
            DISPATCHDIM_X = REG_num_prims.lo;
            DISPATCHDIM_Y = 1;
            DISPATCHDIM_Z = 1;
            control( cs_store_fence ); // commit the semaphore write
            // launch scheduler init kernel
            dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1)
                args(
                    build_args.p_scheduler,
                    build_args.leaf_size,
                    build_args.leaf_type,
                    build_args.p_primref_index_buffers,
                    build_args.p_primref_buffer,
                    build_args.p_bvh2,
                    build_args.p_bvh_base,
                    build_args.p_globals,
                    build_args.p_sah_globals,
                    build_args.p_qnode_child_buffer,
                    build_args.sah_build_flags
                )
                postsync store_dword( p_scheduler_postsync, 1 );
            // wait on init kernel
            semaphore_wait while( *p_scheduler_postsync != 1 );
            // launch BFS1 pass1
            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial
                args( build_args.p_scheduler,
                      build_args.p_sah_globals)
                postsync store_dword( p_scheduler_postsync, 0 );
            // wait on BFS pass1
            semaphore_wait while( *p_scheduler_postsync != 0 );
            // launch BFS pass2
            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial
                args( build_args.p_scheduler,
                      build_args.p_sah_globals )
                postsync store_dword( p_scheduler_postsync, 1 );
        }
        // after BFS pass 2 we drop into a scheduling loop
        l_build_loop:
        {
            semaphore_wait while( *p_scheduler_postsync != 1 );
            {
                dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
                    args( build_args.p_scheduler, build_args.p_sah_globals )
                    postsync store_dword( p_scheduler_postsync, 0 );
                // wait on the scheduler
                semaphore_wait while( *p_scheduler_postsync != 0 );
            }
            // load and process the scheduler results
            define REG_wg_counts    REG0;
            define REG_num_bfs_wgs  REG0.lo;
            define REG_num_dfs_wgs  REG0.hi;
            define REG_loop_break   REG1;
            define REG_p_scheduler  REG2;
            {
                REG_p_scheduler = p_scheduler;
                REG_wg_counts    = load_qword( REG_p_scheduler );
                define C_MASK_LO REG3 ;
                C_MASK_LO = 0xffffffff;
                REG_loop_break = REG_wg_counts  & C_MASK_LO;
                REG_loop_break = REG_loop_break == 0;
            }
            // dispatch new DFS WGs
            DISPATCHDIM_X = REG_num_dfs_wgs;
            dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
                args( p_scheduler,
                      build_args.p_sah_globals );
            // jump out if there are no bfs WGs
            goto l_build_qnodes if (REG_loop_break);
            // dispatch new BFS1 WGs
            DISPATCHDIM_X = REG_num_bfs_wgs;
            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed
                args( p_scheduler,
                      build_args.p_sah_globals )
                postsync store_dword( p_scheduler_postsync, 2 );
           semaphore_wait while( *p_scheduler_postsync != 2 );
           // dispatch new BFS2 WGs
           dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed
               args( p_scheduler,
                     build_args.p_sah_globals )
               postsync store_dword( p_scheduler_postsync, 1 );
            //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
            // wait until all upcoming DFS WGs have finished launching
            //   so that the scheduler can refill the launch array
                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
            semaphore_wait while( *p_num_dfs_wgs != 0 );
            goto l_build_loop;
        }
    }
 l_build_qnodes:
    control( wait_idle );
    // P/C qnode build
    dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1)
        args( build_args.p_sah_globals,
              build_args.p_qnode_child_buffer,
              build_args.sah_build_flags );
    {
        define p_pc_counters ( build_args.p_root_buffer_counters );
        define REG_addr      REG0;
        define REG_produced  REG1;
        define REG_consumed  REG2;
        define REG_have_work REG3;
        define REG_wg_count  REG4;
        define C_8 REG5;
        define C_16 REG6;
        define C_1 REG7;
        C_1 = 1;
        C_8 =  8;
        C_16 = 16;
        REG_addr =  build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address
        REG_consumed = 0;
        l_qnode_loop:
            control( wait_idle ); // wait for previous pass
            // load counters and compute number of wgs to respawn
            REG_produced  = load_qword( REG_addr ); REG_addr = REG_addr + C_8;
            REG_wg_count  = REG_produced - REG_consumed;
            REG_have_work = REG_wg_count > 0;
            goto l_done if not(REG_have_work.lo);
            // save REG_consumed as a starting position in p_qnode_child_buffer
            store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8;
            // save REG_produced as ending position in p_qnode_child_buffer
            store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16;
            REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration
            // calculate amount of workgroups to schedule
            REG_wg_count = REG_wg_count + C_1;
            REG_wg_count = REG_wg_count >> C_1;
            DISPATCHDIM_X = REG_wg_count.lo;
            control( cs_store_fence ); // commit the stores
            dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify
                    args( build_args.p_sah_globals,
                          build_args.p_qnode_child_buffer,
                          build_args.sah_build_flags);
            goto l_qnode_loop;
    }
 l_done:
 }
 struct SAHBuildArgsBatchable
 {
    qword p_globals_ptrs;
    qword p_scheduler;
    qword p_buffers_info;
    qword p_sah_globals;
    dword num_max_qnode_global_root_buffer_entries;
    dword num_builds;
 };
 metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args )
 {
    define p_scheduler                  build_args.p_scheduler;
    define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
    define p_num_dfs_wgs                (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs));
    // initialize scheduler semaphore
    REG0.lo = 0;
    store_dword( p_scheduler_postsync, REG0.lo );
    // dispatch categorization pass
    dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1)
        args(
              build_args.p_scheduler,
              build_args.p_globals_ptrs,
              build_args.p_buffers_info,
              build_args.p_sah_globals,
              build_args.num_builds
          )
          postsync store_dword( p_scheduler_postsync, 1 );
    // wait on the categorization pass
    semaphore_wait while( *p_scheduler_postsync != 1 );
    //  dispatch the trivial and single-WG passes
    {
        REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) );
        DISPATCHDIM_X = REG0.lo;
        DISPATCHDIM_Y = 1;
        DISPATCHDIM_Z = 1;
        // dispatch trivial builds
        dispatch_indirect opencl_build_kernel_DFS_trivial_batch
            args( build_args.p_sah_globals );
        control( wait_idle );
        // dispatch single-wg builds
        DISPATCHDIM_X = REG0.hi;
        dispatch_indirect opencl_build_kernel_DFS_single_wg_batch
            args( build_args.p_sah_globals, build_args.p_scheduler );
    }
    // compute the number of builds not covered by the trivial passes
    // skip the builder loop if all builds are satisfied by trivial passes
    {
        REG1 = REG0.lo;
        REG2 = REG0.hi;
        REG3 = build_args.num_builds;
        REG5 = REG2 + REG1;
        REG5 = REG3 - REG5;
        REG4 = REG5 == 0 ;
        goto l_done if (REG4.lo);
    }
    // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop
    define REG_num_nontrivial REG5;
 l_build_outer_loop:
    {
        // configure the scheduler to initiate a new block of builds
        dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1)
            args( build_args.p_scheduler, build_args.p_sah_globals )
            postsync store_dword( p_scheduler_postsync, 0 );
        // wait on init kernel
        semaphore_wait while( *p_scheduler_postsync != 0 );
        // read results produced by scheduler init kernel
        //   lo == BFS wg count.  hi == all ones if we need to loop again
        //
        REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
        REG4 = load_qword( REG0 );
        // launch BFS1 pass1
        DISPATCHDIM_X = REG4.lo;
        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch
            args( build_args.p_scheduler,
                    build_args.p_sah_globals)
            postsync store_dword( p_scheduler_postsync, 1 );
        // wait on BFS pass1
        semaphore_wait while( *p_scheduler_postsync != 1 );
        // launch BFS pass2
        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch
            args( build_args.p_scheduler,
                    build_args.p_sah_globals )
            postsync store_dword( p_scheduler_postsync, 0 );
        l_build_loop:
            {
                semaphore_wait while( *p_scheduler_postsync != 0 );
                {
                    dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
                        args( build_args.p_scheduler, build_args.p_sah_globals )
                        postsync store_dword( p_scheduler_postsync, 1 );
                    // wait on the scheduler
                    semaphore_wait while( *p_scheduler_postsync != 1 );
                }
                // load and process the scheduler results
                define REG_wg_counts    REG0;
                define REG_num_bfs_wgs  REG0.lo;
                define REG_num_dfs_wgs  REG0.hi;
                define REG_loop_break   REG1;
                define REG_p_scheduler  REG2;
                {
                    REG_p_scheduler = p_scheduler;
                    REG_wg_counts    = load_qword( REG_p_scheduler );
                    define C_MASK_LO REG3 ;
                    C_MASK_LO = 0xffffffff;
                    REG_loop_break = REG_wg_counts  & C_MASK_LO;
                    REG_loop_break = REG_loop_break == 0;
                }
                // dispatch new DFS WGs
                DISPATCHDIM_X = REG_num_dfs_wgs;
                dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
                    args( p_scheduler,
                          build_args.p_sah_globals );
                // jump out if there are no bfs WGs
                goto l_continue_outer_loop if (REG_loop_break);
                // dispatch new BFS1 WGs
                DISPATCHDIM_X = REG_num_bfs_wgs;
                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch
                    args( p_scheduler,
                          build_args.p_sah_globals )
                    postsync store_dword( p_scheduler_postsync, 2 );
               semaphore_wait while( *p_scheduler_postsync != 2 );
                // dispatch new BFS2 WGs
                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch
                    args( p_scheduler,
                          build_args.p_sah_globals )
                    postsync store_dword( p_scheduler_postsync, 0 );
                //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
                // wait until all upcoming DFS WGs have finished launching
                //   so that the scheduler can refill the launch array
                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
                semaphore_wait while( *p_num_dfs_wgs != 0 );
                goto l_build_loop;
            }
        l_continue_outer_loop:
            goto l_build_outer_loop if(REG4.hi);
    }
 ////////
 //
 // Qnode build phase
 //
 ////////
    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
    control( wait_idle );
    define REG_wg_counts   REG1;
    define REG_p_scheduler REG2;
    define REG_have_work   REG3;
    define REG_GRB_NUM_MAX_ENTRIES    REG4;
    // init scheduler for qnode phase
    dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1)
        args( build_args.p_scheduler,
              build_args.num_builds,
              build_args.num_max_qnode_global_root_buffer_entries);
    REG_p_scheduler = p_scheduler;
    control( wait_idle );
    REG_wg_counts   = load_qword( REG_p_scheduler );
    DISPATCHDIM_X = REG_wg_counts.lo;
    // configure the scheduler to initiate a new block of builds
    dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched
        args( build_args.p_scheduler,
              build_args.p_sah_globals);
    // read results produced by init scheduler kernel
    //   lo == num of builds processed.  hi == num of maximum global root buffer entries
    //
    REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
    REG5 = load_qword( REG0 );
    REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi;
    REG_GRB_NUM_MAX_ENTRIES.hi = 0;
 l_qnode_loop:
    {
        control( wait_idle ); // wait for previous pass
        dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler );
        control( wait_idle );
        REG_wg_counts   = load_qword( REG_p_scheduler );
        REG_have_work = REG_wg_counts > 0;
        goto l_done if not(REG_have_work.lo);
        DISPATCHDIM_X = REG_wg_counts.lo;
        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch
                args( build_args.p_sah_globals,
                      build_args.p_scheduler );
        control( wait_idle );
        REG_wg_counts   = load_qword( REG_p_scheduler ); // reload values
        REG_wg_counts.lo = REG_wg_counts.hi;
        REG_wg_counts.hi = 0;
        REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES;
        goto l_qnode_loop if not(REG_have_work.lo);
        DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled
        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched
                args( build_args.p_sah_globals,
                      build_args.p_scheduler );
        goto l_qnode_loop;
    }
 ////////
 //
 // Old implementation - TODO: maybe add switch between two implementations?
 //
 ////////
    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
    //DISPATCHDIM_X = REG5.lo;
    //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes
    //    args( build_args.p_sah_globals, build_args.p_scheduler );
 l_done:
    control( wait_idle );
 }
--- a/src/intel/vulkan/grl/gpu/postbuild_info.grl
+++ b/src/intel/vulkan/grl/gpu/postbuild_info.grl
@ -1,49 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module postbuild_info; // In postbuild we assume output data structure to be DXR compatible
 kernel compacted_size < source="bvh_postbuild_info.cl", kernelFunction="compacted_size" >
 kernel current_size < source="bvh_postbuild_info.cl", kernelFunction="current_size" >
 kernel serialized_size < source="bvh_postbuild_info.cl", kernelFunction="serialized_size" >
 kernel decoded_size < source="bvh_postbuild_info.cl", kernelFunction="decoded_size" >
 metakernel compacted_size(
    qword bvh,
    qword postbuildInfo)
 {
    dispatch compacted_size(1,1,1) args(
        bvh,
        postbuildInfo);
 }
 metakernel current_size(
    qword bvh,
    qword postbuildInfo)
 {
    dispatch current_size(1,1,1) args(
        bvh,
        postbuildInfo);
 }
 metakernel serialized_size(
    qword bvh,
    qword postbuildInfo)
 {
    dispatch serialized_size(1,1,1) args(
        bvh,
        postbuildInfo);
 }
 metakernel decoded_size(
    qword bvh,
    qword postbuildInfo)
 {
    dispatch decoded_size(1,1,1) args(
        bvh,
        postbuildInfo);
 }
--- a/src/intel/vulkan/grl/gpu/presplit.grl
+++ b/src/intel/vulkan/grl/gpu/presplit.grl
@ -1,62 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module presplit;
 kernel_module presplit_kernels ("bvh_build_presplit.cl")
 {
    links lsc_intrinsics;
    kernel opencl_kernel_compute_num_presplits               < kernelFunction="compute_num_presplits" >;
    kernel opencl_kernel_priority_sum                        < kernelFunction="priority_sum"          >;
    kernel opencl_kernel_perform_presplits                   < kernelFunction="perform_presplits"     >;
 }
 import struct MKBuilderState "structs.grl";
 import struct MKSizeEstimate "structs.grl";
 metakernel compute_num_presplits(
    MKBuilderState state,
    qword presplit_buffer,
    dword numHwThreads )
 {
    dispatch opencl_kernel_compute_num_presplits ( numHwThreads, 1, 1 ) args(
        state.build_globals,
        state.bvh_buffer,
        state.build_primref_buffer,
        presplit_buffer,
        state.geomDesc_buffer );
 }
 metakernel priority_sum(
    MKBuilderState state,
    MKSizeEstimate estimate,
    qword presplit_buffer )
 {
    dispatch opencl_kernel_priority_sum ( 1, 1, 1 ) args(
        state.build_globals,
        presplit_buffer,
        estimate.numPrimitivesToSplit / 2 );
 }
 metakernel perform_presplits(
    MKBuilderState state,
    MKSizeEstimate estimate,
    qword presplit_buffer,
    dword numHwThreads )
 {
    dispatch opencl_kernel_perform_presplits ( numHwThreads, 1, 1 ) args(
        state.build_globals,
        state.bvh_buffer,
        state.build_primref_buffer,
        presplit_buffer,
        state.bvh_buffer,
        state.geomDesc_buffer,
        estimate.numPrimitivesToSplit / 2 );
 }
--- a/src/intel/vulkan/grl/gpu/qbvh6.h
+++ b/src/intel/vulkan/grl/gpu/qbvh6.h
@ -1,933 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "GRLGen12.h"
 #include "shared.h"
 #include "quad.h"
 /* ====== GENERAL BVH config ====== */
 #define BVH_NODE_N6 6
 #define BVH_NODE_N 8
 #define BVH_NODE_N_LOG 3
 #define SAH_LOG_BLOCK_SHIFT 2
 #define BVH_LEAF_N_MIN BVH_NODE_N6
 #define BVH_LEAF_N_MAX BVH_NODE_N6
 #define BVH_NODE_DEFAULT_MASK 0xff
 #define BVH_NODE_DEGENERATED_MASK 0x00
 /* ====== QUANTIZATION config ====== */
 #define QUANT_BITS 8
 #define QUANT_MIN 0
 #define QUANT_MAX 255
 #define QUANT_MAX_MANT (255.0f / 256.0f)
 #define NO_NODE_OFFSET 0
 /* ======================================================================= */
 /* ============================== BVH BASE =============================== */
 /* ======================================================================= */
 GRL_INLINE void setBVHBaseBounds(struct BVHBase *base, struct AABB *aabb)
 {
    base->Meta.bounds.lower[0] = aabb->lower.x;
    base->Meta.bounds.lower[1] = aabb->lower.y;
    base->Meta.bounds.lower[2] = aabb->lower.z;
    base->Meta.bounds.upper[0] = aabb->upper.x;
    base->Meta.bounds.upper[1] = aabb->upper.y;
    base->Meta.bounds.upper[2] = aabb->upper.z;
 }
 GRL_INLINE global struct QBVHNodeN *BVHBase_nodeData(struct BVHBase *bvh)
 {
    return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
 }
 GRL_INLINE global struct QBVHNodeN *BVHBase_rootNode(struct BVHBase *bvh)
 {
    return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
 }
 GRL_INLINE global struct Quad *BVHBase_quadLeaves(struct BVHBase *bvh)
 {
    return (global struct Quad *)((void *)bvh + 64 * (ulong)bvh->quadLeafStart);
 }
 GRL_INLINE uint64_t BVHBase_numNodes(struct BVHBase *bvh)
 {
    return bvh->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
 }
 GRL_INLINE uint64_t BVHBase_numQuads(struct BVHBase *bvh)
 {
    return bvh->quadLeafCur - bvh->quadLeafStart;
 }
 GRL_INLINE uint64_t BVHBase_numProcedurals(struct BVHBase *bvh)
 {
    return bvh->proceduralDataCur - bvh->proceduralDataStart;
 }
 GRL_INLINE uint64_t BVHBase_numInstances(struct BVHBase *bvh)
 {
    return bvh->instanceLeafEnd - bvh->instanceLeafStart;
 }
 /* =================================================================== */
 /* ============================== QBVH =============================== */
 /* =================================================================== */
 __constant const float ulp = FLT_EPSILON;
 GRL_INLINE struct AABB conservativeAABB(struct AABB *aabb)
 {
    struct AABB box;
    const float4 v4 = max(fabs(aabb->lower), fabs(aabb->upper));
    const float v = ulp * max(v4.x, max(v4.y, v4.z));
    box.lower = aabb->lower - (float4)v;
    box.upper = aabb->upper + (float4)v;
    return box;
 }
 GRL_INLINE struct AABB3f conservativeAABB3f(struct AABB3f* aabb3d)
 {
    struct AABB aabb4d = AABBfromAABB3f(*aabb3d);
    struct AABB box = conservativeAABB(&aabb4d);
    return AABB3fFromAABB(box);
 }
 struct QBVH_AABB
 {
    uchar lower_x[BVH_NODE_N6];
    uchar upper_x[BVH_NODE_N6];
    uchar lower_y[BVH_NODE_N6];
    uchar upper_y[BVH_NODE_N6];
    uchar lower_z[BVH_NODE_N6];
    uchar upper_z[BVH_NODE_N6];
 };
 struct QBVHNodeN
 {
    float lower[3];
    int offset;
    // 16 bytes
    uchar type;
    uchar pad;
    // 18 bytes
    char exp[3];
    uchar instMask;
    // 22 bytes
    uchar childData[6];
    // 28 bytes
    struct QBVH_AABB qbounds; // + 36 bytes
                              // 64 bytes
 };
 GRL_INLINE uint QBVHNodeN_blockIncr(struct QBVHNodeN *This, uint childID)
 {
    return This->childData[childID] & 0x3;
 }
 GRL_INLINE uint QBVHNodeN_startPrim(struct QBVHNodeN *This, uint childID)
 {
    return (This->childData[childID] >> 2) & 0xF;
 }
 GRL_INLINE void initQBVHNodeN(struct QBVHNodeN *qnode)
 {
    uint *ptr = (uint *)qnode;
    for (uint i = 0; i < 16; i++)
        ptr[i] = 0;
 }
 GRL_INLINE struct AABB extractAABB_QBVHNodeN(struct QBVHNodeN *qnode, uint i)
 {
    struct AABB aabb;
    const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
    const int4 lower_i = (int4)(qnode->qbounds.lower_x[i], qnode->qbounds.lower_y[i], qnode->qbounds.lower_z[i], 0);
    const int4 upper_i = (int4)(qnode->qbounds.upper_x[i], qnode->qbounds.upper_y[i], qnode->qbounds.upper_z[i], 0);
    const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
    aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
    aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
    return aabb;
 }
 GRL_INLINE struct AABB getAABB_QBVHNodeN(struct QBVHNodeN *qnode)
 {
    struct AABB aabb;
 #if 0
  AABB_init(&aabb);
  for (uint i = 0; i < BVH_NODE_N6; i++)
  {
    struct AABB v = extractAABB_QBVHNodeN(qnode, i);
    AABB_extend(&aabb, &v);
  }
 #else
    uint lower_x = qnode->qbounds.lower_x[0];
    uint lower_y = qnode->qbounds.lower_y[0];
    uint lower_z = qnode->qbounds.lower_z[0];
    uint upper_x = qnode->qbounds.upper_x[0];
    uint upper_y = qnode->qbounds.upper_y[0];
    uint upper_z = qnode->qbounds.upper_z[0];
    for (uint i = 1; i < BVH_NODE_N6; i++)
    {
        uint lx = qnode->qbounds.lower_x[i];
        uint ly = qnode->qbounds.lower_y[i];
        uint lz = qnode->qbounds.lower_z[i];
        uint ux = qnode->qbounds.upper_x[i];
        uint uy = qnode->qbounds.upper_y[i];
        uint uz = qnode->qbounds.upper_z[i];
        bool valid = lx <= ux;
        if (valid)
        {
            lower_x = min(lower_x, lx);
            lower_y = min(lower_y, ly);
            lower_z = min(lower_z, lz);
            upper_x = max(upper_x, ux);
            upper_y = max(upper_y, uy);
            upper_z = max(upper_z, uz);
        }
    }
    const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
    const int4 lower_i = (int4)(lower_x, lower_y, lower_z, 0);
    const int4 upper_i = (int4)(upper_x, upper_y, upper_z, 0);
    const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
    aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
    aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
 #endif
    return aabb;
 }
 GRL_INLINE struct AABB3f InternalNode_getAABB3f(struct InternalNode* node)
 {
    return AABB3fFromAABB(getAABB_QBVHNodeN((struct QBVHNodeN*)node));
 }
 GRL_INLINE uint getNumChildren_QBVHNodeN(struct QBVHNodeN *qnode)
 {
    uint children = 0;
    for (uint i = 0; i < BVH_NODE_N6; i++)
    {
        uint lx = qnode->qbounds.lower_x[i];
        uint ux = qnode->qbounds.upper_x[i];
        bool valid = lx <= ux;
        if (valid)
            children++;
    }
    return children;
 }
 GRL_INLINE long extractQBVHNodeN_offset(struct QBVHNodeN *qnode)
 {
    return ((long)qnode->offset) << 6;
 }
 GRL_INLINE void *QBVHNodeN_childrenPointer(struct QBVHNodeN *qnode)
 {
    const int offset = qnode->offset;
    return (void *)(qnode + offset);
 }
 GRL_INLINE void subgroup_setQBVHNodeN_setFields_reduced_bounds(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, struct AABB reduced_aabb)
 {
    const uint subgroupLocalID = get_sub_group_local_id();
    const uint k = subgroupLocalID;
    const float up = 1.0f + ulp;
    const float down = 1.0f - ulp;
    struct AABB aabb = reduced_aabb; // needs to execute with full subgroup width
    aabb = AABB_sub_group_broadcast(&aabb, 0);
    if (subgroupLocalID < BVH_NODE_N6)
    {
        struct AABB conservative_aabb = conservativeAABB(&aabb);
        const float3 len = AABB_size(&conservative_aabb).xyz * up;
        int3 exp;
        const float3 mant = frexp_vec3(len, &exp);
        const float3 org = conservative_aabb.lower.xyz;
        exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
        qbvh_node->offset = offset;
        qbvh_node->type = type;
        qbvh_node->lower[0] = org.x;
        qbvh_node->lower[1] = org.y;
        qbvh_node->lower[2] = org.z;
        qbvh_node->exp[0] = exp.x;
        qbvh_node->exp[1] = exp.y;
        qbvh_node->exp[2] = exp.z;
        qbvh_node->instMask = mask;
        uchar3 lower_uchar = (uchar3)(0x80);
        uchar3 upper_uchar = (uchar3)(0);
        if (subgroupLocalID < numChildren)
        {
            struct AABB child_aabb = conservativeAABB(input_aabb);
            float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
            lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
            float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
            upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
            lower_uchar = convert_uchar3_rtn(lower);
            upper_uchar = convert_uchar3_rtp(upper);
            if (degenerated)
            {
                lower_uchar = upper_uchar = 0;
            }
        }
        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
        qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
 #if ENABLE_CONVERSION_CHECKS == 1
        if (!(exp.x >= -128 && exp.x <= 127))
            printf("exp_x error \n");
        if (!(exp.y >= -128 && exp.y <= 127))
            printf("exp_y error \n");
        if (!(exp.z >= -128 && exp.z <= 127))
            printf("exp_z error \n");
        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
        if (!AABB_subset(&child_aabb, &child_qaabb))
        {
            uint3 lower_i = convert_uint3(lower_uchar);
            uint3 upper_i = convert_uint3(upper_uchar);
            printf("\n ERROR %d\n", k);
            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
            printf("%i uncompressed \n", k);
            AABB_print(&child_aabb);
            printf("%i compressed \n", k);
            AABB_print(&child_qaabb);
            printf("%i uncompressed (as int) \n", k);
            AABB_printasInt(&child_aabb);
            printf("%i compressed (as int) \n", k);
            AABB_printasInt(&child_qaabb);
            int4 e0 = child_aabb.lower < child_qaabb.lower;
            int4 e1 = child_aabb.upper > child_qaabb.upper;
            printf("e0 %d e1 %d \n", e0, e1);
        }
 #endif
    }
 }
 GRL_INLINE void subgroup_setQBVHNodeN_setFields(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated)
 {
    struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb);
    subgroup_setQBVHNodeN_setFields_reduced_bounds(offset, type, input_aabb, numChildren, mask, qbvh_node, degenerated, aabb);
 }
 GRL_INLINE void subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, bool active_lane)
 {
    const uint lane = get_sub_group_local_id() % 8;
    const uint node_in_sg = get_sub_group_local_id() / 8;
    const uint k = lane;
    const float up = 1.0f + ulp;
    const float down = 1.0f - ulp;
    struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); // needs to execute with full subgroup width
    aabb = AABB_sub_group_shuffle(&aabb, node_in_sg * 8);
    if (lane < BVH_NODE_N6 && active_lane)
    {
        struct AABB conservative_aabb = conservativeAABB(&aabb);
        const float3 len = AABB_size(&conservative_aabb).xyz * up;
        int3 exp;
        const float3 mant = frexp_vec3(len, &exp);
        const float3 org = conservative_aabb.lower.xyz;
        exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
        qbvh_node->offset = offset;
        qbvh_node->type = type;
        qbvh_node->lower[0] = org.x;
        qbvh_node->lower[1] = org.y;
        qbvh_node->lower[2] = org.z;
        qbvh_node->exp[0] = exp.x;
        qbvh_node->exp[1] = exp.y;
        qbvh_node->exp[2] = exp.z;
        qbvh_node->instMask = mask;
        uchar3 lower_uchar = (uchar3)(0x80);
        uchar3 upper_uchar = (uchar3)(0);
        if (lane < numChildren)
        {
            struct AABB child_aabb = conservativeAABB(input_aabb);
            float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
            lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
            float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
            upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
            lower_uchar = convert_uchar3_rtn(lower);
            upper_uchar = convert_uchar3_rtp(upper);
            if (degenerated)
            {
                lower_uchar = upper_uchar = 0;
            }
        }
        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
        qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
 #if ENABLE_CONVERSION_CHECKS == 1
        if (!(exp.x >= -128 && exp.x <= 127))
            printf("exp_x error \n");
        if (!(exp.y >= -128 && exp.y <= 127))
            printf("exp_y error \n");
        if (!(exp.z >= -128 && exp.z <= 127))
            printf("exp_z error \n");
        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
        if (!AABB_subset(&child_aabb, &child_qaabb))
        {
            uint3 lower_i = convert_uint3(lower_uchar);
            uint3 upper_i = convert_uint3(upper_uchar);
            printf("\n ERROR %d\n", k);
            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
            printf("%i uncompressed \n", k);
            AABB_print(&child_aabb);
            printf("%i compressed \n", k);
            AABB_print(&child_qaabb);
            printf("%i uncompressed (as int) \n", k);
            AABB_printasInt(&child_aabb);
            printf("%i compressed (as int) \n", k);
            AABB_printasInt(&child_qaabb);
            int4 e0 = child_aabb.lower < child_qaabb.lower;
            int4 e1 = child_aabb.upper > child_qaabb.upper;
            printf("e0 %d e1 %d \n", e0, e1);
        }
 #endif
    }
 }
 GRL_INLINE void subgroup_setInstanceQBVHNodeN(const int offset, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node, const uint instMask)
 {
    const uint subgroupLocalID = get_sub_group_local_id();
    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
    struct AABB aabb;
    AABB_init(&aabb);
    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
    uchar commonMask = sub_group_reduce_or_N6(instMask);
    if (subgroupLocalID < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
        aabb = *input_aabb;
    subgroup_setQBVHNodeN_setFields(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated);
 }
 // return true if is degenerated
 GRL_INLINE bool subgroup_setInstanceBox_2xSIMD8_in_SIMD16(struct AABB* input_aabb, const uint numChildren, uchar* mask, const uint instMask, bool active_lane)
 {
    const uint lane = get_sub_group_local_id() % 8;
    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
    uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
    if (active_lane)
        *mask = commonMask;
    if (active_lane && (degenerated && commonMask != BVH_NODE_DEGENERATED_MASK))
        AABB_init(input_aabb);
    return active_lane ? degenerated : false;
 }
 GRL_INLINE void subgroup_setInstanceQBVHNodeN_x2(const int offset, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, const uint instMask, bool active_lane)
 {
    const uint lane = get_sub_group_local_id() % 8;
    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
    struct AABB aabb;
    AABB_init(&aabb);
    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
    uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
    if (lane < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
        aabb = *input_aabb;
    subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated, active_lane);
 }
 GRL_INLINE void subgroup_setQBVHNodeN(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, uint mask)
 {
    const uint subgroupLocalID = get_sub_group_local_id();
    struct AABB aabb;
    AABB_init(&aabb);
    if (subgroupLocalID < numChildren)
        aabb = *input_aabb;
    subgroup_setQBVHNodeN_setFields(offset, type, &aabb, numChildren, mask, qbvh_node, false);
 }
 GRL_INLINE void subgroup_setQBVHNodeN_x2(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, bool active_lane)
 {
    const uint lane = get_sub_group_local_id() % 8;
    struct AABB aabb;
    AABB_init(&aabb);
    if (lane < numChildren)
        aabb = *input_aabb;
    subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, type, &aabb, numChildren, BVH_NODE_DEFAULT_MASK, qbvh_node, false, active_lane);
 }
 GRL_INLINE void subgroup_QBVHNodeN_setBounds( uniform struct QBVHNodeN* qbvh_node, 
                                              uniform struct AABB reduced_bounds,
                                              varying struct AABB input_aabb, 
                                              uniform uint numChildren,
                                              varying ushort lane )
 {
    const float up = 1.0f + ulp;
    const float down = 1.0f - ulp;
    int3 exp;
    struct AABB conservative_aabb = conservativeAABB( &reduced_bounds);
    const float3 len = AABB_size( &conservative_aabb ).xyz * up;
    const float3 mant = frexp_vec3( len, &exp );
    const float3 org = conservative_aabb.lower.xyz;
    exp += (mant > ( float3 )QUANT_MAX_MANT ? (int3)1 : (int3)0);
    qbvh_node->lower[0] = org.x;
    qbvh_node->lower[1] = org.y;
    qbvh_node->lower[2] = org.z;
    qbvh_node->exp[0] = exp.x;
    qbvh_node->exp[1] = exp.y;
    qbvh_node->exp[2] = exp.z;
    qbvh_node->instMask = 0xff;
    uchar3 lower_uchar = 0x80;
    uchar3 upper_uchar = 0;
    if ( lane < BVH_NODE_N6 )
    {
        ushort k = lane;
        if( lane < numChildren )
        {
            struct AABB child_aabb = conservativeAABB( &input_aabb ); // conservative ???
            float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
            lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
            float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
            upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
            lower_uchar = convert_uchar3_rtn( lower );
            upper_uchar = convert_uchar3_rtp( upper );
        }
        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
    }
 }
 GRL_INLINE void QBVHNodeN_setBounds(struct QBVHNodeN *qbvh_node, struct AABB *input_aabb, const uint numChildren)
 {
    const float up = 1.0f + ulp;
    const float down = 1.0f - ulp;
    int3 exp;
    struct AABB aabb;
    AABB_init(&aabb);
    for (uint i = 0; i < numChildren; i++)
        AABB_extend(&aabb, &input_aabb[i]);
    struct AABB conservative_aabb = conservativeAABB(&aabb);
    const float3 len = AABB_size(&conservative_aabb).xyz * up;
    const float3 mant = frexp_vec3(len, &exp);
    const float3 org = conservative_aabb.lower.xyz;
    exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
    qbvh_node->lower[0] = org.x;
    qbvh_node->lower[1] = org.y;
    qbvh_node->lower[2] = org.z;
    qbvh_node->exp[0] = exp.x;
    qbvh_node->exp[1] = exp.y;
    qbvh_node->exp[2] = exp.z;
    qbvh_node->instMask = 0xff;
    for (uint k = 0; k < numChildren; k++)
    {
        struct AABB child_aabb = conservativeAABB(&input_aabb[k]); // conservative ???
        float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
        lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
        float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
        upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
        uchar3 lower_uchar = convert_uchar3_rtn(lower);
        uchar3 upper_uchar = convert_uchar3_rtp(upper);
        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
 #if ENABLE_CONVERSION_CHECKS == 1
        if (!(exp.x >= -128 && exp.x <= 127))
            printf("exp_x error \n");
        if (!(exp.y >= -128 && exp.y <= 127))
            printf("exp_y error \n");
        if (!(exp.z >= -128 && exp.z <= 127))
            printf("exp_z error \n");
        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
        if (!AABB_subset(&child_aabb, &child_qaabb))
        {
            uint3 lower_i = convert_uint3(lower_uchar);
            uint3 upper_i = convert_uint3(upper_uchar);
            printf("\n ERROR %d\n", k);
            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
            printf("%i uncompressed \n", k);
            AABB_print(&child_aabb);
            printf("%i compressed \n", k);
            AABB_print(&child_qaabb);
            printf("%i uncompressed (as int) \n", k);
            AABB_printasInt(&child_aabb);
            printf("%i compressed (as int) \n", k);
            AABB_printasInt(&child_qaabb);
            int4 e0 = child_aabb.lower < child_qaabb.lower;
            int4 e1 = child_aabb.upper > child_qaabb.upper;
            printf("e0 %d e1 %d \n", e0, e1);
        }
 #endif
    }
    for (uint k = numChildren; k < BVH_NODE_N6; k++)
    {
        qbvh_node->qbounds.lower_x[k] = 0x80;
        qbvh_node->qbounds.lower_y[k] = 0x80;
        qbvh_node->qbounds.lower_z[k] = 0x80;
        qbvh_node->qbounds.upper_x[k] = 0;
        qbvh_node->qbounds.upper_y[k] = 0;
        qbvh_node->qbounds.upper_z[k] = 0;
    }
 }
 GRL_INLINE void QBVHNodeN_setChildren(struct QBVHNodeN *qbvh_node, const int offset, const uint numChildren)
 {
    qbvh_node->offset = offset;
    for (uint k = 0; k < BVH_NODE_N6; k++)
        qbvh_node->childData[k] = 1;
 }
 GRL_INLINE void QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
 {
    for (uint k = 0; k < BVH_NODE_N6; k++)
        qbvh_node->childData[k] = 1;
 }
 GRL_INLINE void SUBGROUP_QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
 {
    if( get_sub_group_local_id() < BVH_NODE_N6 )
        qbvh_node->childData[get_sub_group_local_id()] = 1;
 }
 GRL_INLINE void QBVHNodeN_setChildIncr2(struct QBVHNodeN *qbvh_node)
 {
    for (uint k = 0; k < BVH_NODE_N6; k++)
        qbvh_node->childData[k] = 2;
 }
 GRL_INLINE void QBVHNodeN_setType(struct QBVHNodeN *qbvh_node, const uint type)
 {
    qbvh_node->type = type;
 }
 GRL_INLINE void setQBVHNodeN(const int offset, const uint type, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node)
 {
    QBVHNodeN_setType(qbvh_node, type);
    QBVHNodeN_setChildren(qbvh_node, offset, numChildren);
    QBVHNodeN_setBounds(qbvh_node, input_aabb, numChildren);
 }
 GRL_INLINE void printQBVHNodeN(struct QBVHNodeN *qnode)
 {
    printf(" offset %d type %d \n", qnode->offset, (int)qnode->type);
    printf(" lower %f %f %f \n", qnode->lower[0], qnode->lower[1], qnode->lower[2]);
    printf(" exp %d %d %d \n", (int)qnode->exp[0], (int)qnode->exp[1], (int)qnode->exp[2]);
    printf(" instMask %d \n", qnode->instMask);
    struct AABB aabb0 = extractAABB_QBVHNodeN(qnode, 0);
    struct AABB aabb1 = extractAABB_QBVHNodeN(qnode, 1);
    struct AABB aabb2 = extractAABB_QBVHNodeN(qnode, 2);
    struct AABB aabb3 = extractAABB_QBVHNodeN(qnode, 3);
    struct AABB aabb4 = extractAABB_QBVHNodeN(qnode, 4);
    struct AABB aabb5 = extractAABB_QBVHNodeN(qnode, 5);
    printf(" lower_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_x[0], qnode->qbounds.lower_x[1], qnode->qbounds.lower_x[2], qnode->qbounds.lower_x[3], qnode->qbounds.lower_x[4], qnode->qbounds.lower_x[5], aabb0.lower.x, aabb1.lower.x, aabb2.lower.x, aabb3.lower.x, aabb4.lower.x, aabb5.lower.x);
    printf(" upper_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_x[0], qnode->qbounds.upper_x[1], qnode->qbounds.upper_x[2], qnode->qbounds.upper_x[3], qnode->qbounds.upper_x[4], qnode->qbounds.upper_x[5], aabb0.upper.x, aabb1.upper.x, aabb2.upper.x, aabb3.upper.x, aabb4.upper.x, aabb5.upper.x);
    printf(" lower_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_y[0], qnode->qbounds.lower_y[1], qnode->qbounds.lower_y[2], qnode->qbounds.lower_y[3], qnode->qbounds.lower_y[4], qnode->qbounds.lower_y[5], aabb0.lower.y, aabb1.lower.y, aabb2.lower.y, aabb3.lower.y, aabb4.lower.y, aabb5.lower.y);
    printf(" upper_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_y[0], qnode->qbounds.upper_y[1], qnode->qbounds.upper_y[2], qnode->qbounds.upper_y[3], qnode->qbounds.upper_y[4], qnode->qbounds.upper_y[5], aabb0.upper.y, aabb1.upper.y, aabb2.upper.y, aabb3.upper.y, aabb4.upper.y, aabb5.upper.y);
    printf(" lower_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_z[0], qnode->qbounds.lower_z[1], qnode->qbounds.lower_z[2], qnode->qbounds.lower_z[3], qnode->qbounds.lower_z[4], qnode->qbounds.lower_z[5], aabb0.lower.z, aabb1.lower.z, aabb2.lower.z, aabb3.lower.z, aabb4.lower.z, aabb5.lower.z);
    printf(" upper_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_z[0], qnode->qbounds.upper_z[1], qnode->qbounds.upper_z[2], qnode->qbounds.upper_z[3], qnode->qbounds.upper_z[4], qnode->qbounds.upper_z[5], aabb0.upper.z, aabb1.upper.z, aabb2.upper.z, aabb3.upper.z, aabb4.upper.z, aabb5.upper.z);
 }
 GRL_INLINE int encodeOffset(global char *bvh_mem, global void *parent, int global_child_offset)
 {
    long global_parent_offset = (long)parent - (long)bvh_mem;
    global_parent_offset = global_parent_offset & (~(64 - 1));        // FIXME: (sw) this should not be necessary?
    int relative_offset = global_child_offset - global_parent_offset; // FIXME: this limits BVH size to 4GB
    //if ((int)relative_offset <= 0) printf("relative offset <= 0 %d global_child_offset %d global_parent_offset %d \n", relative_offset,global_child_offset,global_parent_offset);
    return relative_offset;
 }
 GRL_INLINE void QBVH6Node_set_offset(struct QBVHNodeN *qnode, void *children)
 {
    int ofs = (struct QBVHNodeN *)children - qnode;
    qnode->offset = ofs;
 }
 GRL_INLINE void QBVH6Node_set_type(struct QBVHNodeN *qnode, uint type)
 {
    qnode->type = type;
 }
 GRL_INLINE uint sortBVHChildrenIDs(uint input)
 {
 #if BVH_NODE_N == 8
    return sort8_descending(input);
 #else
    return sort4_descending(input);
 #endif
 }
 enum XFM_BOX_OPTION {
    XFM_BOX_NO_CLIP = 0,
    XFM_BOX_NOT_REFINED_CLIPPED = 1, //<<use clipbox, for not refined, compute bbox from children, transform after extending to one box
    XFM_BOX_NOT_REFINED_TAKE_CLIPBOX = 2 //<<use clipbox, for not refined, just transform xlipbox, don't take children boxes into account
 };
 #define DEB_PRINTFS 0
 #ifndef FINE_TRANSFORM_NODE_BOX
 #define FINE_TRANSFORM_NODE_BOX 0
 #endif
 GRL_INLINE struct AABB3f GRL_OVERLOADABLE compute_xfm_bbox(const float* xfm, InternalNode* pnode, enum XFM_BOX_OPTION clipOpt, const AABB3f* clipBox, float matrixTransformOverhead)
 {
    AABB3f childrenbox;
 #if FINE_TRANSFORM_NODE_BOX
    struct AffineSpace3f axfm = AffineSpace3f_load_row_major(xfm);
    bool computeFine = matrixTransformOverhead < 0.6f;
    computeFine = sub_group_any(computeFine);
    if (computeFine)
    {
        bool clip = clipOpt != XFM_BOX_NO_CLIP;
        InternalNode node = *pnode;
 #if DEB_PRINTFS
        if (InternalNode_IsChildValid(&node, 5) && !InternalNode_IsChildValid(&node, 4))
            printf("child 5 valid && child 4 invalid\n");
        if (InternalNode_IsChildValid(&node, 4) && !InternalNode_IsChildValid(&node, 3))
            printf("child 4 valid && child 3 invalid\n");
        if (InternalNode_IsChildValid(&node, 3) && !InternalNode_IsChildValid(&node, 2))
            printf("child 3 valid && child 2 invalid\n");
        if (InternalNode_IsChildValid(&node, 2) && !InternalNode_IsChildValid(&node, 1))
            printf("child 2 valid && child 1 invalid\n");
        if (InternalNode_IsChildValid(&node, 1) && !InternalNode_IsChildValid(&node, 0))
            printf("child 1 valid && child 0 invalid\n");
 #endif
 #if DEB_PRINTFS
        printf("F");
 #endif
        AABB3f child_bounds0 = InternalNode_GetChildAABB(&node, 0);
        AABB3f child_bounds1 = InternalNode_GetChildAABB(&node, 1);
        AABB3f child_bounds2 = InternalNode_GetChildAABB(&node, 2); 
        AABB3f child_bounds3 = InternalNode_GetChildAABB(&node, 3); 
        AABB3f child_bounds4 = InternalNode_GetChildAABB(&node, 4); 
        AABB3f child_bounds5 = InternalNode_GetChildAABB(&node, 5); 
        // we bravely assumme we will have at least 2 children here.
        if(!InternalNode_IsChildValid(&node, 2)) child_bounds2 = child_bounds0;
        if(!InternalNode_IsChildValid(&node, 3)) child_bounds3 = child_bounds0;
        if(!InternalNode_IsChildValid(&node, 4)) child_bounds4 = child_bounds0;
        if(!InternalNode_IsChildValid(&node, 5)) child_bounds5 = child_bounds0;
        if (clip)
        {
            AABB3f_trim_upper(&child_bounds0, clipBox->upper);
            AABB3f_trim_upper(&child_bounds1, clipBox->upper);
            AABB3f_trim_upper(&child_bounds2, clipBox->upper);
            AABB3f_trim_upper(&child_bounds3, clipBox->upper);
            AABB3f_trim_upper(&child_bounds4, clipBox->upper);
            AABB3f_trim_upper(&child_bounds5, clipBox->upper);
        }
        child_bounds0 = transform_aabb(child_bounds0, xfm);
        child_bounds1 = transform_aabb(child_bounds1, xfm);
        child_bounds2 = transform_aabb(child_bounds2, xfm);
        child_bounds3 = transform_aabb(child_bounds3, xfm);
        child_bounds4 = transform_aabb(child_bounds4, xfm);
        child_bounds5 = transform_aabb(child_bounds5, xfm);
        AABB3f_extend(&child_bounds0, &child_bounds1);
        AABB3f_extend(&child_bounds2, &child_bounds3);
        AABB3f_extend(&child_bounds4, &child_bounds5);
        AABB3f_extend(&child_bounds0, &child_bounds2);
        AABB3f_extend(&child_bounds0, &child_bounds4);
        return child_bounds0;
    }
 #endif
 #if DEB_PRINTFS
    printf("0");
 #endif
    struct AABB3f child_bounds;
    if (clipOpt != XFM_BOX_NOT_REFINED_TAKE_CLIPBOX)
    {
        // XFM_BOX_NOT_REFINED_CLIPPED || XFM_BOX_NO_CLIP
        child_bounds = InternalNode_getAABB3f(pnode);
        if (clipOpt != XFM_BOX_NO_CLIP)
        {
            AABB3f_intersect(&child_bounds, *clipBox);
        }
    }
    else
    {
        //XFM_BOX_NOT_REFINED_TAKE_CLIPBOX
        child_bounds = *clipBox;
    }
    child_bounds = transform_aabb(child_bounds, xfm);
    //child_bounds = conservativeAABB3f(&child_bounds);
    return child_bounds;
 }
 GRL_INLINE AABB3f GRL_OVERLOADABLE compute_xfm_bbox(struct AffineSpace3f xfm, InternalNode* pnode, bool clip, AABB3f* clipBox, float matOverhead)
 {
    float transform[12];
    load_row_major_from_AffineSpace3f(xfm, transform);
    return compute_xfm_bbox(transform, pnode, clip, clipBox, matOverhead);
 }
 GRL_INLINE uint64_t compute_refit_structs_compacted_size(BVHBase* base)
 {
    uint dataSize = 0;
    if (BVHBase_HasBackPointers(base))
    {
        const uint fatleafEntrySize = (base->fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63;
        const uint innerEntrySize = (base->innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63;
        // New atomic update
        if(base->quadIndicesDataStart > base->backPointerDataStart)
        {
            uint numQuads = BVHBase_GetNumQuads(base);
            const uint quadTableMainBufferSize = (numQuads + 255) & ~255;
            const uint quadLeftoversSize = (base->quadLeftoversCountNewAtomicUpdate + 255) & ~255;
            const uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
            const uint quadIndicesDataSize = (numQuads * sizeof(QuadDataIndices) + 63) & ~63;
            dataSize += quadTableEntriesSize + quadIndicesDataSize;
        }
        dataSize += 
            ((BVHBase_GetNumInternalNodes(base) * sizeof(uint) + 63) & ~63)
            + fatleafEntrySize + innerEntrySize;
    }
    return (uint64_t)dataSize;
 }
 GRL_INLINE uint64_t compute_compacted_size(BVHBase* base)
 {
    uint64_t size = sizeof(BVHBase);
    size += BVHBase_GetNumHWInstanceLeaves(base) * sizeof(HwInstanceLeaf);
    size += BVHBase_GetNumProcedurals(base) * sizeof(ProceduralLeaf);
    size += BVHBase_GetNumQuads(base) * sizeof(QuadLeaf);
    size += compute_refit_structs_compacted_size(base);
    size += BVHBase_GetNumInternalNodes(base) * sizeof(InternalNode);
    size += sizeof(InstanceDesc) * base->Meta.instanceCount;
    size += (sizeof(GeoMetaData) * base->Meta.geoCount + 63) & ~63; // align to 64
    size = (size + 63) & ~63;
    return size;
 }
--- a/src/intel/vulkan/grl/gpu/quad.h
+++ b/src/intel/vulkan/grl/gpu/quad.h
@ -1,127 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "shared.h"
 #include "intrinsics.h"
 #include "AABB.h"
 #include "AABB3f.h"
 // JDB TODO:  Use corresponding GRL structures!!!
 struct Quad
 {
    unsigned int shaderIndex;   // note: also mask
    unsigned int geomIndex;     // note:  also geom flags in upper 2 bits
    unsigned int primIndex0;
    unsigned int primIndex1Delta;
    float v[4][3];
 };
 GRL_INLINE unsigned int Quad_getGeomIndex(global struct Quad *quad)
 {
    return quad->geomIndex;
 }
 GRL_INLINE unsigned int Quad_getPrimIndex0(global struct Quad *quad)
 {
    return quad->primIndex0;
 }
 GRL_INLINE unsigned int Quad_getPrimIndex1(global struct Quad *quad)
 {
    return quad->primIndex0 + (quad->primIndex1Delta & 0xFFFF);
 }
 GRL_INLINE float3 load_float3(float *p)
 {
    return (float3)(p[0], p[1], p[2]);
 }
 GRL_INLINE float3 load_perm_float3(float *p, const uint3 perm)
 {
    return (float3)(p[perm.x], p[perm.y], p[perm.z]);
 }
 GRL_INLINE float2 load_perm_float2(float *p, const uint2 perm)
 {
    return (float2)(p[perm.x], p[perm.y]);
 }
 GRL_INLINE float load_perm_float(float *p, const uint perm)
 {
    return p[perm];
 }
 GRL_INLINE struct AABB getAABB_Quad(struct Quad *q)
 {
    struct AABB aabb;
    const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
    const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
    aabb.lower = (float4)(lower, 0.0f);
    aabb.upper = (float4)(upper, 0.0f);
    return aabb;
 }
 GRL_INLINE void Quad_ExtendAABB(struct Quad* q, struct AABB* box)
 {
    struct AABB aabb;
    const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
    const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
    aabb.lower = (float4)(lower, 0.0f);
    aabb.upper = (float4)(upper, 0.0f);
    AABB_extend(box, &aabb);
 }
 GRL_INLINE float4 getCentroid2_Quad(struct Quad *q)
 {
    struct AABB aabb = getAABB_Quad(q);
    return aabb.lower + aabb.upper;
 }
 GRL_INLINE void setQuad(struct Quad *quad, const float4 v0, const float4 v1, const float4 v2, const float4 v3,
                    const uchar j0, const uchar j1, const uchar j2,
                    const uint geomID, const uint primID0, const uint primID1, const uint geomMask, const uint geomFlags )
 {
    quad->v[0][0] = v0.x;
    quad->v[0][1] = v0.y;
    quad->v[0][2] = v0.z;
    quad->v[1][0] = v1.x;
    quad->v[1][1] = v1.y;
    quad->v[1][2] = v1.z;
    quad->v[2][0] = v2.x;
    quad->v[2][1] = v2.y;
    quad->v[2][2] = v2.z;
    quad->v[3][0] = v3.x;
    quad->v[3][1] = v3.y;
    quad->v[3][2] = v3.z;
    quad->shaderIndex = (geomMask << 24) | geomID;
    quad->geomIndex = geomID | (geomFlags << 30);
    quad->primIndex0 = primID0;
    const uint delta = primID1 - primID0;
    const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
    quad->primIndex1Delta = delta | (j << 16) | (1 << 22); // single prim in leaf
 }
 GRL_INLINE void setQuadVertices(struct Quad *quad, const float3 v0, const float3 v1, const float3 v2, const float3 v3)
 {
    quad->v[0][0] = v0.x;
    quad->v[0][1] = v0.y;
    quad->v[0][2] = v0.z;
    quad->v[1][0] = v1.x;
    quad->v[1][1] = v1.y;
    quad->v[1][2] = v1.z;
    quad->v[2][0] = v2.x;
    quad->v[2][1] = v2.y;
    quad->v[2][2] = v2.z;
    quad->v[3][0] = v3.x;
    quad->v[3][1] = v3.y;
    quad->v[3][2] = v3.z;
 }
--- a/src/intel/vulkan/grl/gpu/radix_sort.grl
+++ b/src/intel/vulkan/grl/gpu/radix_sort.grl
@ -1,163 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module radix_sort;
 kernel_module radix_kernels ("morton_radix_sort.cl") 
 {
    links lsc_intrinsics;
    kernel opencl_build_morton_kernel_sort_bin_items              < kernelFunction="sort_morton_codes_bin_items">;
    kernel opencl_build_morton_kernel_sort_reduce_bins            < kernelFunction="sort_morton_codes_reduce_bins">;
    kernel opencl_build_morton_kernel_sort_scatter_items          < kernelFunction="sort_morton_codes_scatter_items">;
    kernel opencl_build_morton_codes_sort_merged                  < kernelFunction="sort_morton_codes_merged">;
    kernel opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum   < kernelFunction="sort_morton_codes_reduce_bins_wide_partial_sum">;
    kernel opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce    < kernelFunction="sort_morton_codes_reduce_bins_wide_add_reduce">;
 }
 metakernel sort(
    qword build_globals,
    dword shift,
    qword global_histogram,
    qword input0,
    qword input1,
    dword input0_offset,
    dword input1_offset,
    dword iteration,
    dword threads)
 {
    dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
        build_globals,
        shift,
        global_histogram,
        input0,
        input1,
        input0_offset,
        input1_offset,
        iteration);
    control(wait_idle);
    dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
        threads,
        global_histogram);
    control(wait_idle);
    dispatch opencl_build_morton_kernel_sort_scatter_items (threads, 1, 1) args(
        build_globals,
        shift,
        global_histogram,
        input0,
        input1,
        input0_offset,
        input1_offset,
        iteration);
        control(wait_idle);
 }
 metakernel sort_bin_items(
    qword build_globals,
    qword global_histogram,
    qword wg_flags,
    qword input0,
    dword iteration,
    dword threads,
    dword update_wg_flags
    )
 {
    dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
        build_globals,
        global_histogram,
        wg_flags,
        input0,
        iteration,
        threads,
        update_wg_flags
    );
 }
 metakernel sort_reduce_bins(
    qword build_globals,
    qword global_histogram,
    dword threads,
    dword iteration)
 {
    dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
        build_globals,
        threads,
        global_histogram,
        iteration);
 }
 metakernel sort_scatter_items(
    qword build_globals,
    qword global_histogram,
    qword input0,
    qword input1,
    dword iteration,
    dword threads,
    dword update_morton_sort_in_flight )
 {
    dispatch opencl_build_morton_kernel_sort_scatter_items( threads, 1, 1 ) args(
        build_globals,
        global_histogram,
        input0,
        input1,
        iteration,
        threads,
        update_morton_sort_in_flight
    );
 }
 metakernel sort_bin_items_merged(
    qword build_globals,
    qword global_histogram,
    qword input0,
    dword iteration,
    dword threads)
 {
    dispatch opencl_build_morton_codes_sort_merged (threads, 1, 1) args(
        build_globals,
        global_histogram,
        input0,
        iteration,
        threads
    );
 }
 metakernel sort_reduce_bins_wide(
    qword build_globals,
    qword global_histogram,
    qword global_histogram_tmp,
    qword wg_flags,
    dword threads,
    dword threads_groups,
    dword iteration)
 {
    dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum(threads_groups, 1, 1) args(
        build_globals,
        threads,
        threads_groups,
        global_histogram,
        global_histogram_tmp,
        wg_flags,
        iteration);
    control(wait_idle);
    dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce(threads_groups, 1, 1) args(
        build_globals,
        threads,
        threads_groups,
        global_histogram,
        global_histogram_tmp,
        iteration);
 }
--- a/src/intel/vulkan/grl/gpu/rebraid.grl
+++ b/src/intel/vulkan/grl/gpu/rebraid.grl
@ -1,167 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module rebraid;
 kernel init_scratch             < source="bvh_rebraid.cl", kernelFunction="rebraid_init_scratch"                        >
 kernel chase_instance_ptrs      < source="bvh_rebraid.cl", kernelFunction="rebraid_chase_instance_pointers"             >
 kernel calc_aabb                < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances"           >
 kernel calc_aabb_indirect                < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_indirect"           >
 kernel calc_aabb_ptr            < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers"  >
 kernel calc_aabb_ptr_indirect            < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers_indirect"  >
 kernel count_splits             < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits"          >
 kernel count_splits_SG          < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG"       >
 kernel count_splits_SG_indirect          < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG_indirect"       >
 kernel build_primrefs           < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs"        >
 kernel build_primrefs_indirect           < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs_indirect"        >
 //kernel ISA_TEST < source="bvh_rebraid.cl", kernelFunction="ISA_TEST" >
 //kernel DEBUG_PRINT < source="bvh_rebraid.cl", kernelFunction="DEBUG_PRINT" >
 const PRIMREF_GROUP_SIZE = 256;
 const COUNT_SPLITS_GROUP_SIZE = 16;
 struct MKRebraidArgs
 {
  qword bvh_buffer;
  qword primref_buffer;
  qword global_buffer;
  qword instances_buffer;
  qword rebraid_scratch;
  qword flat_instances_buffer;
  dword num_instances;
  dword num_extra_primrefs;
 };
 metakernel rebraid(
    MKRebraidArgs Args
  )
 {
  dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
  dispatch calc_aabb(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
  control( wait_idle );
  //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
  //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.num_instances );
  dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
  control( wait_idle );
  define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
  dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
  control( wait_idle );
  //dispatch DEBUG_PRINT(1,1,1) args( Args.global_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
 }
 metakernel rebraid_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
 {
    dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
    define num_groups  REG0;
    num_groups = load_dword(indirectBuildRangeInfo);
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect calc_aabb_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
    control(wait_idle);
    dispatch_indirect count_splits_SG_indirect
        args(Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
    define groupsize_1 REG1; // groupsize - 1
    define C_8         REG2;
    groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
    C_8 = 8;          // log_2(PRIMREF_GROUP_SIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
    DISPATCHDIM_X = num_groups.lo;
    control(wait_idle);
    dispatch_indirect build_primrefs_indirect args(
        Args.global_buffer,
        Args.bvh_buffer,
        Args.instances_buffer,
        Args.rebraid_scratch,
        Args.primref_buffer,
        indirectBuildRangeInfo,
        Args.num_extra_primrefs);
    control(wait_idle);
 }
 metakernel rebraid_ptrs(
    MKRebraidArgs Args
  )
 {
  dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
  dispatch chase_instance_ptrs( Args.num_instances, 1, 1)   args( Args.instances_buffer, Args.flat_instances_buffer );
  dispatch calc_aabb_ptr(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
  control( wait_idle );
  //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
  //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
  dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch );
  control( wait_idle );
  define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
  dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch,  Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
  control( wait_idle );
 }
 metakernel rebraid_ptrs_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
 {
    dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
    define num_groups  REG0;
    num_groups = load_dword(indirectBuildRangeInfo);
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect chase_instance_ptrs
             args(Args.instances_buffer, Args.flat_instances_buffer, indirectBuildRangeInfo);
    dispatch_indirect calc_aabb_ptr_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
    control(wait_idle);
    dispatch_indirect count_splits_SG_indirect
        args(Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
    define groupsize_1 REG1; // groupsize - 1
    define C_8         REG2;
    groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
    C_8 = 8;          // log_2(PRIMREF_GROUP_SIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
    DISPATCHDIM_X = num_groups.lo;
    control(wait_idle);
    dispatch_indirect build_primrefs_indirect args(
        Args.global_buffer,
        Args.bvh_buffer,
        Args.flat_instances_buffer,
        Args.rebraid_scratch,
        Args.primref_buffer,
        Args.num_extra_primrefs,
        indirectBuildRangeInfo,
        Args.num_instances);
    control(wait_idle);
 }
--- a/src/intel/vulkan/grl/gpu/shared.h
+++ b/src/intel/vulkan/grl/gpu/shared.h
@ -1,182 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "GRLGen12.h"
 #pragma once
 #define sizeof_Quad 64
 #define sizeof_Procedural 64
 #define sizeof_PrimRef 32
 #define sizeof_PresplitItem 8
 #define sizeof_HwInstanceLeaf 128
 #define MORTON_BUILDER_SUBTREE_THRESHOLD 256
 #define MORTON_BUILDER_P2_ELEMENTS_IN_SLM 16 * 1024 / 32
 // Temporarily disable localized phase2 due to issues in ELG presi
 // This implementation would be replaced with bottom_up + bounding box approach without the need for phase2 refit
 #define MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD /*100000*/ 0
 #define BVH_QUAD_NODE 4
 #define BVH_INSTANCE_NODE 1
 #define BVH_INTERNAL_NODE 0
 #define BVH_PROCEDURAL_NODE 3
 #define BUILDRECORD_STACK_SIZE 48
 #define BINS 16
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
 GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
 struct AABB
 {
    float4 lower;
    float4 upper;
 };
 typedef struct BlockAllocator
 {
    unsigned int start;
    unsigned int cur;
 } BlockAllocator;
 struct Globals
 {
    struct AABB centroidBounds;
    unsigned int build_record_start;
    unsigned int numPrimitives;
    unsigned int leafPrimType;
    unsigned int leafSize;
    unsigned int numSplittedPrimitives;
    unsigned int numBuildRecords;
    // spatial split sate
    unsigned int numOriginalPrimitives;
    float presplitPrioritySum;
    float probThreshold;
    // binned-sah bfs state 
    unsigned int counter;
    unsigned int numBuildRecords_extended;
    // sync variable used for global-sync on work groups
    unsigned int sync;
    /* morton code builder state */
    unsigned int shift;      // used by adaptive mc-builder
    unsigned int shift_mask; // used by adaptive mc-builder
    unsigned int binary_hierarchy_root;
    unsigned int p0_allocated_num;
    unsigned int p0_created_num;
    unsigned int morton_sort_in_flight;
    unsigned int sort_iterations;
    gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy.  Stashed here as a debug aid
 };
 struct Range
 {
    unsigned int start, end;
 };
 struct Triangle
 {
    unsigned int vtx[3];
    //unsigned int primID;
    //unsigned int geomID;
 };
 struct MortonCodePrimitive
 {
    uint64_t index_code; // 64bit code + index combo
 };
 struct BuildRecord
 {
    struct AABB centroidBounds;
    unsigned int start, end;
    __global void *current;
 };
 struct BinaryMortonCodeHierarchy
 {
    struct Range range;
    unsigned int leftChild;
    unsigned int rightChild;
   // unsigned int flag;
 };
 typedef struct MortonFlattenedBoxlessNode {
    uint binary_hierarchy_index; // only needed when type != BVH_INTERNAL_NODE
    uint childOffset_type;       // childOffset : 26, type : 6
    uint backPointer;            // same usage as in bvh
 } MortonFlattenedBoxlessNode;
 struct StatStackEntry
 {
    struct AABB aabb;
    unsigned int node;
    unsigned int type;
    unsigned int depth;
    float area;
 };
 struct BuildRecordMorton
 {
    unsigned int nodeID;
    unsigned int items;
    unsigned int current_index;
    unsigned int parent_index;
 };
 struct Split
 {
    float sah;
    int dim;
    int pos;
 };
 struct BinMapping
 {
    float4 ofs, scale;
 };
 struct BinInfo
 {
    struct AABB3f boundsX[BINS];
    struct AABB3f boundsY[BINS];
    struct AABB3f boundsZ[BINS];
    uint3 counts[BINS];
 };
 struct BinInfo2
 {
    struct AABB3f boundsX[BINS * 2];
    struct AABB3f boundsY[BINS * 2];
    struct AABB3f boundsZ[BINS * 2];
    uint3 counts[BINS * 2];
 };
 struct GlobalBuildRecord
 {
    struct BinInfo2 binInfo;
    struct BinMapping binMapping;
    struct Split split;
    struct Range range;
    struct AABB leftCentroid;
    struct AABB rightCentroid;
    struct AABB leftGeometry;
    struct AABB rightGeometry;
    unsigned int atomicCountLeft;
    unsigned int atomicCountRight;
    unsigned int buildRecordID;
 };
 GRL_NAMESPACE_END(GPUBVHBuilder)
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/gpu/structs.grl
+++ b/src/intel/vulkan/grl/gpu/structs.grl
@ -1,38 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module structs;
 struct MKBuilderState {
    qword geomDesc_buffer;
    qword build_primref_buffer;
    qword build_globals;
    qword bvh_buffer;
    dword leaf_type;
    dword leaf_size;
 };
 struct MKSizeEstimate {
    dword numTriangles;
    dword numProcedurals;
    dword numPrimitives;
    dword numMeshes;
    dword numBuildPrimitives;
    dword numPrimitivesToSplit;
    dword instance_descs_start;
    dword geo_meta_data_start;
    dword node_data_start;
    dword leaf_data_start;
    dword procedural_data_start;
    dword back_pointer_start;
    dword sizeTotal;
    dword updateScratchSizeTotal;
    dword fatleaf_table_start;
    dword innernode_table_start;
    dword max_fatleaves;
    dword quad_indices_data_start;
 };
--- a/src/intel/vulkan/grl/gpu/traversal_shader.cl
+++ b/src/intel/vulkan/grl/gpu/traversal_shader.cl
@ -1,277 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #include "instance.h"
 #include "api_interface.h"
 #include "bvh_build_primref.h"
 #include "bvh_build_refit.h"
 /*
  Create primrefs from array of instance descriptors.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
 TS_primrefs_from_instances(
    global struct Globals* globals,
    global struct BVHBase* bvh,
    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
    uint numInstances,
    global struct AABB* primrefs,
    global uchar* pAABBs,
    global uchar* pIsProcedural,
    dword aabb_stride,
    uint allowUpdate
    )
 {
    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (instanceIndex < numInstances)
    {
        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
        if ( pIsProcedural[instanceIndex] )
        {
            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
        }
        primrefs_from_instances(
            globals,
            bvh,
            instance,
            instanceIndex,
            primrefs,
            procedural_bb,
            allowUpdate);
    }
 }
 /*
  Create primrefs from array of instance descriptors.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 void kernel
 TS_primrefs_from_instances_indirect(
    global struct Globals* globals,
    global struct BVHBase* bvh,
    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
    uint numInstances,
    global struct AABB* primrefs,
    global uchar* pAABBs,
    global uchar* pIsProcedural,
    dword aabb_stride,
    uint allowUpdate,
    global struct IndirectBuildRangeInfo* indirect_data
    )
 {
    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (instanceIndex < indirect_data->primitiveCount)
    {
        instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
            (((global char*)instances) + indirect_data->primitiveOffset);
        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
        if ( pIsProcedural[instanceIndex] )
        {
            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
        }
        primrefs_from_instances(
            globals,
            bvh,
            instance,
            instanceIndex,
            primrefs,
            procedural_bb,
            allowUpdate);
    }
 }
 /*
  Create primrefs from array of pointers to instance descriptors.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 __attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
 TS_primrefs_from_instances_pointers(global struct Globals* globals,
    global struct BVHBase* bvh,
    global void* instances_in,
    uint numInstances,
    global struct AABB* primrefs,
    global uchar* pAABBs,
    global uchar* pIsProcedural,
    dword aabb_stride,
    uint allowUpdate
    )
 {
    global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
        (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (instanceIndex < numInstances)
    {
        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
        if (pIsProcedural[instanceIndex])
        {
            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
        }
        primrefs_from_instances(
            globals,
            bvh,
            instance,
            instanceIndex,
            primrefs,
            procedural_bb,
            allowUpdate);
    }
 }
 /*
  Create primrefs from array of pointers to instance descriptors.
 */
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
 void kernel
 TS_primrefs_from_instances_pointers_indirect(global struct Globals* globals,
    global struct BVHBase* bvh,
    global void* instances_in,
    global struct AABB* primrefs,
    global uchar* pAABBs,
    global uchar* pIsProcedural,
    dword aabb_stride,
    uint allowUpdate,
    global struct IndirectBuildRangeInfo* indirect_data
    )
 {
    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
    if (instanceIndex < indirect_data->primitiveCount)
    {
        instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
        global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
            (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
        if (pIsProcedural[instanceIndex])
        {
            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
        }
        primrefs_from_instances(
            globals,
            bvh,
            instance,
            instanceIndex,
            primrefs,
            procedural_bb,
            allowUpdate);
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel
 TS_update_instance_leaves(global struct BVHBase* bvh,
    uint64_t dxrInstancesArray,
    uint64_t dxrInstancesPtr,
    global struct AABB3f* instance_aabb_scratch,
    global uchar* aabbs,
    global uchar* is_procedural,
    dword aabb_stride
 )
 {
    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
    if (id >= num_leaves)
        return;
    struct HwInstanceLeaf* leaves = BVHBase_GetHWInstanceLeaves(bvh);
    uint idx = HwInstanceLeaf_GetInstanceIndex(&leaves[id]);
    global GRL_RAYTRACING_AABB* procedural_box = 0;
    if (is_procedural[idx])
    {
        procedural_box = (global GRL_RAYTRACING_AABB*)(aabbs + (aabb_stride * idx));
    }
    DO_update_instance_leaves(
        bvh,
        dxrInstancesArray,
        dxrInstancesPtr,
        instance_aabb_scratch,
        id,
        procedural_box);
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(16, 1, 1)))
 void kernel
 TS_fixup_leaves( global struct BVHBase* bvh,
                 global uchar* primref_index,
                 global PrimRef* primrefs,
                 uint stride )
 {
    uint num_inners = BVHBase_GetNumInternalNodes(bvh);
    uint id       = get_local_id(0) + get_local_size(0) * get_group_id(0);
    // assign 8 lanes to each inner node, 6 of which will do useful work
    uint node_id  = id / 8;
    uint child_id = id % 8;
    bool node_valid = (node_id < num_inners);
    if (node_valid )
    {
        global InternalNode* nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
        global InternalNode* my_node = nodes + node_id;
        if (my_node->nodeType == BVH_INSTANCE_NODE)
        {
            bool child_valid = (child_id < 6) && InternalNode_IsChildValid(my_node, child_id);
            if (child_valid)
            {
                global HwInstanceLeaf* leaves = (global HwInstanceLeaf*)InternalNode_GetChildren(my_node);
                uint leafIndex = (leaves - BVHBase_GetHWInstanceLeaves(bvh)) + child_id;
                const uint primrefID = *(uint*)(primref_index + leafIndex * stride);
                uint type = PRIMREF_isProceduralInstance(&primrefs[primrefID]) ?
                                BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
                InternalNode_SetChildType(my_node, child_id, type);
            }
            if (child_id == 0)
                my_node->nodeType = BVH_INTERNAL_NODE;
        }
    }
 }
 GRL_ANNOTATE_IGC_DO_NOT_SPILL
 __attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
 TS_Refit_per_one_startpoint_sg(
    global struct BVHBase* bvh,
    global struct AABB3f* instance_leaf_aabbs,
    global uchar* procedural_instance_enable_buffer )
 {
    DO_Refit_per_one_startpoint_sg(bvh, (global GRL_RAYTRACING_GEOMETRY_DESC*) bvh, instance_leaf_aabbs, procedural_instance_enable_buffer );
 }
--- a/src/intel/vulkan/grl/gpu/traversal_shader.grl
+++ b/src/intel/vulkan/grl/gpu/traversal_shader.grl
@ -1,244 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 module traversal_shader;
 kernel_module morton_kernels ("traversal_shader.cl")
 {
    links lsc_intrinsics;
    kernel TS_primrefs_from_instances       < kernelFunction = "TS_primrefs_from_instances" >;
    kernel TS_primrefs_from_instances_indirect       < kernelFunction = "TS_primrefs_from_instances_indirect" >;
    kernel TS_primrefs_from_instances_ptrs  < kernelFunction = "TS_primrefs_from_instances_pointers" >;
    kernel TS_primrefs_from_instances_ptrs_indirect  < kernelFunction = "TS_primrefs_from_instances_pointers_indirect" >;
    kernel TS_update_instance_leaves        < kernelFunction = "TS_update_instance_leaves" >;
    kernel TS_Refit_per_one_startpoint_sg   < kernelFunction = "TS_Refit_per_one_startpoint_sg" >;
    kernel TS_fixup_leaves                  < kernelFunction = "TS_fixup_leaves" >;
 }
 struct MKTSBuildArgs
 {
    qword build_globals;
    qword bvh_buffer;
    qword instance_descs;
    qword build_primref_buffer;
    qword aabb_buffer;
    qword is_procedural_buffer;
    qword leaf_creation_index_buffer;
    dword aabb_stride;
    dword num_instances;
    dword leaf_creation_index_stride;
 };
 const BUILD_PRIMREFS_GROUPSIZE = 16;
 metakernel TS_build_primrefs( MKTSBuildArgs build_state, dword allowUpdate )
 {
    define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
    dispatch TS_primrefs_from_instances(num_groups, 1, 1) args(
        build_state.build_globals,
        build_state.bvh_buffer,
        build_state.instance_descs,
        build_state.num_instances,
        build_state.build_primref_buffer,
        build_state.aabb_buffer,
        build_state.is_procedural_buffer,
        build_state.aabb_stride,
        allowUpdate
    );
 }
 metakernel TS_build_primrefs_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
    C_4 = 4;          // log_2(BUILD_PRIMREFS_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect TS_primrefs_from_instances_indirect args(
        build_state.build_globals,
        build_state.bvh_buffer,
        build_state.instance_descs,
        build_state.build_primref_buffer,
        build_state.aabb_buffer,
        build_state.is_procedural_buffer,
        build_state.aabb_stride,
        allowUpdate,
        indirectBuildRangeInfo
    );
 }
 metakernel TS_build_primrefs_array_of_pointers( MKTSBuildArgs build_state, dword allowUpdate )
 {
    define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
    dispatch TS_primrefs_from_instances_ptrs(num_groups, 1, 1) args(
        build_state.build_globals,
        build_state.bvh_buffer,
        build_state.instance_descs,
        build_state.num_instances,
        build_state.build_primref_buffer,
        build_state.aabb_buffer,
        build_state.is_procedural_buffer,
        build_state.aabb_stride,
        allowUpdate
    );
 }
 metakernel
 TS_build_primrefs_array_of_pointers_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
    C_4 = 4;          // log_2(BUILD_PRIMREFS_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect TS_primrefs_from_instances_ptrs_indirect args(
        build_state.build_globals,
        build_state.bvh_buffer,
        build_state.instance_descs,
        build_state.build_primref_buffer,
        build_state.aabb_buffer,
        build_state.is_procedural_buffer,
        build_state.aabb_stride,
        allowUpdate,
        indirectBuildRangeInfo
    );
 }
 const UPDATE_INSTANCE_LEAVES_GROUPSIZE = 16;
 struct MKTSUpdateArgs
 {
    qword bvh_buffer;
    qword instance_descs;
    qword instance_descs_ptrs;
    qword aabb_buffer;
    qword is_procedural_buffer;
    qword refit_scratch;
    dword aabb_stride;
    dword num_instances;
 };
 metakernel TS_update_instance_leaves( MKTSUpdateArgs update_state )
 {
    define num_groups((update_state.num_instances + UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1) / UPDATE_INSTANCE_LEAVES_GROUPSIZE);
    dispatch TS_update_instance_leaves(num_groups, 1, 1) args(
        update_state.bvh_buffer,
        update_state.instance_descs,
        update_state.instance_descs_ptrs,
        update_state.refit_scratch,
        update_state.aabb_buffer,
        update_state.is_procedural_buffer,
        update_state.aabb_stride
    );
 }
 metakernel TS_update_instance_leaves_indirect( MKTSUpdateArgs update_state, qword indirectBuildRangeInfo )
 {
    define num_groups  REG0;
    define groupsize_1 REG1; // groupsize - 1
    define C_4         REG2;
    // init with primitiveCount
    num_groups = load_dword(indirectBuildRangeInfo);
    groupsize_1 = 15; // UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1
    C_4 = 4;          // log_2(UPDATE_INSTANCE_LEAVES_GROUPSIZE)
    num_groups = num_groups + groupsize_1;
    num_groups = num_groups >> C_4; // num_groups / UPDATE_INSTANCE_LEAVES_GROUPSIZE;
    DISPATCHDIM_X = num_groups.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    // need to add indirect offset?
    dispatch_indirect TS_update_instance_leaves args(
        update_state.bvh_buffer,
        update_state.instance_descs,
        update_state.instance_descs_ptrs,
        update_state.refit_scratch,
        update_state.aabb_buffer,
        update_state.is_procedural_buffer,
        update_state.aabb_stride
    );
 }
 metakernel TS_refit(MKTSUpdateArgs update_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
 {
    REG0 = bvh_inner_nodes_start_value;
    REG1.lo = load_dword(bvh_inner_nodes_end);
    REG1.hi = 0;
    REG2 = REG1 - REG0;
    DISPATCHDIM_X = REG2.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect TS_Refit_per_one_startpoint_sg
    args(
        update_state.bvh_buffer,
        update_state.refit_scratch,
        update_state.is_procedural_buffer
    );
 }
 const FIXUP_LEAVES_NODES_PER_GROUP = 2;
 metakernel TS_fixup_leaves(MKTSBuildArgs build_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
 {
    define ONE REG3;
    ONE = 1;
    REG0 = bvh_inner_nodes_start_value;
    REG1.lo = load_dword(bvh_inner_nodes_end);
    REG1.hi = 0;
    REG2 = REG1 - REG0;
    REG2 = REG2 + ONE;
    REG2 = REG2 >> ONE;
    DISPATCHDIM_X = REG2.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
    dispatch_indirect TS_fixup_leaves
        args(
            build_state.bvh_buffer,
            build_state.leaf_creation_index_buffer,
            build_state.build_primref_buffer,
            build_state.leaf_creation_index_stride
        );
 }
--- a/src/intel/vulkan/grl/grl_cl_kernel_gen.py
+++ b/src/intel/vulkan/grl/grl_cl_kernel_gen.py
@ -1,226 +0,0 @@
 COPYRIGHT = """\
 /*
 * Copyright 2021 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 """
 import argparse
 import os
 from grl_parser import parse_grl_file
 from mako.template import Template
 TEMPLATE_H = Template(COPYRIGHT + """
 /* This file generated from ${filename}, don't edit directly. */
 #ifndef GRL_CL_KERNEL_H
 #define GRL_CL_KERNEL_H
 #include "genxml/gen_macros.h"
 #include "compiler/brw_kernel.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 enum grl_cl_kernel {
 % for k in kernels:
    GRL_CL_KERNEL_${k.upper()},
 % endfor
    GRL_CL_KERNEL_MAX,
 };
 const char *genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel);
 const char *genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id);
 void genX(grl_get_cl_kernel)(struct brw_kernel *kernel, enum grl_cl_kernel id);
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* INTEL_GRL_H */
 """)
 TEMPLATE_C = Template(COPYRIGHT + """
 /* This file generated from ${filename}, don't edit directly. */
 #include "grl_cl_kernel.h"
 % for k in kernels:
 #include "${prefix}_${k}.h"
 % endfor
 const char *
 genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel)
 {
    switch (kernel) {
 % for k in kernels:
    case GRL_CL_KERNEL_${k.upper()}: return "${k}";
 % endfor
    default: return "unknown";
    }
 }
 const char *
 genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id)
 {
    switch (id) {
 % for k in kernels:
    case GRL_CL_KERNEL_${k.upper()}: return ${prefix}_${k}_sha1;
 % endfor
    default:
        unreachable("Invalid GRL kernel enum");
    }
 };
 void
 ${prefix}_grl_get_cl_kernel(struct brw_kernel *kernel, enum grl_cl_kernel id)
 {
    switch (id) {
 % for k in kernels:
    case GRL_CL_KERNEL_${k.upper()}:
        *kernel = ${prefix}_${k};
        break;
 % endfor
    default:
        unreachable("Invalid GRL kernel enum");
    }
 }
 """)
 def get_libraries_files(kernel_module):
    lib_files = []
    for item in kernel_module[3]:
        if item[0] != 'library':
            continue
        default_file = None
        fallback_file = None
        path_directory = None
        for props in item[2]:
            if props[0] == 'fallback':
                fallback_file = props[1]
            elif props[0] == 'default':
                default_file = props[1]
            elif props[0] == 'path':
                path_directory = props[1]
        assert path_directory
        assert default_file or fallback_file
        if fallback_file:
            lib_files.append(os.path.join(path_directory, fallback_file))
        else:
            lib_files.append(os.path.join(path_directory, default_file))
    return lib_files
 def add_kernels(kernels, cl_file, entrypoint, libs):
    assert cl_file.endswith('.cl')
    for lib_file in libs:
        assert lib_file.endswith('.cl')
    kernels.append((cl_file, entrypoint, ','.join(libs)))
 def get_kernels(grl_nodes):
    kernels = []
    for item in grl_nodes:
        assert isinstance(item, tuple)
        if item[0] == 'kernel':
            ann = item[2]
            add_kernels(kernels, ann['source'], ann['kernelFunction'], [])
        elif item[0] == 'kernel-module':
            cl_file = item[2]
            libfiles = get_libraries_files(item)
            for kernel_def in item[3]:
                if kernel_def[0] == 'kernel':
                    ann = kernel_def[2]
                    add_kernels(kernels, cl_file, ann['kernelFunction'], libfiles)
    return kernels
 def parse_libraries(filenames):
    libraries = {}
    for fname in filenames:
        lib_package = parse_grl_file(fname, [])
        for lib in lib_package:
            assert lib[0] == 'library'
            # Add the directory of the library so that CL files can be found.
            lib[2].append(('path', os.path.dirname(fname)))
            libraries[lib[1]] = lib
    return libraries
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out-c', help='Output C file')
    parser.add_argument('--out-h', help='Output H file')
    parser.add_argument('--ls-kernels', action='store_const', const=True,
                        help='List all openCL kernels')
    parser.add_argument('--prefix', help='Prefix')
    parser.add_argument('--library', dest='libraries', action='append',
                        default=[], help='Libraries to include')
    parser.add_argument('files', type=str, nargs='*', help='GRL files')
    args = parser.parse_args()
    libraries = parse_libraries(args.libraries)
    kernels = []
    for fname in args.files:
        kernels += get_kernels(parse_grl_file(fname, libraries))
    # Make the list of kernels unique and sorted
    kernels = sorted(list(set(kernels)))
    if args.ls_kernels:
        for cl_file, entrypoint, libs in kernels:
            if not os.path.isabs(cl_file):
                cl_file = os.path.join(os.path.dirname(fname), cl_file)
            print('{}:{}:{}'.format(cl_file, entrypoint, libs))
    kernel_c_names = []
    for cl_file, entrypoint, libs in kernels:
        cl_file = os.path.splitext(cl_file)[0]
        cl_file_name = cl_file.replace('/', '_')
        kernel_c_names.append('_'.join([cl_file_name, entrypoint]))
    try:
        if args.out_h:
            with open(args.out_h, 'w', encoding='utf-8') as f:
                f.write(TEMPLATE_H.render(kernels=kernel_c_names,
                                          filename=os.path.basename(__file__)))
        if args.out_c:
            with open(args.out_c, 'w', encoding='utf-8') as f:
                f.write(TEMPLATE_C.render(kernels=kernel_c_names,
                                          prefix=args.prefix,
                                          filename=os.path.basename(__file__)))
    except Exception:
        # In the event there's an error, this imports some helpers from mako
        # to print a useful stack trace and prints it, then exits with
        # status 1, if python is run with debug; otherwise it just raises
        # the exception
        if __debug__:
            import sys
            from mako import exceptions
            sys.stderr.write(exceptions.text_error_template().render() + '\n')
            sys.exit(1)
        raise
 if __name__ == '__main__':
    main()
--- a/src/intel/vulkan/grl/grl_metakernel_gen.py
+++ b/src/intel/vulkan/grl/grl_metakernel_gen.py
@ -1,930 +0,0 @@
 #!/bin/env python
 COPYRIGHT = """\
 /*
 * Copyright 2021 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 """
 import argparse
 import os.path
 import re
 import sys
 from grl_parser import parse_grl_file
 class Writer(object):
    def __init__(self, file):
        self._file = file
        self._indent = 0
        self._new_line = True
    def push_indent(self, levels=4):
        self._indent += levels
    def pop_indent(self, levels=4):
        self._indent -= levels
    def write(self, s, *fmt):
        if self._new_line:
            s = '\n' + s
        self._new_line = False
        if s.endswith('\n'):
            self._new_line = True
            s = s[:-1]
        if fmt:
            s = s.format(*fmt)
        self._file.write(s.replace('\n', '\n' + ' ' * self._indent))
 # Internal Representation
 class Value(object):
    def __init__(self, name=None, zone=None):
        self.name = name
        self._zone = zone
        self.live = False
    @property
    def zone(self):
        assert self._zone is not None
        return self._zone
    def is_reg(self):
        return False
    def c_val(self):
        if not self.name:
            print(self)
        assert self.name
        return self.name
    def c_cpu_val(self):
        assert self.zone == 'cpu'
        return self.c_val()
    def c_gpu_val(self):
        if self.zone == 'gpu':
            return self.c_val()
        else:
            return 'mi_imm({})'.format(self.c_cpu_val())
 class Constant(Value):
    def __init__(self, value):
        super().__init__(zone='cpu')
        self.value = value
    def c_val(self):
        if self.value < 100:
            return str(self.value)
        elif self.value < (1 << 32):
            return '0x{:x}u'.format(self.value)
        else:
            return '0x{:x}ull'.format(self.value)
 class Register(Value):
    def __init__(self, name):
        super().__init__(name=name, zone='gpu')
    def is_reg(self):
        return True
 class FixedGPR(Register):
    def __init__(self, num):
        super().__init__('REG{}'.format(num))
        self.num = num
    def write_c(self, w):
        w.write('UNUSED struct mi_value {} = mi_reserve_gpr(&b, {});\n',
                self.name, self.num)
 class GroupSizeRegister(Register):
    def __init__(self, comp):
        super().__init__('DISPATCHDIM_' + 'XYZ'[comp])
        self.comp = comp
 class Member(Value):
    def __init__(self, value, member):
        super().__init__(zone=value.zone)
        self.value = value
        self.member = member
    def is_reg(self):
        return self.value.is_reg()
    def c_val(self):
        c_val = self.value.c_val()
        if self.zone == 'gpu':
            assert isinstance(self.value, Register)
            if self.member == 'hi':
                return 'mi_value_half({}, true)'.format(c_val)
            elif self.member == 'lo':
                return 'mi_value_half({}, false)'.format(c_val)
            else:
                assert False, 'Invalid member: {}'.format(self.member)
        else:
            return '.'.join([c_val, self.member])
 class OffsetOf(Value):
    def __init__(self, mk, expr):
        super().__init__(zone='cpu')
        assert isinstance(expr, tuple) and expr[0] == 'member'
        self.type = mk.m.get_type(expr[1])
        self.field = expr[2]
    def c_val(self):
        return 'offsetof({}, {})'.format(self.type.c_name, self.field)
 class Scope(object):
    def __init__(self, m, mk, parent):
        self.m = m
        self.mk = mk
        self.parent = parent
        self.defs = {}
    def add_def(self, d, name=None):
        if name is None:
            name = d.name
        assert name not in self.defs
        self.defs[name] = d
    def get_def(self, name):
        if name in self.defs:
            return self.defs[name]
        assert self.parent, 'Unknown definition: "{}"'.format(name)
        return self.parent.get_def(name)
 class Statement(object):
    def __init__(self, srcs=[]):
        assert isinstance(srcs, (list, tuple))
        self.srcs = list(srcs)
 class SSAStatement(Statement, Value):
    _count = 0
    def __init__(self, zone, srcs):
        Statement.__init__(self, srcs)
        Value.__init__(self, None, zone)
        self.c_name = '_tmp{}'.format(SSAStatement._count)
        SSAStatement._count += 1
    def c_val(self):
        return self.c_name
    def write_c_refs(self, w):
        assert self.zone == 'gpu'
        assert self.uses > 0
        if self.uses > 1:
            w.write('mi_value_add_refs(&b, {}, {});\n',
                    self.c_name, self.uses - 1)
 class Half(SSAStatement):
    def __init__(self, value, half):
        assert half in ('hi', 'lo')
        super().__init__(None, [value])
        self.half = half
    @property
    def zone(self):
        return self.srcs[0].zone
    def write_c(self, w):
        assert self.half in ('hi', 'lo')
        if self.zone == 'cpu':
            if self.half == 'hi':
                w.write('uint32_t {} = (uint64_t)({}) >> 32;\n',
                        self.c_name, self.srcs[0].c_cpu_val())
            else:
                w.write('uint32_t {} = {};\n',
                        self.c_name, self.srcs[0].c_cpu_val())
        else:
            if self.half == 'hi':
                w.write('struct mi_value {} = mi_value_half({}, true);\n',
                        self.c_name, self.srcs[0].c_gpu_val())
            else:
                w.write('struct mi_value {} = mi_value_half({}, false);\n',
                        self.c_name, self.srcs[0].c_gpu_val())
            self.write_c_refs(w)
 class Expression(SSAStatement):
    def __init__(self, mk, op, *srcs):
        super().__init__(None, srcs)
        self.op = op
    @property
    def zone(self):
        zone = 'cpu'
        for s in self.srcs:
            if s.zone == 'gpu':
                zone = 'gpu'
        return zone
    def write_c(self, w):
        if self.zone == 'cpu':
            c_cpu_vals = [s.c_cpu_val() for s in self.srcs]
            # There is one bitfield that is a uint64_t, but only holds 2 bits.
            # In practice we won't overflow, but let's help the compiler (and
            # coverity) out here.
            if self.op == '<<':
                w.write(f'assume({c_cpu_vals[0]} < (1 << 8));')
            w.write('uint64_t {} = ', self.c_name)
            if len(self.srcs) == 1:
                w.write('({} {})', self.op, c_cpu_vals[0])
            elif len(self.srcs) == 2:
                w.write('({} {} {})', c_cpu_vals[0], self.op, c_cpu_vals[1])
            else:
                assert len(self.srcs) == 3 and op == '?'
                w.write('({} ? {} : {})', *c_cpu_vals)
            w.write(';\n')
            return
        w.write('struct mi_value {} = ', self.c_name)
        if self.op == '~':
            w.write('mi_inot(&b, {});\n', self.srcs[0].c_gpu_val())
        elif self.op == '+':
            w.write('mi_iadd(&b, {}, {});\n',
                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
        elif self.op == '-':
            w.write('mi_isub(&b, {}, {});\n',
                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
        elif self.op == '&':
            w.write('mi_iand(&b, {}, {});\n',
                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
        elif self.op == '|':
            w.write('mi_ior(&b, {}, {});\n',
                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
        elif self.op == '<<':
            if self.srcs[1].zone == 'cpu':
                w.write('mi_ishl_imm(&b, {}, {});\n',
                        self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val())
            else:
                w.write('mi_ishl(&b, {}, {});\n',
                        self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
        elif self.op == '>>':
            if self.srcs[1].zone == 'cpu':
                w.write('mi_ushr_imm(&b, {}, {});\n',
                        self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val())
            else:
                w.write('mi_ushr(&b, {}, {});\n',
                        self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
        elif self.op == '==':
            w.write('mi_ieq(&b, {}, {});\n',
                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
        elif self.op == '<':
            w.write('mi_ult(&b, {}, {});\n',
                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
        elif self.op == '>':
            w.write('mi_ult(&b, {}, {});\n',
                    self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val())
        elif self.op == '<=':
            w.write('mi_uge(&b, {}, {});\n',
                    self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val())
        else:
            assert False, 'Unknown expression opcode: {}'.format(self.op)
        self.write_c_refs(w)
 class StoreReg(Statement):
    def __init__(self, mk, reg, value):
        super().__init__([mk.load_value(value)])
        self.reg = mk.parse_value(reg)
        assert self.reg.is_reg()
    def write_c(self, w):
        value = self.srcs[0]
        w.write('mi_store(&b, {}, {});\n',
                self.reg.c_gpu_val(), value.c_gpu_val())
 class LoadMem(SSAStatement):
    def __init__(self, mk, bit_size, addr):
        super().__init__('gpu', [mk.load_value(addr)])
        self.bit_size = bit_size
    def write_c(self, w):
        addr = self.srcs[0]
        w.write('struct mi_value {} = ', self.c_name)
        if addr.zone == 'cpu':
            w.write('mi_mem{}(anv_address_from_u64({}));\n',
                    self.bit_size, addr.c_cpu_val())
        else:
            assert self.bit_size == 64
            w.write('mi_load_mem64_offset(&b, anv_address_from_u64(0), {});\n',
                    addr.c_gpu_val())
        self.write_c_refs(w)
 class StoreMem(Statement):
    def __init__(self, mk, bit_size, addr, src):
        super().__init__([mk.load_value(addr), mk.load_value(src)])
        self.bit_size = bit_size
    def write_c(self, w):
        addr, data = tuple(self.srcs)
        if addr.zone == 'cpu':
            w.write('mi_store(&b, mi_mem{}(anv_address_from_u64({})), {});\n',
                    self.bit_size, addr.c_cpu_val(), data.c_gpu_val())
        else:
            assert self.bit_size == 64
            w.write('mi_store_mem64_offset(&b, anv_address_from_u64(0), {}, {});\n',
                    addr.c_gpu_val(), data.c_gpu_val())
 class GoTo(Statement):
    def __init__(self, mk, target_id, cond=None, invert=False):
        cond = [mk.load_value(cond)] if cond is not None else []
        super().__init__(cond)
        self.target_id = target_id
        self.invert = invert
        self.mk = mk
    def write_c(self, w):
        # Now that we've parsed the entire metakernel, we can look up the
        # actual target from the id
        target = self.mk.get_goto_target(self.target_id)
        if self.srcs:
            cond = self.srcs[0]
            if self.invert:
                w.write('mi_goto_if(&b, mi_inot(&b, {}), &{});\n', cond.c_gpu_val(), target.c_name)
            else:
                w.write('mi_goto_if(&b, {}, &{});\n', cond.c_gpu_val(), target.c_name)
        else:
            w.write('mi_goto(&b, &{});\n', target.c_name)
 class GoToTarget(Statement):
    def __init__(self, mk, name):
        super().__init__()
        self.name = name
        self.c_name = '_goto_target_' + name
        self.goto_tokens = []
        mk = mk.add_goto_target(self)
    def write_decl(self, w):
        w.write('struct mi_goto_target {} = MI_GOTO_TARGET_INIT;\n',
                self.c_name)
    def write_c(self, w):
        w.write('mi_goto_target(&b, &{});\n', self.c_name)
 class Dispatch(Statement):
    def __init__(self, mk, kernel, group_size, args, postsync):
        if group_size is None:
            srcs = [mk.scope.get_def('DISPATCHDIM_{}'.format(d)) for d in 'XYZ']
        else:
            srcs = [mk.load_value(s) for s in group_size]
        srcs += [mk.load_value(a) for a in args]
        super().__init__(srcs)
        self.kernel = mk.m.kernels[kernel]
        self.indirect = group_size is None
        self.postsync = postsync
    def write_c(self, w):
        w.write('{\n')
        w.push_indent()
        group_size = self.srcs[:3]
        args = self.srcs[3:]
        if not self.indirect:
            w.write('const uint32_t _group_size[3] = {{ {}, {}, {} }};\n',
                    *[s.c_cpu_val() for s in group_size])
            gs = '_group_size'
        else:
            gs = 'NULL'
        w.write('const struct anv_kernel_arg _args[] = {\n')
        w.push_indent()
        for arg in args:
            w.write('{{ .u64 = {} }},\n', arg.c_cpu_val())
        w.pop_indent()
        w.write('};\n')
        w.write('genX(grl_dispatch)(cmd_buffer, {},\n', self.kernel.c_name)
        w.write('                   {}, ARRAY_SIZE(_args), _args);\n', gs)
        w.pop_indent()
        w.write('}\n')
 class SemWait(Statement):
    def __init__(self, scope, wait):
        super().__init__()
        self.wait = wait
 class Control(Statement):
    def __init__(self, scope, wait):
        super().__init__()
        self.wait = wait
    def write_c(self, w):
        w.write('cmd_buffer->state.pending_pipe_bits |=\n')
        w.write('    ANV_PIPE_CS_STALL_BIT |\n')
        w.write('    ANV_PIPE_DATA_CACHE_FLUSH_BIT |\n')
        w.write('    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;\n')
        w.write('genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);\n')
 TYPE_REMAPS = {
    'dword' : 'uint32_t',
    'qword' : 'uint64_t',
 }
 class Module(object):
    def __init__(self, grl_dir, elems):
        assert isinstance(elems[0], tuple)
        assert elems[0][0] == 'module-name'
        self.grl_dir = grl_dir
        self.name = elems[0][1]
        self.kernels = {}
        self.structs = {}
        self.constants = []
        self.metakernels = []
        self.regs = {}
        scope = Scope(self, None, None)
        for e in elems[1:]:
            if e[0] == 'kernel':
                k = Kernel(self, *e[1:])
                assert k.name not in self.kernels
                self.kernels[k.name] = k
            elif e[0] == 'kernel-module':
                m = KernelModule(self, *e[1:])
                for k in m.kernels:
                    assert k.name not in self.kernels
                    self.kernels[k.name] = k
            elif e[0] == 'struct':
                s = Struct(self, *e[1:])
                assert s.name not in self.kernels
                self.structs[s.name] = s
            elif e[0] == 'named-constant':
                c = NamedConstant(*e[1:])
                scope.add_def(c)
                self.constants.append(c)
            elif e[0] == 'meta-kernel':
                mk = MetaKernel(self, scope, *e[1:])
                self.metakernels.append(mk)
            elif e[0] == 'import':
                assert e[2] == 'struct'
                self.import_struct(e[1], e[3])
            else:
                assert False, 'Invalid module-level token: {}'.format(t[0])
    def import_struct(self, filename, struct_name):
        elems = parse_grl_file(os.path.join(self.grl_dir, filename), [])
        assert elems
        for e in elems[1:]:
            if e[0] == 'struct' and e[1] == struct_name:
                s = Struct(self, *e[1:])
                assert s.name not in self.kernels
                self.structs[s.name] = s
                return
        assert False, "Struct {0} not found in {1}".format(struct_name, filename)
    def get_type(self, name):
        if name in self.structs:
            return self.structs[name]
        return BasicType(TYPE_REMAPS.get(name, name))
    def get_fixed_gpr(self, num):
        assert isinstance(num, int)
        if num in self.regs:
            return self.regs[num]
        reg = FixedGPR(num)
        self.regs[num] = reg
        return reg
    def optimize(self):
        progress = True
        while progress:
            progress = False
            # Copy Propagation
            for mk in self.metakernels:
                if mk.opt_copy_prop():
                    progress = True
            # Dead Code Elimination
            for r in self.regs.values():
                r.live = False
            for c in self.constants:
                c.live = False
            for mk in self.metakernels:
                mk.opt_dead_code1()
            for mk in self.metakernels:
                if mk.opt_dead_code2():
                    progress = True
            for n in list(self.regs.keys()):
                if not self.regs[n].live:
                    del self.regs[n]
                    progress = True
            self.constants = [c for c in self.constants if c.live]
    def compact_regs(self):
        old_regs = self.regs
        self.regs = {}
        for i, reg in enumerate(old_regs.values()):
            reg.num = i
            self.regs[i] = reg
    def write_h(self, w):
        for s in self.structs.values():
            s.write_h(w)
        for mk in self.metakernels:
            mk.write_h(w)
    def write_c(self, w):
        for c in self.constants:
            c.write_c(w)
        for mk in self.metakernels:
            mk.write_c(w)
 class Kernel(object):
    def __init__(self, m, name, ann):
        self.name = name
        self.source_file = ann['source']
        self.kernel_name = self.source_file.replace('/', '_')[:-3].upper()
        self.entrypoint = ann['kernelFunction']
        assert self.source_file.endswith('.cl')
        self.c_name = '_'.join([
            'GRL_CL_KERNEL',
            self.kernel_name,
            self.entrypoint.upper(),
        ])
 class KernelModule(object):
    def __init__(self, m, name, source, kernels):
        self.name = name
        self.kernels = []
        self.libraries = []
        for k in kernels:
            if k[0] == 'kernel':
                k[2]['source'] = source
                self.kernels.append(Kernel(m, *k[1:]))
            elif k[0] == 'library':
                # Skip this for now.
                pass
 class BasicType(object):
    def __init__(self, name):
        self.name = name
        self.c_name = name
 class Struct(object):
    def __init__(self, m, name, fields, align):
        assert align == 0
        self.name = name
        self.c_name = 'struct ' + '_'.join(['grl', m.name, self.name])
        self.fields = [(m.get_type(t), n) for t, n in fields]
    def write_h(self, w):
        w.write('{} {{\n', self.c_name)
        w.push_indent()
        for f in self.fields:
            w.write('{} {};\n', f[0].c_name, f[1])
        w.pop_indent()
        w.write('};\n')
 class NamedConstant(Value):
    def __init__(self, name, value):
        super().__init__(name, 'cpu')
        self.name = name
        self.value = Constant(value)
        self.written = False
    def set_module(self, m):
        pass
    def write_c(self, w):
        if self.written:
            return
        w.write('static const uint64_t {} = {};\n',
                self.name, self.value.c_val())
        self.written = True
 class MetaKernelParameter(Value):
    def __init__(self, mk, type, name):
        super().__init__(name, 'cpu')
        self.type = mk.m.get_type(type)
 class MetaKernel(object):
    def __init__(self, m, m_scope, name, params, ann, statements):
        self.m = m
        self.name = name
        self.c_name = '_'.join(['grl', m.name, self.name])
        self.goto_targets = {}
        self.num_tmps = 0
        mk_scope = Scope(m, self, m_scope)
        self.params = [MetaKernelParameter(self, *p) for p in params]
        for p in self.params:
            mk_scope.add_def(p)
        mk_scope.add_def(GroupSizeRegister(0), name='DISPATCHDIM_X')
        mk_scope.add_def(GroupSizeRegister(1), name='DISPATCHDIM_Y')
        mk_scope.add_def(GroupSizeRegister(2), name='DISPATCHDIM_Z')
        self.statements = []
        self.parse_stmt(mk_scope, statements)
        self.scope = None
    def get_tmp(self):
        tmpN = '_tmp{}'.format(self.num_tmps)
        self.num_tmps += 1
        return tmpN
    def add_stmt(self, stmt):
        self.statements.append(stmt)
        return stmt
    def parse_value(self, v):
        if isinstance(v, Value):
            return v
        elif isinstance(v, str):
            if re.match(r'REG\d+', v):
                return self.m.get_fixed_gpr(int(v[3:]))
            else:
                return self.scope.get_def(v)
        elif isinstance(v, int):
            return Constant(v)
        elif isinstance(v, tuple):
            if v[0] == 'member':
                return Member(self.parse_value(v[1]), v[2])
            elif v[0] == 'offsetof':
                return OffsetOf(self, v[1])
            else:
                op = v[0]
                srcs = [self.parse_value(s) for s in v[1:]]
                return self.add_stmt(Expression(self, op, *srcs))
        else:
            assert False, 'Invalid value: {}'.format(v[0])
    def load_value(self, v):
        v = self.parse_value(v)
        if isinstance(v, Member) and v.zone == 'gpu':
            v = self.add_stmt(Half(v.value, v.member))
        return v
    def parse_stmt(self, scope, s):
        self.scope = scope
        if isinstance(s, list):
            subscope = Scope(self.m, self, scope)
            for stmt in s:
                self.parse_stmt(subscope, stmt)
        elif s[0] == 'define':
            scope.add_def(self.parse_value(s[2]), name=s[1])
        elif s[0] == 'assign':
            self.add_stmt(StoreReg(self, *s[1:]))
        elif s[0] == 'dispatch':
            self.add_stmt(Dispatch(self, *s[1:]))
        elif s[0] == 'load-dword':
            v = self.add_stmt(LoadMem(self, 32, s[2]))
            self.add_stmt(StoreReg(self, s[1], v))
        elif s[0] == 'load-qword':
            v = self.add_stmt(LoadMem(self, 64, s[2]))
            self.add_stmt(StoreReg(self, s[1], v))
        elif s[0] == 'store-dword':
            self.add_stmt(StoreMem(self, 32, *s[1:]))
        elif s[0] == 'store-qword':
            self.add_stmt(StoreMem(self, 64, *s[1:]))
        elif s[0] == 'goto':
            self.add_stmt(GoTo(self, s[1]))
        elif s[0] == 'goto-if':
            self.add_stmt(GoTo(self, s[1], s[2]))
        elif s[0] == 'goto-if-not':
            self.add_stmt(GoTo(self, s[1], s[2], invert=True))
        elif s[0] == 'label':
            self.add_stmt(GoToTarget(self, s[1]))
        elif s[0] == 'control':
            self.add_stmt(Control(self, s[1]))
        elif s[0] == 'sem-wait-while':
            self.add_stmt(Control(self, s[1]))
        else:
            assert False, 'Invalid statement: {}'.format(s[0])
    def add_goto_target(self, t):
        assert t.name not in self.goto_targets
        self.goto_targets[t.name] = t
    def get_goto_target(self, name):
        return self.goto_targets[name]
    def opt_copy_prop(self):
        progress = False
        copies = {}
        for stmt in self.statements:
            for i in range(len(stmt.srcs)):
                src = stmt.srcs[i]
                if isinstance(src, FixedGPR) and src.num in copies:
                    stmt.srcs[i] = copies[src.num]
                    progress = True
            if isinstance(stmt, StoreReg):
                reg = stmt.reg
                if isinstance(reg, Member):
                    reg = reg.value
                if isinstance(reg, FixedGPR):
                    copies.pop(reg.num, None)
                    if not stmt.srcs[0].is_reg():
                        copies[reg.num] = stmt.srcs[0]
            elif isinstance(stmt, (GoTo, GoToTarget)):
                copies = {}
        return progress
    def opt_dead_code1(self):
        for stmt in self.statements:
            # Mark every register which is read as live
            for src in stmt.srcs:
                if isinstance(src, Register):
                    src.live = True
            # Initialize every SSA statement to dead
            if isinstance(stmt, SSAStatement):
                stmt.live = False
    def opt_dead_code2(self):
        def yield_live(statements):
            gprs_read = set(self.m.regs.keys())
            for stmt in statements:
                if isinstance(stmt, SSAStatement):
                    if not stmt.live:
                        continue
                elif isinstance(stmt, StoreReg):
                    reg = stmt.reg
                    if isinstance(reg, Member):
                        reg = reg.value
                    if not stmt.reg.live:
                        continue
                    if isinstance(reg, FixedGPR):
                        if reg.num in gprs_read:
                            gprs_read.remove(reg.num)
                        else:
                            continue
                elif isinstance(stmt, (GoTo, GoToTarget)):
                    gprs_read = set(self.m.regs.keys())
                for src in stmt.srcs:
                    src.live = True
                    if isinstance(src, FixedGPR):
                        gprs_read.add(src.num)
                yield stmt
        old_stmt_list = self.statements
        old_stmt_list.reverse()
        self.statements = list(yield_live(old_stmt_list))
        self.statements.reverse()
        return len(self.statements) != len(old_stmt_list)
    def count_ssa_value_uses(self):
        for stmt in self.statements:
            if isinstance(stmt, SSAStatement):
                stmt.uses = 0
            for src in stmt.srcs:
                if isinstance(src, SSAStatement):
                    src.uses += 1
    def write_h(self, w):
        w.write('void\n')
        w.write('genX({})(\n', self.c_name)
        w.push_indent()
        w.write('struct anv_cmd_buffer *cmd_buffer')
        for p in self.params:
            w.write(',\n{} {}', p.type.c_name, p.name)
        w.write(');\n')
        w.pop_indent()
    def write_c(self, w):
        w.write('void\n')
        w.write('genX({})(\n', self.c_name)
        w.push_indent()
        w.write('struct anv_cmd_buffer *cmd_buffer')
        for p in self.params:
            w.write(',\n{} {}', p.type.c_name, p.name)
        w.write(')\n')
        w.pop_indent()
        w.write('{\n')
        w.push_indent()
        w.write('struct mi_builder b;\n')
        w.write('mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);\n')
        w.write('/* TODO: use anv_mocs? */\n');
        w.write('const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);\n');
        w.write('mi_builder_set_mocs(&b, mocs);\n');
        w.write('\n')
        for r in self.m.regs.values():
            r.write_c(w)
        w.write('\n')
        for t in self.goto_targets.values():
            t.write_decl(w)
        w.write('\n')
        self.count_ssa_value_uses()
        for s in self.statements:
            s.write_c(w)
        w.pop_indent()
        w.write('}\n')
 HEADER_PROLOGUE = COPYRIGHT + '''
 #include "anv_private.h"
 #include "grl/genX_grl.h"
 #ifndef {0}
 #define {0}
 #ifdef __cplusplus
 extern "C" {{
 #endif
 '''
 HEADER_EPILOGUE = '''
 #ifdef __cplusplus
 }}
 #endif
 #endif /* {0} */
 '''
 C_PROLOGUE = COPYRIGHT + '''
 #include "{0}"
 #include "genxml/gen_macros.h"
 #include "genxml/genX_pack.h"
 #include "genxml/genX_rt_pack.h"
 #include "genX_mi_builder.h"
 #define MI_PREDICATE_RESULT mi_reg32(0x2418)
 #define DISPATCHDIM_X mi_reg32(0x2500)
 #define DISPATCHDIM_Y mi_reg32(0x2504)
 #define DISPATCHDIM_Z mi_reg32(0x2508)
 '''
 def parse_libraries(filenames):
    libraries = {}
    for fname in filenames:
        lib_package = parse_grl_file(fname, [])
        for lib in lib_package:
            assert lib[0] == 'library'
            # Add the directory of the library so that CL files can be found.
            lib[2].append(('path', os.path.dirname(fname)))
            libraries[lib[1]] = lib
    return libraries
 def main():
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--out-c', help='Output C file')
    argparser.add_argument('--out-h', help='Output C file')
    argparser.add_argument('--library', dest='libraries', action='append',
                           default=[], help='Libraries to include')
    argparser.add_argument('grl', help="Input  file")
    args = argparser.parse_args()
    grl_dir = os.path.dirname(args.grl)
    libraries = parse_libraries(args.libraries)
    ir = parse_grl_file(args.grl, libraries)
    m = Module(grl_dir, ir)
    m.optimize()
    m.compact_regs()
    with open(args.out_h, 'w') as f:
        guard = os.path.splitext(os.path.basename(args.out_h))[0].upper()
        w = Writer(f)
        w.write(HEADER_PROLOGUE, guard)
        m.write_h(w)
        w.write(HEADER_EPILOGUE, guard)
    with open(args.out_c, 'w') as f:
        w = Writer(f)
        w.write(C_PROLOGUE, os.path.basename(args.out_h))
        m.write_c(w)
 if __name__ == '__main__':
    main()
--- a/src/intel/vulkan/grl/grl_parser.py
+++ b/src/intel/vulkan/grl/grl_parser.py
@ -1,586 +0,0 @@
 #!/bin/env python
 COPYRIGHT = """\
 /*
 * Copyright 2021 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 """
 import os
 import re
 import ply.lex as lex
 import ply.yacc as yacc
 # Libraries
 libraries = {}
 # LEXER
 keywords = {
    '__debugbreak': 'KW_DEBUGBREAK',
    'alignas': 'KW_ALIGNAS',
    'args': 'KW_ARGS',
    'atomic': 'KW_ATOMIC',
    'atomic_return': 'KW_ATOMIC_RETURN',
    'const': 'KW_CONST',
    'control': 'KW_CONTROL',
    'define': 'KW_DEFINE',
    'dispatch': 'KW_DISPATCH',
    'dispatch_indirect': 'KW_DISPATCH_INDIRECT',
    'goto': 'KW_GOTO',
    'if': 'KW_IF',
    'kernel': 'KW_KERNEL',
    'kernel_module': 'KW_KERNEL_MODULE',
    'import': 'KW_IMPORT',
    'library': 'KW_LIBRARY',
    'links': 'KW_LINKS',
    'load_dword': 'KW_LOAD_DWORD',
    'load_qword': 'KW_LOAD_QWORD',
    'metakernel': 'KW_METAKERNEL',
    'module': 'KW_MODULE',
    'not': 'KW_NOT',
    'offsetof': 'KW_OFFSETOF',
    'postsync': 'KW_POSTSYNC',
    'print': 'KW_PRINT',
    'semaphore_wait': 'KW_SEMAPHORE_WAIT',
    'shiftof': 'KW_SHIFTOF',
    'sizeof': 'KW_SIZEOF',
    'store_dword': 'KW_STORE_DWORD',
    'store_qword': 'KW_STORE_QWORD',
    'store_timestamp': 'KW_STORE_TIMESTAMP',
    'struct': 'KW_STRUCT',
    'unsigned': 'KW_UNSIGNED',
    'while': 'KW_WHILE'
 }
 ops = {
    '&&': 'OP_LOGICAL_AND',
    '||': 'OP_LOGICAL_OR',
    '==': 'OP_EQUALEQUAL',
    '!=': 'OP_NOTEQUAL',
    '<=': 'OP_LESSEQUAL',
    '>=': 'OP_GREATEREQUAL',
    '<<': 'OP_LSHIFT',
    '>>': 'OP_RSHIFT'
 }
 tokens = [
    'INT_LITERAL',
    'STRING_LITERAL',
    'OP',
    'IDENTIFIER'
 ] + list(keywords.values()) + list(ops.values())
 def t_INT_LITERAL(t):
    r'(0x[a-fA-F0-9]+|\d+)'
    if t.value.startswith('0x'):
        t.value = int(t.value[2:], 16)
    else:
        t.value = int(t.value)
    return t
 def t_OP(t):
    r'(&&|\|\||==|!=|<=|>=|<<|>>)'
    t.type = ops.get(t.value)
    return t
 def t_IDENTIFIER(t):
    r'[a-zA-Z_][a-zA-Z_0-9]*'
    t.type = keywords.get(t.value, 'IDENTIFIER')
    return t
 def t_STRING_LITERAL(t):
    r'"(\\.|[^"\\])*"'
    t.value = t.value[1:-1]
    return t
 literals = "+*/(){};:,=&|!~^.%?-<>[]"
 t_ignore = ' \t'
 def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)
 def t_error(t):
    print("WUT: {}".format(t.value))
    t.lexer.skip(1)
 LEXER = lex.lex()
 # PARSER
 precedence = (
    ('right', '?', ':'),
    ('left', 'OP_LOGICAL_OR', 'OP_LOGICAL_AND'),
    ('left', '|'),
    ('left', '^'),
    ('left', '&'),
    ('left', 'OP_EQUALEQUAL', 'OP_NOTEQUAL'),
    ('left', '<', '>', 'OP_LESSEQUAL', 'OP_GREATEREQUAL'),
    ('left', 'OP_LSHIFT', 'OP_RSHIFT'),
    ('left', '+', '-'),
    ('left', '*', '/', '%'),
    ('right', '!', '~'),
    ('left', '[', ']', '.')
 )
 def p_module(p):
    'module : element_list'
    p[0] = p[1]
 def p_element_list(p):
    '''element_list : element_list element
                    | element'''
    if len(p) == 2:
        p[0] = [p[1]]
    else:
        p[0] = p[1] + [p[2]]
 def p_element(p):
    '''element : kernel_definition
               | kernel_module_definition
               | library_definition
               | metakernel_definition
               | module_name
               | struct_definition
               | const_definition
               | import_definition'''
    p[0] = p[1]
 def p_module_name(p):
    'module_name : KW_MODULE IDENTIFIER ";"'
    p[0] = ('module-name', p[2])
 def p_kernel_module_definition(p):
    'kernel_module_definition : KW_KERNEL_MODULE IDENTIFIER "(" STRING_LITERAL ")" "{" kernel_definition_list "}"'
    p[0] = ('kernel-module', p[2], p[4], p[7])
 def p_kernel_definition(p):
    'kernel_definition : KW_KERNEL IDENTIFIER optional_annotation_list'
    p[0] = ('kernel', p[2], p[3])
 def p_library_definition(p):
    'library_definition : KW_LIBRARY IDENTIFIER "{" library_definition_list "}"'
    p[0] = ('library', p[2], p[4])
 def p_library_definition_list(p):
    '''library_definition_list :
                              | library_definition_list IDENTIFIER STRING_LITERAL ";"'''
    if len(p) < 3:
        p[0] = []
    else:
        p[0] = p[1]
        p[0].append((p[2], p[3]))
 def p_import_definition(p):
    'import_definition : KW_IMPORT KW_STRUCT IDENTIFIER STRING_LITERAL ";"'
    p[0] = ('import', p[4], 'struct', p[3])
 def p_links_definition(p):
    'links_definition : KW_LINKS IDENTIFIER'
    # Process a library include like a preprocessor
    global libraries
    if not p[2] in libraries:
        raise "Not able to find library {0}".format(p[2])
    p[0] = libraries[p[2]]
 def p_metakernel_definition(p):
    'metakernel_definition : KW_METAKERNEL IDENTIFIER "(" optional_parameter_list ")" optional_annotation_list scope'
    p[0] = ('meta-kernel', p[2], p[4], p[6], p[7])
 def p_kernel_definition_list(p):
    '''kernel_definition_list :
                              | kernel_definition_list kernel_definition ";"
                              | kernel_definition_list links_definition ";"'''
    if len(p) < 3:
        p[0] = []
    else:
        p[0] = p[1]
        p[0].append(p[2])
 def p_optional_annotation_list(p):
    '''optional_annotation_list :
                                | "<" ">"
                                | "<" annotation_list ">"'''
    if len(p) < 4:
        p[0] = {}
    else:
        p[0] = p[2]
 def p_optional_parameter_list(p):
    '''optional_parameter_list :
                               | parameter_list'''
    p[0] = p[1]
 def p_annotation_list(p):
    '''annotation_list : annotation'''
    p[0] = p[1]
 def p_annotation_list_append(p):
    '''annotation_list : annotation_list "," annotation'''
    p[0] = {**p[1], **p[3]}
 def p_annotation(p):
    '''annotation : IDENTIFIER "=" INT_LITERAL
                  | IDENTIFIER "=" IDENTIFIER
                  | IDENTIFIER "=" STRING_LITERAL'''
    p[0] = {p[1]: p[3]}
 def p_parameter_list(p):
    '''parameter_list : parameter_definition'''
    p[0] = [p[1]]
 def p_parameter_list_append(p):
    '''parameter_list : parameter_list "," parameter_definition'''
    p[0] = p[1]
    p[0].append(p[3])
 def p_parameter_definition(p):
    'parameter_definition : IDENTIFIER IDENTIFIER'
    p[0] = (p[1], p[2])
 def p_scope(p):
    '''scope : "{" optional_statement_list "}"'''
    p[0] = p[2]
 def p_optional_statement_list(p):
    '''optional_statement_list :
                               | statement_list'''
    p[0] = p[1]
 def p_statement_list(p):
    '''statement_list : statement'''
    p[0] = [p[1]]
 def p_statement_list_append(p):
    '''statement_list : statement_list statement'''
    p[0] = p[1]
    p[0].append(p[2])
 def p_statement(p):
    '''statement : definition_statement ";"
                 | assignment_statement ";"
                 | load_store_statement ";"
                 | dispatch_statement ";"
                 | semaphore_statement ";"
                 | label
                 | goto_statement ";"
                 | scope_statement
                 | atomic_op_statement ";"
                 | control_statement ";"
                 | print_statement ";"
                 | debug_break_statement ";"'''
    p[0] = p[1]
 def p_definition_statement(p):
    'definition_statement : KW_DEFINE IDENTIFIER value'
    p[0] = ('define', p[2], p[3])
 def p_assignemt_statement(p):
    'assignment_statement : value "=" value'
    p[0] = ('assign', p[1], p[3])
 def p_load_store_statement_load_dword(p):
    '''load_store_statement : value "=" KW_LOAD_DWORD "(" value ")"'''
    p[0] = ('load-dword', p[1], p[5])
 def p_load_store_statement_load_qword(p):
    '''load_store_statement : value "=" KW_LOAD_QWORD "(" value ")"'''
    p[0] = ('load-qword', p[1], p[5])
 def p_load_store_statement_store_dword(p):
    '''load_store_statement : KW_STORE_DWORD "(" value "," value ")"'''
    p[0] = ('store-dword', p[3], p[5])
 def p_load_store_statement_store_qword(p):
    '''load_store_statement : KW_STORE_QWORD "(" value "," value ")"'''
    p[0] = ('store-qword', p[3], p[5])
 def p_dispatch_statement(p):
    '''dispatch_statement : direct_dispatch_statement
                          | indirect_dispatch_statement'''
    p[0] = p[1]
 def p_direct_dispatch_statement(p):
    '''direct_dispatch_statement : KW_DISPATCH IDENTIFIER "(" value "," value "," value ")" optional_kernel_arg_list optional_postsync'''
    p[0] = ('dispatch', p[2], (p[4], p[6], p[8]), p[10], p[11])
 def p_indirect_dispatch_statement(p):
    '''indirect_dispatch_statement : KW_DISPATCH_INDIRECT IDENTIFIER optional_kernel_arg_list optional_postsync'''
    p[0] = ('dispatch', p[2], None, p[3], p[4])
 def p_optional_kernel_arg_list(p):
    '''optional_kernel_arg_list :
                                | KW_ARGS "(" value_list ")"'''
    p[0] = p[3]
 def p_value_list(p):
    '''value_list : value'''
    p[0] = [p[1]]
 def p_value_list_append(p):
    '''value_list : value_list "," value'''
    p[0] = p[1]
    p[0].append(p[3])
 def p_optional_postsync(p):
    '''optional_postsync :
                         | postsync_operation'''
    if len(p) > 1:
        p[0] = p[1]
 def p_postsync_operation(p):
    '''postsync_operation : postsync_write_dword
                          | postsync_write_timestamp'''
    p[0] = p[1]
 def p_postsync_write_dword(p):
    '''postsync_write_dword : KW_POSTSYNC KW_STORE_DWORD "(" value "," value ")"'''
    p[0] = ('postsync', 'store-dword', p[4], p[6])
 def p_postsync_write_timestamp(p):
    '''postsync_write_timestamp : KW_POSTSYNC KW_STORE_TIMESTAMP "(" value ")"'''
    p[0] = ('postsync', 'timestamp', p[4])
 def p_semaphore_statement(p):
    '''semaphore_statement : KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value "<" value ")"
                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value ">" value ")"
                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_LESSEQUAL value ")"
                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_GREATEREQUAL value ")"
                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_EQUALEQUAL value ")"
                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_NOTEQUAL value ")"'''
    p[0] = ('sem-wait-while', p[5], p[6], p[7])
 def p_atomic_op_statement(p):
    '''atomic_op_statement : KW_ATOMIC IDENTIFIER IDENTIFIER "(" value_list ")"'''
    p[0] = ('atomic', p[2], p[3], p[5])
 def p_atomic_op_statement_return(p):
    '''atomic_op_statement : KW_ATOMIC_RETURN IDENTIFIER IDENTIFIER "(" value_list ")"'''
    p[0] = ('atomic-return', p[2], p[3], p[5])
 def p_label(p):
    '''label : IDENTIFIER ":"'''
    p[0] = ('label', p[1])
 def p_goto_statement(p):
    '''goto_statement : KW_GOTO IDENTIFIER'''
    p[0] = ('goto', p[2])
 def p_goto_statement_if(p):
    '''goto_statement : KW_GOTO IDENTIFIER KW_IF "(" value ")"'''
    p[0] = ('goto-if', p[2], p[5])
 def p_goto_statement_if_not(p):
    '''goto_statement : KW_GOTO IDENTIFIER KW_IF KW_NOT "(" value ")"'''
    p[0] = ('goto-if-not', p[2], p[6])
 def p_scope_statement(p):
    '''scope_statement : scope'''
    p[0] = (p[1])
 def p_control_statement(p):
    '''control_statement : KW_CONTROL "(" id_list ")"'''
    p[0] = ('control', p[3])
 def p_print_statement(p):
    '''print_statement : KW_PRINT "(" printable_list ")"'''
    p[0] = ('print', p[3])
 def p_printable_list(p):
    '''printable_list : printable'''
    p[0] = [p[1]]
 def p_printable_list_append(p):
    '''printable_list : printable_list "," printable'''
    p[0] = p[1]
    p[0].append(p[3])
 def p_printable_str_lit(p):
    '''printable : STRING_LITERAL'''
    p[0] = '"{}"'.format(p[1])
 def p_printable_value(p):
    '''printable : value'''
    p[0] = p[1]
 def p_printable_str_lit_value(p):
    '''printable : STRING_LITERAL value'''
    p[0] = ('"{}"'.format(p[1]), p[2])
 def p_debug_break_statement(p):
    '''debug_break_statement : KW_DEBUGBREAK'''
    p[0] = ('debug-break')
 def p_id_list(p):
    '''id_list : IDENTIFIER'''
    p[0] = p[1]
 def p_id_list_append(p):
    '''id_list : id_list "," IDENTIFIER'''
    p[0] = p[1]
    p[0].append(p[3])
 def p_value(p):
    '''value : IDENTIFIER
             | INT_LITERAL'''
    p[0] = p[1]
 def p_value_braces(p):
    '''value : "(" value ")"'''
    p[0] = (p[2])
 def p_value_member(p):
    '''value : value "." IDENTIFIER'''
    p[0] = ('member', p[1], p[3])
 def p_value_idx(p):
    '''value : value "[" value "]"'''
    p[0] = ('index', p[1], p[3])
 def p_value_binop(p):
    '''value : value "+" value
             | value "-" value
             | value "*" value
             | value "/" value
             | value "%" value
             | value "&" value
             | value "|" value
             | value "<" value
             | value ">" value
             | value "^" value
             | value OP_LESSEQUAL value
             | value OP_GREATEREQUAL value
             | value OP_EQUALEQUAL value
             | value OP_NOTEQUAL value
             | value OP_LOGICAL_AND value
             | value OP_LOGICAL_OR value
             | value OP_LSHIFT value
             | value OP_RSHIFT value'''
    p[0] = (p[2], p[1], p[3])
 def p_value_uniop(p):
    '''value : "!" value
             | "~" value'''
    p[0] = (p[1], p[2])
 def p_value_cond(p):
    '''value : value "?" value ":" value'''
    p[0] = ('?', p[1], p[3], p[5])
 def p_value_funcop(p):
    '''value : KW_OFFSETOF "(" offset_expression ")"
             | KW_SHIFTOF "(" IDENTIFIER ")"
             | KW_SIZEOF "(" IDENTIFIER ")"'''
    p[0] = (p[1], p[3])
 def p_offset_expression(p):
    '''offset_expression : IDENTIFIER'''
    p[0] = p[1]
 def p_offset_expression_member(p):
    '''offset_expression : offset_expression "." IDENTIFIER'''
    p[0] = ('member', p[1], p[3])
 def p_offset_expression_idx(p):
    '''offset_expression : offset_expression "[" INT_LITERAL "]"'''
    p[0] = ('index', p[1], p[3])
 def p_struct_definition(p):
    '''struct_definition : KW_STRUCT optional_alignment_specifier IDENTIFIER "{" optional_struct_member_list "}" ";"'''
    p[0] = ('struct', p[3], p[5], p[2])
 def p_optional_alignment_specifier(p):
    '''optional_alignment_specifier :
                                    | KW_ALIGNAS "(" INT_LITERAL ")"'''
    if len(p) == 1:
        p[0] = 0
    else:
        p[0] = p[3]
 def p_optional_struct_member_list(p):
    '''optional_struct_member_list :
                                   | struct_member_list'''
    if len(p) == 1:
        p[0] = {}
    else:
        p[0] = p[1]
 def p_struct_member_list(p):
    '''struct_member_list : struct_member'''
    p[0] = [p[1]]
 def p_struct_member_list_append(p):
    '''struct_member_list : struct_member_list struct_member'''
    p[0] = p[1] + [p[2]]
 def p_struct_member(p):
    '''struct_member : struct_member_typename IDENTIFIER ";"'''
    p[0] = (p[1], p[2])
 def p_struct_member_array(p):
    '''struct_member : struct_member_typename IDENTIFIER "[" INT_LITERAL "]" ";"'''
    '''struct_member : struct_member_typename IDENTIFIER "[" IDENTIFIER "]" ";"'''
    p[0] = {p[1]: p[2], 'count': p[4]}
 def p_struct_member_typename(p):
    '''struct_member_typename : IDENTIFIER'''
    p[0] = p[1]
 def p_struct_member_typename_unsigned(p):
    '''struct_member_typename : KW_UNSIGNED IDENTIFIER'''
    p[0] = ('unsigned', p[2])
 def p_struct_member_typename_struct(p):
    '''struct_member_typename : KW_STRUCT IDENTIFIER'''
    p[0] = ('struct', p[2])
 def p_const_definition(p):
    '''const_definition : KW_CONST IDENTIFIER "=" INT_LITERAL ";"'''
    p[0] = ('named-constant', p[2], p[4])
 PARSER = yacc.yacc()
 # Shamelessly stolen from some StackOverflow answer
 def _remove_comments(text):
    def replacer(match):
        s = match.group(0)
        if s.startswith('/'):
            return " " # note: a space and not an empty string
        else:
            return s
    pattern = re.compile(
        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE
    )
    return re.sub(pattern, replacer, text)
 def parse_grl_file(grl_fname, libs):
    global libraries
    libraries = libs
    with open(grl_fname, 'r') as f:
        return PARSER.parse(_remove_comments(f.read()))
--- a/src/intel/vulkan/grl/grl_structs.h
+++ b/src/intel/vulkan/grl/grl_structs.h
@ -1,479 +0,0 @@
 /*
 * Copyright © 2022 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 /**
 * This file contains a redefinition of structures defined in the GRL library.
 * We need to have those structures defined to allocate & prepare data for
 * the OpenCL kernels building acceleration structures. Unfortunately because
 * of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL
 * header files directly so we have to redefine stuff here.
 */
 #ifndef GRL_STRUCTS_H
 #define GRL_STRUCTS_H
 #include "GRLStructs.h"
 #include "GRLRTASCommon.h"
 struct MKBuilderState {
   qword geomDesc_buffer;
   qword build_primref_buffer;
   qword build_globals;
   qword bvh_buffer;
   dword leaf_type;
   dword leaf_size;
 };
 #define PREFIX_MK_STATE(prefix, obj) \
   (struct prefix##_MKBuilderState) { \
      .geomDesc_buffer = (obj).geomDesc_buffer, \
      .build_primref_buffer = (obj).build_primref_buffer, \
      .build_globals = (obj).build_globals, \
      .bvh_buffer = (obj).bvh_buffer, \
      .leaf_type = (obj).leaf_type, \
      .leaf_size = (obj).leaf_size, \
   }
 struct MKSizeEstimate {
   dword numTriangles;
   dword numProcedurals;
   dword numPrimitives;
   dword numMeshes;
   dword numBuildPrimitives;
   dword numPrimitivesToSplit;
   dword instance_descs_start;
   dword geo_meta_data_start;
   dword node_data_start;
   dword leaf_data_start;
   dword procedural_data_start;
   dword back_pointer_start;
   dword sizeTotal;
   dword updateScratchSizeTotal;
   dword fatleaf_table_start;
   dword innernode_table_start;
   dword max_fatleaves;
   size_t max_instance_leafs;
   size_t max_inner_nodes;
   size_t leaf_data_size;
   size_t min_primitives;
   size_t max_primitives;
 };
 #define PREFIX_MK_SIZE(prefix, obj) \
   (struct prefix##_MKSizeEstimate) { \
      .numTriangles = (obj).numTriangles, \
      .numProcedurals = (obj).numProcedurals, \
      .numPrimitives = (obj).numPrimitives, \
      .numMeshes = (obj).numMeshes, \
      .numBuildPrimitives = (obj).numBuildPrimitives, \
      .numPrimitivesToSplit = (obj).numPrimitivesToSplit, \
      .instance_descs_start = (obj).instance_descs_start, \
      .geo_meta_data_start = (obj).geo_meta_data_start, \
      .node_data_start = (obj).node_data_start, \
      .leaf_data_start = (obj).leaf_data_start, \
      .procedural_data_start = (obj).procedural_data_start, \
      .back_pointer_start = (obj).back_pointer_start, \
      .sizeTotal = (obj).sizeTotal, \
      .updateScratchSizeTotal = (obj).updateScratchSizeTotal, \
      .fatleaf_table_start = (obj).fatleaf_table_start, \
      .innernode_table_start = (obj).innernode_table_start, \
      .max_fatleaves = (obj).max_fatleaves, \
   }
 typedef struct AABB {
   float lower[4];
   float upper[4];
 } AABB;
 struct Globals
 {
   struct AABB centroidBounds;
   unsigned int build_record_start;
   unsigned int numPrimitives;
   unsigned int leafPrimType;
   unsigned int leafSize;
   unsigned int numSplittedPrimitives;
   unsigned int numBuildRecords;
   // spatial split sate
   unsigned int numOriginalPrimitives;
   float presplitPrioritySum;
   float probThreshold;
   // binned-sah bfs state
   unsigned int counter;
   unsigned int numBuildRecords_extended;
   // sync variable used for global-sync on work groups
   unsigned int sync;
   /* morton code builder state */
   unsigned int shift;      // used by adaptive mc-builder
   unsigned int shift_mask; // used by adaptive mc-builder
   unsigned int binary_hierarchy_root;
   unsigned int p0_allocated_num;
   unsigned int p0_created_num;
   unsigned int morton_sort_in_flight;
   unsigned int sort_iterations;
   gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy.  Stashed here as a debug aid
 };
 typedef struct BVHBase
 {
   // TODO:  Implement the "copy-first-node" trick... duplicate root node here
   uint64_t rootNodeOffset;
   uint32_t reserved;
   uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
   uint32_t quadLeafStart;
   uint32_t quadLeafCur;
   uint32_t proceduralDataStart;
   uint32_t proceduralDataCur;
   uint32_t instanceLeafStart;
   uint32_t instanceLeafEnd;
   uint32_t backPointerDataStart;     //
   uint32_t refitTreeletsDataStart;   // refit structs
   uint32_t refitStartPointDataStart; //
   uint32_t BVHDataEnd;
   // number of bottom treelets
   // if 1, then the bottom treelet is also tip treelet
   uint32_t refitTreeletCnt;
   uint32_t refitTreeletCnt2; // always 0, used for atomic updates
   // data layout:
   // @backPointerDataStart
   //  'backpointer' - a dword per inner node.
   //  The bits are used as follows:
   //     2:0  --> Used as a refit counter during BVH refitting.  MBZ
   //     5:3  --> Number of children
   //     31:6 --> Index of the parent node in the internal node array
   //    The root node has a parent index of all ones
   // @refitTreeletsDataStart
   //  RefitTreelet[], the last treelet is for top treelet all previous are for bottom
   // @refitStartPointDataStart
   //  for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
   // @backPointerDataEnd
   uint32_t fatLeafCount;  // number of internal nodes which are "fat-leaves"
   uint32_t innerCount;    // number of internal nodes which are true inner nodes (all internalNode children)
   uint32_t fatLeafTableStart;
   uint32_t innerTableStart;
   uint32_t _pad[12];
   struct RTASMetaData Meta;
 } BVHBase;
 struct BatchedInitGlobalsData
 {
   qword p_build_globals;
   qword p_bvh_buffer;
   dword numPrimitives;
   dword numGeometries;
   dword numInstances;
   dword instance_descs_start;
   dword geo_meta_data_start;
   dword node_data_start;
   dword leaf_data_start;
   dword procedural_data_start;
   dword back_pointer_start;
   dword sizeTotal;
   dword leafType;
   dword leafSize;
   dword fatleaf_table_start;
   dword innernode_table_start;
 };
 #define BFS_NUM_BINS        16
 #define BFS_NUM_VCONTEXTS   256
 #define BFS_MAX_DEPTH 32
 #define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
 struct BFS_Split
 {
   float sah;
   int dim;
   int pos;
 };
 struct BFS_BinInfo
 {
   float min_max[18 * BFS_NUM_BINS]; //  layout: bins[axis][num_bins][6]
   //          The 6 are lower(xyz) and -upper(xyz)
   // bins use negated-max so that we can use vectorized mins instead of min/max pairs
   uint counts[3 * BFS_NUM_BINS];
 };
 struct SAHBuildGlobals
 {
   qword   p_primref_index_buffers;
   qword   p_primrefs_buffer;
   qword   p_bvh2;
   qword   p_globals;     // TODO: deprecate this
   qword   p_bvh_base;
   gpuva_t p_qnode_root_buffer;
   dword flags; // bit 1 is 'alloc_backpointers'.  bit 2 is 'need_masks'
   dword num_primrefs;
   dword leaf_size;
   dword leaf_type;
   dword root_buffer_num_produced;
   dword root_buffer_num_produced_hi;
   dword root_buffer_num_consumed;
   dword root_buffer_num_consumed_hi;
   dword root_buffer_num_to_consume;
   dword root_buffer_num_to_consume_hi;
 };
 typedef union LRBounds
 {
   struct
   {
      struct AABB3f left_centroid_bounds;
      struct AABB3f left_geom_bounds;
      struct AABB3f right_centroid_bounds;
      struct AABB3f right_geom_bounds;
   } boxes;
   struct
   {
      float Array[24];
   } scalars;
 } LRBounds;
 struct VContext
 {
   uint dispatch_primref_begin;    // range of primrefs for this task
   uint dispatch_primref_end;
   uint bvh2_root;                 // BVH2 root node for this task
   uint tree_depth;                // depth of this node in the tree
   uint num_left;          // primref counts
   uint num_right;
   uint lr_mask;      // lower 8b : left mask.  upper 8b : right mask
   uint batch_index;
   // pass1 global working state and output
   struct BFS_Split split;
   struct BFS_BinInfo global_bin_info;
   // pass2 global working state and output
   LRBounds lr_bounds;
 };
 struct BFSDispatchRecord
 {
   ushort batch_index;
   ushort context_id;
 };
 struct BFSDispatchQueue
 {
   uint num_dispatches;
   uint wg_count[BFS_NUM_VCONTEXTS];
   struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
 };
 struct BFS1SpillStackEntry
 {
   uint primref_begin;
   uint primref_end;
   uint bvh2_root;
   ushort tree_depth;
   ushort batch_index;
 };
 struct BFS1SpillStack
 {
   uint size;
   struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
 };
 struct QNodeGlobalRootBufferEntry
 {
   uint bvh2_node;
   uint qnode;
   uint build_idx;
   uint _pad;
 };
 struct QNodeGlobalRootBuffer
 {
   uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
   struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
 };
 struct DFSDispatchRecord
 {
   uint primref_base;
   uint bvh2_base;
   uint batch_index;
   ushort num_primrefs;
   ushort tree_depth;
 };
 struct DFSDispatchQueue
 {
   struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
 };
 #define VCONTEXT_STATE_EXECUTING   0
 #define VCONTEXT_STATE_UNALLOCATED 1
 union SchedulerUnion
 {
   struct VContextScheduler
   {
      /////////////////////////////////////////////////////////////
      //  State data used for communication with command streamer
      //   NOTE: This part must match definition in 'new_sah_builder.grl'
      /////////////////////////////////////////////////////////////
      dword num_bfs_wgs;
      dword num_dfs_wgs;
      dword scheduler_postsync;
      dword _pad1;
      dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
      dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
      dword batched_build_wg_count;  // number of wgs to dispatch for initial BFS pass
      dword batched_build_loop_mask; // value is 0 if  #builds <= #contexts.  else 1  command streamer uses this as a loop condition
      /////////////////////////////////////////////////////////////
      dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
      dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
      dword vcontext_state[BFS_NUM_VCONTEXTS];
      struct BFSDispatchQueue bfs_queue;
      struct DFSDispatchQueue dfs_queue;
      struct VContext contexts[BFS_NUM_VCONTEXTS];
      struct BFS1SpillStack bfs2_spill_stack;
   } vContextScheduler;
   struct QnodeScheduler
   {
      dword num_qnode_grb_curr_entries;
      dword num_qnode_grb_new_entries;
      dword scheduler_postsync;
      dword _pad1;
      dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
      dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
      dword batched_builds_to_process;
      dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
      /////////////////////////////////////////////////////////////
      dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
      dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
      struct QNodeGlobalRootBuffer qnode_global_root_buffer;
   } qnodeScheduler;
 };
 struct BVH2Node
 {
   struct AABB3f box;
   uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
   uint  meta_ss;
   //ushort meta_s;   // leaf: primref count.  inner: offset from first to second child, in nodes
   //uchar is_inner; //  1 if inner, 0 if leaf
   //uchar mask;
 };
 struct BVH2
 {
   uint num_nodes;
   uint _pad[7];  // align to 32B
 };
 struct BatchedBLSDispatchEntry
 {
   /////////////////////////////////////////////////////////////
   //  State data used for communication with command streamer
   //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
   /////////////////////////////////////////////////////////////
   qword p_data_buffer;
   qword num_elements; // number of elements in p_data_buffer
 };
 struct SAHBuildArgsBatchable
 {
   qword p_globals_ptrs;
   qword p_scheduler;
   qword p_buffers_info;
   qword p_sah_globals;
   dword num_max_qnode_global_root_buffer_entries;
   dword num_builds;
 };
 #define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \
   (struct prefix##_SAHBuildArgsBatchable) { \
      .p_globals_ptrs = (obj).p_globals_ptrs, \
      .p_scheduler = (obj).p_scheduler, \
      .p_buffers_info = (obj).p_buffers_info, \
      .p_sah_globals = (obj).p_sah_globals, \
      .num_max_qnode_global_root_buffer_entries = \
      (obj).num_max_qnode_global_root_buffer_entries, \
      .num_builds = (obj).num_builds, \
   }
 struct SAHBuildBuffersInfo
 {
   gpuva_t p_globals;
   gpuva_t p_primref_index_buffers;
   gpuva_t p_primrefs_buffer;
   gpuva_t p_bvh2;
   gpuva_t p_bvh_base;
   gpuva_t p_qnode_root_buffer;
   dword   sah_globals_flags;
   dword   _pad;
   gpuva_t _pad2;
 };
 #endif /* GRL_STRUCTS_H */
--- a/src/intel/vulkan/grl/include/AABB3f.h
+++ b/src/intel/vulkan/grl/include/AABB3f.h
@ -1,459 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "GRLRTASCommon.h"
 #include "affinespace.h"
 #ifndef __OPENCL_VERSION__
 #   include "stdio.h" //for printf
 #endif
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
 GRL_INLINE void AABB3f_init(struct AABB3f *aabb)
 {
    aabb->lower[0] = (float)(INFINITY);
    aabb->lower[1] = (float)(INFINITY);
    aabb->lower[2] = (float)(INFINITY);
    aabb->upper[0] = -(float)(INFINITY);
    aabb->upper[1] = -(float)(INFINITY);
    aabb->upper[2] = -(float)(INFINITY);
 }
 GRL_INLINE float3 AABB3f_load_lower( const struct AABB3f* aabb )
 {
    float3 v = { aabb->lower[0], aabb->lower[1], aabb->lower[2] };
    return v;
 }
 GRL_INLINE float3 AABB3f_load_upper( const struct AABB3f* aabb )
 {
    float3 v = { aabb->upper[0], aabb->upper[1], aabb->upper[2] };
    return v;
 }
 GRL_INLINE void AABB3f_extend(struct AABB3f *aabb, const struct AABB3f *v)
 {
    aabb->lower[0] = fmin(aabb->lower[0], v->lower[0]);
    aabb->lower[1] = fmin(aabb->lower[1], v->lower[1]);
    aabb->lower[2] = fmin(aabb->lower[2], v->lower[2]);
    aabb->upper[0] = fmax(aabb->upper[0], v->upper[0]);
    aabb->upper[1] = fmax(aabb->upper[1], v->upper[1]);
    aabb->upper[2] = fmax(aabb->upper[2], v->upper[2]);
 }
 GRL_INLINE void AABB3f_intersect(struct AABB3f* aabb, struct AABB3f inters)
 {
    aabb->upper[0] = fmin(inters.upper[0],aabb->upper[0]);
    aabb->upper[1] = fmin(inters.upper[1],aabb->upper[1]);
    aabb->upper[2] = fmin(inters.upper[2],aabb->upper[2]);
    aabb->lower[0] = fmax(inters.lower[0],aabb->lower[0]);
    aabb->lower[1] = fmax(inters.lower[1],aabb->lower[1]);
    aabb->lower[2] = fmax(inters.lower[2],aabb->lower[2]);
 }
 GRL_INLINE void AABB3f_trim_upper(struct AABB3f* aabb, const float* upper)
 {
    aabb->upper[0] = fmin(upper[0], aabb->upper[0]);
    aabb->upper[1] = fmin(upper[1], aabb->upper[1]);
    aabb->upper[2] = fmin(upper[2], aabb->upper[2]);
 }
 GRL_INLINE void AABB3f_set( struct AABB3f* aabb, float3 lower, float3 upper )
 {
    aabb->lower[0] = lower.x ;
    aabb->lower[1] = lower.y ;
    aabb->lower[2] = lower.z ;
    aabb->upper[0] = upper.x ;
    aabb->upper[1] = upper.y ;
    aabb->upper[2] = upper.z ;
 }
 inline void AABB3f_extend_point(struct AABB3f *aabb, const float3 p)
 {
    aabb->lower[0] = fmin(aabb->lower[0], p.x);
    aabb->lower[1] = fmin(aabb->lower[1], p.y);
    aabb->lower[2] = fmin(aabb->lower[2], p.z);
    aabb->upper[0] = fmax(aabb->upper[0], p.x);
    aabb->upper[1] = fmax(aabb->upper[1], p.y);
    aabb->upper[2] = fmax(aabb->upper[2], p.z);
 }
 GRL_INLINE void AABB3f_extendlu(struct AABB3f *aabb, const float3 lower, const float3 upper)
 {
    aabb->lower[0] = fmin(aabb->lower[0], lower.x);
    aabb->lower[1] = fmin(aabb->lower[1], lower.y);
    aabb->lower[2] = fmin(aabb->lower[2], lower.z);
    aabb->upper[0] = fmax(aabb->upper[0], upper.x);
    aabb->upper[1] = fmax(aabb->upper[1], upper.y);
    aabb->upper[2] = fmax(aabb->upper[2], upper.z);
 }
 GRL_INLINE float3 AABB3f_size(struct AABB3f* aabb)
 {
    return AABB3f_load_upper(aabb) - AABB3f_load_lower(aabb);
 }
 GRL_INLINE float AABB3f_halfArea(struct AABB3f *aabb)
 {
    const float3 d = AABB3f_load_upper( aabb ) - AABB3f_load_lower( aabb );
    return d.x* (d.y + d.z) + (d.y * d.z);
 }
 GRL_INLINE float halfArea_AABB3f(struct AABB3f *aabb) // TODO: Remove me
 {
    const float3 d = { aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2] };
    return fma(d.x, (d.y + d.z), d.y * d.z);
 }
 GRL_INLINE void AABB3f_set_lower(struct AABB3f* aabb, float3 lower)
 {
    aabb->lower[0] = lower.x;
    aabb->lower[1] = lower.y;
    aabb->lower[2] = lower.z;
 }
 GRL_INLINE void AABB3f_set_upper(struct AABB3f* aabb, float3 upper)
 {
    aabb->upper[0] = upper.x;
    aabb->upper[1] = upper.y;
    aabb->upper[2] = upper.z;
 }
 GRL_INLINE float3 conservativeExtent(float3 extent)
 {
    const float v = FLT_EPSILON * fmax(extent.x, fmax(extent.y, extent.z));
    float3 v3 = { v,v,v };
    extent = extent + v3;
    return extent;
 }
 inline struct AABB3f GRL_OVERLOADABLE transform_aabb(float3 lower, float3 upper, const float* Transform)
 {
 #if 1
    // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
    //     New AABB is center +- Extent.
    //
    // For derivation see:
    //    https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
    //
    float3 Center = (upper + lower) * 0.5f;
    float3 Extent = (conservativeExtent(upper) - lower) * 0.5f;
    float cx = Center.x * Transform[0] + Center.y * Transform[1] + Center.z * Transform[2] + Transform[3];
    float cy = Center.x * Transform[4] + Center.y * Transform[5] + Center.z * Transform[6] + Transform[7];
    float cz = Center.x * Transform[8] + Center.y * Transform[9] + Center.z * Transform[10] + Transform[11];
    float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
    float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
    float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
    Center.x = cx; Center.y = cy;  Center.z = cz;
    Extent.x = ex; Extent.y = ey;  Extent.z = ez;
    struct AABB3f box;
    AABB3f_set_lower(&box, Center - Extent);
    AABB3f_set_upper(&box, Center + Extent);
    return box;
 #else
    struct AffineSpace3f xfm = AffineSpace3f_load_row_major(Transform);
    float3 plll = { lower.x, lower.y, lower.z };
    float3 pllu = { lower.x, lower.y, upper.z };
    float3 plul = { lower.x, upper.y, lower.z };
    float3 pluu = { lower.x, upper.y, upper.z };
    float3 pull = { upper.x, lower.y, lower.z };
    float3 pulu = { upper.x, lower.y, upper.z };
    float3 puul = { upper.x, upper.y, lower.z };
    float3 puuu = { upper.x, upper.y, upper.z };
    plll  = xfmPoint(xfm, plll) ;
    pllu  = xfmPoint(xfm, pllu) ;
    plul  = xfmPoint(xfm, plul) ;
    pluu  = xfmPoint(xfm, pluu) ;
    pull  = xfmPoint(xfm, pull) ;
    pulu  = xfmPoint(xfm, pulu) ;
    puul  = xfmPoint(xfm, puul) ;
    puuu  = xfmPoint(xfm, puuu) ;
    float3 p1_min = fmin(plll, pull);
    float3 p2_min = fmin(pllu, pulu);
    float3 p3_min = fmin(plul, puul);
    float3 p4_min = fmin(pluu, puuu);
    float3 p1_max = fmax(plll, pull);
    float3 p2_max = fmax(pllu, pulu);
    float3 p3_max = fmax(plul, puul);
    float3 p4_max = fmax(pluu, puuu);
    p1_min = fmin(p1_min, p3_min);
    p2_min = fmin(p2_min, p4_min);
    p1_max = fmax(p1_max, p3_max);
    p2_max = fmax(p2_max, p4_max);
    p1_min = fmin(p1_min, p2_min);
    p1_max = fmax(p1_max, p2_max);
    AABB3f out = {
        {p1_min.x,p1_min.y,p1_min.z},
        {p1_max.x,p1_max.y,p1_max.z}
    };
    return out;
 #endif
 }
 GRL_INLINE struct AABB3f GRL_OVERLOADABLE transform_aabb(struct AABB3f box, const float* Transform)
 {
    float3 lower = { box.lower[0], box.lower[1], box.lower[2] };
    float3 upper = { box.upper[0], box.upper[1], box.upper[2] };
    return transform_aabb(lower, upper, Transform);
 }
 GRL_INLINE struct AABB3f AABB3f_transform(struct AffineSpace3f xfm, struct AABB3f in)
 {
    struct AABB3f out;
    float rmTransform[12];
    load_row_major_from_AffineSpace3f(xfm, rmTransform);
    out = transform_aabb(in, rmTransform);
    return out;
 }
 GRL_INLINE bool AABB3f_isIn(struct AABB3f bigger, float3 contained)
 {
    bool iscontained =
        contained.x >= bigger.lower[0] &&
        contained.y >= bigger.lower[1] &&
        contained.z >= bigger.lower[2] &&
        contained.x <= bigger.upper[0] &&
        contained.y <= bigger.upper[1] &&
        contained.z <= bigger.upper[2];
    return iscontained;
 }
 GRL_INLINE bool AABB3f_isSubset(struct AABB3f bigger, struct AABB3f contained)
 {
    bool iscontained =
        contained.lower[0] >= bigger.lower[0] &&
        contained.lower[1] >= bigger.lower[1] &&
        contained.lower[2] >= bigger.lower[2] &&
        contained.upper[0] <= bigger.upper[0] &&
        contained.upper[1] <= bigger.upper[1] &&
        contained.upper[2] <= bigger.upper[2];
    return iscontained;
 }
 GRL_INLINE bool AABB3f_is_degenerate(struct AABB3f* box )
 {
    return box->lower[0] > box->upper[0] ||
           box->lower[1] > box->upper[1] ||
           box->lower[2] > box->upper[2];
 }
 GRL_INLINE void AABB3f_print(struct AABB3f *aabb)
 {
    printf("AABB {\n");
    printf("  lower = %f, %f, %f\n", aabb->lower[0], aabb->lower[1], aabb->lower[2]);
    printf("  upper = %f, %f, %f\n", aabb->upper[0], aabb->upper[1], aabb->upper[2]);
    printf("}\n");
 }
 #ifdef __OPENCL_VERSION__
 GRL_INLINE struct AABB3f AABB3f_sub_group_shuffle(struct AABB3f *aabb, const uint slotID)
 {
    struct AABB3f bounds;
    bounds.lower[0] = intel_sub_group_shuffle(aabb->lower[0], slotID);
    bounds.lower[1] = intel_sub_group_shuffle(aabb->lower[1], slotID);
    bounds.lower[2] = intel_sub_group_shuffle(aabb->lower[2], slotID);
    bounds.upper[0] = intel_sub_group_shuffle(aabb->upper[0], slotID);
    bounds.upper[1] = intel_sub_group_shuffle(aabb->upper[1], slotID);
    bounds.upper[2] = intel_sub_group_shuffle(aabb->upper[2], slotID);
    return bounds;
 }
 GRL_INLINE struct AABB3f AABB3f_sub_group_reduce(struct AABB3f *aabb)
 {
    struct AABB3f bounds;
    bounds.lower[0] = sub_group_reduce_min(aabb->lower[0]);
    bounds.lower[1] = sub_group_reduce_min(aabb->lower[1]);
    bounds.lower[2] = sub_group_reduce_min(aabb->lower[2]);
    bounds.upper[0] = sub_group_reduce_max(aabb->upper[0]);
    bounds.upper[1] = sub_group_reduce_max(aabb->upper[1]);
    bounds.upper[2] = sub_group_reduce_max(aabb->upper[2]);
    return bounds;
 }
 GRL_INLINE struct AABB3f AABB3f_sub_group_scan_exclusive_min_max(struct AABB3f *aabb)
 {
    struct AABB3f bounds;
    bounds.lower[0] = sub_group_scan_exclusive_min(aabb->lower[0]);
    bounds.lower[1] = sub_group_scan_exclusive_min(aabb->lower[1]);
    bounds.lower[2] = sub_group_scan_exclusive_min(aabb->lower[2]);
    bounds.upper[0] = sub_group_scan_exclusive_max(aabb->upper[0]);
    bounds.upper[1] = sub_group_scan_exclusive_max(aabb->upper[1]);
    bounds.upper[2] = sub_group_scan_exclusive_max(aabb->upper[2]);
    return bounds;
 }
 GRL_INLINE struct AABB3f AABB3f_sub_group_scan_inclusive_min_max(struct AABB3f *aabb)
 {
    struct AABB3f bounds;
    bounds.lower[0] = sub_group_scan_inclusive_min(aabb->lower[0]);
    bounds.lower[1] = sub_group_scan_inclusive_min(aabb->lower[1]);
    bounds.lower[2] = sub_group_scan_inclusive_min(aabb->lower[2]);
    bounds.upper[0] = sub_group_scan_inclusive_max(aabb->upper[0]);
    bounds.upper[1] = sub_group_scan_inclusive_max(aabb->upper[1]);
    bounds.upper[2] = sub_group_scan_inclusive_max(aabb->upper[2]);
    return bounds;
 }
 GRL_INLINE void AABB3f_atomic_merge_local_nocheck(local struct AABB3f *aabb, const float4 lower, const float4 upper)
 {
    atomic_min((local float *)&aabb->lower + 0, lower.x);
    atomic_min((local float *)&aabb->lower + 1, lower.y);
    atomic_min((local float *)&aabb->lower + 2, lower.z);
    atomic_max((local float *)&aabb->upper + 0, upper.x);
    atomic_max((local float *)&aabb->upper + 1, upper.y);
    atomic_max((local float *)&aabb->upper + 2, upper.z);
 }
 GRL_INLINE void AABB3f_atomic_merge_global_lu( global struct AABB3f* aabb, const float3 lower, const float3 upper )
 {
    atomic_min( (global float*) & aabb->lower + 0, lower.x );
    atomic_min( (global float*) & aabb->lower + 1, lower.y );
    atomic_min( (global float*) & aabb->lower + 2, lower.z );
    atomic_max( (global float*) & aabb->upper + 0, upper.x );
    atomic_max( (global float*) & aabb->upper + 1, upper.y );
    atomic_max( (global float*) & aabb->upper + 2, upper.z );
 }
 GRL_INLINE void AABB3f_atomic_merge_local_lu( local struct AABB3f* aabb, const float3 lower, const float3 upper )
 {
    atomic_min( (local float*) & aabb->lower + 0, lower.x );
    atomic_min( (local float*) & aabb->lower + 1, lower.y );
    atomic_min( (local float*) & aabb->lower + 2, lower.z );
    atomic_max( (local float*) & aabb->upper + 0, upper.x );
    atomic_max( (local float*) & aabb->upper + 1, upper.y );
    atomic_max( (local float*) & aabb->upper + 2, upper.z );
 }
 GRL_INLINE void Uniform_AABB3f_atomic_merge_local_sub_group_lu(uniform local struct AABB3f* aabb, const float3 lower, const float3 upper)
 {
    float lx = sub_group_reduce_min(lower.x);
    float ly = sub_group_reduce_min(lower.y);
    float lz = sub_group_reduce_min(lower.z);
    float ux = sub_group_reduce_max(upper.x);
    float uy = sub_group_reduce_max(upper.y);
    float uz = sub_group_reduce_max(upper.z);
    if (get_sub_group_local_id() == 0)
    {
        atomic_min((local float*) & aabb->lower + 0, lx);
        atomic_min((local float*) & aabb->lower + 1, ly);
        atomic_min((local float*) & aabb->lower + 2, lz);
        atomic_max((local float*) & aabb->upper + 0, ux);
        atomic_max((local float*) & aabb->upper + 1, uy);
        atomic_max((local float*) & aabb->upper + 2, uz);
    }
 }
 GRL_INLINE void AABB3f_atomic_merge_global_sub_group_lu(uniform global struct AABB3f* aabb, const float3 lower, const float3 upper)
 {
    uint lane = get_sub_group_local_id();
    float l[3];
    l[0] = sub_group_reduce_min(lower.x);
    l[1] = sub_group_reduce_min(lower.y);
    l[2] = sub_group_reduce_min(lower.z);
    float u[3];
    u[0] = sub_group_reduce_max(upper.x);
    u[1] = sub_group_reduce_max(upper.y);
    u[2] = sub_group_reduce_max(upper.z);
    if (lane < 3)
    {
        atomic_min((global float*)&aabb->lower + lane, l[lane]);
        atomic_max((global float*)&aabb->upper + lane, u[lane]);
    }
 }
 GRL_INLINE void AABB3f_atomic_merge_global( global struct AABB3f* aabb, struct AABB3f* other )
 {
    float3 lower = AABB3f_load_lower( other );
    float3 upper = AABB3f_load_upper( other );
    atomic_min( (global float*) & aabb->lower + 0, lower.x );
    atomic_min( (global float*) & aabb->lower + 1, lower.y );
    atomic_min( (global float*) & aabb->lower + 2, lower.z );
    atomic_max( (global float*) & aabb->upper + 0, upper.x );
    atomic_max( (global float*) & aabb->upper + 1, upper.y );
    atomic_max( (global float*) & aabb->upper + 2, upper.z );
 }
 GRL_INLINE void AABB3f_atomic_merge_localBB_nocheck( local struct AABB3f* aabb, struct AABB3f* bb )
 {
    atomic_min( (local float*) & aabb->lower + 0, bb->lower[0] );
    atomic_min( (local float*) & aabb->lower + 1, bb->lower[1] );
    atomic_min( (local float*) & aabb->lower + 2, bb->lower[2] );
    atomic_max( (local float*) & aabb->upper + 0, bb->upper[0] );
    atomic_max( (local float*) & aabb->upper + 1, bb->upper[1] );
    atomic_max( (local float*) & aabb->upper + 2, bb->upper[2] );
 }
 GRL_INLINE void AABB3f_atomic_merge_local(local struct AABB3f *aabb, const float4 lower, const float4 upper)
 {
    if (lower.x < aabb->lower[0])
        atomic_min((local float *)&aabb->lower + 0, lower.x);
    if (lower.y < aabb->lower[1])
        atomic_min((local float *)&aabb->lower + 1, lower.y);
    if (lower.z < aabb->lower[2])
        atomic_min((local float *)&aabb->lower + 2, lower.z);
    if (upper.x > aabb->upper[0])
        atomic_max((local float *)&aabb->upper + 0, upper.x);
    if (upper.y > aabb->upper[1])
        atomic_max((local float *)&aabb->upper + 1, upper.y);
    if (upper.z > aabb->upper[2])
        atomic_max((local float *)&aabb->upper + 2, upper.z);
 }
 GRL_INLINE void AABB3f_atomic_merge_global_local(global struct AABB3f *dest, local struct AABB3f *source)
 {
    float3 l = AABB3f_load_lower(source);
    float3 u = AABB3f_load_upper(source);
    atomic_min((global float *)&dest->lower + 0, l.x );
    atomic_min((global float *)&dest->lower + 1, l.y );
    atomic_min((global float *)&dest->lower + 2, l.z );
    atomic_max((global float *)&dest->upper + 0, u.x );
    atomic_max((global float *)&dest->upper + 1, u.y );
    atomic_max((global float *)&dest->upper + 2, u.z );
 }
 struct AABB3f AABB3f_construct( float3 min, float3 max )
 {
    struct AABB3f bb;
    bb.lower[0] = min.x; bb.lower[1] = min.y; bb.lower[2] = min.z;
    bb.upper[0] = max.x; bb.upper[1] = max.y; bb.upper[2] = max.z;
    return bb;
 }
 struct AABB3f AABB3f_select( struct AABB3f left, struct AABB3f right, int3 cond )
 {
    float3 l = select( AABB3f_load_lower(&left), AABB3f_load_lower(&right), cond );
    float3 u = select( AABB3f_load_upper(&left), AABB3f_load_upper(&right), cond );
    return AABB3f_construct( l, u );
 }
 #endif
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/include/GRLGen12.h
+++ b/src/intel/vulkan/grl/include/GRLGen12.h
@ -1,691 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 //
 // This file is to contain structure definitions related to the Gen12 QBVH6 acceleration structures
 //
 //
 //********************************************************************************************
 //   WARNING!!!!!
 // This file is shared by OpenCL and C++ source code and must be compatible.
 //  There should only be C structure definitions and trivial GRL_INLINE functions here
 //
 //********************************************************************************************
 #pragma once
 #include "GRLRTASCommon.h"
 #include "GRLUtilities.h"
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
 GRL_NAMESPACE_BEGIN(GEN12)
    enum_uint8(NodeType)
    {
        NODE_TYPE_MIXED = 0x0,        // identifies a mixed internal node where each child can have a different type
        NODE_TYPE_INTERNAL = 0x0,     // internal BVH node with 6 children
        NODE_TYPE_INSTANCE = 0x1,     // instance leaf
        NODE_TYPE_PROCEDURAL = 0x3,   // procedural leaf
        NODE_TYPE_QUAD = 0x4,         // quad leaf
        NODE_TYPE_INVALID = 0x7       // indicates invalid node
    };
    typedef enum PrimLeafType
    {
        TYPE_NONE = 0,
        TYPE_QUAD = 0,
        /* For a node type of NODE_TYPE_PROCEDURAL we support enabling
        * and disabling the opaque/non_opaque culling. */
        TYPE_OPACITY_CULLING_ENABLED = 0,
        TYPE_OPACITY_CULLING_DISABLED = 1
    } PrimLeafType;
    #define BVH_MAGIC_MACRO     "GEN12_RTAS_005"    //  If serialization-breaking or algorithm-breaking changes are made, increment the digits at the end
    static const char BVH_MAGIC[16] = BVH_MAGIC_MACRO;
    typedef struct BVHBase
    {
        // TODO:  Implement the "copy-first-node" trick... duplicate root node here
        uint64_t rootNodeOffset;
        uint32_t reserved;
        uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
        uint32_t quadLeafStart;
        uint32_t quadLeafCur;
        uint32_t proceduralDataStart;
        uint32_t proceduralDataCur;
        uint32_t instanceLeafStart;
        uint32_t instanceLeafEnd;
        uint32_t backPointerDataStart;     //
        uint32_t refitTreeletsDataStart;   // refit structs
        uint32_t refitStartPointDataStart; //
        uint32_t BVHDataEnd;
        // number of bottom treelets
        // if 1, then the bottom treelet is also tip treelet
        uint32_t refitTreeletCnt;    
        uint32_t refitTreeletCnt2; // always 0, used for atomic updates
        // data layout:
        // @backPointerDataStart
        //  'backpointer' - a dword per inner node.
        //  The bits are used as follows:
        //     2:0  --> Used as a refit counter during BVH refitting.  MBZ
        //     5:3  --> Number of children
        //     31:6 --> Index of the parent node in the internal node array
        //    The root node has a parent index of all ones
        // @refitTreeletsDataStart
        //  RefitTreelet[], the last treelet is for top treelet all previous are for bottom 
        // @refitStartPointDataStart
        //  for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
        // @backPointerDataEnd
        uint32_t fatLeafCount;  // number of internal nodes which are "fat-leaves"
        uint32_t innerCount;    // number of internal nodes which are true inner nodes (all internalNode children)
        uint32_t fatLeafTableStart;
        uint32_t innerTableStart;
        uint32_t quadLeftoversCountNewAtomicUpdate; // number of quad leftovers for new atomic update
        uint32_t quadTableSizeNewAtomicUpdate; // size of quad Table including leftovers, padded to 256
        uint32_t quadIndicesDataStart;
        uint32_t _pad[9];
        struct RTASMetaData Meta;
    } BVHBase;
    GRL_INLINE struct GeoMetaData* BVHBase_GetGeoMetaData(BVHBase* base)
    {
        return (struct GeoMetaData*)(((char*)base) + base->Meta.geoDescsStart);
    }
 #ifdef __OPENCL_VERSION__
 #define BVH_ROOT_NODE_OFFSET sizeof(BVHBase)
 #else
 #define BVH_ROOT_NODE_OFFSET sizeof(GRL::RTAS::GEN12::BVHBase)
 #endif
 GRL_STATIC_ASSERT( sizeof(BVHBase) == BVH_ROOT_NODE_OFFSET, "Wrong size!");
 GRL_STATIC_ASSERT( (sizeof(BVHBase) % 64) == 0 , "Misaligned size!");
    typedef struct BackPointers {
    } BackPointers;
    // threshold for size of bottom treelets, note usually treelets will be 2-3x smaller than that number
    // means that no bottom treelet has more paths than this number
    #define TREELET_NUM_STARTPOINTS 1536
    // threshold under which only one treelet will be created
    #define SINGLE_TREELET_THRESHOLD 3072
    typedef struct LeafTableEntry {
        uint backpointer;
        uint inner_node_index;
        uint leaf_index;
    } LeafTableEntry;
    typedef struct InnerNodeTableEntry {
        uint node_index_and_numchildren; // numchildren in 3 lsbs
        uint first_child;
    } InnerNodeTableEntry;
    typedef struct QuadDataIndices
    {
        uint header_data[4];
        uint vert_idx[4];
    } QuadDataIndices;
    typedef struct RefitTreelet {
        uint32_t startpoint_offset;
        uint32_t numStartpoints;
        uint32_t numNonTrivialStartpoints;
        uint8_t  maxDepth;
        uint8_t  depthLess64; // depth from bottom at which there are less 64  paths
        uint8_t  depthLess128;// depth from bottom at which there are less 128 paths
        uint8_t  depthLess256;// depth from bottom at which there are less 256 paths
    } RefitTreelet;
    // if RefitTreelet has number of startpoints == 1
    // it should be reinterpreted as:
    typedef struct RefitTreeletTrivial {
        uint32_t theOnlyNodeIndex;
        uint32_t numStartpoints; // have to be 1 or 0
        int32_t  childrenOffsetOfTheNode; // 0th node based
        uint8_t  maxDepth;
        uint8_t  numChildrenOfTheNode;
    } RefitTreeletTrivial;
    // 5:0  - depth after you die
    // 31:6 - Index of the inner node
    typedef uint32_t StartPoint;
    struct HwInstanceLeaf;
    struct QuadLeaf;
    struct ProceduralLeaf;
    struct InternalNode;
    typedef struct HwInstanceLeaf HwInstanceLeaf;
    typedef struct InternalNode InternalNode;
    typedef struct QuadLeaf QuadLeaf;
    typedef struct ProceduralLeaf ProceduralLeaf;
    GRL_INLINE uint32_t BackPointer_GetParentIndex( uint32_t bp )
    {
        return bp >> 6;
    }
    GRL_INLINE uint32_t BackPointer_GetNumChildren( uint32_t bp )
    {
        return (bp >> 3) & (7);
    }
    GRL_INLINE uint32_t BackPointer_GetRefitCount( uint32_t bp )
    {
        return bp & 7;
    }
    GRL_INLINE bool BackPointer_IsRoot( uint32_t bp )
    {
        return (bp >> 6) == 0x03FFFFFF;
    }
    GRL_INLINE InternalNode* BVHBase_GetRootNode( const BVHBase* p )
    {
        return (InternalNode*)( ((char*)p) + BVH_ROOT_NODE_OFFSET);
    }
    GRL_INLINE AABB3f BVHBase_GetRootAABB(const BVHBase* p)
    {
        return p->Meta.bounds;
    }
    GRL_INLINE InternalNode* BVHBase_GetInternalNodes(const BVHBase* p)
    {
        return (InternalNode*)(((char*)p) + BVH_ROOT_NODE_OFFSET);
    }
    GRL_INLINE InternalNode* BVHBase_GetInternalNodesEnd(const BVHBase* p)
    {
        return (InternalNode*)(((char*)p) + (size_t)(64u * p->nodeDataCur));
    }
    GRL_INLINE uint32_t BVHBase_GetNumInternalNodes(const BVHBase* p)
    {
        return p->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
    }
    GRL_INLINE QuadLeaf* BVHBase_GetQuadLeaves(const BVHBase* p)
    {
        return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafStart));
    }
    GRL_INLINE const QuadLeaf* BVHBase_GetQuadLeaves_End(const BVHBase* p)
    {
        return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafCur));
    }
    GRL_INLINE const ProceduralLeaf* BVHBase_GetProceduralLeaves_End(const BVHBase* p)
    {
        return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataCur));
    }
    GRL_INLINE ProceduralLeaf* BVHBase_GetProceduralLeaves(const BVHBase* p)
    {
        return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataStart));
    }
    GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves(const BVHBase* p )
    {
        char* pRTASBits = (char*)p;
        return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafStart));
    }
    GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves_End(const BVHBase* p )
    {
        char* pRTASBits = (char*) p;
        return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafEnd));
    }
    GRL_INLINE uint BVHBase_GetNumHWInstanceLeaves( const BVHBase* p )
    {
        return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
    }
    GRL_INLINE uint* BVHBase_GetRefitStartPoints(const BVHBase* p)
    {
        return (uint32_t*)(((char*)p) + (size_t)(64u * p->refitStartPointDataStart));
    }
    GRL_INLINE uint BVHBase_GetRefitStartPointsSize(const BVHBase* p)
    {
        return 64u * (p->fatLeafTableStart - p->refitStartPointDataStart);
    }
    GRL_INLINE uint StartPoint_GetDepth(StartPoint s)
    {
        return s & ((1 << 6) - 1);
    }
    GRL_INLINE uint StartPoint_GetNodeIdx(StartPoint s)
    {
        return s >> 6;
    }
    GRL_INLINE RefitTreelet* BVHBase_GetRefitTreeletDescs(const BVHBase* p)
    {
        return (RefitTreelet*)(((char*)p) + (size_t)(64u * p->refitTreeletsDataStart));
    }
    // this is treelet count as should be executed, ie. num of bottom treelets if there are top and bottoms.
    // to get real number of all treelets including tip, the formula is 
    //    actualNumTreelets = refitTreeletCnt > 1 ? refitTreeletCnt + 1 : 1;
    GRL_INLINE uint32_t* BVHBase_GetRefitTreeletCntPtr(BVHBase* p)
    {
        return &p->refitTreeletCnt;
    }
    GRL_INLINE uint32_t BVHBase_GetRefitTreeletCnt(const BVHBase* p)
    {
        return p->refitTreeletCnt;
    }
    GRL_INLINE uint32_t BVHBase_IsSingleTreelet(const BVHBase* p)
    {
        return p->refitTreeletCnt == 1;
    }
    GRL_INLINE BackPointers* BVHBase_GetBackPointers(const BVHBase* p)
    {
        return (BackPointers*)(((char*)p) + (size_t)(64u * p->backPointerDataStart));
    }
    GRL_INLINE LeafTableEntry* BVHBase_GetFatLeafTable(const BVHBase* p)
    {
        return (LeafTableEntry*)(((char*)p) + (size_t)(64u * p->fatLeafTableStart));
    }
    GRL_INLINE InnerNodeTableEntry* BVHBase_GetInnerNodeTable(const BVHBase* p)
    {
        return (InnerNodeTableEntry*)(((char*)p) + (size_t)(64u * p->innerTableStart));
    }
    GRL_INLINE QuadDataIndices* BVHBase_GetQuadDataIndicesTable(const BVHBase* p)
    {
        return (QuadDataIndices*)(((char*)p) + (size_t)(64u * p->quadIndicesDataStart));
    }
    GRL_INLINE unsigned* InnerNode_GetBackPointer(
        BackPointers* backpointersStruct,
        uint32_t inodeOffset /*in 64B units, from the earliest Inner node*/)
    {
        uint* backpointersArray = (uint*)backpointersStruct;
        // BACKPOINTER_LAYOUT
        uint new_index = inodeOffset;                                                                              //<-layout canonical
        //uint new_index = inodeOffset*16;                                                                           //<-layout scattered
        // uint new_index = (inodeOffset & (~0xFFFF)) | (((inodeOffset & 0xFF) << 8) | ((inodeOffset & 0xFF00) >> 8));     //<-layout hashed
        return backpointersArray + new_index;
    }
    GRL_INLINE uint32_t BVHBase_GetRefitStructsDataSize(const BVHBase* p)
    {
        return 64u * (p->BVHDataEnd - p->backPointerDataStart);
    }
    GRL_INLINE uint32_t BVHBase_GetBackpointersDataSize(const BVHBase* p)
    {
        return 64u * (p->refitTreeletsDataStart - p->backPointerDataStart);
    }
    GRL_INLINE uint32_t* BVHBase_GetBVHDataEnd( const BVHBase* p )
    {
        return (uint32_t*)(((char*)p) + (size_t)(64u * p->BVHDataEnd));
    }
    GRL_INLINE bool BVHBase_HasBackPointers( const BVHBase* p )
    {
        return p->refitTreeletsDataStart > p->backPointerDataStart;
    }
    GRL_INLINE const size_t BVHBase_GetNumQuads(const BVHBase* p)
    {
        return p->quadLeafCur - p->quadLeafStart;
    }
    GRL_INLINE const size_t BVHBase_GetNumProcedurals(const BVHBase* p)
    {
        return p->proceduralDataCur - p->proceduralDataStart;
    }
    GRL_INLINE const size_t BVHBase_GetNumInstances(const BVHBase* p)
    {
        return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
    }
    GRL_INLINE const size_t BVHBase_totalBytes(const BVHBase* p)
    {
        return p->BVHDataEnd * 64u;
    }
    struct HwInstanceLeaf
    {
        /* first 64 bytes accessed during traversal */
        struct Part0
        {
            //uint32_t shaderIndex : 24;
            //uint32_t geomMask : 8;
            uint32_t DW0;
            // uint32_t instanceContributionToHitGroupIndex : 24;
            // uint32_t pad0 : 8
            //
            // NOTE:  Traversal shaders are implemented by aliasing instance leaves as procedural and sending them through the procedural path
            //    For a procedural instance, bit 29 should be set to 1, to disable "opaque culling"
            //      and bits 30 and 31 must be zero.  See also the definition of the 'PrimLeafDesc' structure
            uint32_t DW1;
            //      uint64_t rootNodePtr : 48;
            //      uint64_t instFlags : 8;
            //      uint64_t pad1 : 8;
            uint64_t DW2_DW3;
            // Vec3f world2obj_vx;   // 1st row of Worl2Obj transform
            float    world2obj_vx_x;
            float    world2obj_vx_y;
            float    world2obj_vx_z;
            // Vec3f world2obj_vy;   // 2nd row of Worl2Obj transform
            float    world2obj_vy_x;
            float    world2obj_vy_y;
            float    world2obj_vy_z;
            // Vec3f world2obj_vz;   // 3rd row of Worl2Obj transform
            float    world2obj_vz_x;
            float    world2obj_vz_y;
            float    world2obj_vz_z;
            // Vec3f obj2world_p;    // translation of Obj2World transform (on purpose in fist 64 bytes)
            float    obj2world_p_x;
            float    obj2world_p_y;
            float    obj2world_p_z;
        } part0;
        /* second 64 bytes accessed during shading */
        // NOTE: Everything in this block is under SW control
        struct Part1
        {
            //      uint64_t bvhPtr : 48;
            //      uint64_t pad : 16;
            uint64_t DW0_DW1;
            uint32_t instanceID;
            uint32_t instanceIndex;
            // Vec3f world2obj_vx;   // 1st row of Worl2Obj transform
            float    obj2world_vx_x;
            float    obj2world_vx_y;
            float    obj2world_vx_z;
            // Vec3f world2obj_vy;   // 2nd row of Worl2Obj transform
            float    obj2world_vy_x;
            float    obj2world_vy_y;
            float    obj2world_vy_z;
            // Vec3f world2obj_vz;   // 3rd row of Worl2Obj transform
            float    obj2world_vz_x;
            float    obj2world_vz_y;
            float    obj2world_vz_z;
            // Vec3f obj2world_p;    // translation of Obj2World transform (on purpose in fist 64 bytes)
            float    world2obj_p_x;
            float    world2obj_p_y;
            float    world2obj_p_z;
        } part1;
    };
    __constant const uint64_t c_one = 1ul;
    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceMask( const HwInstanceLeaf* p )
    {
        return p->part0.DW0 >> 24;
    }
    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceContributionToHitGroupIndex( const HwInstanceLeaf* p )
    {
        return p->part0.DW1 & 0x00ffffff;
    }
    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceFlags( const HwInstanceLeaf* p )
    {
        return (p->part0.DW2_DW3 >> 48) & 0xff;
    }
    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceID( const HwInstanceLeaf* p )
    {
        return p->part1.instanceID;
    }
    GRL_INLINE gpuva_t HwInstanceLeaf_GetBVH( const HwInstanceLeaf* p )           { return p->part1.DW0_DW1 & ((c_one << 48) - 1); }
    GRL_INLINE gpuva_t HwInstanceLeaf_GetStartNode( const HwInstanceLeaf* p )     { return p->part0.DW2_DW3 & ((c_one << 48) - 1); }
    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceIndex( const HwInstanceLeaf* p ) { return p->part1.instanceIndex; }
    GRL_INLINE void HwInstanceLeaf_GetTransform(struct HwInstanceLeaf* p, float* transform)
    {
        transform[0]  = p->part1.obj2world_vx_x;
        transform[1]  = p->part1.obj2world_vy_x;
        transform[2]  = p->part1.obj2world_vz_x;
        transform[3]  = p->part0.obj2world_p_x;
        transform[4]  = p->part1.obj2world_vx_y;
        transform[5]  = p->part1.obj2world_vy_y;
        transform[6]  = p->part1.obj2world_vz_y;
        transform[7]  = p->part0.obj2world_p_y;
        transform[8]  = p->part1.obj2world_vx_z;
        transform[9]  = p->part1.obj2world_vy_z;
        transform[10] = p->part1.obj2world_vz_z;
        transform[11] = p->part0.obj2world_p_z;
    }
    GRL_INLINE void HwInstanceLeaf_SetBVH( HwInstanceLeaf* p, gpuva_t b ) {
        uint64_t mask = ((c_one << 48) - 1);
        uint64_t v = p->part1.DW0_DW1;
        v = (b & mask) | (v & ~mask);
        p->part1.DW0_DW1 = v;
    }
    GRL_INLINE void HwInstanceLeaf_SetStartNode( HwInstanceLeaf* p, gpuva_t b ) {
        uint64_t mask = ((c_one << 48) - 1);
        uint64_t v = p->part0.DW2_DW3;
        v = (b & mask) | (v & ~mask);
        p->part0.DW2_DW3 = v;
    }
    GRL_INLINE void HwInstanceLeaf_SetStartNodeAndInstanceFlags( HwInstanceLeaf* p,
                                                             gpuva_t root,
                                                             uint8_t flags ) {
        uint64_t mask = ((1ull << 48) - 1);
        uint64_t v = (root & mask) | ((uint64_t)(flags)<<48);
        p->part1.DW0_DW1 = v;
    }
    struct InternalNode
    {
        float lower[3];       // world space origin of quantization grid
        int32_t childOffset;  // offset to all children in 64B multiples
        uint8_t nodeType;     // the type of the node
        uint8_t pad;          // unused byte
        int8_t exp_x;         // 2^exp_x is the size of the grid in x dimension
        int8_t exp_y;         // 2^exp_y is the size of the grid in y dimension
        int8_t exp_z;         // 2^exp_z is the size of the grid in z dimension
        uint8_t nodeMask;     // mask used for ray filtering
        struct ChildData
        {
            //uint8_t blockIncr : 2; // size of child in 64 byte blocks.   Must be ==2 for instance leaves, <=2 for quad leaves.
            //uint8_t startPrim : 4; // start primitive in fat leaf mode or child type in mixed mode
            //uint8_t pad : 2; // unused bits
            uint8_t bits;
        } childData[6];
        uint8_t lower_x[6];  // the quantized lower bounds in x-dimension
        uint8_t upper_x[6];  // the quantized upper bounds in x-dimension
        uint8_t lower_y[6];  // the quantized lower bounds in y-dimension
        uint8_t upper_y[6];  // the quantized upper bounds in y-dimension
        uint8_t lower_z[6];  // the quantized lower bounds in z-dimension
        uint8_t upper_z[6];  // the quantized upper bounds in z-dimension
    };
    GRL_INLINE uint InternalNode_GetChildBlockIncr( const InternalNode* p, uint idx )
    {
        return p->childData[idx].bits & 3;
    }
    GRL_INLINE uint InternalNode_GetChildStartPrim( const InternalNode* p, uint idx )
    {
        return (p->childData[idx].bits>>2) & 0xf;
    }
    GRL_INLINE uint8_t InternalNode_GetChildType( const InternalNode* p, uint idx )
    {
        return (p->childData[idx].bits >> 2) & 0xF;
    }
    GRL_INLINE void InternalNode_SetChildType( InternalNode* p, uint idx, uint type )
    {
        uint bits = p->childData[idx].bits;
        const uint mask = (0xF << 2);
        bits = ((type << 2) & mask) | (bits & ~mask);
        p->childData[idx].bits = (uint8_t)bits;
    }
    GRL_INLINE bool InternalNode_IsChildValid( const InternalNode* p, size_t child )
    {
        bool lower = p->lower_x[child] & 0x80; // invalid nodes are indicated by setting lower_msb = 1 and upper_msb=0
        bool upper = p->upper_x[child] & 0x80;
        return !lower || upper;
    }
    GRL_INLINE AABB3f InternalNode_GetChildAABB(const InternalNode* node, size_t i)
    {
        float4 lower, upper;
        const float4 base = { node->lower[0], node->lower[1], node->lower[2], 0.0f };
        const int4 lower_i = { node->lower_x[i], node->lower_y[i], node->lower_z[i], 0 };
        const int4 upper_i = { node->upper_x[i], node->upper_y[i], node->upper_z[i], 0 };
        const int4 exp_i = { node->exp_x, node->exp_y, node->exp_z, 0 };
        lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
        upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
        AABB3f aabb3f = {
            { lower.x, lower.y, lower.z },
            { upper.x, upper.y, upper.z } };
        return aabb3f;
    }
    GRL_INLINE void* InternalNode_GetChildren( InternalNode* node)
    {
        return (void*)(((char*)node) + node->childOffset * 64);
    }
    typedef struct PrimLeafDesc
    {
        //uint32_t shaderIndex : 24;    // shader index used for shader record calculations
        //uint32_t geomMask : 8;        // geometry mask used for ray masking
        uint32_t shaderIndex_geomMask;
        //uint32_t geomIndex : 29;      // the geometry index specifies the n'th geometry of the scene
        //PrimLeafType type : 1;        // see above
        //GeometryFlags geomFlags : 2;  // geometry flags of this geometry
        uint32_t geomIndex_flags;
    } PrimLeafDesc;
    GRL_INLINE uint32_t PrimLeaf_GetShaderIndex( const PrimLeafDesc* p )
    {
        return p->shaderIndex_geomMask & ((1 << 24) - 1);
    }
    GRL_INLINE uint32_t PrimLeaf_GetGeoIndex( const PrimLeafDesc* p )
    {
        return p->geomIndex_flags & ((1<<29)-1);
    }
    GRL_INLINE uint32_t PrimLeaf_GetGeomFlags( const PrimLeafDesc* p )
    {
        return (p->geomIndex_flags >> 30);
    }
    GRL_INLINE uint32_t PrimLeaf_GetType(const PrimLeafDesc* p)
    {
        return (p->geomIndex_flags >> 29) & 1;
    }
    struct QuadLeaf
    {
        PrimLeafDesc leafDesc;
        uint32_t primIndex0;
        //uint32_t primIndex1Delta : 16;
        //uint32_t j0 : 2;
        //uint32_t j1 : 2;
        //uint32_t j2 : 2;
        //uint32_t last : 1; // last quad in list
        //uint32_t pad : 9;
        uint32_t DW1;
        float v[4][3];
    };
    GRL_INLINE uint32_t QuadLeaf_GetPrimIndexDelta( const QuadLeaf* p )
    {
        return p->DW1 & 0x0000ffff;
    }
    GRL_INLINE uint32_t QuadLeaf_GetPrimIndex0( const QuadLeaf* p )
    {
        return p->primIndex0;
    }
    GRL_INLINE uint32_t QuadLeaf_GetPrimIndex1( const QuadLeaf* p )
    {
        return p->primIndex0 + QuadLeaf_GetPrimIndexDelta(p);
    }
    GRL_INLINE bool QuadLeaf_IsSingleTriangle( const QuadLeaf* p )
    {
        return QuadLeaf_GetPrimIndexDelta(p) == 0;
    }
    GRL_INLINE uint32_t QuadLeaf_GetSecondTriangleIndices( const QuadLeaf* p )
    {
        return (p->DW1>>16) & 0x3f;
    }
    GRL_INLINE void QuadLeaf_SetVertices( QuadLeaf* quad, float3 v0, float3 v1, float3 v2, float3 v3 )
    {
        quad->v[0][0] = v0.x;
        quad->v[0][1] = v0.y;
        quad->v[0][2] = v0.z;
        quad->v[1][0] = v1.x;
        quad->v[1][1] = v1.y;
        quad->v[1][2] = v1.z;
        quad->v[2][0] = v2.x;
        quad->v[2][1] = v2.y;
        quad->v[2][2] = v2.z;
        quad->v[3][0] = v3.x;
        quad->v[3][1] = v3.y;
        quad->v[3][2] = v3.z;
    }
    struct ProceduralLeaf {
        PrimLeafDesc leafDesc;
        // Number of primitives + "last" bits.
        // The meaning of this section is SW-defined and flexible
        uint32_t DW1 ;
        uint32_t _primIndex[13];
    } ;
 GRL_NAMESPACE_END(Gen12)
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/include/GRLIntTypes.h
+++ b/src/intel/vulkan/grl/include/GRLIntTypes.h
@ -1,152 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 //********************************************************************************************
 //   WARNING!!!!!
 //
 // This file is shared by OpenCL and C++ source code and must be a pure C header
 //  There should only be C structure definitions and trivial inline functions here
 //
 //********************************************************************************************
 #pragma once
 #include "GRLOCLCompatibility.h"
 GRL_NAMESPACE_BEGIN(GRL)
    typedef uint32_t dword;
    typedef uint64_t qword;
    typedef qword gpuva_t;
    enum_uint8( InstanceFlags )
    {
        INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
        INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
        INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
        INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8,
    };
    enum_uint8( GeometryFlags )
    {
        GEOMETRY_FLAG_NONE = 0x0,
        GEOMETRY_FLAG_OPAQUE = 0x1,
        GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2,
    };
    enum_uint8( GeometryType )
    {
        GEOMETRY_TYPE_TRIANGLES = 0,
        GEOMETRY_TYPE_PROCEDURAL = 1,
        NUM_GEOMETRY_TYPES = 2
    };
    // NOTE: Does NOT match DXR
    enum_uint8( IndexFormat )
    {
        INDEX_FORMAT_NONE     = 0,     // INDEX_FORMAT_NONE Indicates non-indexed geometry
        INDEX_FORMAT_R16_UINT = 2,
        INDEX_FORMAT_R32_UINT = 4,
        INDEX_FORMAT_END = INDEX_FORMAT_R32_UINT + 1
    };
    // NOTE: Does NOT match DXR
    enum_uint8( VertexFormat )
    {
        VERTEX_FORMAT_R32G32_FLOAT          = 0,
        VERTEX_FORMAT_R32G32B32_FLOAT       = 1,
        VERTEX_FORMAT_R16G16_FLOAT          = 2,
        VERTEX_FORMAT_R16G16B16A16_FLOAT    = 3,
        VERTEX_FORMAT_R16G16_SNORM          = 4,
        VERTEX_FORMAT_R16G16B16A16_SNORM    = 5,
        VERTEX_FORMAT_R16G16B16A16_UNORM    = 6,
        VERTEX_FORMAT_R16G16_UNORM          = 7,
        VERTEX_FORMAT_R10G10B10A2_UNORM     = 8,
        VERTEX_FORMAT_R8G8B8A8_UNORM        = 9,
        VERTEX_FORMAT_R8G8_UNORM            = 10,
        VERTEX_FORMAT_R8G8B8A8_SNORM        = 11,
        VERTEX_FORMAT_R8G8_SNORM            = 12,
        VERTEX_FORMAT_END = VERTEX_FORMAT_R8G8_SNORM + 1
    };
    enum_uint32(RTASFlags)
    {
        // These flags match DXR
        BUILD_FLAG_ALLOW_UPDATE                 = 1<<0,
        BUILD_FLAG_ALLOW_COMPACTION             = 1<<1,
        BUILD_FLAG_PREFER_FAST_TRACE            = 1<<2,
        BUILD_FLAG_PREFER_FAST_BUILD            = 1<<3,
        BUILD_FLAG_MINIMIZE_MEMORY              = 1<<4,
        BUILD_FLAG_PERFORM_UPDATE               = 1<<5,
        // internal flags start here
        BUILD_FLAG_DISALLOW_REBRAID             = 1<<16,
        BUILD_FLAG_ALL = 0x0001003f
    };
    enum_uint8(BVHType)
    {
        BVH_TYPE_NONE, // This is a sentinel for drivers to use when compiling out GRL on non-RT devices
        BVH_TYPE_GEN12,
    };
    enum_uint8(PostBuildInfoType)
    {
        PBI_CURRENT_SIZE,
        PBI_COMPACTED_SIZE,
        PBI_DXR_TOOLS_VISUALIZATION_DESC,
        PBI_DXR_SERIALIZATION_DESC,
    };
    enum_uint32(HazardTypes)
    {
        HAZARD_RTAS_READ       = 1 << 0,
        HAZARD_RTAS_WRITE      = 1 << 1,
        HAZARD_READ            = 1 << 2,
        HAZARD_WRITE           = 1 << 3,
        HAZARD_ALL             = 0xf
    };
    enum_uint32(RaytracingAccelerationStructureType)
    {
        TOP_LEVEL    = 0x0,
        BOTTOM_LEVEL = 0x1,
    };
    typedef struct PostbuildInfoCurrentSize
    {
        uint64_t CurrentSizeInBytes;
    } PostbuildInfoCurrentSize;
    typedef struct PostbuildInfoCompactedSize
    {
        uint64_t CompactedSizeInBytes;
    } PostbuildInfoCompactedSize;
    typedef struct PostbuildInfoToolsVisualizationDesc
    {
        uint64_t DecodedSizeInBytes;
    } PostbuildInfoToolsVisualizationDesc;
    typedef struct PostbuildInfoSerializationDesc
    {
        uint64_t SerializedSizeInBytes;
        uint64_t NumBottomLevelAccelerationStructurePointers;
    } PostbuildInfoSerializationDesc;
    typedef struct DecodeHeader
    {
        RaytracingAccelerationStructureType Type;
        uint32_t NumDesc;
    } DecodeHeader;
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/include/GRLOCLCompatibility.h
+++ b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h
@ -1,210 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #ifdef __OPENCL_VERSION__
 typedef uchar  uint8_t;
 typedef ushort uint16_t;
 typedef uint   uint32_t;
 typedef ulong  uint64_t;
 typedef char   int8_t;
 typedef short  int16_t;
 typedef int    int32_t;
 typedef long   int64_t;
 #else
 #include <stdint.h>
 typedef uint8_t  uchar;
 typedef uint16_t ushort;
 typedef uint32_t uint;
 typedef uint64_t ulong;
 #define __constant
 #define __global
 typedef struct uint2
 {
 #ifdef __cplusplus
    uint2() {};
    uint2( uint ix, uint iy ) : x( ix ), y( iy ) {};
 #endif
    uint x;
    uint y;
 } uint2;
 typedef struct uint3
 {
 #ifdef __cplusplus
    uint3() {};
    uint3( uint ix, uint iy, uint iz ) : x( ix ), y( iy ), z( iz ) {};
 #endif
    uint x;
    uint y;
    uint z;
 } uint3;
 typedef struct int3
 {
    int32_t x;
    int32_t y;
    int32_t z;
 #ifdef __cplusplus
    int3() {};
    int3(int32_t ix, int32_t iy, int32_t iz) : x(ix), y(iy), z(iz) {};
    int3 operator+(const int32_t i) const { return int3(this->x + i, this->y + i, this->z + i); }
    int3 operator<<(const int32_t i) const { return int3(this->x << i, this->y << i, this->z << i); }
 #endif
 } int3;
 typedef struct int4
 {
    int32_t x;
    int32_t y;
    int32_t z;
    int32_t w;
 #ifdef __cplusplus
    int4() {};
    int4(int32_t ix, int32_t iy, int32_t iz, int32_t iw) : x(ix), y(iy), z(iz), w(iw) {};
    int4 operator+(const int32_t i) const { return int4(this->x + i, this->y + i, this->z + i, this->w + i); }
    int4 operator-(const int32_t i) const { return int4(this->x - i, this->y - i, this->z - i, this->w - i); }
    int4 operator<<(const int32_t i) const { return int4(this->x << i, this->y << i, this->z << i, this->w << i); }
 #endif
 } int4;
 typedef struct float3
 {
    float x;
    float y;
    float z;
 #ifdef __cplusplus
    float3(){};
    float3( float ix, float iy, float iz ) : x(ix), y(iy), z(iz){};
    float3 operator+( const float3& f3 ) { return float3( this->x + f3.x, this->y + f3.y, this->z + f3.z ); }
    float3 operator*( const float& f ) { return float3( this->x * f, this->y * f, this->z * f ); }
    float3 operator*( const float3& f3 ) const { return float3(this->x * f3.x, this->y * f3.y, this->z * f3.z); }
    float3 operator-() { return float3(-this->x, -this->y, -this->z); }
    float3 operator-( const float3& f3) { return float3(this->x - f3.x, this->y - f3.y, this->z - f3.z); }
 #endif
 } float3;
 typedef struct float4
 {
    float x;
    float y;
    float z;
    float w;
 #ifdef __cplusplus
    float4() {};
    float4( float ix, float iy, float iz, float iw ) : x( ix ), y( iy ), z( iz ), w( iw ) {};
    float4 operator+(const float4& f4) const { return float4(this->x + f4.x, this->y + f4.y, this->z + f4.z, this->w + f4.w); }
    float4 operator*(const float4& f4) const { return float4(this->x * f4.x, this->y * f4.y, this->z * f4.z, this->w * f4.w); }
 #endif
 } float4;
 #endif /* ! __OPENCL_VERSION__ */
 #ifndef __cplusplus
 #define GRL_NAMESPACE_BEGIN(x)
 #define GRL_NAMESPACE_END(x)
 #define GRL_OVERLOADABLE __attribute((overloadable))
 #define GRL_INLINE __attribute__((always_inline)) inline static
 #   define enum_uint8(name)   \
        typedef uint8_t name; \
        enum name##_uint32
 #   define enum_uint16(name)   \
        typedef uint16_t name; \
        enum name##_uint32
 #   define enum_uint32(name)   \
        typedef uint32_t name; \
        enum name##_uint32
 #define OCL_BYTE_ALIGN(n) __attribute__ ((aligned (n)))
 #define GRL_STATIC_ASSERT(condition,desc)
 #else /* C++ */
 #ifdef __OPENCL_VERSION__
 #error "OpenCL C++ not supported by this header"
 #endif
 #define GRL_NAMESPACE_BEGIN(x) namespace x {
 #define GRL_NAMESPACE_END(x) }
 #define GRL_OVERLOADABLE
 #define GRL_INLINE inline
 #define enum_uint8(N) enum N : uint8_t
 #define enum_uint16(N) enum N : uint16_t
 #define enum_uint32(N) enum N : uint32_t
 #define OCL_BYTE_ALIGN(n)
 #define GRL_STATIC_ASSERT(condition,desc) static_assert( condition, desc )
 #include <cmath>
 inline float3 fmin(float3 a, float3 b)
 {
    float3 o = { std::fmin(a.x, b.x), std::fmin(a.y, b.y), std::fmin(a.z, b.z) };
    return o;
 }
 inline float3 fmax(float3 a, float3 b)
 {
    float3 o = { std::fmax(a.x, b.x), std::fmax(a.y, b.y), std::fmax(a.z, b.z) };
    return o;
 }
 inline float3 operator/(const float3& f3, const float& f) { return float3(f3.x / f, f3.y / f, f3.z / f); }
 inline float dot(const float3& a, const float3& b) {
    return a.x * b.x + a.y * b.y + a.z * b.z;
 }
 inline float as_float(uint32_t i)
 {
    union { float f; uint32_t i; } fi;
    fi.i = i;
    return fi.f;
 }
 inline float3 as_float3(int3 i3)
 {
    float3 o = { as_float(i3.x), as_float(i3.y), as_float(i3.z) };
    return o;
 }
 inline float4 as_float4(int4 i4)
 {
    float4 o = { as_float(i4.x), as_float(i4.y), as_float(i4.z), as_float(i4.w) };
    return o;
 }
 inline float4 convert_float4_rtn(int4 i4)
 {
    return float4(static_cast<float>(i4.x), static_cast<float>(i4.y), static_cast<float>(i4.z), static_cast<float>(i4.w));
 }
 inline float4 convert_float4_rtp(int4 i4)
 {
    return convert_float4_rtn(i4);
 }
 #endif
--- a/src/intel/vulkan/grl/include/GRLRTASCommon.h
+++ b/src/intel/vulkan/grl/include/GRLRTASCommon.h
@ -1,142 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 //
 // This file is to contain structure definitions for RTAS-related meta-deta.
 //   The structures here should be generic enough to apply to any acceleration structure.
 //   If we ever move to KD-Trees or Octrees, this file should not need to change.
 //
 //********************************************************************************************
 //   WARNING!!!!!
 //
 // This file is shared by OpenCL and C++ source code and must be a pure C header
 //  There should only be C structure definitions and trivial inline functions here
 //
 //********************************************************************************************
 #pragma once
 #include "GRLIntTypes.h"
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
    typedef struct SerializationIdentifier
    {
        uint8_t Bytes[16];
    } SerializationIdentifier;
    GRL_STATIC_ASSERT(sizeof(SerializationIdentifier) == 16, "Wrong size!");
    // Header structure for RTAS serialization.
    //    This structure is binary-compatible with the DXR and Vulkan API definitions
    typedef struct SerializationHeader
    {
        SerializationIdentifier DriverID;   // DXR 'DriverOpaqueGUID'.            Vulkan: 'driverUUID'
        SerializationIdentifier GRLID;      // DXR 'DriverOpaqueVersioningData'.  Vulkan: 'accelerationStructureUUID'
        uint64_t SerializedSizeInBytesIncludingHeader;
        uint64_t DeserializedSizeInBytes;
        uint64_t InstanceHandleCount;
    } SerializationHeader;
    GRL_STATIC_ASSERT(sizeof(SerializationHeader) == 56, "Wrong size!");
    // This structure is binary-compatible with DXR and Vulkan 'InstanceDesc' structures
    typedef struct InstanceDesc {
        float    Transform[3][4];
        uint32_t InstanceIDAndMask; // mask in 8 msbs
        uint32_t InstanceContributionToHitGroupIndexAndFlags; // flags in 8 msbs
        gpuva_t  AccelerationStructureGPUVA; // NOTE:  In GRL this is always a VA.  Vulkan CPU builds use handles here, and these may need to be translated
    } InstanceDesc;
    GRL_STATIC_ASSERT(sizeof(InstanceDesc) == 64, "Wrong size!");
    typedef struct GeoMetaData{
        uint32_t PrimitiveCount;
        uint16_t Type;
        uint16_t Flags;
    } GeoMetaData;
    GRL_STATIC_ASSERT(sizeof(GeoMetaData) == 8, "Wrong size!");
    typedef struct AABB3f {
        float lower[3];
        float upper[3];
    } AABB3f;
    GRL_STATIC_ASSERT(sizeof(AABB3f) == 24, "Wrong size!");
    enum_uint32(error_t_) {
        error_t_no_error = 0x0,
        error_t_internal_node_child_OOB = 0x1,
        error_t_leaf_node_child_OOB = 0x2,
        error_t_unrecognised_node_t = 0x4,
        error_t_mixed_node_unsupported = 0x8,
        error_t_instance_pointers_inconsistent = 0x10,
        error_t_instance_pointed_root_not_internal = 0x20,
        error_t_leaf_node_instance_child_missed_by_64B = 0x40,
        error_t_internal_node_child_cycle = 0x80,
        error_t_input_geo_insane = 0x100,
        error_t_quad_leaf_broken = 0x200,
        error_t_backpointer_not_reset = 0x400,
        error_t_backpointer_wrong_children_num = 0x500,
        error_t_backpointer_inconsitent_parent_child = 0x600,
        error_t_backpointer_root_not_root_error = 0x700,
        error_t_backpointer_OOB = 0x800,
        error_t_backpointers_buffer_too_small = 0x900,
        error_t_atomic_update_struct_fatleaf_count_oob = 0x1000,            // for this and following:
        error_t_atomic_update_struct_fatleaf_node_idx_oob = 0x2000,         // offset_in_BVH is just index in fatleaf or inner node arrays
        error_t_atomic_update_struct_fatleaf_backpointer_mismatch = 0x3000,
        error_t_atomic_update_struct_fatleaf_num_children_error = 0x4000,
        error_t_atomic_update_struct_fatleaf_children_non_leaf = 0x5000,
        error_t_atomic_update_struct_inner_count_oob = 0x6000,
        error_t_atomic_update_struct_inner_node_idx_oob = 0x7000,
        error_t_atomic_update_struct_inner_node_child_idx_error = 0x8000,
        error_t_atomic_update_struct_inner_num_children_error = 0x9000,
        error_t_atomic_update_struct_inner_children_non_internal = 0xA000,
        error_t_unknown = 1u << 31,
    };
    enum_uint32(error_phase_t) {
        error_phase_t_unknown = 0,
        error_phase_t_post_build_Morton  = 1,
        error_phase_t_post_build_Trivial = 2,
        error_phase_t_post_build_NewSAH  = 3,
        error_phase_t_post_update        = 4,
        error_phase_t_pre_update         = 5,
        error_phase_t_post_copy_op       = 6,
    };
    typedef struct ERROR_INFO {
        error_t_ type;
        uint    offset_in_BVH; //in 64B units
        error_phase_t when;
        uint reserved;
    } ERROR_INFO;
    // Meta-data common to all acceleration structures, which is needed to implement required functionality
    //  All RTAS structures must contain a struct of this type named 'Meta'
    typedef struct RTASMetaData {
        struct AABB3f bounds;
        uint32_t instanceDescsStart;  // byte offset to array of original instance_descs used for build.  Required for DXR visualization and serialization
        uint32_t instanceCount;
        uint32_t geoDescsStart;     // byte offset to array of 'GeoMetaData' matching input geos.  Required for DXR visualization
        uint32_t geoCount;
        uint64_t allocationSize;  // Size of the memory allocation containing this RTAS
                                  //  This is the size given to the app in the prebuild info when the RTAS was first created
                                  //  If RTAS was compacted, this will be the compacted size
        ERROR_INFO errors;        // only used in debug mode
    } RTASMetaData;
    GRL_STATIC_ASSERT( sizeof(RTASMetaData) == 64, "Wrong size!");
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/include/GRLStructs.h
+++ b/src/intel/vulkan/grl/include/GRLStructs.h
@ -1,60 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "GRLIntTypes.h"
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(_INTERNAL)
    struct GeometryTriangles
    {
        gpuva_t        pTransformBuffer;
        gpuva_t        pIndexBuffer;
        gpuva_t        pVertexBuffer;
        qword          VertexBufferByteStride;
        dword          IndexCount;
        dword          VertexCount;
        IndexFormat    IndexFormat;
        VertexFormat   VertexFormat;
    };
    struct GeometryProcedural
    {
        gpuva_t  pAABBs_GPUVA; ///<elements of pAABBs_GPUVA are gpuAABB format.
        qword    AABBByteStride;
        dword    AABBCount;
    };
    // TODO we miss 'unsigned int ShaderIndex_Mask; // extension' field
    struct Geo
    {
        union
        {
            struct GeometryTriangles Triangles;
            struct GeometryProcedural Procedural;
        } Desc;
        GeometryType Type;
        uint8_t Flags;
    };
    // Matches the Vulkan VkAccelerationStructureBuildRangeInfoKHR structure
    // See Vulkan spec for data access rules:
    //     https://registry.khronos.org/vulkan/specs/latest/man/html/VkAccelerationStructureBuildRangeInfoKHR.html
    //
    struct IndirectBuildRangeInfo
    {
        dword    primitiveCount;        // Number of primitives
        dword    primitiveOffset;       // Byte offset to primitive data
        dword    firstVertex;           // Index of first vertex
        dword    transformOffset;       // Byte offset to transform data (for triangle Geo with non-null transform)
    };
 GRL_NAMESPACE_END(_INTERNAL)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/include/GRLUtilities.h
+++ b/src/intel/vulkan/grl/include/GRLUtilities.h
@ -1,32 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "GRLOCLCompatibility.h"
 GRL_NAMESPACE_BEGIN(GRL)
    GRL_INLINE float4 bitShiftLdexp4(float4 x, int4 y)
    {
        y = (y + 127) << 23;
        return x * as_float4(y);
    }
    GRL_INLINE float3 bitShiftLdexp3(float3 x, int3 y)
    {
        y = (y + 127) << 23;
        return x * as_float3(y);
    }
    GRL_INLINE float bitShiftLdexp(float x, int y)
    {
        y = (y + 127) << 23;
        return x * as_float(y);
    }
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/include/affinespace.h
+++ b/src/intel/vulkan/grl/include/affinespace.h
@ -1,192 +0,0 @@
 //
 // Copyright (C) 2009-2021 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 //
 //
 #pragma once
 #include "GRLRTASCommon.h"
 GRL_NAMESPACE_BEGIN(GRL)
 GRL_NAMESPACE_BEGIN(RTAS)
 inline float3 GRL_OVERLOADABLE cross(const float3 a, const float3 b)
 {
    float3 res = { a.y * b.z - a.z * b.y,
                   a.z * b.x - a.x * b.z,
                   a.x * b.y - a.y * b.x };
    return res;
 }
 struct LinearSpace3f
 {
    float3 vx;
    float3 vy;
    float3 vz;
 };
 /* compute the determinant of the matrix */
 GRL_INLINE struct LinearSpace3f LinearSpace3f_Constructor(const float3 vx, const float3 vy, const float3 vz)
 {
    struct LinearSpace3f xfm;
    xfm.vx = vx;
    xfm.vy = vy;
    xfm.vz = vz;
    return xfm;
 }
 /* compute the determinant of the matrix */
 GRL_INLINE float LinearSpace3f_det(struct LinearSpace3f xfm)
 {
    return dot(xfm.vx, cross(xfm.vy, xfm.vz));
 }
 /* compute transposed matrix */
 GRL_INLINE struct LinearSpace3f LinearSpace3f_transpose(struct LinearSpace3f in)
 {
    float3 x = { in.vx.x, in.vy.x, in.vz.x };
    float3 y = { in.vx.y, in.vy.y, in.vz.y };
    float3 z = { in.vx.z, in.vy.z, in.vz.z };
    return LinearSpace3f_Constructor(x,
                                     y,
                                     z);
 }
 /* compute adjoint matrix */
 GRL_INLINE const struct LinearSpace3f LinearSpace3f_adjoint(struct LinearSpace3f in)
 {
    return LinearSpace3f_transpose(LinearSpace3f_Constructor(cross(in.vy, in.vz),
                                                             cross(in.vz, in.vx),
                                                             cross(in.vx, in.vy)));
 }
 /* compute inverse matrix */
 GRL_INLINE struct LinearSpace3f LinearSpace3f_invert(struct LinearSpace3f in)
 {
    const float det = LinearSpace3f_det(in);
    const struct LinearSpace3f adj = LinearSpace3f_adjoint(in);
    return LinearSpace3f_Constructor(adj.vx / det, adj.vy / det, adj.vz / det);
 }
 GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct LinearSpace3f xfm, float3 p)
 {
    return xfm.vx * p.x + xfm.vy * p.y + xfm.vz * p.z;
 }
 struct AffineSpace3f
 {
    struct LinearSpace3f l;
    float3 p;
 };
 GRL_INLINE struct AffineSpace3f AffineSpace3f_Constructor(struct LinearSpace3f l, float3 p)
 {
    struct AffineSpace3f out;
    out.l = l;
    out.p = p;
    return out;
 }
 GRL_INLINE struct AffineSpace3f AffineSpace3f_load_row_major(const float *in)
 {
    struct AffineSpace3f out;
    out.l.vx.x = in[0];
    out.l.vx.y = in[4];
    out.l.vx.z = in[8];
    out.l.vy.x = in[1];
    out.l.vy.y = in[5];
    out.l.vy.z = in[9];
    out.l.vz.x = in[2];
    out.l.vz.y = in[6];
    out.l.vz.z = in[10];
    out.p.x = in[3];
    out.p.y = in[7];
    out.p.z = in[11];
    return out;
 }
 // squared proportion of oriented transformed cube to aa box that would contain it.
 // the smaller it is the more overhead transformation produces
 GRL_INLINE
 float transformation_bbox_surf_overhead(const float* Transform)
 {
    // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
    //     New AABB is center +- Extent.
    //
    // For derivation see:
    //    https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
    //
    // take the cube of side 1 and see how big aabb containing it transformed is vs just surface of transformed
    float ex = fabs(Transform[0]) + fabs(Transform[1]) + fabs(Transform[2]);
    float ey = fabs(Transform[4]) + fabs(Transform[5]) + fabs(Transform[6]);
    float ez = fabs(Transform[8]) + fabs(Transform[9]) + fabs(Transform[10]);
    // we will compare squared sizes
    ex = ex * ex;
    ey = ey * ey;
    ez = ez * ez;
    // surface of aabb containing oriented box;
    float aabb_sq_half_surf = ex * ey + ey * ez + ez * ex;
    // ^2 lengths of transformed <1,0,0>, <0,1,0>, <0,0,1>
    float obx = Transform[0] * Transform[0] + Transform[4] * Transform[4] + Transform[8] * Transform[8];
    float oby = Transform[1] * Transform[1] + Transform[5] * Transform[5] + Transform[9] * Transform[9];
    float obz = Transform[2] * Transform[2] + Transform[6] * Transform[6] + Transform[10] * Transform[10];
    float obb_sq_half_surf = obx * oby + oby * obz + obz * obx;
    return obb_sq_half_surf / aabb_sq_half_surf;
    // ex = 2.0
    // ey = 2.0
    // ez = 2.0
    // ex = 4.0
    // ey = 4.0
    // ez = 4.0
    // aabb_half_surf = 16+16 *2.0 +  2.0*2.0+ 2.0*2.0; = 12;
    // aabb_sq_half_surf = 144;
    //
    // obx = 4.0;
    // oby = 4.0;
    // obz = 4.0;
    // obb_sq_half_surf = 16 + 16+ 16;
    // obb_sq_half_surf = 16.0 *3 = 48
 }
 GRL_INLINE void load_row_major_from_AffineSpace3f(struct AffineSpace3f in, float* out)
 {
    out[0]  = in.l.vx.x;
    out[4]  = in.l.vx.y;
    out[8]  = in.l.vx.z;
    out[1]  = in.l.vy.x;
    out[5]  = in.l.vy.y;
    out[9]  = in.l.vy.z;
    out[2]  = in.l.vz.x;
    out[6]  = in.l.vz.y;
    out[10] = in.l.vz.z;
    out[3]  = in.p.x;
    out[7]  = in.p.y;
    out[11] = in.p.z;
 }
 GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct AffineSpace3f xfm, float3 p)
 {
    return xfmPoint(xfm.l, p) + xfm.p;
 }
 /* compute inverse matrix */
 GRL_INLINE struct AffineSpace3f AffineSpace3f_invert(struct AffineSpace3f in)
 {
    const struct LinearSpace3f il = LinearSpace3f_invert(in.l);
    float3 ip = -xfmPoint(il, in.p);
    return AffineSpace3f_Constructor(il, ip);
 }
 GRL_NAMESPACE_END(RTAS)
 GRL_NAMESPACE_END(GRL)
--- a/src/intel/vulkan/grl/meson.build
+++ b/src/intel/vulkan/grl/meson.build
@ -1,186 +0,0 @@
 # Copyright © 2021 Intel Corporation
 # SPDX-License-Identifier: MIT
 grl_lib_files = [
  'gpu/libs/libraries.grl',
 ]
 grl_grl_files = [
  'gpu/build_leaf.grl',
  'gpu/build_primref.grl',
 #  'gpu/build_refit.grl',
  'gpu/copy.grl',
 #  'gpu/grl_api_interface_verify.grl',
  'gpu/misc.grl',
 #  'gpu/morton_builder.grl',
 #  'gpu/msb_radix_bitonic_sort.grl',
  'gpu/new_sah_builder.grl',
  'gpu/postbuild_info.grl',
 #  'gpu/presplit.grl',
 #  'gpu/radix_sort.grl',
 #  'gpu/rebraid.grl',
 #  'gpu/traversal_shader.grl',
 ]
 grl_lib_args = []
 foreach libfile : grl_lib_files
  grl_lib_args += '--library'
  grl_lib_args += files(libfile)
 endforeach
 grl_genX_files = [
  'genX_grl_dispatch.c',
  'genX_grl_uuid.cpp',
 ]
 grl_lib_args = []
 foreach libfile : grl_lib_files
  grl_lib_args += '--library'
  grl_lib_args += files(libfile)
 endforeach
 grl_cl_kernel_h = custom_target(
  'grl_cl_kernel.h',
  input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files],
  output : 'grl_cl_kernel.h',
  command : [
    prog_python, '@INPUT0@', '--out-h', '@OUTPUT@',
    grl_lib_args, files(grl_grl_files),
  ],
 )
 has_ply = run_command(
  prog_python, '-c',
  '''
 import ply
  ''', check : false)
 if has_ply.returncode() != 0
  error('Python (3.x) ply module required to build GRL kernels.')
 endif
 r = run_command(prog_python, 'grl_cl_kernel_gen.py',
                grl_lib_args, '--ls-kernels', grl_grl_files, check : false)
 assert(r.returncode() == 0, 'Failed to fetch GRL CL kernels')
 grl_kernels = r.stdout().strip().split()
 grl_metakernel_c = []
 grl_metakernel_h = []
 foreach grl_file : grl_grl_files
  base_outfile = 'grl_metakernel_' + fs.replace_suffix(fs.name(grl_file), '')
  outfiles = custom_target(
    base_outfile,
    input : ['grl_metakernel_gen.py', grl_file, grl_lib_files],
    output : [base_outfile + '.h', base_outfile + '.c'],
    command : [
      prog_python, '@INPUT0@', '--out-h', '@OUTPUT0@',
      '--out-c', '@OUTPUT1@', grl_lib_args, '@INPUT1@',
    ],
  )
  grl_metakernel_h += outfiles[0]
  grl_metakernel_c += outfiles[1]
 endforeach
 grl_genX_libs = []
 foreach t : [['125', 'gfx125', 'dg2'], ['200', 'gfx20', 'lnl'],
             ['300', 'gfx30', 'ptl'], ]
  verX10 = t[0]
  genX_prefix = t[1]
  platform = t[2]
  grl_compiled_cl_kernels = []
  foreach k : grl_kernels
    # get_cl_files dumps out filename:entrypoint:libfile1,libfile2,libfile3
    cl_file = k.split(':')[0]
    entrypoint = k.split(':')[1]
    library_files = k.split(':')[2]
    kernel_prefix = '_'.join([
      genX_prefix,
      fs.replace_suffix(cl_file, '').replace('gpu/', '').replace('/', '_'),
      entrypoint
    ])
    input_args = [ files(cl_file), ]
    if library_files != ''
      foreach lib_file : library_files.split(',')
        input_args += [ lib_file ]
      endforeach
    endif
    prepended_input_args = []
    foreach input_arg : input_args
      prepended_input_args += ['--in', input_arg]
    endforeach
    outfile = kernel_prefix + '.h'
    grl_compiled_cl_kernels += custom_target(
      outfile,
      input : cl_file,
      output : outfile,
      command : [
        prog_intel_clc, '-p', platform, '--prefix', kernel_prefix,
        '-e', entrypoint, prepended_input_args, '-o', '@OUTPUT@', '--',
        '-cl-std=cl2.0', '-D__OPENCL_VERSION__=200',
        '-DMAX_HW_SIMD_WIDTH=16', '-DMAX_WORKGROUP_SIZE=16',
        '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
        '-I' + join_paths(meson.current_source_dir(), 'gpu'),
        '-I' + join_paths(meson.current_source_dir(), 'include'),
      ],
      env: ['MESA_SHADER_CACHE_DISABLE=true',
            'MESA_SPIRV_LOG_LEVEL=error'],
      depends : dep_prog_intel_clc
    )
  endforeach
  grl_cl_kernel_c = custom_target(
    'grl_@0@_cl_kernel.c'.format(genX_prefix),
    input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files],
    output : 'grl_@0@_cl_kernel.c'.format(genX_prefix),
    command : [
      prog_python, '@INPUT0@', '--out-c', '@OUTPUT@',
      grl_lib_args, '--prefix', genX_prefix, files(grl_grl_files),
    ],
  )
  grl_genX_libs += static_library(
    'grl_@0@'.format(genX_prefix),
    [grl_cl_kernel_h, grl_compiled_cl_kernels, grl_cl_kernel_c,
     grl_genX_files, grl_metakernel_c, grl_metakernel_h],
    include_directories : [
      inc_include, inc_src,
      inc_intel,
    ],
    c_args : [
      no_override_init_args, sse2_args,
      '-DGFX_VERx10=@0@'.format(verX10),
    ],
    cpp_args : [
      sse2_args,
      '-DGFX_VERx10=@0@'.format(verX10),
    ],
    dependencies : [
      dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers,
      idep_vulkan_runtime_headers, idep_anv_headers, idep_genxml,
    ],
    gnu_symbol_visibility : 'hidden',
  )
 endforeach
 libgrl_deps = [
  dep_valgrind,
  idep_nir_headers,
  idep_vulkan_util_headers,
  idep_vulkan_wsi_headers,
 ]
 libgrl = static_library(
  'grl',
  [grl_cl_kernel_h],
  include_directories : [
    inc_include, inc_src, inc_intel,
  ],
  link_whole : [grl_genX_libs],
  dependencies : [libgrl_deps, idep_anv_headers],
 )
 idep_grl = declare_dependency(
  link_with : libgrl,
  dependencies : libgrl_deps,
  sources : [grl_metakernel_h, grl_cl_kernel_h],
  include_directories : include_directories('include', 'gpu'),
 )
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@ -39,22 +39,10 @@ idep_anv_headers = declare_dependency(
 bvh_spv = []
 if with_intel_vk_rt
  if with_intel_bvh_grl
    subdir('grl')
    optional_libgrl = [libgrl]
    anv_flags += '-DANV_SUPPORT_RT_GRL=1'
  else
  subdir('bvh')
    idep_grl = null_dep
    optional_libgrl = []
    anv_flags += '-DANV_SUPPORT_RT_GRL=0'
  endif
  anv_flags += '-DANV_SUPPORT_RT=1'
 else
  idep_grl = null_dep
  optional_libgrl = []
  anv_flags += '-DANV_SUPPORT_RT=0'
  anv_flags += '-DANV_SUPPORT_RT_GRL=0'
 endif
 intel_icd = custom_target(
@ -111,15 +99,9 @@ anv_per_hw_ver_files = files(
  'genX_simple_shader.c',
 )
 if with_intel_vk_rt
  if with_intel_bvh_grl
    anv_per_hw_ver_files += files(
      'genX_acceleration_structure_grl.c',
    )
  else
  anv_per_hw_ver_files += files(
    'genX_acceleration_structure.c',
  )
  endif
 endif
 foreach _gfx_ver : ['90', '110', '120', '125', '200', '300']
@ -135,7 +117,7 @@ foreach _gfx_ver : ['90', '110', '120', '125', '200', '300']
      dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml,
      idep_vulkan_util_headers, idep_vulkan_wsi_headers,
      idep_vulkan_runtime_headers, idep_mesautil,
-      idep_intel_driver_ds_headers, idep_grl,
+      idep_intel_driver_ds_headers,
      idep_intel_shaders, idep_intel_blorp,
    ],
  )
@ -271,7 +253,7 @@ libvulkan_intel = shared_library(
  include_directories : [
    inc_include, inc_src, inc_intel,
  ],
-  link_whole : [libanv_common, libanv_per_hw_ver_libs] + optional_libgrl,
+  link_whole : [libanv_common, libanv_per_hw_ver_libs],
  link_with : [
    libisl, libintel_perf,
  ],
@ -313,7 +295,7 @@ if with_tests
    link_with : [
      libanv_per_hw_ver_libs, libintel_common,
      libisl, libintel_perf,
-    ] + optional_libgrl,
+    ],
    dependencies : [
      dep_thread, dep_dl, dep_m, anv_deps,
      idep_nir, idep_vulkan_util, idep_vulkan_wsi, idep_vulkan_runtime,