diff --git a/meson.build b/meson.build index 277a7842c46..99145b3d530 100644 --- a/meson.build +++ b/meson.build @@ -307,29 +307,12 @@ with_any_broadcom = [ with_intel_vk_rt = get_option('intel-rt') \ .disable_auto_if(not with_intel_vk) \ - .disable_if(get_option('intel-bvh-grl') and \ - host_machine.cpu_family() != 'x86_64', \ - error_message : 'Intel Ray Tracing is only supported on x86_64') \ .allowed() -with_intel_bvh_grl = get_option('intel-bvh-grl') - -if get_option('intel-clc') != 'system' and \ - get_option('precomp-compiler') != 'system' and \ - with_intel_bvh_grl - # Require intel-clc with Anv & Iris (for internal shaders) - with_intel_clc = get_option('intel-clc') == 'enabled' or \ - get_option('precomp-compiler') == 'enabled' or \ - with_intel_bvh_grl -else - with_intel_clc = false -endif - with_any_intel = [ with_gallium_crocus, with_gallium_i915, with_gallium_iris, - with_intel_clc, with_intel_hasvk, with_intel_tools, with_intel_vk, diff --git a/meson.options b/meson.options index 5fd5102d3fb..2e61c4c544d 100644 --- a/meson.options +++ b/meson.options @@ -294,7 +294,7 @@ option( type : 'array', value : [], choices : [ - 'device-select', 'intel-nullhw', 'overlay', 'screenshot', + 'device-select', 'intel-nullhw', 'overlay', 'screenshot', 'vram-report-limit', ], description : 'List of vulkan layers to build' @@ -693,13 +693,6 @@ option( description : 'Build the intel-clc compiler or use a system version.' ) -option( - 'intel-bvh-grl', - type : 'boolean', - value : false, - description : 'Build the BVH structure using GRL.' -) - option( 'install-intel-clc', type : 'boolean', diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index 7a9ee0ba58b..50acc601eca 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -87,7 +87,6 @@ with_nir_headers_only = ( with_gallium_rusticl, with_microsoft_clc, with_spirv_to_dxil, - with_intel_clc, with_clc, with_drivers_clc, get_option('intel-elk'), diff --git a/src/intel/compiler/intel_clc.c b/src/intel/compiler/intel_clc.c deleted file mode 100644 index 4fd1ed6ce56..00000000000 --- a/src/intel/compiler/intel_clc.c +++ /dev/null @@ -1,632 +0,0 @@ -/* - * Copyright © 2021 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_compiler.h" -#include "brw_kernel.h" -#include "brw_nir.h" -#include "elk/elk_nir.h" -#include "compiler/brw_disasm.h" -#include "compiler/clc/clc.h" -#include "compiler/glsl_types.h" -#include "compiler/nir/nir_serialize.h" -#include "compiler/spirv/spirv_info.h" -#include "dev/intel_debug.h" -#include "util/build_id.h" -#include "util/disk_cache.h" -#include "util/macros.h" -#include "util/mesa-sha1.h" -#include "util/u_dynarray.h" - -#include -#include -#include -#include -#include -#include -#include - -/* Shader functions */ -#define SPIR_V_MAGIC_NUMBER 0x07230203 - -static struct disk_cache * -get_disk_cache(struct brw_compiler *compiler) -{ -#ifdef ENABLE_SHADER_CACHE - char renderer[14]; - ASSERTED int len = snprintf(renderer, sizeof(renderer), "brw_clc_%04x", - compiler->devinfo->pci_device_id); - assert(len == sizeof(renderer) - 2); - - const struct build_id_note *note = - build_id_find_nhdr_for_addr(get_disk_cache); - if (note == NULL) { - fprintf(stderr, "Failed to find build-id\n"); - abort(); - } - - unsigned build_id_len = build_id_length(note); - if (build_id_len < 20) { - fprintf(stderr, "build-id too short. It needs to be a SHA\n"); - abort(); - } - - struct mesa_sha1 sha1_ctx; - uint8_t sha1[20]; - _mesa_sha1_init(&sha1_ctx); - _mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len); - _mesa_sha1_final(&sha1_ctx, sha1); - - char timestamp[41]; - _mesa_sha1_format(timestamp, sha1); - - const uint64_t driver_flags = brw_get_compiler_config_value(compiler); - - return disk_cache_create(renderer, timestamp, driver_flags); -#endif - return NULL; -} - -static void -compiler_log(void *data, unsigned *id, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - if (INTEL_DEBUG(DEBUG_CS)) - vfprintf(stderr, fmt, args); - va_end(args); -} - -static void -msg_callback(void *priv, const char *msg) -{ - (void)priv; - fprintf(stderr, "%s", msg); -} - -static void -print_u32_data(FILE *fp, const char *prefix, const char *arr_name, - const uint32_t *data, size_t len) -{ - assert(len % 4 == 0); - fprintf(fp, "static const uint32_t %s_%s[] = {", prefix, arr_name); - for (unsigned i = 0; i < (len / 4); i++) { - if (i % 4 == 0) - fprintf(fp,"\n "); - - fprintf(fp, " 0x%08" PRIx32 ",", data[i]); - } - fprintf(fp, "\n};\n"); -} - -static void -print_u8_data(FILE *fp, const char *prefix, const char *arr_name, - const uint8_t *data, size_t len) -{ - fprintf(fp, "static const uint8_t %s_%s[] = {", prefix, arr_name); - for (unsigned i = 0; i < len; i++) { - if (i % 16 == 0) - fprintf(fp,"\n "); - - fprintf(fp, " 0x%02" PRIx8 ",", data[i]); - } - fprintf(fp, "\n};\n"); -} - -static const char * -reloc_type_str(enum brw_shader_reloc_type type) -{ - switch (type) { -#define CASE(e) case e: return #e; - CASE(BRW_SHADER_RELOC_TYPE_U32) - CASE(BRW_SHADER_RELOC_TYPE_MOV_IMM) -#undef CASE - default: - unreachable("Unknown relocation type"); - } -} - -static void -print_cs_prog_data_fields(FILE *fp, const char *prefix, const char *pad, - const struct brw_cs_prog_data *cs_prog_data) -{ -#define PROG_DATA_FIELD(fmt, field) \ - fprintf(fp, "%s." #field " = " fmt ",\n", pad, cs_prog_data->field) - -#define PROG_DATA_BOOL_FIELD(field) \ - fprintf(fp, "%s." #field " = %s,\n", pad, \ - cs_prog_data->field ? "true" : "false") - - PROG_DATA_FIELD("%u", base.nr_params); - assert(cs_prog_data->base.stage == MESA_SHADER_COMPUTE); - fprintf(fp, "%s.base.stage = MESA_SHADER_COMPUTE,\n", pad); - assert(cs_prog_data->base.zero_push_reg == 0); - assert(cs_prog_data->base.push_reg_mask_param == 0); - PROG_DATA_FIELD("%u", base.curb_read_length); - PROG_DATA_FIELD("%u", base.total_scratch); - PROG_DATA_FIELD("%u", base.total_shared); - PROG_DATA_FIELD("%u", base.program_size); - PROG_DATA_FIELD("%u", base.const_data_size); - PROG_DATA_FIELD("%u", base.const_data_offset); - PROG_DATA_FIELD("%u", base.num_relocs); - fprintf(fp, "%s.base.relocs = %s_relocs,\n", pad, prefix); - PROG_DATA_FIELD("%u", base.grf_used); - PROG_DATA_FIELD("%u", base.printf_info_count); - fprintf(fp, "%s.base.printf_info = (u_printf_info *)%s_printfs,\n", pad, prefix); - assert(!cs_prog_data->base.has_ubo_pull); - assert(cs_prog_data->base.dispatch_grf_start_reg == 0); - assert(!cs_prog_data->base.use_alt_mode); - assert(cs_prog_data->base.param == 0); - PROG_DATA_BOOL_FIELD(base.uses_atomic_load_store); - fprintf(fp, "%s.local_size = { %u, %u, %u },\n", pad, - cs_prog_data->local_size[0], - cs_prog_data->local_size[1], - cs_prog_data->local_size[2]); - fprintf(fp, "%s.prog_offset = { %u, %u, %u },\n", pad, - cs_prog_data->prog_offset[0], - cs_prog_data->prog_offset[1], - cs_prog_data->prog_offset[2]); - PROG_DATA_FIELD("%u", prog_mask); - PROG_DATA_FIELD("%u", prog_spilled); - PROG_DATA_BOOL_FIELD(uses_barrier); - PROG_DATA_BOOL_FIELD(uses_num_work_groups); - assert(!cs_prog_data->uses_inline_data); - assert(!cs_prog_data->uses_btd_stack_ids); - PROG_DATA_FIELD("%u", push.per_thread.dwords); - PROG_DATA_FIELD("%u", push.per_thread.regs); - PROG_DATA_FIELD("%u", push.per_thread.size); - PROG_DATA_FIELD("%u", push.cross_thread.dwords); - PROG_DATA_FIELD("%u", push.cross_thread.regs); - PROG_DATA_FIELD("%u", push.cross_thread.size); - -#undef PROG_DATA_FIELD -#undef PROG_DATA_BOOL_FIELD -} - -static void -print_kernel(FILE *fp, const char *prefix, - const struct brw_kernel *kernel, - const struct brw_isa_info *isa) -{ - struct mesa_sha1 sha1_ctx; - _mesa_sha1_init(&sha1_ctx); - -#define SHA1_UPDATE_VALUE(val) \ - _mesa_sha1_update(&sha1_ctx, &val, sizeof(val)) - - fprintf(fp, "#include \"intel/compiler/brw_kernel.h\"\n"); - fprintf(fp, "\n"); - - fprintf(fp, "static const struct brw_shader_reloc %s_relocs[] = {\n", - prefix); - for (unsigned i = 0; i < kernel->prog_data.base.num_relocs; i++) { - const struct brw_shader_reloc *reloc = &kernel->prog_data.base.relocs[i]; - fprintf(fp, " { %"PRIu32", %s, %"PRIu32", %"PRIu32" },\n", - reloc->id, reloc_type_str(reloc->type), - reloc->offset, reloc->delta); - } - fprintf(fp, "};\n"); - _mesa_sha1_update(&sha1_ctx, kernel->prog_data.base.relocs, - kernel->prog_data.base.num_relocs * - sizeof(kernel->prog_data.base.relocs[0])); - - fprintf(fp, "static const u_printf_info %s_printfs[] = {\n", - prefix); - for (unsigned i = 0; i < kernel->prog_data.base.printf_info_count; i++) { - const u_printf_info *printf_info = &kernel->prog_data.base.printf_info[i]; - fprintf(fp, " {\n"); - fprintf(fp, " .num_args = %"PRIu32",\n", printf_info->num_args); - fprintf(fp, " .arg_sizes = (unsigned []) {\n"); - for (unsigned a = 0; a < printf_info->num_args; a++) - fprintf(fp, " %"PRIu32",\n", printf_info->arg_sizes[a]); - fprintf(fp, " },\n"); - fprintf(fp, " .string_size = %"PRIu32",\n", printf_info->string_size); - fprintf(fp, " .strings = (char []) {"); - for (unsigned c = 0; c < printf_info->string_size; c++) { - if (c % 8 == 0 ) - fprintf(fp, "\n "); - fprintf(fp, "0x%02hhx, ", printf_info->strings[c]); - } - fprintf(fp, "\n },\n"); - fprintf(fp, " },\n"); - } - fprintf(fp, "};\n"); - - /* Get rid of the pointers before we hash */ - struct brw_cs_prog_data cs_prog_data = kernel->prog_data; - cs_prog_data.base.relocs = NULL; - assert(cs_prog_data.base.param == NULL); - _mesa_sha1_update(&sha1_ctx, &cs_prog_data, sizeof(cs_prog_data)); - - SHA1_UPDATE_VALUE(kernel->args_size); - SHA1_UPDATE_VALUE(kernel->arg_count); - _mesa_sha1_update(&sha1_ctx, kernel->args, - kernel->arg_count * sizeof(kernel->args[0])); - - fprintf(fp, "static const struct brw_kernel_arg_desc %s_args[] = {\n", - prefix); - for (unsigned i = 0; i < kernel->arg_count; i++) { - fprintf(fp, " { %d, %d },\n", - kernel->args[i].offset, kernel->args[i].size); - } - fprintf(fp, "};\n\n"); - - _mesa_sha1_update(&sha1_ctx, kernel->code, - kernel->prog_data.base.program_size); - - fprintf(fp, "#if 0 /* BEGIN KERNEL ASSEMBLY */\n"); - fprintf(fp, "\n"); - brw_disassemble_with_errors(isa, kernel->code, 0, NULL, fp); - fprintf(fp, "\n"); - fprintf(fp, "#endif /* END KERNEL ASSEMBLY */\n"); - print_u32_data(fp, prefix, "code", kernel->code, - kernel->prog_data.base.program_size); - - fprintf(fp, "static const struct brw_kernel %s = {\n", prefix); - fprintf(fp, " .prog_data = {\n"); - print_cs_prog_data_fields(fp, prefix, " ", &kernel->prog_data); - fprintf(fp, " },\n"); - fprintf(fp, " .args_size = %d,\n", (int)kernel->args_size); - fprintf(fp, " .arg_count = %d,\n", (int)kernel->arg_count); - fprintf(fp, " .args = %s_args,\n", prefix); - fprintf(fp, " .code = %s_code,\n", prefix); - fprintf(fp, "};\n"); - - unsigned char sha1[20]; - _mesa_sha1_final(&sha1_ctx, sha1); - char sha1_str[41]; - _mesa_sha1_format(sha1_str, sha1); - fprintf(fp, "const char *%s_sha1 = \"%s\";\n", prefix, sha1_str); -} - -static void -print_usage(char *exec_name, FILE *f) -{ - fprintf(f, -"Usage: %s [options] -- [clang args]\n" -"Options:\n" -" -h --help Print this help.\n" -" -e, --entrypoint Specify the entry-point name.\n" -" -L, --llvm17-wa Enable LLVM 17 workarounds for opaque pointers" -" -p, --platform Specify the target platform name.\n" -" --prefix Prefix for variable names in generated C code.\n" -" -o, --out Specify the output filename.\n" -" -i, --in Specify one input filename. Accepted multiple times.\n" -" -s, --spv Specify the output filename for spirv.\n" -" -n, --nir Specify whether to output serialized NIR instead of ISA.\n" -" -g, --gfx-version Specify the Gfx version used for NIR output.\n" -" -t, --text Specify the output filename for the parsed text\n" -" -v, --verbose Print more information during compilation.\n" -" -M, --llvm-version Print LLVM version.\n" - , exec_name); -} - -#define OPT_PREFIX 1000 - -struct intel_clc_params { - char *entry_point; - char *platform; - char *outfile; - char *spv_outfile; - char *txt_outfile; - char *prefix; - - unsigned gfx_version; - - bool output_nir; - bool print_info; - bool llvm17_wa; - - void *mem_ctx; - - struct intel_device_info devinfo; -}; - -#include "compiler/spirv/nir_spirv.h" - -static int -output_isa(const struct intel_clc_params *params, struct clc_binary *binary) -{ - struct brw_kernel kernel = {}; - char *error_str; - int ret = 0; - - struct brw_isa_info _isa, *isa = &_isa; - brw_init_isa_info(isa, ¶ms->devinfo); - - struct brw_compiler *compiler = brw_compiler_create(params->mem_ctx, - ¶ms->devinfo); - compiler->spilling_rate = 11; - compiler->shader_debug_log = compiler_log; - compiler->shader_perf_log = compiler_log; - struct disk_cache *disk_cache = get_disk_cache(compiler); - - if (!brw_kernel_from_spirv(compiler, disk_cache, &kernel, NULL, params->mem_ctx, - binary->data, binary->size, - params->entry_point, &error_str)) { - fprintf(stderr, "Compile failed: %s\n", error_str); - ret = -1; - goto exit; - } - - if (params->print_info) { - fprintf(stdout, "kernel info:\n"); - fprintf(stdout, " uses_barrier : %u\n", kernel.prog_data.uses_barrier); - fprintf(stdout, " uses_num_work_groups : %u\n", kernel.prog_data.uses_num_work_groups); - fprintf(stdout, " uses_inline_data : %u\n", kernel.prog_data.uses_inline_data); - fprintf(stdout, " local_size : %ux%ux%u\n", - kernel.prog_data.local_size[0], - kernel.prog_data.local_size[1], - kernel.prog_data.local_size[2]); - fprintf(stdout, " curb_read_length : %u\n", kernel.prog_data.base.curb_read_length); - fprintf(stdout, " total_scratch : %u\n", kernel.prog_data.base.total_scratch); - fprintf(stdout, " total_shared : %u\n", kernel.prog_data.base.total_shared); - fprintf(stdout, " program_size : %u\n", kernel.prog_data.base.program_size); - fprintf(stdout, " const_data_size : %u\n", kernel.prog_data.base.const_data_size); - fprintf(stdout, " uses_atomic_load_store : %u\n", kernel.prog_data.base.uses_atomic_load_store); - fprintf(stdout, " dispatch_grf_start_reg : %u\n", kernel.prog_data.base.dispatch_grf_start_reg); - } - - char *prefix = params->prefix; - char prefix_tmp[256]; - if (prefix == NULL) { - bool is_pt_5 = (params->devinfo.verx10 % 10) == 5; - snprintf(prefix_tmp, sizeof(prefix_tmp), "gfx%d%s_clc_%s", - params->devinfo.ver, is_pt_5 ? "5" : "", params->entry_point); - prefix = prefix_tmp; - } - - if (params->outfile != NULL) { - FILE *fp = fopen(params->outfile, "w"); - print_kernel(fp, prefix, &kernel, isa); - fclose(fp); - } else { - print_kernel(stdout, prefix, &kernel, isa); - } - -exit: - disk_cache_destroy(disk_cache); - return ret; -} - -static void -print_llvm_version(FILE *out) -{ - fprintf(out, "%s\n", MESA_LLVM_VERSION_STRING); -} - -int main(int argc, char **argv) -{ - int exit_code = 0; - - process_intel_debug_variable(); - - static struct option long_options[] ={ - {"help", no_argument, 0, 'h'}, - {"entrypoint", required_argument, 0, 'e'}, - {"platform", required_argument, 0, 'p'}, - {"prefix", required_argument, 0, OPT_PREFIX}, - {"in", required_argument, 0, 'i'}, - {"out", required_argument, 0, 'o'}, - {"spv", required_argument, 0, 's'}, - {"text", required_argument, 0, 't'}, - {"llvm-version", no_argument, 0, 'M'}, - {"verbose", no_argument, 0, 'v'}, - {0, 0, 0, 0} - }; - - struct intel_clc_params params = {}; - - struct util_dynarray clang_args; - struct util_dynarray input_files; - - struct clc_binary spirv_obj = {0}; - struct clc_parsed_spirv parsed_spirv_data = {0}; - struct disk_cache *disk_cache = NULL; - - params.mem_ctx = ralloc_context(NULL); - - util_dynarray_init(&clang_args, params.mem_ctx); - util_dynarray_init(&input_files, params.mem_ctx); - - int ch; - while ((ch = getopt_long(argc, argv, "he:p:s:t:i:o:Mv", long_options, NULL)) != -1) - { - switch (ch) - { - case 'h': - print_usage(argv[0], stdout); - goto end; - case 'e': - params.entry_point = optarg; - break; - case 'p': - params.platform = optarg; - break; - case 'o': - params.outfile = optarg; - break; - case 'i': - util_dynarray_append(&input_files, char *, optarg); - break; - case 's': - params.spv_outfile = optarg; - break; - case 't': - params.txt_outfile = optarg; - break; - case 'v': - params.print_info = true; - break; - case 'M': - print_llvm_version(stdout); - return EXIT_SUCCESS; - case OPT_PREFIX: - params.prefix = optarg; - break; - default: - fprintf(stderr, "Unrecognized option \"%s\".\n", optarg); - print_usage(argv[0], stderr); - goto fail; - } - } - - for (int i = optind; i < argc; i++) { - util_dynarray_append(&clang_args, char *, argv[i]); - } - - if (util_dynarray_num_elements(&input_files, char *) == 0) { - fprintf(stderr, "No input file(s).\n"); - print_usage(argv[0], stderr); - goto fail; - } - - struct clc_logger logger = { - .error = msg_callback, - .warning = msg_callback, - }; - - size_t total_size = 0; - char *all_inputs = NULL; - util_dynarray_foreach(&input_files, char *, infile) { - int fd = open(*infile, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "Failed to open %s\n", *infile); - goto fail; - } - - off_t len = lseek(fd, 0, SEEK_END); - size_t new_size = total_size + len; - all_inputs = reralloc_size(params.mem_ctx, all_inputs, new_size + 1); - if (!all_inputs) { - fprintf(stderr, "Failed to allocate memory\n"); - goto fail; - } - lseek(fd, 0, SEEK_SET); - read(fd, all_inputs + total_size, len); - close(fd); - total_size = new_size; - all_inputs[total_size] = '\0'; - } - - if (params.txt_outfile) { - FILE *fp = fopen(params.txt_outfile, "w"); - fwrite(all_inputs, total_size, 1, fp); - fclose(fp); - } - - const char *allowed_spirv_extensions[] = { - "SPV_EXT_shader_atomic_float_add", - "SPV_EXT_shader_atomic_float_min_max", - "SPV_KHR_float_controls", - "SPV_INTEL_subgroups", - NULL, - }; - - struct clc_compile_args clc_args = { - .source = { - .name = "intel_clc_files", - .value = all_inputs, - }, - .features = { - .fp16 = true, - .intel_subgroups = true, - .subgroups = true, - .subgroups_ifp = true, - }, - .args = util_dynarray_begin(&clang_args), - .num_args = util_dynarray_num_elements(&clang_args, char *), - .allowed_spirv_extensions = allowed_spirv_extensions, - }; - - if (!clc_compile_c_to_spirv(&clc_args, &logger, &spirv_obj, NULL)) { - goto fail; - } - - if (params.spv_outfile) { - FILE *fp = fopen(params.spv_outfile, "w"); - fwrite(spirv_obj.data, spirv_obj.size, 1, fp); - fclose(fp); - } - - glsl_type_singleton_init_or_ref(); - - if (params.platform == NULL) { - fprintf(stderr, "No target platform name specified.\n"); - print_usage(argv[0], stderr); - goto fail; - } - - int pci_id = intel_device_name_to_pci_device_id(params.platform); - if (pci_id < 0) { - fprintf(stderr, "Invalid target platform name: %s\n", params.platform); - goto fail; - } - - if (!intel_get_device_info_for_build(pci_id, ¶ms.devinfo)) { - fprintf(stderr, "Failed to get device information.\n"); - goto fail; - } - - if (params.devinfo.verx10 < 125) { - fprintf(stderr, "Platform currently not supported.\n"); - goto fail; - } - - if (params.entry_point == NULL) { - fprintf(stderr, "No entry-point name specified.\n"); - print_usage(argv[0], stderr); - goto fail; - } - - if (!clc_parse_spirv(&spirv_obj, &logger, &parsed_spirv_data)) - goto fail; - - const struct clc_kernel_info *kernel_info = NULL; - for (unsigned i = 0; i < parsed_spirv_data.num_kernels; i++) { - if (strcmp(parsed_spirv_data.kernels[i].name, params.entry_point) == 0) { - kernel_info = &parsed_spirv_data.kernels[i]; - break; - } - } - if (kernel_info == NULL) { - fprintf(stderr, "Kernel entrypoint %s not found\n", params.entry_point); - goto fail; - } - - exit_code = output_isa(¶ms, &spirv_obj); - - glsl_type_singleton_decref(); - - goto end; - -fail: - exit_code = 1; - -end: - disk_cache_destroy(disk_cache); - clc_free_parsed_spirv(&parsed_spirv_data); - clc_free_spirv(&spirv_obj); - ralloc_free(params.mem_ctx); - - return exit_code; -} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 84751415d5e..4448a76f3f5 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -169,42 +169,6 @@ idep_intel_compiler_brw = declare_dependency( ], ) -# For now this tool is only going to be used by Anv -if with_intel_bvh_grl - if get_option('intel-clc') == 'system' or get_option('precomp-compiler') == 'system' - prog_intel_clc = find_program('intel_clc', native : true) - dep_prog_intel_clc = [] - elif with_intel_clc - prog_intel_clc = executable( - 'intel_clc', - [ - 'intel_clc.c', - 'brw_kernel.c', - - # Use just the nir_options part of ELK instead of fully linking. - 'elk/elk_nir_options.h', - 'elk/elk_nir_options.c', - 'elk/elk_spirv.c', - ], - link_with : [libisl], - include_directories : [inc_include, inc_src, inc_intel], - c_args : [pre_args, no_override_init_args], - cpp_args : ['-Werror=vla'], - link_args : [ld_args_build_id], - dependencies : [idep_nir, idep_mesaclc, idep_mesautil, idep_intel_dev, - idep_intel_compiler_brw], - # If we can run host binaries directly, just build intel_clc for the host. - # Most commonly this happens when doing a cross compile from an x86_64 build - # machine to an x86 host - native : not meson.can_run_host_binaries(), - install : get_option('install-intel-clc') or get_option('install-precomp-compiler'), - ) - dep_prog_intel_clc = [prog_intel_clc] - endif -else - dep_prog_intel_clc = [] -endif - if with_tests test( 'intel_compiler_brw_tests', diff --git a/src/intel/vulkan/anv_formats.c b/src/intel/vulkan/anv_formats.c index 7ebe0fb7cba..f651073a29b 100644 --- a/src/intel/vulkan/anv_formats.c +++ b/src/intel/vulkan/anv_formats.c @@ -1016,29 +1016,8 @@ get_buffer_format_features2(const struct intel_device_info *devinfo, flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT; if (devinfo->has_ray_tracing) { -#if ANV_SUPPORT_RT_GRL - switch (vk_format) { - case VK_FORMAT_R32G32_SFLOAT: - case VK_FORMAT_R32G32B32_SFLOAT: - case VK_FORMAT_R16G16_SFLOAT: - case VK_FORMAT_R16G16B16A16_SFLOAT: - case VK_FORMAT_R16G16_SNORM: - case VK_FORMAT_R16G16B16A16_SNORM: - case VK_FORMAT_R16G16B16A16_UNORM: - case VK_FORMAT_R16G16_UNORM: - case VK_FORMAT_R8G8B8A8_UNORM: - case VK_FORMAT_R8G8_UNORM: - case VK_FORMAT_R8G8B8A8_SNORM: - case VK_FORMAT_R8G8_SNORM: - flags |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR; - break; - default: - break; - } -#else if (vk_acceleration_struct_vtx_format_supported(vk_format)) flags |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR; -#endif } } } diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 75f7bfb7dea..84abe9ab286 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2415,7 +2415,7 @@ anv_queue_post_submit(struct anv_queue *queue, VkResult submit_result) result = vk_queue_set_lost(&queue->vk, "sync wait failed"); } -#if ANV_SUPPORT_RT && !ANV_SUPPORT_RT_GRL +#if ANV_SUPPORT_RT /* The recorded bvh is dumped to files upon command buffer completion */ if (INTEL_DEBUG_BVH_ANY) anv_dump_bvh_to_files(queue->device); diff --git a/src/intel/vulkan/genX_acceleration_structure_grl.c b/src/intel/vulkan/genX_acceleration_structure_grl.c deleted file mode 100644 index 33d8e2c6a5d..00000000000 --- a/src/intel/vulkan/genX_acceleration_structure_grl.c +++ /dev/null @@ -1,1274 +0,0 @@ -/* - * Copyright © 2020 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "anv_private.h" - -#include - -#include "util/u_debug.h" -#include "util/half_float.h" -#include "util/u_atomic.h" - -#include "genxml/gen_macros.h" -#include "genxml/genX_pack.h" -#include "genxml/genX_rt_pack.h" - -#include "ds/intel_tracepoints.h" - -#if GFX_VERx10 >= 125 -#include "grl/grl_structs.h" - -/* Wait for the previous dispatches to finish and flush their data port - * writes. - */ -#define ANV_GRL_FLUSH_FLAGS (ANV_PIPE_END_OF_PIPE_SYNC_BIT | \ - ANV_PIPE_DATA_CACHE_FLUSH_BIT | \ - ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) - -static const VkAccelerationStructureGeometryKHR * -get_geometry(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo, - uint32_t index) -{ - return pInfo->pGeometries ? &pInfo->pGeometries[index] : - pInfo->ppGeometries[index]; -} - -static size_t align_transient_size(size_t bytes) -{ - return align_uintptr(bytes, 64); -} - -static size_t align_private_size(size_t bytes) -{ - return align_uintptr(bytes, 64); -} - -static size_t get_scheduler_size(size_t num_builds) -{ - size_t scheduler_size = sizeof(union SchedulerUnion); - /* add more memory for qnode creation stage if needed */ - if (num_builds > QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) { - scheduler_size += (num_builds - QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) * 2 * - sizeof(struct QNodeGlobalRootBufferEntry); - } - - return align_private_size(scheduler_size); -} - -static size_t -get_batched_binnedsah_transient_mem_size(size_t num_builds) -{ - if (num_builds == 0) - return 0; - return num_builds * (sizeof(struct SAHBuildBuffersInfo) + sizeof(gpuva_t)); -} - -static size_t -get_batched_binnedsah_private_mem_size(size_t num_builds) -{ - if (num_builds == 0) - return 0; - - size_t globals_size = align_private_size(num_builds * sizeof(struct SAHBuildGlobals)); - return globals_size + get_scheduler_size(num_builds); -} - -static uint32_t -estimate_qbvh6_nodes(const uint32_t N) -{ - const uint32_t W = 6; - const uint32_t N0 = N / 2 + N % 2; // lowest level with 2 leaves per QBVH6 node - const uint32_t N1 = N0 / W + (N0 % W ? 1 : 0); // filled level - const uint32_t N2 = N0 / W + (N1 % W ? 1 : 0); // filled level - const uint32_t N3 = N0 / W + (N2 % W ? 1 : 0); // filled level - const uint32_t N4 = N3; // overestimate remaining nodes - return N0 + N1 + N2 + N3 + N4; -} - -/* Estimates the worst case number of QBVH6 nodes for a top-down BVH - * build that guarantees to produce subtree with N >= K primitives - * from which a single QBVH6 node is created. - */ -static uint32_t -estimate_qbvh6_nodes_minK(const uint32_t N, uint32_t K) -{ - const uint32_t N0 = N / K + (N % K ? 1 : 0); // lowest level of nodes with K leaves minimally - return N0 + estimate_qbvh6_nodes(N0); -} - -static size_t -estimate_qbvh6_fatleafs(const size_t P) -{ - return P; -} - -static size_t -estimate_qbvh6_nodes_worstcase(const size_t P) -{ - const size_t F = estimate_qbvh6_fatleafs(P); - - // worst-case each inner node having 5 fat-leaf children. - // number of inner nodes is F/5 and number of fat-leaves is F - return F + ceil(F/5.0); -} - -#define sizeof_PrimRef 32 -#define sizeof_HwInstanceLeaf (GENX(RT_BVH_INSTANCE_LEAF_length) * 4) -#define sizeof_InternalNode (GENX(RT_BVH_INTERNAL_NODE_length) * 4) -#define sizeof_Procedural (GENX(RT_BVH_PROCEDURAL_LEAF_length) * 4) -#define sizeof_Quad (GENX(RT_BVH_QUAD_LEAF_length) * 4) - -static struct MKSizeEstimate -get_gpu_size_estimate(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo, - const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos, - const uint32_t *pMaxPrimitiveCounts) -{ - uint32_t num_triangles = 0, num_aabbs = 0, num_instances = 0; - for (unsigned g = 0; g < pInfo->geometryCount; g++) { - const VkAccelerationStructureGeometryKHR *pGeometry = - get_geometry(pInfo, g); - uint32_t prim_count = pBuildRangeInfos != NULL ? - pBuildRangeInfos[g].primitiveCount : pMaxPrimitiveCounts[g]; - - switch (pGeometry->geometryType) { - case VK_GEOMETRY_TYPE_TRIANGLES_KHR: - num_triangles += prim_count; - break; - case VK_GEOMETRY_TYPE_AABBS_KHR: - num_aabbs += prim_count; - break; - case VK_GEOMETRY_TYPE_INSTANCES_KHR: - num_instances += prim_count; - break; - default: - unreachable("Unsupported geometry type"); - } - } - const uint32_t num_primitives = num_triangles + num_aabbs + num_instances; - - struct MKSizeEstimate est = {}; - - uint64_t size = sizeof(BVHBase); - size = align64(size, 64); - - /* Must immediately follow BVHBase because we use fixed offset to nodes. */ - est.node_data_start = size; - - switch (pInfo->type) { - case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: { - assert(num_triangles == 0 && num_aabbs == 0); - - est.numPrimitives = num_instances; - est.numPrimitivesToSplit = 0; - est.numBuildPrimitives = est.numPrimitives + est.numPrimitivesToSplit; - - est.min_primitives = est.numPrimitives; - est.max_primitives = est.numPrimitives + est.numPrimitivesToSplit; - - unsigned int sizeInnerNodes = - (unsigned int) estimate_qbvh6_nodes_worstcase(est.numBuildPrimitives) * - sizeof_InternalNode; - if (sizeInnerNodes == 0) - sizeInnerNodes = sizeof_InternalNode; - - est.max_inner_nodes = sizeInnerNodes / sizeof_InternalNode; - - size += sizeInnerNodes; - STATIC_ASSERT(sizeof_InternalNode % 64 == 0); - - est.leaf_data_start = size; - size += est.numBuildPrimitives * sizeof_HwInstanceLeaf; - STATIC_ASSERT(sizeof_HwInstanceLeaf % 64 == 0); - - est.leaf_data_size = est.numBuildPrimitives * sizeof_HwInstanceLeaf; - - break; - } - - case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: { - assert(num_instances == 0); - - /* RT: TODO */ - const float split_factor = 0.0f; - uint32_t num_prims_to_split = 0; - if (false) - num_prims_to_split = num_triangles + (double)split_factor; - - const uint32_t num_build_triangles = num_triangles + num_prims_to_split; - const uint32_t num_build_primitives = num_build_triangles + num_aabbs; - - est.numPrimitives = num_primitives; - est.numTriangles = num_triangles; - est.numProcedurals = num_aabbs; - est.numMeshes = pInfo->geometryCount; - est.numBuildPrimitives = num_build_primitives; - est.numPrimitivesToSplit = num_prims_to_split; - est.max_instance_leafs = 0; - - est.min_primitives = (size_t)(num_build_triangles * 0.5f + num_aabbs); - est.max_primitives = num_build_triangles + num_aabbs; - - size_t nodeBytes = 0; - nodeBytes += estimate_qbvh6_nodes_worstcase(num_build_triangles) * sizeof_InternalNode; - nodeBytes += estimate_qbvh6_nodes_worstcase(num_aabbs) * sizeof_InternalNode; - if (nodeBytes == 0) // for case with 0 primitives - nodeBytes = sizeof_InternalNode; - nodeBytes = MAX2(nodeBytes, 8 * (size_t)num_build_primitives); // for primref_index0/1 buffers - - est.max_inner_nodes = nodeBytes / sizeof_InternalNode; - - size += nodeBytes; - STATIC_ASSERT(sizeof_InternalNode % 64 == 0); - - est.leaf_data_start = size; - size += num_build_triangles * sizeof_Quad; - STATIC_ASSERT(sizeof_Quad % 64 == 0); - - est.procedural_data_start = size; - size += num_aabbs * sizeof_Procedural; - STATIC_ASSERT(sizeof_Procedural % 64 == 0); - - est.leaf_data_size = num_build_triangles * sizeof_Quad + - num_aabbs * sizeof_Procedural; - - if (num_build_primitives == 0) - size += MAX2(sizeof_Quad, sizeof_Procedural); - break; - } - - default: - unreachable("Unsupported acceleration structure type"); - } - - size = align64(size, 64); - est.instance_descs_start = size; - size += sizeof(struct InstanceDesc) * num_instances; - - est.geo_meta_data_start = size; - size += sizeof(struct GeoMetaData) * pInfo->geometryCount; - size = align64(size, 64); - - assert(size == align64(size, 64)); - est.back_pointer_start = size; - - const bool alloc_backpointers = false; /* RT TODO */ - if (alloc_backpointers) { - size += est.max_inner_nodes * sizeof(uint32_t); - size = align64(size, 64); - } - - assert(size < UINT32_MAX); - est.sizeTotal = align64(size, 64); - - return est; -} - -struct scratch_layout { - gpuva_t base; - uint32_t total_size; - - gpuva_t primrefs; - gpuva_t globals; - gpuva_t leaf_index_buffers; - uint32_t leaf_index_buffer_stride; - - /* new_sah */ - gpuva_t qnode_buffer; - gpuva_t bvh2_buffer; -}; - -static size_t -get_bvh2_size(uint32_t num_primitivies) -{ - if (num_primitivies == 0) - return 0; - return sizeof(struct BVH2) + - (2 * num_primitivies - 1) * sizeof(struct BVH2Node); -} - -static struct scratch_layout -get_gpu_scratch_layout(struct anv_address base, - struct MKSizeEstimate est, - enum anv_rt_bvh_build_method build_method) -{ - struct scratch_layout scratch = { - .base = anv_address_physical(base), - }; - gpuva_t current = anv_address_physical(base); - - scratch.globals = current; - current += sizeof(struct Globals); - - scratch.primrefs = intel_canonical_address(current); - current += est.numBuildPrimitives * sizeof_PrimRef; - - scratch.leaf_index_buffers = intel_canonical_address(current); - current += est.numBuildPrimitives * sizeof(uint32_t) * 2; - scratch.leaf_index_buffer_stride = sizeof(uint32_t); - - switch (build_method) { - case ANV_BVH_BUILD_METHOD_TRIVIAL: - break; - - case ANV_BVH_BUILD_METHOD_NEW_SAH: { - size_t bvh2_size = get_bvh2_size(est.numBuildPrimitives); - if (est.leaf_data_size < bvh2_size) { - scratch.bvh2_buffer = intel_canonical_address(current); - current += bvh2_size; - } - - scratch.qnode_buffer = intel_canonical_address(current); - current += 2 * sizeof(dword) * est.max_inner_nodes; - break; - } - - default: - unreachable("invalid build"); - } - - assert((current - scratch.base) < UINT32_MAX); - scratch.total_size = current - scratch.base; - - return scratch; -} - -static void -anv_get_gpu_acceleration_structure_size( - UNUSED struct anv_device *device, - VkAccelerationStructureBuildTypeKHR buildType, - const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo, - const uint32_t* pMaxPrimitiveCounts, - VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo) -{ - - struct MKSizeEstimate est = get_gpu_size_estimate(pBuildInfo, NULL, - pMaxPrimitiveCounts); - struct scratch_layout scratch = get_gpu_scratch_layout(ANV_NULL_ADDRESS, est, - device->bvh_build_method); - - pSizeInfo->accelerationStructureSize = est.sizeTotal; - pSizeInfo->buildScratchSize = scratch.total_size; - pSizeInfo->updateScratchSize = scratch.total_size; /* TODO */ -} - -void -genX(GetAccelerationStructureBuildSizesKHR)( - VkDevice _device, - VkAccelerationStructureBuildTypeKHR buildType, - const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo, - const uint32_t* pMaxPrimitiveCounts, - VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo) -{ - ANV_FROM_HANDLE(anv_device, device, _device); - assert(pSizeInfo->sType == - VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR); - - VkAccelerationStructureBuildSizesInfoKHR gpu_size_info; - anv_get_gpu_acceleration_structure_size(device, buildType, pBuildInfo, - pMaxPrimitiveCounts, - &gpu_size_info); - - pSizeInfo->accelerationStructureSize = - gpu_size_info.accelerationStructureSize; - pSizeInfo->buildScratchSize = gpu_size_info.buildScratchSize; - pSizeInfo->updateScratchSize = gpu_size_info.updateScratchSize; -} - -void -genX(GetDeviceAccelerationStructureCompatibilityKHR)( - VkDevice _device, - const VkAccelerationStructureVersionInfoKHR* pVersionInfo, - VkAccelerationStructureCompatibilityKHR* pCompatibility) -{ - ANV_FROM_HANDLE(anv_device, device, _device); - - if (memcmp(pVersionInfo->pVersionData, - device->physical->rt_uuid, - sizeof(device->physical->rt_uuid)) == 0) { - *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR; - } else { - *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR; - } -} - -static inline uint8_t -vk_to_grl_GeometryFlags(VkGeometryFlagsKHR flags) -{ - uint8_t grl_flags = GEOMETRY_FLAG_NONE; - unsigned mask = flags; - while (mask) { - int i = u_bit_scan(&mask); - switch ((VkGeometryFlagBitsKHR)(1u << i)) { - case VK_GEOMETRY_OPAQUE_BIT_KHR: - grl_flags |= GEOMETRY_FLAG_OPAQUE; - break; - case VK_GEOMETRY_NO_DUPLICATE_ANY_HIT_INVOCATION_BIT_KHR: - grl_flags |= GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION; - break; - default: - unreachable("Unsupported acceleration structure build flag"); - } - } - return grl_flags; -} - -static inline IndexFormat -vk_to_grl_IndexFormat(VkIndexType type) -{ - switch (type) { - case VK_INDEX_TYPE_NONE_KHR: return INDEX_FORMAT_NONE; - case VK_INDEX_TYPE_UINT8_KHR: unreachable("No UINT8 support yet"); - case VK_INDEX_TYPE_UINT16: return INDEX_FORMAT_R16_UINT; - case VK_INDEX_TYPE_UINT32: return INDEX_FORMAT_R32_UINT; - default: - unreachable("Unsupported index type"); - } -} - -static inline VertexFormat -vk_to_grl_VertexFormat(VkFormat format) -{ - switch (format) { - case VK_FORMAT_R32G32_SFLOAT: return VERTEX_FORMAT_R32G32_FLOAT; - case VK_FORMAT_R32G32B32_SFLOAT: return VERTEX_FORMAT_R32G32B32_FLOAT; - case VK_FORMAT_R16G16_SFLOAT: return VERTEX_FORMAT_R16G16_FLOAT; - case VK_FORMAT_R16G16B16A16_SFLOAT: return VERTEX_FORMAT_R16G16B16A16_FLOAT; - case VK_FORMAT_R16G16_SNORM: return VERTEX_FORMAT_R16G16_SNORM; - case VK_FORMAT_R16G16B16A16_SNORM: return VERTEX_FORMAT_R16G16B16A16_SNORM; - case VK_FORMAT_R16G16B16A16_UNORM: return VERTEX_FORMAT_R16G16B16A16_UNORM; - case VK_FORMAT_R16G16_UNORM: return VERTEX_FORMAT_R16G16_UNORM; - /* case VK_FORMAT_R10G10B10A2_UNORM: return VERTEX_FORMAT_R10G10B10A2_UNORM; */ - case VK_FORMAT_R8G8B8A8_UNORM: return VERTEX_FORMAT_R8G8B8A8_UNORM; - case VK_FORMAT_R8G8_UNORM: return VERTEX_FORMAT_R8G8_UNORM; - case VK_FORMAT_R8G8B8A8_SNORM: return VERTEX_FORMAT_R8G8B8A8_SNORM; - case VK_FORMAT_R8G8_SNORM: return VERTEX_FORMAT_R8G8_SNORM; - default: - unreachable("Unsupported vertex format"); - } -} - -static struct Geo -vk_to_grl_Geo(const VkAccelerationStructureGeometryKHR *pGeometry, - uint32_t prim_count, - uint32_t transform_offset, - uint32_t primitive_offset, - uint32_t first_vertex) -{ - struct Geo geo = { - .Flags = vk_to_grl_GeometryFlags(pGeometry->flags), - }; - - switch (pGeometry->geometryType) { - case VK_GEOMETRY_TYPE_TRIANGLES_KHR: { - const VkAccelerationStructureGeometryTrianglesDataKHR *vk_tri = - &pGeometry->geometry.triangles; - - geo.Type = GEOMETRY_TYPE_TRIANGLES; - - geo.Desc.Triangles.pTransformBuffer = - vk_tri->transformData.deviceAddress; - geo.Desc.Triangles.pIndexBuffer = - vk_tri->indexData.deviceAddress; - geo.Desc.Triangles.pVertexBuffer = - vk_tri->vertexData.deviceAddress; - geo.Desc.Triangles.VertexBufferByteStride = vk_tri->vertexStride; - - if (geo.Desc.Triangles.pTransformBuffer) - geo.Desc.Triangles.pTransformBuffer += transform_offset; - - if (vk_tri->indexType == VK_INDEX_TYPE_NONE_KHR) { - geo.Desc.Triangles.IndexCount = 0; - geo.Desc.Triangles.VertexCount = prim_count * 3; - geo.Desc.Triangles.IndexFormat = INDEX_FORMAT_NONE; - geo.Desc.Triangles.pVertexBuffer += primitive_offset; - } else { - geo.Desc.Triangles.IndexCount = prim_count * 3; - geo.Desc.Triangles.VertexCount = vk_tri->maxVertex; - geo.Desc.Triangles.IndexFormat = - vk_to_grl_IndexFormat(vk_tri->indexType); - geo.Desc.Triangles.pIndexBuffer += primitive_offset; - } - - geo.Desc.Triangles.VertexFormat = - vk_to_grl_VertexFormat(vk_tri->vertexFormat); - geo.Desc.Triangles.pVertexBuffer += vk_tri->vertexStride * first_vertex; - break; - } - - case VK_GEOMETRY_TYPE_AABBS_KHR: { - const VkAccelerationStructureGeometryAabbsDataKHR *vk_aabbs = - &pGeometry->geometry.aabbs; - geo.Type = GEOMETRY_TYPE_PROCEDURAL; - geo.Desc.Procedural.pAABBs_GPUVA = - vk_aabbs->data.deviceAddress + primitive_offset; - geo.Desc.Procedural.AABBByteStride = vk_aabbs->stride; - geo.Desc.Procedural.AABBCount = prim_count; - break; - } - - default: - unreachable("Invalid geometry type"); - } - - return geo; -} - -#include "grl/grl_metakernel_copy.h" -#include "grl/grl_metakernel_misc.h" -#include "grl/grl_metakernel_build_primref.h" -#include "grl/grl_metakernel_new_sah_builder.h" -#include "grl/grl_metakernel_build_leaf.h" - -struct build_state { - enum anv_rt_bvh_build_method build_method; - - struct MKSizeEstimate estimate; - struct scratch_layout scratch; - struct MKBuilderState state; - - struct anv_address bvh_addr; - - size_t geom_size_prefix_sum_buffer; - size_t transient_size; - - uint32_t leaf_type; - uint32_t leaf_size; - - uint32_t num_geometries; - uint32_t num_instances; - - uint64_t instances_addr; - bool array_of_instances_ptr; - - const VkAccelerationStructureGeometryKHR *vk_geoms; -}; - -static void -get_binnedsah_scratch_buffers(struct build_state *bs, - uint64_t *p_qnode_buffer, - uint64_t *p_primref_indices, - uint64_t *p_bvh2) -{ - if (bs->estimate.numBuildPrimitives == 0) - { - *p_bvh2 = 0; - *p_qnode_buffer = 0; - *p_primref_indices = 0; - return; - } - - size_t bvh2_size = get_bvh2_size(bs->estimate.numBuildPrimitives); - if (bs->estimate.leaf_data_size < bvh2_size) { - assert(bs->scratch.bvh2_buffer != 0); - *p_bvh2 = bs->scratch.bvh2_buffer; - } else { - *p_bvh2 = intel_canonical_address(bs->state.bvh_buffer + - bs->estimate.leaf_data_start); - } - - assert(bs->scratch.qnode_buffer != 0); - *p_qnode_buffer = bs->scratch.qnode_buffer; - - assert(bs->scratch.leaf_index_buffers != 0); - *p_primref_indices = bs->scratch.leaf_index_buffers; -} - -static void -write_memory(struct anv_cmd_alloc alloc, size_t offset, const void *data, size_t data_len) -{ - assert((offset + data_len) < alloc.size); - memcpy(alloc.map + offset, data, data_len); -} - -static void -cmd_build_acceleration_structures( - struct anv_cmd_buffer *cmd_buffer, - uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, - const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, - const VkDeviceAddress *pIndirectDeviceAddresses, - const uint32_t *pIndirectStrides, - const uint32_t *const *ppMaxPrimitiveCounts) -{ - struct anv_device *device = cmd_buffer->device; - VK_MULTIALLOC(ma); - - struct build_state *builds; - vk_multialloc_add(&ma, &builds, struct build_state, infoCount); - - if (!vk_multialloc_zalloc(&ma, - &cmd_buffer->device->vk.alloc, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)) { - anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); - return; - } - - trace_intel_begin_as_build(&cmd_buffer->trace); - - /* TODO: Indirect */ - assert(ppBuildRangeInfos != NULL); - - size_t transient_mem_init_globals_size = 0; - size_t transient_mem_init_globals_offset = 0; - - size_t transient_total = 0; - - size_t private_mem_total = 0; - - size_t num_trivial_builds = 0; - size_t num_new_sah_builds = 0; - - /* Prepare a bunch of data for the kernels we have to run. */ - for (uint32_t i = 0; i < infoCount; i++) { - struct build_state *bs = &builds[i]; - - const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i]; - struct anv_address scratch_addr = - anv_address_from_u64(pInfo->scratchData.deviceAddress); - - const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos = - ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL; - const uint32_t *pMaxPrimitiveCounts = - ppMaxPrimitiveCounts ? ppMaxPrimitiveCounts[i] : NULL; - - ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, - pInfo->dstAccelerationStructure); - - bs->build_method = device->bvh_build_method; - - bs->bvh_addr = anv_address_from_u64(vk_acceleration_structure_get_va(dst_accel)); - - bs->estimate = get_gpu_size_estimate(pInfo, pBuildRangeInfos, - pMaxPrimitiveCounts); - bs->scratch = get_gpu_scratch_layout(scratch_addr, bs->estimate, - bs->build_method); - - uint32_t leaf_size, leaf_type; - - switch (pInfo->type) { - case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: { - assert(pInfo->geometryCount == 1); - - const VkAccelerationStructureGeometryKHR *pGeometry = - get_geometry(pInfo, 0); - assert(pGeometry->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR); - - const VkAccelerationStructureGeometryInstancesDataKHR *instances = - &pGeometry->geometry.instances; - - bs->num_instances = pBuildRangeInfos[0].primitiveCount; - bs->instances_addr = instances->data.deviceAddress; - bs->array_of_instances_ptr = instances->arrayOfPointers; - leaf_type = NODE_TYPE_INSTANCE; - leaf_size = GENX(RT_BVH_INSTANCE_LEAF_length) * 4; - break; - } - - case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: { - bs->num_geometries = pInfo->geometryCount; - leaf_type = NODE_TYPE_QUAD; - leaf_size = GENX(RT_BVH_QUAD_LEAF_length) * 4; - break; - } - - default: - unreachable("Unsupported acceleration structure type"); - } - - size_t geom_struct_size = bs->num_geometries * sizeof(struct Geo); - size_t geom_prefix_sum_size = align_uintptr(sizeof(uint32_t) * (bs->num_geometries + 1), 64); - - bs->transient_size = geom_prefix_sum_size + geom_struct_size; - - bs->geom_size_prefix_sum_buffer = transient_total + 0; - - bs->state = (struct MKBuilderState) { - .geomDesc_buffer = bs->geom_size_prefix_sum_buffer + - geom_prefix_sum_size, - .build_primref_buffer = bs->scratch.primrefs, - .build_globals = bs->scratch.globals, - .bvh_buffer = anv_address_physical(bs->bvh_addr), - .leaf_type = leaf_type, - .leaf_size = leaf_size, - }; - - transient_total += bs->transient_size; - - switch (device->bvh_build_method) { - case ANV_BVH_BUILD_METHOD_TRIVIAL: - num_trivial_builds++; - break; - case ANV_BVH_BUILD_METHOD_NEW_SAH: - num_new_sah_builds++; - break; - default: - unreachable("invalid BVH build method"); - } - - transient_mem_init_globals_size += sizeof(struct BatchedInitGlobalsData); - } - - transient_total = align_transient_size(transient_total); - transient_mem_init_globals_offset = transient_total; - transient_total += align_transient_size(transient_mem_init_globals_size); - - size_t transient_mem_binnedsah_size = 0; - size_t transient_mem_binnedsah_offset = 0; - size_t private_mem_binnedsah_size = 0; - size_t private_mem_binnedsah_offset = 0; - - transient_mem_binnedsah_size = get_batched_binnedsah_transient_mem_size(num_new_sah_builds); - transient_mem_binnedsah_offset = transient_total; - transient_total += align_transient_size(transient_mem_binnedsah_size); - - private_mem_binnedsah_size = get_batched_binnedsah_private_mem_size(num_new_sah_builds); - private_mem_binnedsah_offset = private_mem_total; - private_mem_total += align_private_size(private_mem_binnedsah_size); - - /* Allocate required memory, unless we already have a suiteable buffer */ - struct anv_cmd_alloc private_mem_alloc; - if (private_mem_total > cmd_buffer->state.rt.build_priv_mem_size) { - private_mem_alloc = - anv_cmd_buffer_alloc_space(cmd_buffer, private_mem_total, 64, - false /* mapped */); - if (anv_cmd_alloc_is_empty(private_mem_alloc)) { - anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY); - goto error; - } - - cmd_buffer->state.rt.build_priv_mem_addr = private_mem_alloc.address; - cmd_buffer->state.rt.build_priv_mem_size = private_mem_alloc.size; - } else { - private_mem_alloc = (struct anv_cmd_alloc) { - .address = cmd_buffer->state.rt.build_priv_mem_addr, - .map = anv_address_map(cmd_buffer->state.rt.build_priv_mem_addr), - .size = cmd_buffer->state.rt.build_priv_mem_size, - }; - } - - struct anv_cmd_alloc transient_mem_alloc = - anv_cmd_buffer_alloc_space(cmd_buffer, transient_total, 64, - true /* mapped */); - if (transient_total > 0 && anv_cmd_alloc_is_empty(transient_mem_alloc)) { - anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY); - goto error; - } - - uint64_t private_base = anv_address_physical(private_mem_alloc.address); - uint64_t transient_base = anv_address_physical(transient_mem_alloc.address); - - /* Prepare transient memory */ - for (uint32_t i = 0; i < infoCount; i++) { - struct build_state *bs = &builds[i]; - - const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i]; - - const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos = - ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL; - - struct Geo *geos = transient_mem_alloc.map + bs->state.geomDesc_buffer; - uint32_t *prefixes = transient_mem_alloc.map + bs->geom_size_prefix_sum_buffer; - uint32_t prefix_sum = 0; - for (unsigned g = 0; g < bs->num_geometries; g++) { - const VkAccelerationStructureGeometryKHR *pGeometry = get_geometry(pInfo, g); - uint32_t prim_count = pBuildRangeInfos[g].primitiveCount; - geos[g] = vk_to_grl_Geo(pGeometry, prim_count, - pBuildRangeInfos[g].transformOffset, - pBuildRangeInfos[g].primitiveOffset, - pBuildRangeInfos[g].firstVertex); - - prefixes[g] = prefix_sum; - prefix_sum += prim_count; - } - - prefixes[bs->num_geometries] = prefix_sum; - - bs->geom_size_prefix_sum_buffer = - intel_canonical_address(bs->geom_size_prefix_sum_buffer + - transient_base); - bs->state.geomDesc_buffer = - intel_canonical_address(bs->state.geomDesc_buffer + - transient_base); - - struct BatchedInitGlobalsData data = { - .p_build_globals = bs->scratch.globals, - .p_bvh_buffer = anv_address_physical(bs->bvh_addr), - - .numPrimitives = 0, - .numGeometries = bs->num_geometries, - .numInstances = bs->num_instances, - - .instance_descs_start = bs->estimate.instance_descs_start, - .geo_meta_data_start = bs->estimate.geo_meta_data_start, - .node_data_start = bs->estimate.node_data_start, - .leaf_data_start = bs->estimate.leaf_data_start, - .procedural_data_start = bs->estimate.procedural_data_start, - .back_pointer_start = bs->estimate.back_pointer_start, - .sizeTotal = bs->estimate.sizeTotal, - - .leafType = bs->state.leaf_type, - .leafSize = bs->state.leaf_size, - }; - - write_memory(transient_mem_alloc, - transient_mem_init_globals_offset + i * sizeof(data), - &data, sizeof(data)); - } - - genX(flush_pipeline_select_gpgpu)(cmd_buffer); - - /* Due to the nature of GRL and its heavy use of jumps/predication, we - * cannot tell exactly in what order the CFE_STATE we insert are going to - * be executed. So always use the largest possible size. - */ - genX(cmd_buffer_ensure_cfe_state)( - cmd_buffer, - cmd_buffer->device->physical->max_grl_scratch_size); - - /* Round 1 : init_globals kernel */ - genX(grl_misc_batched_init_globals)( - cmd_buffer, - intel_canonical_address(transient_base + - transient_mem_init_globals_offset), - infoCount); - - anv_add_pending_pipe_bits(cmd_buffer, - ANV_GRL_FLUSH_FLAGS, - "building accel struct"); - - /* Round 2 : Copy instance/geometry data from the application provided - * buffers into the acceleration structures. - */ - for (uint32_t i = 0; i < infoCount; i++) { - struct build_state *bs = &builds[i]; - - /* Metadata */ - if (bs->num_instances) { - assert(bs->num_geometries == 0); - - const uint64_t copy_size = bs->num_instances * sizeof(InstanceDesc); - /* This must be calculated in same way as - * groupCountForGeoMetaDataCopySize - */ - const uint32_t num_threads = (copy_size >> 8) + 3; - - if (bs->array_of_instances_ptr) { - genX(grl_misc_copy_instance_ptrs)( - cmd_buffer, - anv_address_physical(anv_address_add(bs->bvh_addr, - bs->estimate.instance_descs_start)), - bs->instances_addr, - copy_size, num_threads); - } else { - genX(grl_misc_copy_instances)( - cmd_buffer, - anv_address_physical(anv_address_add(bs->bvh_addr, - bs->estimate.instance_descs_start)), - bs->instances_addr, - copy_size, num_threads); - } - } - - if (bs->num_geometries) { - assert(bs->num_instances == 0); - const uint64_t copy_size = bs->num_geometries * sizeof(struct GeoMetaData); - - /* This must be calculated in same way as - * groupCountForGeoMetaDataCopySize - */ - const uint32_t num_threads = (copy_size >> 6) + 1; - - genX(grl_misc_copy_geo_meta_data)( - cmd_buffer, - anv_address_physical(anv_address_add(bs->bvh_addr, - bs->estimate.geo_meta_data_start)), - bs->state.geomDesc_buffer, - copy_size, - num_threads); - } - - /* Primrefs */ - if (bs->num_instances) { - if (bs->array_of_instances_ptr) { - genX(grl_build_primref_buildPrimirefsFromInstancesArrOfPtrs)( - cmd_buffer, - bs->instances_addr, - PREFIX_MK_SIZE(grl_build_primref, bs->estimate), - PREFIX_MK_STATE(grl_build_primref, bs->state), - false /* allowUpdate */); - } else { - genX(grl_build_primref_buildPrimirefsFromInstances)( - cmd_buffer, - bs->instances_addr, - PREFIX_MK_SIZE(grl_build_primref, bs->estimate), - PREFIX_MK_STATE(grl_build_primref, bs->state), - false /* allowUpdate */); - } - } - - if (bs->num_geometries) { - const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i]; - const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos = - ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL; - - assert(pInfo->geometryCount == bs->num_geometries); - for (unsigned g = 0; g < pInfo->geometryCount; g++) { - const VkAccelerationStructureGeometryKHR *pGeometry = - get_geometry(pInfo, g); - - switch (pGeometry->geometryType) { - case VK_GEOMETRY_TYPE_TRIANGLES_KHR: - genX(grl_build_primref_primrefs_from_tris)( - cmd_buffer, - PREFIX_MK_STATE(grl_build_primref, bs->state), - PREFIX_MK_SIZE(grl_build_primref, bs->estimate), - bs->state.geomDesc_buffer + g * sizeof(struct Geo), - g, - vk_to_grl_GeometryFlags(pGeometry->flags), - /* TODO: Indirect */ - pBuildRangeInfos[g].primitiveCount); - break; - - case VK_GEOMETRY_TYPE_AABBS_KHR: - genX(grl_build_primref_primrefs_from_proc)( - cmd_buffer, - PREFIX_MK_STATE(grl_build_primref, bs->state), - PREFIX_MK_SIZE(grl_build_primref, bs->estimate), - bs->state.geomDesc_buffer + g * sizeof(struct Geo), - g, - vk_to_grl_GeometryFlags(pGeometry->flags), - /* TODO: Indirect */ - pBuildRangeInfos[g].primitiveCount); - break; - - default: - unreachable("Invalid geometry type"); - } - } - } - } - - anv_add_pending_pipe_bits(cmd_buffer, - ANV_GRL_FLUSH_FLAGS, - "building accel struct"); - - /* Dispatch trivial builds */ - if (num_trivial_builds) { - for (uint32_t i = 0; i < infoCount; i++) { - struct build_state *bs = &builds[i]; - - if (bs->build_method != ANV_BVH_BUILD_METHOD_TRIVIAL) - continue; - - genX(grl_new_sah_builder_single_pass_binsah)( - cmd_buffer, - bs->scratch.globals, - bs->state.bvh_buffer, - bs->state.build_primref_buffer, - bs->scratch.leaf_index_buffers, - false /* alloc_backpointers */); - } - } - - /* Dispatch new SAH builds */ - if (num_new_sah_builds) { - size_t global_ptrs_offset = transient_mem_binnedsah_offset; - size_t buffers_info_offset = transient_mem_binnedsah_offset + sizeof(gpuva_t) * num_new_sah_builds; - - size_t scheduler_offset = private_mem_binnedsah_offset; - size_t sah_globals_offset = private_mem_binnedsah_offset + get_scheduler_size(num_new_sah_builds); - - struct SAHBuildArgsBatchable args = { - .num_builds = infoCount, - .p_globals_ptrs = intel_canonical_address(transient_base + global_ptrs_offset), - .p_buffers_info = intel_canonical_address(transient_base + buffers_info_offset), - .p_scheduler = intel_canonical_address(private_base + scheduler_offset), - .p_sah_globals = intel_canonical_address(private_base + sah_globals_offset), - .num_max_qnode_global_root_buffer_entries = MAX2(num_new_sah_builds, QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM), - }; - - for (uint32_t i = 0; i < infoCount; i++) { - struct build_state *bs = &builds[i]; - - if (bs->build_method != ANV_BVH_BUILD_METHOD_NEW_SAH) - continue; - - uint64_t p_build_primref_index_buffers; - uint64_t p_bvh2; - uint64_t p_qnode_child_buffer; - - get_binnedsah_scratch_buffers(bs, - &p_qnode_child_buffer, - &p_build_primref_index_buffers, - &p_bvh2); - - struct SAHBuildBuffersInfo buffers = { - .p_primref_index_buffers = bs->scratch.leaf_index_buffers, - .p_bvh_base = bs->state.bvh_buffer, - .p_primrefs_buffer = bs->state.build_primref_buffer, - .p_bvh2 = p_bvh2, - .p_qnode_root_buffer = p_qnode_child_buffer, - .sah_globals_flags = 0, - }; - - write_memory(transient_mem_alloc, buffers_info_offset, &buffers, sizeof(buffers)); - buffers_info_offset += sizeof(buffers); - - write_memory(transient_mem_alloc, global_ptrs_offset, &bs->state.build_globals, - sizeof(bs->state.build_globals)); - global_ptrs_offset += sizeof(bs->state.build_globals); - } - - genX(grl_new_sah_builder_new_sah_build_batchable)( - cmd_buffer, PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(grl_new_sah_builder, args)); - } - - if (num_new_sah_builds == 0) - anv_add_pending_pipe_bits(cmd_buffer, - ANV_GRL_FLUSH_FLAGS, - "building accel struct"); - - /* Finally write the leaves. */ - for (uint32_t i = 0; i < infoCount; i++) { - struct build_state *bs = &builds[i]; - - if (bs->num_instances) { - assert(bs->num_geometries == 0); - if (bs->array_of_instances_ptr) { - genX(grl_leaf_builder_buildLeafDXR_instances_pointers)(cmd_buffer, - PREFIX_MK_STATE(grl_leaf_builder, bs->state), - bs->scratch.leaf_index_buffers, - bs->instances_addr, - bs->scratch.leaf_index_buffer_stride, - 0 /* offset */, - bs->estimate.numBuildPrimitives); - } else { - genX(grl_leaf_builder_buildLeafDXR_instances)(cmd_buffer, - PREFIX_MK_STATE(grl_leaf_builder, bs->state), - bs->scratch.leaf_index_buffers, - bs->instances_addr, - bs->scratch.leaf_index_buffer_stride, - 0 /* offset */, - bs->estimate.numBuildPrimitives); - } - } - - if (bs->num_geometries) { - assert(bs->num_instances == 0); - const uint64_t p_numPrimitives = - bs->state.build_globals + offsetof(struct Globals, numPrimitives); - - assert(bs->estimate.numProcedurals == 0 || - bs->estimate.numTriangles == 0); - if (bs->estimate.numProcedurals) { - genX(grl_leaf_builder_buildLeafDXR_procedurals)( - cmd_buffer, - PREFIX_MK_STATE(grl_leaf_builder, bs->state), - bs->scratch.leaf_index_buffers, - bs->scratch.leaf_index_buffer_stride, - 0 /* offset */, - p_numPrimitives); - } else { - genX(grl_leaf_builder_buildLeafDXR_quads)( - cmd_buffer, - PREFIX_MK_STATE(grl_leaf_builder, bs->state), - bs->scratch.leaf_index_buffers, - bs->scratch.leaf_index_buffer_stride, - 0 /* offset */, - p_numPrimitives, - false /* allow_updates */); - } - } - } - - anv_add_pending_pipe_bits(cmd_buffer, - ANV_GRL_FLUSH_FLAGS, - "building accel struct"); - - trace_intel_end_as_build(&cmd_buffer->trace); - - error: - vk_free(&cmd_buffer->device->vk.alloc, builds); -} - -void -genX(CmdBuildAccelerationStructuresKHR)( - VkCommandBuffer commandBuffer, - uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, - const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - - if (anv_batch_has_error(&cmd_buffer->batch)) - return; - - cmd_build_acceleration_structures(cmd_buffer, infoCount, pInfos, - ppBuildRangeInfos, NULL, NULL, NULL); -} - -void -genX(CmdCopyAccelerationStructureKHR)( - VkCommandBuffer commandBuffer, - const VkCopyAccelerationStructureInfoKHR* pInfo) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src); - ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst); - - assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR || - pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR); - - if (pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR) { - uint64_t src_size_addr = - vk_acceleration_structure_get_va(src_accel) + - offsetof(struct BVHBase, Meta.allocationSize); - genX(grl_copy_clone_indirect)( - cmd_buffer, - vk_acceleration_structure_get_va(dst_accel), - vk_acceleration_structure_get_va(src_accel), - src_size_addr); - } else { - genX(grl_copy_compact)( - cmd_buffer, - vk_acceleration_structure_get_va(dst_accel), - vk_acceleration_structure_get_va(src_accel)); - } - - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_END_OF_PIPE_SYNC_BIT, - "after copy acceleration struct"); -} - -void -genX(CmdCopyAccelerationStructureToMemoryKHR)( - VkCommandBuffer commandBuffer, - const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src); - struct anv_device *device = cmd_buffer->device; - uint64_t src_size_addr = - vk_acceleration_structure_get_va(src_accel) + - offsetof(struct BVHBase, Meta.allocationSize); - - assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR); - - genX(grl_copy_serialize_indirect)( - cmd_buffer, - pInfo->dst.deviceAddress, - vk_acceleration_structure_get_va(src_accel), - anv_address_physical(device->rt_uuid_addr), - src_size_addr); - - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_END_OF_PIPE_SYNC_BIT, - "after copy acceleration struct"); -} - -void -genX(CmdCopyMemoryToAccelerationStructureKHR)( - VkCommandBuffer commandBuffer, - const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst); - - assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR); - - uint64_t src_size_addr = pInfo->src.deviceAddress + - offsetof(struct SerializationHeader, DeserializedSizeInBytes); - genX(grl_copy_deserialize_indirect)( - cmd_buffer, - vk_acceleration_structure_get_va(dst_accel), - pInfo->src.deviceAddress, - src_size_addr); - - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_END_OF_PIPE_SYNC_BIT, - "after copy acceleration struct"); -} - -/* TODO: Host commands */ - -VkResult -genX(BuildAccelerationStructuresKHR)( - VkDevice _device, - VkDeferredOperationKHR deferredOperation, - uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, - const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) -{ - ANV_FROM_HANDLE(anv_device, device, _device); - unreachable("Unimplemented"); - return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); -} - -VkResult -genX(CopyAccelerationStructureKHR)( - VkDevice _device, - VkDeferredOperationKHR deferredOperation, - const VkCopyAccelerationStructureInfoKHR* pInfo) -{ - ANV_FROM_HANDLE(anv_device, device, _device); - unreachable("Unimplemented"); - return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); -} - -VkResult -genX(CopyAccelerationStructureToMemoryKHR)( - VkDevice _device, - VkDeferredOperationKHR deferredOperation, - const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo) -{ - ANV_FROM_HANDLE(anv_device, device, _device); - unreachable("Unimplemented"); - return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); -} - -VkResult -genX(CopyMemoryToAccelerationStructureKHR)( - VkDevice _device, - VkDeferredOperationKHR deferredOperation, - const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo) -{ - ANV_FROM_HANDLE(anv_device, device, _device); - unreachable("Unimplemented"); - return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); -} - -VkResult -genX(WriteAccelerationStructuresPropertiesKHR)( - VkDevice _device, - uint32_t accelerationStructureCount, - const VkAccelerationStructureKHR* pAccelerationStructures, - VkQueryType queryType, - size_t dataSize, - void* pData, - size_t stride) -{ - ANV_FROM_HANDLE(anv_device, device, _device); - unreachable("Unimplemented"); - return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); -} -#endif /* GFX_VERx10 >= 125 */ diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c index ea935c9f8c9..3cf9563c45f 100644 --- a/src/intel/vulkan/genX_init_state.c +++ b/src/intel/vulkan/genX_init_state.c @@ -31,10 +31,6 @@ #include "vk_standard_sample_locations.h" -#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT_GRL -#include "grl/genX_grl.h" -#endif - #include "genX_mi_builder.h" #include "vk_util.h" @@ -895,13 +891,8 @@ genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice) assert(pdevice->info.verx10 == GFX_VERx10); #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT -#if ANV_SUPPORT_RT_GRL - genX(grl_load_rt_uuid)(pdevice->rt_uuid); - pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)(); -#else STATIC_ASSERT(sizeof(ANV_RT_UUID_MACRO) == VK_UUID_SIZE); memcpy(pdevice->rt_uuid, ANV_RT_UUID_MACRO, VK_UUID_SIZE); -#endif #endif pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp); diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 1652c491b4e..a90481adc88 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -2040,12 +2040,7 @@ void genX(CmdCopyQueryPoolResults)( #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT -#if ANV_SUPPORT_RT_GRL -#include "grl/include/GRLRTASCommon.h" -#include "grl/grl_metakernel_postbuild_info.h" -#else #include "bvh/anv_bvh.h" -#endif void genX(CmdWriteAccelerationStructuresPropertiesKHR)( @@ -2064,66 +2059,19 @@ genX(CmdWriteAccelerationStructuresPropertiesKHR)( ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); -#if !ANV_SUPPORT_RT_GRL anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_END_OF_PIPE_SYNC_BIT | ANV_PIPE_DATA_CACHE_FLUSH_BIT, "read BVH data using CS"); -#endif if (append_query_clear_flush( cmd_buffer, pool, - "CmdWriteAccelerationStructuresPropertiesKHR flush query clears") || - !ANV_SUPPORT_RT_GRL) + "CmdWriteAccelerationStructuresPropertiesKHR flush query clears")) genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); struct mi_builder b; mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); -#if ANV_SUPPORT_RT_GRL - for (uint32_t i = 0; i < accelerationStructureCount; i++) { - ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]); - struct anv_address query_addr = - anv_address_add(anv_query_address(pool, firstQuery + i), 8); - - switch (queryType) { - case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: - genX(grl_postbuild_info_compacted_size)(cmd_buffer, - vk_acceleration_structure_get_va(accel), - anv_address_physical(query_addr)); - break; - - case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: - genX(grl_postbuild_info_current_size)(cmd_buffer, - vk_acceleration_structure_get_va(accel), - anv_address_physical(query_addr)); - break; - - case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: - case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: - genX(grl_postbuild_info_serialized_size)(cmd_buffer, - vk_acceleration_structure_get_va(accel), - anv_address_physical(query_addr)); - break; - - default: - unreachable("unhandled query type"); - } - } - - /* TODO: Figure out why MTL needs ANV_PIPE_DATA_CACHE_FLUSH_BIT in order - * to not lose the availability bit. - */ - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_END_OF_PIPE_SYNC_BIT | - ANV_PIPE_DATA_CACHE_FLUSH_BIT, - "after write acceleration struct props"); - genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); - - for (uint32_t i = 0; i < accelerationStructureCount; i++) - emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true); - -#else for (uint32_t i = 0; i < accelerationStructureCount; i++) { ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]); struct anv_address query_addr = @@ -2163,6 +2111,5 @@ genX(CmdWriteAccelerationStructuresPropertiesKHR)( mi_builder_set_write_check(&b1, (i == (accelerationStructureCount - 1))); emit_query_mi_availability(&b1, anv_query_address(pool, firstQuery + i), true); } -#endif /* ANV_SUPPORT_RT_GRL */ } #endif /* GFX_VERx10 >= 125 && ANV_SUPPORT_RT */ diff --git a/src/intel/vulkan/grl/.gitignore b/src/intel/vulkan/grl/.gitignore deleted file mode 100644 index e2850ca03b1..00000000000 --- a/src/intel/vulkan/grl/.gitignore +++ /dev/null @@ -1 +0,0 @@ -parsetab.py diff --git a/src/intel/vulkan/grl/genX_grl.h b/src/intel/vulkan/grl/genX_grl.h deleted file mode 100644 index 57aefa72de0..00000000000 --- a/src/intel/vulkan/grl/genX_grl.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright © 2021 Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef ANV_GRL_H -#define ANV_GRL_H - -#include "grl/grl_cl_kernel.h" -#include "genxml/gen_macros.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct anv_cmd_buffer; -struct anv_kernel_arg; - -void -genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer, - enum grl_cl_kernel kernel, - const uint32_t *global_size, - uint32_t arg_count, - const struct anv_kernel_arg *args); - -void -genX(grl_load_rt_uuid)(uint8_t *out_uuid); - -uint32_t -genX(grl_max_scratch_size)(void); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* ANV_GRL_H */ diff --git a/src/intel/vulkan/grl/genX_grl_dispatch.c b/src/intel/vulkan/grl/genX_grl_dispatch.c deleted file mode 100644 index aeb76b79bd0..00000000000 --- a/src/intel/vulkan/grl/genX_grl_dispatch.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright © 2021 Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "anv_private.h" -#include "genX_grl.h" - -static struct anv_shader_bin * -get_shader_bin(struct anv_device *device, - enum grl_cl_kernel kernel) -{ - const char *key = genX(grl_get_cl_kernel_sha1)(kernel); - int key_len = strlen(key); - - bool cache_hit = false; - struct anv_shader_bin *bin = - anv_device_search_for_kernel(device, device->internal_cache, - key, key_len, &cache_hit); - if (bin != NULL) - return bin; - - uint32_t dummy_param[32]; - struct brw_kernel kernel_data; - genX(grl_get_cl_kernel)(&kernel_data, kernel); - - assert(kernel_data.prog_data.base.nr_params <= ARRAY_SIZE(dummy_param)); - kernel_data.prog_data.base.param = dummy_param; - - struct anv_push_descriptor_info empty_push_desc_info = {}; - struct anv_pipeline_bind_map bind_map = { - .kernel_args_size = kernel_data.args_size, - .kernel_arg_count = kernel_data.arg_count, - .kernel_args = (struct brw_kernel_arg_desc *)kernel_data.args, - }; - - struct anv_shader_upload_params upload_params = { - .stage = MESA_SHADER_KERNEL, - .key_data = key, - .key_size = key_len, - .kernel_data = kernel_data.code, - .kernel_size = kernel_data.prog_data.base.program_size, - .prog_data = &kernel_data.prog_data.base, - .prog_data_size = sizeof(kernel_data.prog_data), - .bind_map = &bind_map, - .push_desc_info = &empty_push_desc_info, - }; - - bin = anv_device_upload_kernel(device, device->internal_cache, - &upload_params); - - /* The cache already has a reference and it's not going anywhere so there - * is no need to hold a second reference. - */ - anv_shader_bin_unref(device, bin); - - return bin; -} - -void -genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer, - enum grl_cl_kernel kernel, - const uint32_t *global_size, - uint32_t arg_count, - const struct anv_kernel_arg *args) -{ - struct anv_device *device = cmd_buffer->device; - - const struct intel_l3_weights w = - intel_get_default_l3_weights(device->info, true, true); - - struct anv_kernel ak = { - .bin = get_shader_bin(device, kernel), - .l3_config = intel_get_l3_config(device->info, w), - }; - - genX(cmd_buffer_dispatch_kernel)(cmd_buffer, &ak, global_size, - arg_count, args); -} - -uint32_t -genX(grl_max_scratch_size)(void) -{ - uint32_t scratch_size = 0; - - for (uint32_t i = 0; i < GRL_CL_KERNEL_MAX; i++) { - struct brw_kernel kernel_data; - genX(grl_get_cl_kernel)(&kernel_data, i); - - scratch_size = MAX2(kernel_data.prog_data.base.total_scratch, - scratch_size); - } - - return scratch_size; -} diff --git a/src/intel/vulkan/grl/genX_grl_uuid.cpp b/src/intel/vulkan/grl/genX_grl_uuid.cpp deleted file mode 100644 index cf6b425fe2b..00000000000 --- a/src/intel/vulkan/grl/genX_grl_uuid.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright © 2021 Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include -#include - -#include "genX_grl.h" -#include "include/GRLGen12.h" - -#include "vulkan/vulkan_core.h" - -extern "C" void -genX(grl_load_rt_uuid)(uint8_t *out_uuid); - -extern "C" void -genX(grl_load_rt_uuid)(uint8_t *out_uuid) -{ - assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE); - memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE); -} diff --git a/src/intel/vulkan/grl/gpu/AABB.h b/src/intel/vulkan/grl/gpu/AABB.h deleted file mode 100644 index 11d848e3c09..00000000000 --- a/src/intel/vulkan/grl/gpu/AABB.h +++ /dev/null @@ -1,450 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "shared.h" -#include "intrinsics.h" -#ifndef __OPENCL_VERSION__ -#include "stdio.h" -#endif - -GRL_NAMESPACE_BEGIN(GRL) -GRL_NAMESPACE_BEGIN(RTAS) -/* ====== QUAD ENCODING config ====== */ - -#define QUAD_GEOMID_BITS 27 // dxr limit is 2^24 geos... we have headroom -#define QUAD_PRIMID_DIFF_BITS (32 - QUAD_GEOMID_BITS) -#define QUAD_GEOMID_MASK ((1<lower = (float4)(INFINITY, INFINITY, INFINITY, 0); - aabb->upper = -(float4)(INFINITY, INFINITY, INFINITY, 0); -} - -GRL_INLINE uint PRIMREF_geomID( PrimRef* aabb) -{ - const uint v = as_uint(aabb->lower.w); - return v & QUAD_GEOMID_MASK; -} - -GRL_INLINE uint PRIMREF_primID0( PrimRef* aabb) -{ - return as_uint( aabb->upper.w ) & QUAD_PRIMID_MASK; -} - -GRL_INLINE uint PRIMREF_primID1( PrimRef* aabb) -{ - const uint v = as_uint(aabb->lower.w); - const uint primID0 = as_uint(aabb->upper.w) & QUAD_PRIMID_MASK; - const uint deltaID = v >> QUAD_GEOMID_BITS; - const uint primID1 = primID0 + deltaID; - return primID1; -} - -GRL_INLINE uint PRIMREF_geomFlags( PrimRef* aabb ) -{ - const uint v = as_uint( aabb->upper.w ); - return (v >> QUAD_PRIMID_BITS) ; -} - -GRL_INLINE uint PRIMREF_instanceIndex( PrimRef* aabb ) -{ - return as_uint(aabb->lower.w) & INSTANCE_ID_MASK; -} - -GRL_INLINE uchar PRIMREF_instanceMask( PrimRef* aabb ) -{ - return as_uint(aabb->lower.w) >> INSTANCE_ID_BITS; -} - -GRL_INLINE void PRIMREF_setProceduralMetaData( PrimRef* primref, uint geomID, uint primID, uint geomFlags ) -{ - /* encode geomID, primID */ - uint flags = (geomFlags << QUAD_PRIMID_BITS); - primref->lower.w = as_float( geomID ); - primref->upper.w = as_float( primID | flags ); -} - -GRL_INLINE void PRIMREF_setQuadMetaData( PrimRef* primref, uint primID0, uint primID1, uint geomID, uint geomFlags ) -{ - const uint primID_diff = primID1 - primID0; - uint flags = geomFlags << QUAD_PRIMID_BITS; - - primref->lower.w = as_float( geomID | (primID_diff << QUAD_GEOMID_BITS) ); - primref->upper.w = as_float( (primID0 | flags) ); -} - -GRL_INLINE void PRIMREF_setAABB( PrimRef* primref, float3 lower, float3 upper ) -{ - primref->lower.xyz = lower.xyz; - primref->upper.xyz = upper.xyz; -} - -GRL_INLINE PrimRef PRIMREF_set_instance( float3 lower, float3 upper, uint instanceIndex, uint instanceMask, uint rootOffset, bool is_procedural ) -{ - PrimRef new_ref; - new_ref.lower.xyz = lower; - new_ref.lower.w = as_float(instanceIndex | (instanceMask << 24)); - new_ref.upper.xyz = upper; - new_ref.upper.w = as_float(rootOffset + (is_procedural? 0x80000000 : 0)); - return new_ref; -} - -GRL_INLINE bool PRIMREF_isProceduralInstance( PrimRef* primref ) -{ - return (as_uint(primref->upper.w) & 0x80000000) != 0; -} - -GRL_INLINE uint PRIMREF_instanceRootNodeOffset(PrimRef* primref) -{ - return (as_uint(primref->upper.w) & 0x7fffffff); -} - -GRL_INLINE float3 PRIMREF_lower( PrimRef* primref ) -{ - return primref->lower.xyz; -} -GRL_INLINE float3 PRIMREF_upper( PrimRef* primref ) -{ - return primref->upper.xyz; -} - -GRL_INLINE void AABB_extend(struct AABB *aabb, struct AABB *v) -{ - aabb->lower = min(aabb->lower, v->lower); - aabb->upper = max(aabb->upper, v->upper); -} - -GRL_INLINE void AABB_extend_point(struct AABB *aabb, const float4 p) -{ - aabb->lower = min(aabb->lower, p); - aabb->upper = max(aabb->upper, p); -} - -GRL_INLINE void AABB_extendlu(struct AABB *aabb, const float4 lower, const float4 upper) -{ - aabb->lower = min(aabb->lower, lower); - aabb->upper = max(aabb->upper, upper); -} - -GRL_INLINE struct AABB AABB_enlarge(struct AABB *aabb, const float v) -{ - struct AABB box; - box.lower = aabb->lower - (float4)v; - box.upper = aabb->upper + (float4)v; - return box; -} - -GRL_INLINE void AABB_intersect(struct AABB *aabb, struct AABB *v) -{ - aabb->lower = max(aabb->lower, v->lower); - aabb->upper = min(aabb->upper, v->upper); -} - -GRL_INLINE float4 AABB_size(struct AABB *aabb) -{ - return aabb->upper - aabb->lower; -} - -GRL_INLINE float4 AABB_centroid2(struct AABB *aabb) -{ - return aabb->lower + aabb->upper; -} - -GRL_INLINE float AABB_halfArea(struct AABB *aabb) -{ - const float4 d = AABB_size(aabb); - return halfarea(d.xyz); -} - -GRL_INLINE float AABB_intersecion_size(struct AABB* aabb, struct AABB* v) -{ - struct AABB temp = *aabb; - AABB_intersect(&temp, v); - float4 len = AABB_size(&temp); - float ret = 0.0f; - if (len.x >= 0.0f && len.y >= 0.0f && len.z >= 0.0f) { - float3 v = { len.x, len.y, len.z }; - ret = halfarea(v); - } - return ret; -} - -GRL_INLINE bool AABB_subset(struct AABB* small, struct AABB* big) -{ - const int4 b0 = small->lower >= big->lower; - const int4 b1 = small->upper <= big->upper; - const int4 b = b0 & b1; - return b.x & b.y & b.z; -} - -GRL_INLINE struct AABB AABBfromAABB3f(const struct AABB3f box) -{ - struct AABB box4d = { - {box.lower[0], box.lower[1], box.lower[2], 0.0f}, - {box.upper[0], box.upper[1], box.upper[2], 0.0f} - }; - return box4d; -} - -GRL_INLINE struct AABB3f AABB3fFromAABB(const struct AABB box) -{ - struct AABB3f box3d = { - {box.lower[0], box.lower[1], box.lower[2]}, - {box.upper[0], box.upper[1], box.upper[2]} - }; - return box3d; -} - -GRL_INLINE bool AABB_verify(struct AABB* aabb) -{ - bool error = false; - if (aabb->lower.x > aabb->upper.x) - error = true; - if (aabb->lower.y > aabb->upper.y) - error = true; - if (aabb->lower.z > aabb->upper.z) - error = true; - if (!isfinite(aabb->lower.x)) - error = true; - if (!isfinite(aabb->lower.y)) - error = true; - if (!isfinite(aabb->lower.z)) - error = true; - if (!isfinite(aabb->upper.x)) - error = true; - if (!isfinite(aabb->upper.y)) - error = true; - if (!isfinite(aabb->upper.z)) - error = true; - return error; -} - -GRL_INLINE void AABB_print(struct AABB* aabb) -{ - printf("AABB {\n area = %f\n lower = %f\n upper = %f\n geomID = %i primID0 = %i primID1 = %i\n aabb->lower.w = %x aabb->upper.w = %x }\n", - AABB_halfArea(aabb), - aabb->lower.xyz, - aabb->upper.xyz, - PRIMREF_geomID(aabb), - PRIMREF_primID0(aabb), - PRIMREF_primID1(aabb), - as_uint(aabb->lower.w), - as_uint(aabb->upper.w)); -} - -#ifdef __OPENCL_VERSION__ - -GRL_INLINE PrimRef PrimRef_sub_group_shuffle(PrimRef* primRef, const uint slotID) -{ - PrimRef shuffledPrimref; - shuffledPrimref.lower.x = intel_sub_group_shuffle(primRef->lower.x, slotID); - shuffledPrimref.lower.y = intel_sub_group_shuffle(primRef->lower.y, slotID); - shuffledPrimref.lower.z = intel_sub_group_shuffle(primRef->lower.z, slotID); - shuffledPrimref.lower.w = intel_sub_group_shuffle(primRef->lower.w, slotID); - shuffledPrimref.upper.x = intel_sub_group_shuffle(primRef->upper.x, slotID); - shuffledPrimref.upper.y = intel_sub_group_shuffle(primRef->upper.y, slotID); - shuffledPrimref.upper.z = intel_sub_group_shuffle(primRef->upper.z, slotID); - shuffledPrimref.upper.w = intel_sub_group_shuffle(primRef->upper.w, slotID); - return shuffledPrimref; -} - -GRL_INLINE struct AABB AABB_sub_group_broadcast(struct AABB *aabb, const uint slotID) -{ - struct AABB bounds; - bounds.lower.x = sub_group_broadcast(aabb->lower.x, slotID); - bounds.lower.y = sub_group_broadcast(aabb->lower.y, slotID); - bounds.lower.z = sub_group_broadcast(aabb->lower.z, slotID); - bounds.lower.w = 0; - bounds.upper.x = sub_group_broadcast(aabb->upper.x, slotID); - bounds.upper.y = sub_group_broadcast(aabb->upper.y, slotID); - bounds.upper.z = sub_group_broadcast(aabb->upper.z, slotID); - bounds.upper.w = 0; - return bounds; -} -GRL_INLINE struct AABB AABB_sub_group_shuffle(struct AABB* aabb, const uint slotID) -{ - struct AABB bounds; - bounds.lower.x = intel_sub_group_shuffle(aabb->lower.x, slotID); - bounds.lower.y = intel_sub_group_shuffle(aabb->lower.y, slotID); - bounds.lower.z = intel_sub_group_shuffle(aabb->lower.z, slotID); - bounds.lower.w = 0; - bounds.upper.x = intel_sub_group_shuffle(aabb->upper.x, slotID); - bounds.upper.y = intel_sub_group_shuffle(aabb->upper.y, slotID); - bounds.upper.z = intel_sub_group_shuffle(aabb->upper.z, slotID); - bounds.upper.w = 0; - return bounds; -} - -GRL_INLINE uint AABB_sub_group_shuffle_coordPerLane(struct AABB* aabb, const uint slotID) -{ - float coordData[8] = { - sub_group_broadcast(aabb->lower.x, slotID), - sub_group_broadcast(aabb->lower.y, slotID), - sub_group_broadcast(aabb->lower.z, slotID), - sub_group_broadcast(aabb->lower.w, slotID), - sub_group_broadcast(aabb->upper.x, slotID), - sub_group_broadcast(aabb->upper.y, slotID), - sub_group_broadcast(aabb->upper.z, slotID), - sub_group_broadcast(aabb->upper.w, slotID) }; - - uint coordDataFiltered; - const uint lane = get_sub_group_local_id(); - if (lane < 8) coordDataFiltered = as_uint(coordData[lane]); - return coordDataFiltered; -} - -GRL_INLINE struct AABB AABB_sub_group_reduce(struct AABB *aabb) -{ - struct AABB bounds; - bounds.lower.x = sub_group_reduce_min(aabb->lower.x); - bounds.lower.y = sub_group_reduce_min(aabb->lower.y); - bounds.lower.z = sub_group_reduce_min(aabb->lower.z); - bounds.lower.w = 0; - bounds.upper.x = sub_group_reduce_max(aabb->upper.x); - bounds.upper.y = sub_group_reduce_max(aabb->upper.y); - bounds.upper.z = sub_group_reduce_max(aabb->upper.z); - bounds.upper.w = 0; - return bounds; -} - - -GRL_INLINE struct AABB AABB_sub_group_reduce_N6( struct AABB* aabb ) -{ - float3 l = aabb->lower.xyz; - float3 u = aabb->upper.xyz; - l = min( l, intel_sub_group_shuffle_down( l, l, 4 ) ); - l = min( l, intel_sub_group_shuffle_down( l, l, 2 ) ); - l = min( l, intel_sub_group_shuffle_down( l, l, 1 ) ); - u = max( u, intel_sub_group_shuffle_down( u, u, 4 ) ); - u = max( u, intel_sub_group_shuffle_down( u, u, 2 ) ); - u = max( u, intel_sub_group_shuffle_down( u, u, 1 ) ); - - struct AABB bounds; - bounds.lower.x = l.x; - bounds.lower.y = l.y; - bounds.lower.z = l.z; - bounds.lower.w = 0; - bounds.upper.x = u.x; - bounds.upper.y = u.y; - bounds.upper.z = u.z; - bounds.upper.w = 0; - return bounds; -} - - -GRL_INLINE struct AABB AABB_work_group_reduce(struct AABB *aabb) -{ - struct AABB bounds; - bounds.lower.x = work_group_reduce_min(aabb->lower.x); - bounds.lower.y = work_group_reduce_min(aabb->lower.y); - bounds.lower.z = work_group_reduce_min(aabb->lower.z); - bounds.upper.x = work_group_reduce_max(aabb->upper.x); - bounds.upper.y = work_group_reduce_max(aabb->upper.y); - bounds.upper.z = work_group_reduce_max(aabb->upper.z); - return bounds; -} - -GRL_INLINE struct AABB AABB_sub_group_scan_exclusive_min_max(struct AABB *aabb) -{ - struct AABB bounds; - bounds.lower.x = sub_group_scan_exclusive_min(aabb->lower.x); - bounds.lower.y = sub_group_scan_exclusive_min(aabb->lower.y); - bounds.lower.z = sub_group_scan_exclusive_min(aabb->lower.z); - bounds.lower.w = 0; - bounds.upper.x = sub_group_scan_exclusive_max(aabb->upper.x); - bounds.upper.y = sub_group_scan_exclusive_max(aabb->upper.y); - bounds.upper.z = sub_group_scan_exclusive_max(aabb->upper.z); - bounds.upper.w = 0; - return bounds; -} - -GRL_INLINE struct AABB AABB_sub_group_scan_inclusive_min_max(struct AABB *aabb) -{ - struct AABB bounds; - bounds.lower.x = sub_group_scan_inclusive_min(aabb->lower.x); - bounds.lower.y = sub_group_scan_inclusive_min(aabb->lower.y); - bounds.lower.z = sub_group_scan_inclusive_min(aabb->lower.z); - bounds.lower.w = 0; - bounds.upper.x = sub_group_scan_inclusive_max(aabb->upper.x); - bounds.upper.y = sub_group_scan_inclusive_max(aabb->upper.y); - bounds.upper.z = sub_group_scan_inclusive_max(aabb->upper.z); - bounds.upper.w = 0; - return bounds; -} - -GRL_INLINE void AABB_global_atomic_merge(global struct AABB *global_aabb, struct AABB *aabb) -{ - atomic_min((volatile __global float *)&global_aabb->lower + 0, aabb->lower.x); - atomic_min((volatile __global float *)&global_aabb->lower + 1, aabb->lower.y); - atomic_min((volatile __global float *)&global_aabb->lower + 2, aabb->lower.z); - atomic_max((volatile __global float *)&global_aabb->upper + 0, aabb->upper.x); - atomic_max((volatile __global float *)&global_aabb->upper + 1, aabb->upper.y); - atomic_max((volatile __global float *)&global_aabb->upper + 2, aabb->upper.z); -} - -GRL_INLINE void AABB_global_atomic_merge_lu(global struct AABB* global_aabb, float3 lower, float3 upper ) -{ - atomic_min((volatile __global float*) & global_aabb->lower + 0, lower.x); - atomic_min((volatile __global float*) & global_aabb->lower + 1, lower.y); - atomic_min((volatile __global float*) & global_aabb->lower + 2, lower.z); - atomic_max((volatile __global float*) & global_aabb->upper + 0, upper.x); - atomic_max((volatile __global float*) & global_aabb->upper + 1, upper.y); - atomic_max((volatile __global float*) & global_aabb->upper + 2, upper.z); -} - -GRL_INLINE void AABB_global_atomic_merge_sub_group_lu(uniform global struct AABB* aabb, float3 lower, float3 upper) -{ - uint lane = get_sub_group_local_id(); - float l[3]; - l[0] = sub_group_reduce_min(lower.x); - l[1] = sub_group_reduce_min(lower.y); - l[2] = sub_group_reduce_min(lower.z); - float u[3]; - u[0] = sub_group_reduce_max(upper.x); - u[1] = sub_group_reduce_max(upper.y); - u[2] = sub_group_reduce_max(upper.z); - - if (lane < 3) - { - atomic_min((global float*)&aabb->lower + lane, l[lane]); - atomic_max((global float*)&aabb->upper + lane, u[lane]); - } -} - - -GRL_INLINE void AABB_local_atomic_merge(local struct AABB *aabb, const float4 lower, const float4 upper) -{ - if (lower.x < aabb->lower.x) - atomic_min((local float *)&aabb->lower + 0, lower.x); - if (lower.y < aabb->lower.y) - atomic_min((local float *)&aabb->lower + 1, lower.y); - if (lower.z < aabb->lower.z) - atomic_min((local float *)&aabb->lower + 2, lower.z); - if (upper.x > aabb->upper.x) - atomic_max((local float *)&aabb->upper + 0, upper.x); - if (upper.y > aabb->upper.y) - atomic_max((local float *)&aabb->upper + 1, upper.y); - if (upper.z > aabb->upper.z) - atomic_max((local float *)&aabb->upper + 2, upper.z); -} -#endif - -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/api_interface.h b/src/intel/vulkan/grl/gpu/api_interface.h deleted file mode 100644 index 71a1fff6327..00000000000 --- a/src/intel/vulkan/grl/gpu/api_interface.h +++ /dev/null @@ -1,840 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once -#include "GRLStructs.h" -#include "shared.h" -#include "libs/lsc_intrinsics.h" - -typedef struct Geo GRL_RAYTRACING_GEOMETRY_DESC; - -typedef struct GRL_RAYTRACING_AABB -{ - float MinX; - float MinY; - float MinZ; - float MaxX; - float MaxY; - float MaxZ; -} GRL_RAYTRACING_AABB; - -GRL_INLINE void GLR_set_raytracing_aabb(GRL_RAYTRACING_AABB* dest, struct AABB* source) -{ - dest->MinX = source->lower.x; - dest->MinY = source->lower.y; - dest->MinZ = source->lower.z; - dest->MaxX = source->upper.x; - dest->MaxY = source->upper.y; - dest->MaxZ = source->upper.z; -} - -GRL_INLINE uint3 GRL_load_triangle(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint triID) -{ - global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer; - uint index_format = geomDesc->Desc.Triangles.IndexFormat; - - if (index_format == INDEX_FORMAT_R32_UINT) - { - const uint* data = (const uint*)(indices + triID * 3 * 4); - return (uint3)(data[0], data[1], data[2]); - } - else if (index_format == INDEX_FORMAT_NONE) - { - return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2); - } - else - { - const ushort* data = (const ushort*)(indices + triID * 3 * 2); - return (uint3)(data[0], data[1], data[2]); - } -} - -GRL_INLINE uint3 GRL_load_indices_from_buffer(global char* indices, const uint index_format, const uint triID) -{ - if (index_format == INDEX_FORMAT_R32_UINT) - { - return load_uint3_L1C_L3C((global uint3*)(indices + triID * 3 * 4), 0); - } - else if (index_format == INDEX_FORMAT_NONE) - { - return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2); - } - else - { - const ushort* data = (const ushort*)(indices + triID * 3 * 2); - return (uint3)(data[0], data[1], data[2]); - } -} - -// Load all 3 indices from one triangle, and a single index from another -GRL_INLINE uint4 GRL_load_quad_indices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint triID, uint triID_1, ushort fourth_vert) -{ - global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer; - uint index_format = geomDesc->Desc.Triangles.IndexFormat; - - if (index_format == INDEX_FORMAT_R32_UINT) - { - const uint* data0 = (const uint*)(indices + triID * 3 * 4); - const uint* data1 = (const uint*)(indices + triID_1 * 3 * 4); - return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]); - } - else if (index_format == INDEX_FORMAT_NONE) - { - return (uint4)(triID * 3, triID * 3 + 1, triID * 3 + 2, triID_1 * 3 + fourth_vert); - } - else - { - const ushort* data0 = (const ushort*)(indices + triID * 3 * 2); - const ushort* data1 = (const ushort*)(indices + triID_1 * 3 * 2); - return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]); - } -} - -GRL_INLINE void GRL_set_Type(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, GeometryType type) -{ - geomDesc->Type = type; -} - -GRL_INLINE GeometryType GRL_get_Type(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Type; -} - -GRL_INLINE void GRL_set_Flags(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint8_t flags) -{ - geomDesc->Flags = flags; -} - -GRL_INLINE uint8_t GRL_get_Flags(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Flags; -} - -GRL_INLINE void GRL_set_triangles_Transform(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t transform) -{ - geomDesc->Desc.Triangles.pTransformBuffer = transform; -} - -GRL_INLINE gpuva_t GRL_get_triangles_Transform(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Triangles.pTransformBuffer; -} - -GRL_INLINE void GRL_set_triangles_IndexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, IndexFormat format) -{ - geomDesc->Desc.Triangles.IndexFormat = format; -} - -GRL_INLINE IndexFormat GRL_get_triangles_IndexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Triangles.IndexFormat; -} - -GRL_INLINE void GRL_set_triangles_VertexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, VertexFormat format) -{ - geomDesc->Desc.Triangles.VertexFormat = format; -} - -GRL_INLINE VertexFormat GRL_get_triangles_VertexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Triangles.VertexFormat; -} - -GRL_INLINE void GRL_set_triangles_IndexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count) -{ - geomDesc->Desc.Triangles.IndexCount = count; -} - -GRL_INLINE dword GRL_get_triangles_IndexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Triangles.IndexCount; -} - -GRL_INLINE void GRL_set_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count) -{ - geomDesc->Desc.Triangles.VertexCount = count; -} - -GRL_INLINE dword GRL_get_triangles_VertexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Triangles.VertexCount; -} - -GRL_INLINE void GRL_set_triangles_IndexBuffer(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t buffer) -{ - geomDesc->Desc.Triangles.pIndexBuffer = buffer; -} - -GRL_INLINE gpuva_t GRL_get_triangles_IndexBuffer(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Triangles.pIndexBuffer; -} - -GRL_INLINE void GRL_set_triangles_VertexBuffer_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address) -{ - geomDesc->Desc.Triangles.pVertexBuffer = address; -} - -GRL_INLINE gpuva_t GRL_get_triangles_VertexBuffer_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Triangles.pVertexBuffer; -} - -GRL_INLINE void GRL_set_triangles_VertexBuffer_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, unsigned long stride) -{ - geomDesc->Desc.Triangles.VertexBufferByteStride = stride; -} - -GRL_INLINE unsigned long GRL_get_triangles_VertexBuffer_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Triangles.VertexBufferByteStride; -} - -GRL_INLINE unsigned long GRL_get_triangles_IndexFormatSizeInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return (unsigned long)(geomDesc->Desc.Triangles.IndexFormat); -} - -GRL_INLINE void GRL_set_procedurals_AABBCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count) -{ - geomDesc->Desc.Procedural.AABBCount = count; -} - -GRL_INLINE dword GRL_get_procedurals_AABBCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Procedural.AABBCount; -} - -GRL_INLINE void GRL_set_procedurals_AABBs_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address) -{ - geomDesc->Desc.Procedural.pAABBs_GPUVA = address; -} - -GRL_INLINE gpuva_t GRL_get_procedurals_AABBs_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Procedural.pAABBs_GPUVA; -} - -GRL_INLINE void GRL_set_procedurals_AABBs_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, qword stride) -{ - geomDesc->Desc.Procedural.AABBByteStride = stride; -} - -GRL_INLINE qword GRL_get_procedurals_AABBs_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc) -{ - return geomDesc->Desc.Procedural.AABBByteStride; -} - -GRL_INLINE uint GRL_is_procedural(GRL_RAYTRACING_GEOMETRY_DESC* desc) -{ - return desc->Type == (unsigned char)GEOMETRY_TYPE_PROCEDURAL; -} - -GRL_INLINE uint GRL_is_triangle(GRL_RAYTRACING_GEOMETRY_DESC* desc) -{ - return desc->Type != (unsigned char)GEOMETRY_TYPE_PROCEDURAL; -} - -GRL_INLINE unsigned int GRL_get_ShaderIndex_Mask(GRL_RAYTRACING_GEOMETRY_DESC* desc) -{ - return 0x00FFFFFF; -} - -GRL_INLINE dword GRL_atomic_add_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* desc, dword value) -{ - return atomic_add((global uint*) & desc->Desc.Triangles.VertexCount, value); -} - -GRL_INLINE unsigned int GRL_get_primitive_count(GRL_RAYTRACING_GEOMETRY_DESC* desc) -{ - if (GRL_is_triangle(desc)) - { - if (desc->Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE) - { - return desc->Desc.Triangles.VertexCount / 3; - } - else - { - return desc->Desc.Triangles.IndexCount / 3; - } - } - else - { - return desc->Desc.Procedural.AABBCount; - } -} - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable // to leaf half values - -GRL_INLINE float snorm_to_float(short v) -{ - return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 32767.0f))); // FIXME: do we have intrinsic for this? -} - -GRL_INLINE float snorm8_to_float(signed char v) -{ - return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 127.0f))); // FIXME: do we have intrinsic for this? -} - -GRL_INLINE float unorm_to_float(unsigned short v) -{ - return min(1.0f, max(0.0f, ((float)v) * (1.0f / 65535.0f))); // FIXME: do we have intrinsic for this? -} - -//only lower 10 bits of v are used -GRL_INLINE float unorm10_to_float(unsigned v) -{ - const unsigned short mask = (unsigned short)((1u << 10u) - 1u); - const unsigned short v10 = (unsigned short)v & mask; - return min(1.0f, max(0.0f, ((float)v10) * (1.0f / 1023.0f))); // FIXME: do we have intrinsic for this? -} - -GRL_INLINE float unorm8_to_float(unsigned char v) -{ - return min(1.0f, max(0.0f, ((float)v) * (1.0f / 255.0f))); // FIXME: do we have intrinsic for this? -} - -GRL_INLINE float4 GRL_load_vertex(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint vtxID) -{ - float4 v = (float4)(0, 0, 0, 0); - global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer; - uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride; - uint vertex_format = geomDesc->Desc.Triangles.VertexFormat; - - if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) - { - const float* data = (const float*)(vertices + vtxID * vertex_stride); - v = (float4)(data[0], data[1], data[2], 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT) - { - const float* data = (const float*)(vertices + vtxID * vertex_stride); - v = (float4)(data[0], data[1], 0.0f, 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) - { - const half* data = (const half*)(vertices + vtxID * vertex_stride); - v = (float4)(data[0], data[1], data[2], 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT) - { - const half* data = (const half*)(vertices + vtxID * vertex_stride); - v = (float4)(data[0], data[1], 0.0f, 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) - { - const short* data = (const short*)(vertices + vtxID * vertex_stride); - v = (float4)(snorm_to_float(data[0]), - snorm_to_float(data[1]), - snorm_to_float(data[2]), - 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM) - { - const short* data = (const short*)(vertices + vtxID * vertex_stride); - v = (float4)(snorm_to_float(data[0]), - snorm_to_float(data[1]), - 0.0f, - 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) - { - const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride); - v = (float4)(unorm_to_float(data[0]), - unorm_to_float(data[1]), - unorm_to_float(data[2]), - 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM) - { - const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride); - v = (float4)(unorm_to_float(data[0]), - unorm_to_float(data[1]), - 0.0f, - 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) - { - const unsigned data = *(const unsigned*)(vertices + vtxID * vertex_stride); - v = (float4)(unorm10_to_float(data), - unorm10_to_float((data >> 10)), - unorm10_to_float((data >> 20)), - 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) - { - const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride); - v = (float4)(unorm8_to_float(data[0]), - unorm8_to_float(data[1]), - unorm8_to_float(data[2]), - 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM) - { - const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride); - v = (float4)(unorm8_to_float(data[0]), - unorm8_to_float(data[1]), - 0.0f, - 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) - { - const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride); - v = (float4)(snorm8_to_float(data[0]), - snorm8_to_float(data[1]), - snorm8_to_float(data[2]), - 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM) - { - const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride); - v = (float4)(snorm8_to_float(data[0]), - snorm8_to_float(data[1]), - 0.0f, - 0.0f); - } - - /* perform vertex transformation */ - if (geomDesc->Desc.Triangles.pTransformBuffer) - { - global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer; - const float x = xfm[0] * v.x + xfm[1] * v.y + xfm[2] * v.z + xfm[3]; - const float y = xfm[4] * v.x + xfm[5] * v.y + xfm[6] * v.z + xfm[7]; - const float z = xfm[8] * v.x + xfm[9] * v.y + xfm[10] * v.z + xfm[11]; - v = (float4)(x, y, z, 0.0f); - } - - return v; -} - -GRL_INLINE void GRL_load_triangle_vertices(global char* vertices, const uint vertex_format, const uint vertex_stride, global float* transform_buffer, const uint vtx0ID, const uint vtx1ID, const uint vtx2ID, float4* out) -{ - if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) - { - const float3 data0 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx0ID * vertex_stride), 0)); - const float3 data1 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx1ID * vertex_stride), 0)); - const float3 data2 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx2ID * vertex_stride), 0)); - out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f); - out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f); - out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT) - { - const float* data0 = (const float*)(vertices + vtx0ID * vertex_stride); - const float* data1 = (const float*)(vertices + vtx1ID * vertex_stride); - const float* data2 = (const float*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f); - out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f); - out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) - { - const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride); - const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride); - const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f); - out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f); - out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT) - { - const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride); - const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride); - const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f); - out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f); - out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) - { - const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride); - const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride); - const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]), 0.0f); - out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]), 0.0f); - out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]), 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM) - { - const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride); - const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride); - const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f, 0.0f); - out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f, 0.0f); - out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f, 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) - { - const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride); - const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride); - const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]), 0.0f); - out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]), 0.0f); - out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]), 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM) - { - const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride); - const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride); - const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f, 0.0f); - out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f, 0.0f); - out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f, 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) - { - const unsigned data0 = *(const unsigned*)(vertices + vtx0ID * vertex_stride); - const unsigned data1 = *(const unsigned*)(vertices + vtx1ID * vertex_stride); - const unsigned data2 = *(const unsigned*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(unorm10_to_float(data0), unorm10_to_float(data0 >> 10), unorm10_to_float(data0 >> 20), 0.0f); - out[1] = (float4)(unorm10_to_float(data1), unorm10_to_float(data1 >> 10), unorm10_to_float(data1 >> 20), 0.0f); - out[2] = (float4)(unorm10_to_float(data2), unorm10_to_float(data2 >> 10), unorm10_to_float(data2 >> 20), 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) - { - const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride); - const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride); - const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]), 0.0f); - out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]), 0.0f); - out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]), 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM) - { - const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride); - const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride); - const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f, 0.0f); - out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f, 0.0f); - out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f, 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) - { - const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride); - const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride); - const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]), 0.0f); - out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]), 0.0f); - out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]), 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM) - { - const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride); - const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride); - const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride); - out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f, 0.0f); - out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f, 0.0f); - out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f, 0.0f); - } - - /* perform vertex transformation */ - if (transform_buffer) - { - global float* xfm = (global float*)transform_buffer; - for (uint i = 0; i < 3; ++i) - { - const float x = xfm[0] * out[i].x + xfm[1] * out[i].y + xfm[2] * out[i].z + xfm[3]; - const float y = xfm[4] * out[i].x + xfm[5] * out[i].y + xfm[6] * out[i].z + xfm[7]; - const float z = xfm[8] * out[i].x + xfm[9] * out[i].y + xfm[10] * out[i].z + xfm[11]; - out[i] = (float4)(x, y, z, 0.0f); - } - } -} - -GRL_INLINE void GRL_load_quad_vertices_no_stride(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - float3* out0, float3* out1, float3* out2, float3* out3, - const uint4 vtxID, const uint vertex_format, global char* vertices) -{ - float3 v0, v1, v2, v3; - - if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) - { - const float* data0 = (const float*)(vertices + vtxID.x); - const float* data1 = (const float*)(vertices + vtxID.y); - const float* data2 = (const float*)(vertices + vtxID.z); - const float* data3 = (const float*)(vertices + vtxID.w); - v0 = (float3)(data0[0], data0[1], data0[2]); - v1 = (float3)(data1[0], data1[1], data1[2]); - v2 = (float3)(data2[0], data2[1], data2[2]); - v3 = (float3)(data3[0], data3[1], data3[2]); - } - else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT) - { - const float* data0 = (const float*)(vertices + vtxID.x); - const float* data1 = (const float*)(vertices + vtxID.y); - const float* data2 = (const float*)(vertices + vtxID.z); - const float* data3 = (const float*)(vertices + vtxID.w); - v0 = (float3)(data0[0], data0[1], 0.0f); - v1 = (float3)(data1[0], data1[1], 0.0f); - v2 = (float3)(data2[0], data2[1], 0.0f); - v3 = (float3)(data3[0], data3[1], 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) - { - const half* data0 = (const half*)(vertices + vtxID.x); - const half* data1 = (const half*)(vertices + vtxID.y); - const half* data2 = (const half*)(vertices + vtxID.z); - const half* data3 = (const half*)(vertices + vtxID.w); - v0 = (float3)(data0[0], data0[1], data0[2]); - v1 = (float3)(data1[0], data1[1], data1[2]); - v2 = (float3)(data2[0], data2[1], data2[2]); - v3 = (float3)(data3[0], data3[1], data3[2]); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT) - { - const half* data0 = (const half*)(vertices + vtxID.x); - const half* data1 = (const half*)(vertices + vtxID.y); - const half* data2 = (const half*)(vertices + vtxID.z); - const half* data3 = (const half*)(vertices + vtxID.w); - v0 = (float3)(data0[0], data0[1], 0.0f); - v1 = (float3)(data1[0], data1[1], 0.0f); - v2 = (float3)(data2[0], data2[1], 0.0f); - v3 = (float3)(data3[0], data3[1], 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) - { - const short* data0 = (const short*)(vertices + vtxID.x); - const short* data1 = (const short*)(vertices + vtxID.y); - const short* data2 = (const short*)(vertices + vtxID.z); - const short* data3 = (const short*)(vertices + vtxID.w); - v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2])); - v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2])); - v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2])); - v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), snorm_to_float(data3[2])); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM) - { - const short* data0 = (const short*)(vertices + vtxID.x); - const short* data1 = (const short*)(vertices + vtxID.y); - const short* data2 = (const short*)(vertices + vtxID.z); - const short* data3 = (const short*)(vertices + vtxID.w); - v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f); - v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f); - v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f); - v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) - { - const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x); - const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y); - const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z); - const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w); - v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2])); - v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2])); - v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2])); - v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), unorm_to_float(data3[2])); - } - else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM) - { - const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x); - const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y); - const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z); - const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w); - v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f); - v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f); - v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f); - v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) - { - const unsigned data0 = *(const unsigned*)(vertices + vtxID.x); - const unsigned data1 = *(const unsigned*)(vertices + vtxID.y); - const unsigned data2 = *(const unsigned*)(vertices + vtxID.z); - const unsigned data3 = *(const unsigned*)(vertices + vtxID.w); - v0 = (float3)(unorm10_to_float(data0), unorm10_to_float((data0 >> 10)), unorm10_to_float((data0 >> 20))); - v1 = (float3)(unorm10_to_float(data1), unorm10_to_float((data1 >> 10)), unorm10_to_float((data1 >> 20))); - v2 = (float3)(unorm10_to_float(data2), unorm10_to_float((data2 >> 10)), unorm10_to_float((data2 >> 20))); - v3 = (float3)(unorm10_to_float(data3), unorm10_to_float((data3 >> 10)), unorm10_to_float((data3 >> 20))); - } - else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) - { - const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x); - const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y); - const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z); - const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w); - v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2])); - v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2])); - v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2])); - v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), unorm8_to_float(data3[2])); - } - else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM) - { - const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x); - const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y); - const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z); - const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w); - v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f); - v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f); - v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f); - v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), 0.0f); - } - else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) - { - const signed char* data0 = (const signed char*)(vertices + vtxID.x); - const signed char* data1 = (const signed char*)(vertices + vtxID.y); - const signed char* data2 = (const signed char*)(vertices + vtxID.z); - const signed char* data3 = (const signed char*)(vertices + vtxID.w); - v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2])); - v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2])); - v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2])); - v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), snorm8_to_float(data3[2])); - } - else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM) - { - const signed char* data0 = (const signed char*)(vertices + vtxID.x); - const signed char* data1 = (const signed char*)(vertices + vtxID.y); - const signed char* data2 = (const signed char*)(vertices + vtxID.z); - const signed char* data3 = (const signed char*)(vertices + vtxID.w); - v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f); - v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f); - v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f); - v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), 0.0f); - } - else - { - v0 = (float3)(0.0f, 0.0f, 0.0f); - v1 = (float3)(0.0f, 0.0f, 0.0f); - v2 = (float3)(0.0f, 0.0f, 0.0f); - v3 = (float3)(0.0f, 0.0f, 0.0f); - } - - - /* perform vertex transformation */ - if (geomDesc->Desc.Triangles.pTransformBuffer) - { - global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer; - - v0.xyz = (float3)( - xfm[0] * v0.x + xfm[1] * v0.y + xfm[2] * v0.z + xfm[3], - xfm[4] * v0.x + xfm[5] * v0.y + xfm[6] * v0.z + xfm[7], - xfm[8] * v0.x + xfm[9] * v0.y + xfm[10] * v0.z + xfm[11] - ); - - v1.xyz = (float3)( - xfm[0] * v1.x + xfm[1] * v1.y + xfm[2] * v1.z + xfm[3], - xfm[4] * v1.x + xfm[5] * v1.y + xfm[6] * v1.z + xfm[7], - xfm[8] * v1.x + xfm[9] * v1.y + xfm[10] * v1.z + xfm[11] - ); - - v2.xyz = (float3)( - xfm[0] * v2.x + xfm[1] * v2.y + xfm[2] * v2.z + xfm[3], - xfm[4] * v2.x + xfm[5] * v2.y + xfm[6] * v2.z + xfm[7], - xfm[8] * v2.x + xfm[9] * v2.y + xfm[10] * v2.z + xfm[11] - ); - - v3.xyz = (float3)( - xfm[0] * v3.x + xfm[1] * v3.y + xfm[2] * v3.z + xfm[3], - xfm[4] * v3.x + xfm[5] * v3.y + xfm[6] * v3.z + xfm[7], - xfm[8] * v3.x + xfm[9] * v3.y + xfm[10] * v3.z + xfm[11] - ); - } - - *out0 = v0; - *out1 = v1; - *out2 = v2; - *out3 = v3; -} - - -GRL_INLINE void GRL_load_quad_vertices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - float3* out0, float3* out1, float3* out2, float3* out3, - uint4 vtxID) -{ - global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer; - uint vertex_format = geomDesc->Desc.Triangles.VertexFormat; - uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride; - - vtxID *= vertex_stride; - - GRL_load_quad_vertices_no_stride(geomDesc, out0, out1, out2, out3, - vtxID, vertex_format, vertices); -} - - -GRL_INLINE GRL_RAYTRACING_AABB GRL_load_aabb(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint primID) -{ - global char* aabb0 = (global char*)geomDesc->Desc.Procedural.pAABBs_GPUVA; - global char* aabb = aabb0 + (primID * geomDesc->Desc.Procedural.AABBByteStride); - return *(global GRL_RAYTRACING_AABB*)aabb; -} - -// same as for d3d12 -typedef struct GRL_RAYTRACING_INSTANCE_DESC -{ - float Transform[12]; - // unsigned int InstanceID : 24; - // unsigned int InstanceMask : 8; - uint32_t DW0; - // unsigned int InstanceContributionToHitGroupIndex : 24; - // unsigned int Flags : 8; - uint32_t DW1; - global char* AccelerationStructure; -} GRL_RAYTRACING_INSTANCE_DESC; - -GRL_INLINE float GRL_get_transform(const GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column) -{ - return d->Transform[row * 4 + column]; -} - -GRL_INLINE uint32_t GRL_get_instanceID(const GRL_RAYTRACING_INSTANCE_DESC* d) -{ - return d->DW0 & ((1 << 24) - 1); -} - -GRL_INLINE uint32_t GRL_get_InstanceMask(const GRL_RAYTRACING_INSTANCE_DESC* d) -{ - return d->DW0 >> 24; -} - -GRL_INLINE uint32_t GRL_get_InstanceContributionToHitGroupIndex(const GRL_RAYTRACING_INSTANCE_DESC* d) -{ - return d->DW1 & ((1 << 24) - 1); -} - -GRL_INLINE uint32_t GRL_get_InstanceFlags(const GRL_RAYTRACING_INSTANCE_DESC* d) -{ - return d->DW1 >> 24; -} - -GRL_INLINE gpuva_t GRL_get_AccelerationStructure(const GRL_RAYTRACING_INSTANCE_DESC* d) -{ - return (gpuva_t)d->AccelerationStructure; -} - -GRL_INLINE void GRL_set_transform(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column, float value) -{ - d->Transform[row * 4 + column] = value; -} - -GRL_INLINE void GRL_set_instanceID(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t id) -{ - d->DW0 &= 255 << 24; - d->DW0 |= id & ((1 << 24) - 1); -} - -GRL_INLINE void GRL_set_InstanceMask(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t mask) -{ - d->DW0 &= ((1 << 24) - 1); - d->DW0 |= mask << 24; -} - -GRL_INLINE void GRL_set_InstanceContributionToHitGroupIndex(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t contribution) -{ - d->DW1 &= 255 << 24; - d->DW1 |= contribution & ((1 << 24) - 1); -} - -GRL_INLINE void GRL_set_InstanceFlags(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t flags) -{ - d->DW1 &= ((1 << 24) - 1); - d->DW1 |= flags << 24; -} - -GRL_INLINE void GRL_set_AccelerationStructure(GRL_RAYTRACING_INSTANCE_DESC* d, gpuva_t address) -{ - d->AccelerationStructure = (global char*)address; -} diff --git a/src/intel/vulkan/grl/gpu/atomic_update.cl b/src/intel/vulkan/grl/gpu/atomic_update.cl deleted file mode 100644 index 5171a122dc1..00000000000 --- a/src/intel/vulkan/grl/gpu/atomic_update.cl +++ /dev/null @@ -1,1112 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "GRLGen12.h" - -#include "bvh_build_refit.h" -#include "bvh_build_treelet_refit.h" - - -struct RefitScratch -{ - float lower[3]; - uint mask; - float upper[3]; - uint _pad; - -}; - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(64, 1, 1))) void kernel -init_refit_scratch( - global struct BVHBase* bvh, - global struct RefitScratch* scratch ) -{ - uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); - - if ( tid < BVHBase_GetNumInternalNodes(bvh) ) - { - float4 v = (float4) (FLT_MAX,FLT_MAX,FLT_MAX,0); - store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 0, as_uint4(v) ); - store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 1, as_uint4(v) ); - } -} - -bool is_fat_leaf( InternalNode* curNode ) -{ - return curNode->nodeType != BVH_INTERNAL_NODE; // TODO: Not enough for traversal shaders!! if ts enabled need to check child types -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(64, 1, 1))) void kernel -build_fatleaf_table( - global struct BVHBase* bvh ) -{ - uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); - - if ( tid < BVHBase_GetNumInternalNodes(bvh) ) - { - InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; - - if ( is_fat_leaf(curNode) ) - { - uint offs = atomic_inc_global( &bvh->fatLeafCount ); - - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - uint bp = *InnerNode_GetBackPointer(backPointers, tid); - - LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+offs; - leaf->backpointer = bp; - leaf->inner_node_index = tid; - leaf->leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart; - } - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(64, 1, 1))) void kernel -build_fatleaf_table_new_update( - global struct Globals *globals, - global struct BVHBase* bvh ) -{ - uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); - - if ( tid < BVHBase_GetNumInternalNodes(bvh) ) - { - InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; - - if ( is_fat_leaf(curNode) ) - { - // This implementation uses fatleaf table structure but it is actually quad table - // Also tested implementation that process 2 fatleafs per SIMD line as we iterate over the children - // but performance was worse - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - uint bp = *InnerNode_GetBackPointer(backPointers, tid); - uint fatLeafTableStart = bvh->fatLeafTableStart; - - uint leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart; - uint numChildren = (bp >> 3) & 0x7; - - uint quad_leaf_table_index = leaf_index; - - // Check if num children is outside of the % 256 work group - // If so, move these cases to the offset after numQuads and push them to the leftovers part - // where fatleaves are stored every 8th pos with additional padding - // This way we will not have the case in leftovers table where single fatleaf has children in 2 separate work groups - - uint prev_group = leaf_index & 255; - uint next_group = (leaf_index + (numChildren - 1)) & 255; - uint slm_pos = prev_group; - bool is_leftover = prev_group > next_group; - - if(is_leftover) - { - LeafTableEntry* leafBase = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index)); - uint numQuads_aligned_256 = (globals->numPrimitives + 255) & ~255; - - uint leftovers_offset = atomic_add_global( &bvh->quadLeftoversCountNewAtomicUpdate, 8 ); - - for(uint i = 0; i < BVH_NODE_N6; i++) - { - uint pos = (i < numChildren) ? i : 0; - LeafTableEntry* leaf_null = &leafBase[pos]; - leaf_null->leaf_index = -1 << 3; - } - - quad_leaf_table_index = numQuads_aligned_256 + leftovers_offset; - slm_pos = leftovers_offset & 255; - } - - LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index)); - - for(uint i = 0; i < BVH_NODE_N6; i++) - { - uint pos = (i < numChildren) ? i : 0; - LeafTableEntry* leafCur = &leaf[pos]; - leafCur->backpointer = bp; - leafCur->inner_node_index = (tid << 8) | slm_pos; - leafCur->leaf_index = (leaf_index << 3) | pos; - } - - // Need to clean the unused area where we pad to 8 for leftovers - if(is_leftover) - { - for(uint i = 1; i < 8; i++) - { - uint pos = (i >= numChildren) ? i : 7; - LeafTableEntry* leafCur = &leaf[pos]; - leafCur->leaf_index = -1 << 3; - } - } - } - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(64, 1, 1))) void kernel -build_innernode_table( - global struct BVHBase* bvh ) -{ - uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); - - if ( tid < BVHBase_GetNumInternalNodes(bvh) ) - { - InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; - - if ( !is_fat_leaf( curNode ) ) - { - uint offs = atomic_inc_global( &bvh->innerCount ); - - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - uint bp = *InnerNode_GetBackPointer(backPointers, tid); - - InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh)+offs; - inner->node_index_and_numchildren = (tid<<3) | ((bp>>3) &7); - inner->first_child = tid + curNode->childOffset; - } - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(256, 1, 1))) void kernel -fixup_quad_table( - global struct BVHBase* bvh ) -{ - // This kernel has 2 work groups that set the magic number for unused data in - // fatleaf table. One work group for thelast group of the first part where quads are packed, - // second one for the last group of the part where quads are stored padded - - uint numQuads = BVHBase_GetNumQuads(bvh); - uint numQuadLeftovers = bvh->quadLeftoversCountNewAtomicUpdate; - uint numQuadLeftovers_aligned_256 = (numQuadLeftovers + 255) & ~255; - - uint numQuads_aligned_256 = (numQuads + 255) & ~255; - uint quadOffsetEnd = numQuads_aligned_256 + get_group_id(0) * numQuadLeftovers_aligned_256; - uint quadOffsetStart = quadOffsetEnd - 256; - - uint quads_number_last_group = (get_group_id(0) == 0) ? numQuads : numQuads_aligned_256 + numQuadLeftovers; - - uint leftovers = quadOffsetEnd - quads_number_last_group; - - uint tid = get_local_id(0) > (255 - leftovers) ? get_local_id(0) : 256 - leftovers; - - if(leftovers != 0) - { - LeafTableEntry* leafBvh = BVHBase_GetFatLeafTable(bvh); - - LeafTableEntry* leaf = &leafBvh[quadOffsetStart + tid]; - leaf->leaf_index = -1 << 3; - } - - if(get_group_id(0) == 1 && get_local_id(0) == 0) - bvh->quadTableSizeNewAtomicUpdate = quadOffsetEnd; -} - - -// updates one quad leaf and gets BBOX contatining it -GRL_INLINE void refit_bottom_child_quad_WB( - global struct QuadLeaf* quad, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - struct AABB* childAABB) -{ - /* get the geomID and primID0/1 for both quad triangles */ - const uint geomID = PrimLeaf_GetGeoIndex(&quad->leafDesc); - const uint primID0 = quad->primIndex0; - const uint primID1 = primID0 + QuadLeaf_GetPrimIndexDelta(quad); - ushort fourth_vert = 0; - - if (primID1 != primID0) - { - ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(quad); - fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert; - fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert; - } - - global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc + geomID; - - uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert); - - // read the indices of the 4 verts we want - float3 vtx0, vtx1, vtx2, vtx3; - GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices); - - childAABB->lower.xyz = min( min( vtx0, vtx1 ), min(vtx2,vtx3) ); - childAABB->upper.xyz = max( max( vtx0, vtx1 ), max(vtx2,vtx3) ); - - float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x ); - float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y ); - float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z ); - - global uint4* dst_verts = (global uint4*) &(quad->v[0][0]); - store_uint4_L1WB_L3WB( dst_verts, 0, as_uint4(pack0) ); - store_uint4_L1WB_L3WB( dst_verts, 1, as_uint4(pack1) ); - store_uint4_L1WB_L3WB( dst_verts, 2, as_uint4(pack2) ); -} - -inline uchar4 uchar4_shuffle_down( uchar4 v, uint offs ) -{ - uint vi = as_uint(v); - return as_uchar4(intel_sub_group_shuffle_down(vi,vi,offs)); -} -inline uchar4 uchar4_broadcast( uchar4 v, uint offs ) -{ - uint vi = as_uint(v); - return as_uchar4(sub_group_broadcast(vi,offs)); -} - -GRL_INLINE void sg_InternalNode_setFields( - struct InternalNode* node, - struct AABB reduced_aabb, - const int offset, const uint nodeType, struct AABB* input_aabb, - const uint numChildren, const uchar nodeMask ) -{ - const float up = 1.0f + ulp; - const float down = 1.0f - ulp; - - struct AABB conservative_aabb = conservativeAABB(&reduced_aabb); - const float3 org = conservative_aabb.lower.xyz; - - const float3 len = AABB_size(&conservative_aabb).xyz * up; - int3 exp; - const float3 mant = frexp_vec3(len, &exp); - exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); - - uchar4 lower_uchar = 0x80; - uchar4 upper_uchar = 0; - - ushort lane = get_sub_group_local_id(); - ushort simd8_id = lane/8; - ushort logical_lane = lane%8; - - if( logical_lane < numChildren ) - { - struct AABB child_aabb = conservativeAABB( input_aabb ); // conservative ??? - - float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) ); - lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) ); - float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) ); - upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) ); - lower_uchar.xyz = convert_uchar3_rtn( lower ); - upper_uchar.xyz = convert_uchar3_rtp( upper ); - } - - uchar4 lo0 = lower_uchar; - uchar4 lo1 = uchar4_shuffle_down( lower_uchar, 1 ); - uchar4 lo2 = uchar4_shuffle_down( lower_uchar, 2 ); - uchar4 lo3 = uchar4_shuffle_down( lower_uchar, 3 ); - uchar4 lo4 = uchar4_shuffle_down( lower_uchar, 4 ); - uchar4 lo5 = uchar4_shuffle_down( lower_uchar, 5 ); - - uchar4 hi0 = upper_uchar; - uchar4 hi1 = uchar4_shuffle_down( upper_uchar,1 ); - uchar4 hi2 = uchar4_shuffle_down( upper_uchar,2 ); - uchar4 hi3 = uchar4_shuffle_down( upper_uchar,3 ); - uchar4 hi4 = uchar4_shuffle_down( upper_uchar,4 ); - uchar4 hi5 = uchar4_shuffle_down( upper_uchar,5 ); - - if( logical_lane == 0 ) - { - uchar childBlockStride = 0x01 + (uint)(nodeType == NODE_TYPE_INSTANCE); - - uint4 block0 = (uint4)(as_uint(org.x), as_uint(org.y), as_uint(org.z), offset); - - char3 exp_char = (char3)(exp.x,exp.y,exp.z); - - uint4 block1 = (uint4)( - as_uint((uchar4)(nodeType, 0 /* padding */, exp_char.x, exp_char.y)), - as_uint((uchar4)(exp_char.z, nodeMask, childBlockStride, childBlockStride)) , - as_uint((uchar4)(childBlockStride, childBlockStride, childBlockStride, childBlockStride)) , - as_uint((uchar4)(lo0.x,lo1.x,lo2.x,lo3.x)) - ); - - uint4 block2 = (uint4)( - as_uint((uchar4)(lo4.x,lo5.x,hi0.x,hi1.x)) , - as_uint((uchar4)(hi2.x,hi3.x,hi4.x,hi5.x)) , - as_uint((uchar4)(lo0.y,lo1.y,lo2.y,lo3.y)) , - as_uint((uchar4)(lo4.y,lo5.y,hi0.y,hi1.y)) - ); - - uint4 block3 = (uint4)( - as_uint((uchar4)(hi2.y,hi3.y,hi4.y,hi5.y)), - as_uint((uchar4)(lo0.z,lo1.z,lo2.z,lo3.z)), - as_uint((uchar4)(lo4.z,lo5.z,hi0.z,hi1.z)), - as_uint((uchar4)(hi2.z,hi3.z,hi4.z,hi5.z)) - ); - - global uint4* pNode = (global uint4*)node; - -#if 0 - printf( - "block0 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" - "block1 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" - "block2 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" - "block3 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" , - block0.x,block0.y,block0.z,block0.w, - pNode[0].x, pNode[0].y, pNode[0].z, pNode[0].w, - block1.x,block1.y,block1.z,block1.w, - pNode[1].x, pNode[1].y, pNode[1].z, pNode[1].w, - block2.x,block2.y,block2.z,block2.w, - pNode[2].x, pNode[2].y, pNode[2].z, pNode[2].w , - block3.x,block3.y,block3.z,block3.w, - pNode[3].x, pNode[3].y, pNode[3].z, pNode[3].w ); -#endif - - store_uint4_L1WB_L3WB( pNode, 0, block0 ); - store_uint4_L1WB_L3WB( pNode, 1, block1 ); - store_uint4_L1WB_L3WB( pNode, 2, block2 ); - store_uint4_L1WB_L3WB( pNode, 3, block3 ); - } - -} - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(256, 1, 1))) -void kernel -traverse_aabbs_quad( - global struct BVHBase* bvh, - global struct RefitScratch* scratch, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc - ) -{ - - uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh); - varying ushort lane = get_sub_group_local_id(); - - uniform uint num_leaves = bvh->fatLeafCount; - - local struct RefitScratch local_scratch[256]; - if( get_local_id(0) < min(num_nodes,256u) ) - { - for( uint i=0; i<3; i++ ){ - local_scratch[get_local_id(0)].lower[i] = FLT_MAX; - local_scratch[get_local_id(0)].upper[i] = FLT_MAX; - } - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - - ushort SIMD8_PER_SG = get_sub_group_size()/8; - ushort SIMD8_PER_WG = get_num_sub_groups()*SIMD8_PER_SG; - ushort simd8_local_id = get_sub_group_local_id()/8; - ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; - ushort logical_lane = lane%8; - - uniform uint fatleaf_index = simd8_id + get_group_id(0)*SIMD8_PER_WG; - - - if ( fatleaf_index < num_leaves ) - { - LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+fatleaf_index; - uint innerNodeIdx = leaf->inner_node_index; - uint bp = leaf->backpointer; - uint leaf_index = leaf->leaf_index; - - varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx; - varying QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index; - - uint childOffs = (((char*)quad) - ((char*)curNode))/64; - - varying struct AABB childrenBox; - AABB_init(&childrenBox); - - uint numChildren = (bp >> 3) & 0x7; - if (logical_lane < numChildren) - { - refit_bottom_child_quad_WB( - (global struct QuadLeaf*) &quad[logical_lane], - geomDesc, - &childrenBox ); - } - - struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox); - struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); - for (uint i = 1; i < SIMD8_PER_SG; i++) - { - struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); - int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; - reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); - reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); - } - - sg_InternalNode_setFields( - curNode, - reduce_bounds, - childOffs, - NODE_TYPE_QUAD, - &childrenBox, - numChildren, - 0xff ); - - // atomic min operation vectorized across 6 lanes - // [ lower.xyz ][-][upper.xyz][-] - // - // Lanes 3 and 7 are inactive. 'upper' is negated - bool atomic_mask = (1<> 6); - - // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= 256 - if(atomic_mask && parent != 0x03FFFFFF) - { - while( parent >= 256 ) - { - innerNodeIdx = parent; - bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); - atomic_min( ((global float*) &(scratch[innerNodeIdx]))+logical_lane, v ); - parent = bp >> 6; - } - while( parent != 0x03FFFFFF ) - { - innerNodeIdx = parent; - bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); - atomic_min( ((local float*) &(local_scratch[innerNodeIdx]))+logical_lane, v ); - parent = bp >> 6; - } - } - - } - - - barrier( CLK_LOCAL_MEM_FENCE ); - num_nodes = min(num_nodes,256u); - - local float* in = (local float*)&local_scratch[0]; - global float* out = (global float*)&scratch[0]; - - for (uint i = get_local_id(0); i < num_nodes*6; i += 256 ) - { - // since we want to save [ lower.xyz ][-][upper.xyz][-] i.e 0,1,2, 4,5,6 etc. we need to offset +1 for every triplet - uint idx = i + (i/3); - - float v = in[idx]; - if( v != FLT_MAX ) - atomic_min( out + idx , v ); - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(64, 1, 1))) -void kernel -write_inner_nodes( - global struct BVHBase* bvh, - global struct RefitScratch* scratch - ) -{ - uint SIMD8_PER_SG = get_sub_group_size()/8; - uniform uint node_id = SIMD8_PER_SG * get_sub_group_global_id() + (get_sub_group_local_id()/8); - varying ushort lane = get_sub_group_local_id() % 8; - varying uint num_inners = bvh->innerCount; - - if ( node_id < num_inners ) - { - InnerNodeTableEntry* entry = BVHBase_GetInnerNodeTable(bvh) + node_id; - uint node_index = entry->node_index_and_numchildren>>3; - uint numChildren = entry->node_index_and_numchildren & 7; - uint first_child = entry->first_child; - - varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+node_index; - - varying struct AABB childAABB; - AABB_init(&childAABB); - - if( lane < numChildren ) - { - uint child = first_child + lane; - childAABB.lower.x = scratch[child].lower[0]; - childAABB.lower.y = scratch[child].lower[1]; - childAABB.lower.z = scratch[child].lower[2]; - childAABB.upper.x = -scratch[child].upper[0]; - childAABB.upper.y = -scratch[child].upper[1]; - childAABB.upper.z = -scratch[child].upper[2]; - } - - varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB); - struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); - for (uint i = 1; i < SIMD8_PER_SG; i++) - { - struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); - int3 is_upper_lane = ((uint3)(i)) == (get_sub_group_local_id()/8); - reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); - reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); - } - - sg_InternalNode_setFields( - curNode, - reduce_bounds, - first_child - node_index, - NODE_TYPE_INTERNAL, - &childAABB, - numChildren, - 0xff ); - - } - - if (node_id == 0 && lane == 0 ) - { - bvh->Meta.bounds.lower[0] = scratch[0].lower[0]; - bvh->Meta.bounds.lower[1] = scratch[0].lower[1]; - bvh->Meta.bounds.lower[2] = scratch[0].lower[2]; - bvh->Meta.bounds.upper[0] = -scratch[0].upper[0]; - bvh->Meta.bounds.upper[1] = -scratch[0].upper[1]; - bvh->Meta.bounds.upper[2] = -scratch[0].upper[2]; - } - -} - - - -#if 1 -#define SLM_BOX_COUNT 1024 - -struct AABB load_box( uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes ) -{ - if( place < SLM_BOX_COUNT ) - return local_boxes[place]; - else - return extra_boxes[place-SLM_BOX_COUNT]; -} - -void store_box( struct AABB box, uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes ) -{ - if (place < SLM_BOX_COUNT) - { - local_boxes[place] = box; - } - else - { - global uint4* ptr = (global uint4*)&extra_boxes[place-SLM_BOX_COUNT]; - store_uint4_L1WB_L3WB( ptr, 0, as_uint4(box.lower) ); - store_uint4_L1WB_L3WB( ptr+1, 0, as_uint4(box.upper) ); - } -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(512, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel -update_single_group_quads( - global struct BVHBase* bvh, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global struct AABB* extra_boxes -) -{ - uniform uint tid = get_sub_group_global_id(); - uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh); - uniform uint num_leaves = bvh->fatLeafCount; - uniform uint num_inners = bvh->innerCount; - - varying ushort lane = get_sub_group_local_id(); - - local struct AABB local_boxes[SLM_BOX_COUNT]; // == 32KB - - // initialize nodes - for (uint i = get_local_id( 0 ); i < num_nodes; i+= get_local_size(0)) - { - struct AABB tmp; - AABB_init(&tmp); - tmp.upper = -tmp.upper; - store_box( tmp, i, local_boxes, extra_boxes ); - } - - - if( num_nodes > SLM_BOX_COUNT ) - mem_fence_workgroup_default(); - - barrier( CLK_LOCAL_MEM_FENCE ); - - - ushort SIMD8_PER_SG = get_sub_group_size()/8; - ushort NUM_SIMD8 = get_num_sub_groups()*SIMD8_PER_SG; - ushort simd8_local_id = get_sub_group_local_id()/8; - ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; - ushort logical_lane = lane%8; - - - for ( uint i = simd8_id; i < num_leaves; i+= NUM_SIMD8 ) - { - LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+i; - uint innerNodeIdx = leaf->inner_node_index; - uint bp = leaf->backpointer; - uint leaf_index = leaf->leaf_index; - - varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx; - QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index; - - uint childOffs = (((char*)quad) - ((char*)curNode))/64; - - varying struct AABB childrenBox; - AABB_init(&childrenBox); - - uint numChildren = (bp >> 3) & 0x7; - if (logical_lane < numChildren) - { - - refit_bottom_child_quad_WB( - (global struct QuadLeaf*) &quad[logical_lane], - geomDesc, - &childrenBox ); - } - - struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox); - struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); - for (uint i = 1; i < SIMD8_PER_SG; i++) - { - struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); - int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; - reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); - reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); - } - - - if( logical_lane == 0 ) - { - struct AABB negated = reduce_bounds; - negated.upper = -negated.upper; - store_box( negated, innerNodeIdx, local_boxes, extra_boxes ); - } - - sg_InternalNode_setFields( - curNode, - reduce_bounds, - childOffs, - NODE_TYPE_QUAD, - &childrenBox, - numChildren, - 0xff ); - - - // atomic min operation vectorized across 6 lanes - // [ lower.xyz ][-][upper.xyz][-] - // - // Lanes 3 and 7 are inactive. 'upper' is negated - uint lmod = logical_lane % 4; - uint ldiv = logical_lane / 4; - float vlo = reduce_bounds.lower.x; - float vhi = reduce_bounds.upper.x; - vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo; - vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi; - vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo; - vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi; - float v = (ldiv == 0) ? vlo : -vhi; - bool atomic_mask = (1<> 6); - - // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= SLM_BOX_COUNT - if(atomic_mask && parent != 0x03FFFFFF) - { - while( parent >= SLM_BOX_COUNT ) - { - innerNodeIdx = parent; - bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); - atomic_min( ((global float*) &(extra_boxes[innerNodeIdx-SLM_BOX_COUNT]))+logical_lane, v ); - parent = bp >> 6; - } - while( parent != 0x03FFFFFF ) - { - innerNodeIdx = parent; - bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); - atomic_min( ((local float*) &(local_boxes[innerNodeIdx]))+logical_lane, v ); - parent = bp >> 6; - } - } - - } - - if( num_nodes > SLM_BOX_COUNT ) - mem_fence_workgroup_default(); - - barrier( CLK_LOCAL_MEM_FENCE ); - - for ( uint i = simd8_id; i < num_inners; i+= NUM_SIMD8 ) - { - InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh) + i; - uint node_index = inner->node_index_and_numchildren>>3; - uint numChildren = inner->node_index_and_numchildren & 7; - uint first_child = inner->first_child; - - varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+ node_index; - - //if (curNode->nodeType == BVH_INTERNAL_NODE) // TODO: Needs updating for traversal shaders - { // TODO: Consider using an inner node table or UC load to avoid polluting LSC with these reads - uint child = first_child + logical_lane; - - bool child_valid = (logical_lane < numChildren); - - struct AABB childAABB; - AABB_init(&childAABB); - if (child_valid) - { - childAABB = load_box( child, local_boxes, extra_boxes ); - childAABB.upper = -childAABB.upper; - } - - varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB); - struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); - for (uint i = 1; i < SIMD8_PER_SG; i++) - { - struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); - int3 is_upper_lane = ((uint3)(i)) == (get_sub_group_local_id()/8); - reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); - reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); - } - - sg_InternalNode_setFields( - curNode, - reduce_bounds, - first_child - node_index, - NODE_TYPE_INTERNAL, - &childAABB, - numChildren, - 0xff ); - } - } - - - if (get_sub_group_id() == 0 && lane == 0 ) - { - bvh->Meta.bounds.lower[0] = local_boxes[0].lower.x; - bvh->Meta.bounds.lower[1] = local_boxes[0].lower.y; - bvh->Meta.bounds.lower[2] = local_boxes[0].lower.z; - bvh->Meta.bounds.upper[0] = -local_boxes[0].upper.x; - bvh->Meta.bounds.upper[1] = -local_boxes[0].upper.y; - bvh->Meta.bounds.upper[2] = -local_boxes[0].upper.z; - } - -} -#endif - -GRL_INLINE void traverse_aabbs_new_update_func( - global struct BVHBase* bvh, - global char* vertices, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global struct RefitScratch* scratch, - uint vertex_format, - local struct AABB3f* children_AABBs, - local uint* num_fat_leaves, - local struct LeafTableEntry* leafTable_local, - const bool single_geo - ) -{ - // The first part of the kernel with vertices loads/stores is executed with quad per work item, - // using previously prepared QuadDataIndices to get the quad data and vert indices - // Second part of the kernel that does the reduction, update fatleaf ain bvh and bottom up is - // executed per simd. - // For bottom up tested also with local part (using local scratch) but since there is not enough SLM additional - // barriers were needed to clean and reuse SLM, which curretnly kills performance. Could be worth to revisit - // on future gens. - - varying uint lid = get_local_id(0); - varying uint tid = lid + get_group_id(0)*get_local_size(0); - - num_fat_leaves[0] = 0; - leafTable_local[lid].leaf_index = -1 << 3; - - LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * bvh->fatLeafTableStart + 12 * tid)); - uint innerNodeIdx_mem = leaf->inner_node_index; - uint bp = leaf->backpointer; - uint leaf_index_mem = leaf->leaf_index; - - uint numChildren = (bp >> 3) & 0x7; - - uint leaf_index = leaf_index_mem >> 3; - uint slm_child_offset = leaf_index_mem & 0x7; - - uint innerNodeIdx = innerNodeIdx_mem >> 8; - uint slm_pos_main = innerNodeIdx_mem & 0xFF; - - uint first_el_of_group = get_group_id(0)*get_local_size(0); - uint quadsNum = BVHBase_GetNumQuads(bvh); - uint expected_tid = first_el_of_group < quadsNum ? first_el_of_group : quadsNum - 1; - - // Skip writes when not all children for single fatleaf are present in this work group - bool skip_tid = leaf_index == 0x1FFFFFFF; - leaf_index = skip_tid ? expected_tid : leaf_index; - - // Compute bounding box for quads - varying struct AABB3f childrenBox; - - tid = leaf_index + slm_child_offset; - - // Read vertex indices and quad header from separate buffer - uint quadIndicesStart = bvh->quadIndicesDataStart; - varying struct QuadDataIndices* vertex_indice_ptr = (QuadDataIndices*)(((char*)bvh) + (64u * quadIndicesStart + 32 * tid)); - QuadDataIndices vertexMap = vertex_indice_ptr[0]; - - varying global uint4* bounds = (global uint4*)((char*)bvh + (64*bvh->quadLeafStart + 64*tid) ); - uint4 quad_data = (uint4)(vertexMap.header_data[0], vertexMap.header_data[1], vertexMap.header_data[2], vertexMap.header_data[3]); - uint4 indices = (uint4)(vertexMap.vert_idx[0], vertexMap.vert_idx[1], vertexMap.vert_idx[2], vertexMap.vert_idx[3]); - - global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc; - - if(!single_geo) - { - uint geomID = vertexMap.header_data[0] & 0xFFFFFF; - desc += geomID; - vertices = (global char*)desc->Desc.Triangles.pVertexBuffer; - vertex_format = desc->Desc.Triangles.VertexFormat; - } - - float3 vtx0, vtx1, vtx2, vtx3; - GRL_load_quad_vertices_no_stride(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices, vertex_format, vertices); - - for(uint i = 0; i < 3; i++) - childrenBox.lower[i] = min( min( vtx0[i], vtx1[i] ), min(vtx2[i],vtx3[i]) ); - - for(uint i = 0; i < 3; i++) - childrenBox.upper[i] = max( max( vtx0[i], vtx1[i] ), max(vtx2[i],vtx3[i]) ); - - float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x ); - float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y ); - float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z ); - - // Store quad data in bvh - // Make sure this goes without partial writes to get best perf - store_uint4_L1WB_L3WB( bounds, 0, quad_data ); - store_uint4_L1WB_L3WB( bounds, 1, as_uint4(pack0) ); - store_uint4_L1WB_L3WB( bounds, 2, as_uint4(pack1) ); - store_uint4_L1WB_L3WB( bounds, 3, as_uint4(pack2) ); - - barrier( CLK_LOCAL_MEM_FENCE ); - - struct AABB reduce_bounds; - - if(!skip_tid) - { - // Store AABB in SLM, to be used later for children quantization in fatleaf - children_AABBs[slm_pos_main + slm_child_offset] = childrenBox; - - if(slm_child_offset == 0) - { - uint offset = atomic_inc_local(&num_fat_leaves[0]); - leafTable_local[offset].inner_node_index = innerNodeIdx_mem; - leafTable_local[offset].backpointer = bp; - leafTable_local[offset].leaf_index = leaf_index_mem; - } - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - varying ushort lane = get_sub_group_local_id(); - ushort SIMD8_PER_SG = get_sub_group_size()/8; - ushort SIMD8_PER_WG = get_num_sub_groups()*SIMD8_PER_SG; - ushort simd8_local_id = get_sub_group_local_id()/8; - ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; - ushort logical_lane = lane%8; - - uint fatleaves_aligned_32 = (num_fat_leaves[0] + 31) & ~31; - - for(uint offset = 0; offset < fatleaves_aligned_32; offset += 32) - { - uniform uint fatleaf_index = simd8_id + offset; - uint innerNodeIdx_mem = leafTable_local[fatleaf_index].inner_node_index; - uint bp = leafTable_local[fatleaf_index].backpointer; - uint leaf_index_mem = leafTable_local[fatleaf_index].leaf_index; - - uint numChildren = (bp >> 3) & 0x7; - - uint leaf_index = leaf_index_mem >> 3; - uint slm_child_offset = leaf_index_mem & 0x7; - - uint innerNodeIdx = innerNodeIdx_mem >> 8; - uint slm_pos_main = innerNodeIdx_mem & 0xFF; - - bool skip_tid = leaf_index == 0x1FFFFFFF; - bool active_lane = (logical_lane < numChildren); - uint lane_children = active_lane ? logical_lane : 0; - - fatleaf_index = leaf_index; - - varying InternalNode* curNode = (InternalNode*)(((char*)bvh) + (BVH_ROOT_NODE_OFFSET + 64 * innerNodeIdx)); - - global struct Quad *quads = (global struct Quad *)((char*)bvh + 64*bvh->quadLeafStart ); - - varying struct AABB childrenBox_bu; - AABB_init(&childrenBox_bu); - - if(!skip_tid) - childrenBox_bu = AABBfromAABB3f(children_AABBs[slm_pos_main + lane_children]); - - struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox_bu); - struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); - - for (uint i = 1; i < SIMD8_PER_SG; i++) - { - struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); - int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; - reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); - reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); - } - - if(!skip_tid) - { - uint quad_offset = 64u * bvh->quadLeafStart + 64 * fatleaf_index; - varying QuadLeaf* quad = (QuadLeaf*)(((char*)bvh) + quad_offset); - uint childOffs = (((char*)quad) - ((char*)curNode))/64; - - sg_InternalNode_setFields( - curNode, - reduce_bounds, - childOffs, - NODE_TYPE_QUAD, - &childrenBox_bu, - numChildren, - 0xff ); - - bool atomic_mask = (1<> 6); - - global float* parent_v = (global float*) &(scratch[parent]) + logical_lane; - - if(atomic_mask && (*parent_v >= v) && (parent != 0x03FFFFFF)) - { - innerNodeIdx = parent; - bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); - atomic_min( parent_v, v ); - parent = bp >> 6; - - if(parent != 0x03FFFFFF) - { - while( parent != 0x03FFFFFF ) - { - innerNodeIdx = parent; - bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); - - global float* parent_v_global = (global float*) &(scratch[innerNodeIdx]) + logical_lane; - if(*parent_v_global >= v) - atomic_min( parent_v_global, v ); - else - break; - - parent = bp >> 6; - } - } - } - } - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(256, 1, 1))) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -void kernel -traverse_aabbs_new_update( - global struct BVHBase* bvh, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global struct RefitScratch* scratch - ) -{ - varying uint lid = get_local_id(0); - varying uint tid = lid + get_group_id(0)*get_local_size(0); - - local struct AABB3f children_AABBs[256]; - local struct LeafTableEntry leafTable_local[256]; - local uint num_fat_leaves; - - traverse_aabbs_new_update_func(bvh, (global char*)geomDesc /* not used */, geomDesc, scratch, (uint)-1 /* not used */, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], false); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(256, 1, 1))) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -void kernel -traverse_aabbs_new_update_single_geo( - global struct BVHBase* bvh, - global char* vertices, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global struct RefitScratch* scratch, - const uint vertex_format - ) -{ - varying uint lid = get_local_id(0); - varying uint tid = lid + get_group_id(0)*get_local_size(0); - - local struct AABB3f children_AABBs[256]; - local struct LeafTableEntry leafTable_local[256]; - local uint num_fat_leaves; - - if(vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32B32_FLOAT, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R32G32_FLOAT) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32_FLOAT, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_FLOAT, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R16G16_FLOAT) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_FLOAT, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_SNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R16G16_SNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_SNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_UNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R16G16_UNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_UNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R10G10B10A2_UNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_UNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R8G8_UNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_UNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_SNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else if(vertex_format == VERTEX_FORMAT_R8G8_SNORM) - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_SNORM, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); - else - traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, (uint)-1, - &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); -} diff --git a/src/intel/vulkan/grl/gpu/atomic_update.grl b/src/intel/vulkan/grl/gpu/atomic_update.grl deleted file mode 100644 index 9e1d6923d4a..00000000000 --- a/src/intel/vulkan/grl/gpu/atomic_update.grl +++ /dev/null @@ -1,198 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module atomic_update; - -kernel_module atomic_update ("atomic_update.cl") -{ - links lsc_intrinsics; - kernel init_refit_scratch < kernelFunction = "init_refit_scratch" >; - kernel traverse_aabbs_quad < kernelFunction = "traverse_aabbs_quad" >; - kernel write_inner_nodes < kernelFunction = "write_inner_nodes" >; - kernel build_fatleaf_table < kernelFunction = "build_fatleaf_table" >; - kernel build_innernode_table < kernelFunction = "build_innernode_table" >; - - kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >; - - kernel build_fatleaf_table_new_update < kernelFunction = "build_fatleaf_table_new_update" >; - kernel fixup_quad_table < kernelFunction = "fixup_quad_table" >; - kernel traverse_aabbs_new_update < kernelFunction = "traverse_aabbs_new_update" >; - kernel traverse_aabbs_new_update_single_geo < kernelFunction = "traverse_aabbs_new_update_single_geo" >; -} - -import struct MKBuilderState "structs.grl"; - -// this metakernel only initializes registers for use in a batching loop by "init_refit_scratch" -metakernel init_refit_scratch_metakernel_registers() -{ - REG0.hi = 0; - REG1 = 3; - REG2 = 63; - REG3 = 4; - REG4 = 2; - - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; -} - -metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes ) -{ - REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!! - define C_3 REG1; - define C_63 REG2; - define C_4 REG3; - define C_2 REG4; - - REG0 = REG0 - C_3; // nodedataCurr - fixed offset - REG0 = REG0 + C_63; // + 63 - REG0 = REG0 >> C_4; // >> 4 - REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64 - - DISPATCHDIM_X = REG0.lo; - - dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 ) - args(bvh_base,scratch); - -} - -metakernel build_node_tables( qword bvh_base ) -{ - REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!! - REG1 = 2; - REG2 = 63; - REG3 = 4; - REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!! - - REG0 = REG0 - REG4; // nodedataCurr - fixed offset - REG0 = REG0 + REG2; // + 63 - REG0 = REG0 >> REG3; // >> 4 - REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64 - - DISPATCHDIM_X = REG0.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 ) - args(bvh_base); - dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 ) - args(bvh_base); -} - -metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base ) -{ - REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!! - REG1 = 2; - REG2 = 63; - REG3 = 4; - REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!! - - REG0 = REG0 - REG4; // nodedataCurr - fixed offset - REG0 = REG0 + REG2; // + 63 - REG0 = REG0 >> REG3; // >> 4 - REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64 - - DISPATCHDIM_X = REG0.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 ) - args(state.build_globals, bvh_base); - dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 ) - args(bvh_base); -} - -metakernel fixup_quad_table( qword bvh_base ) -{ - dispatch fixup_quad_table(2,1,1) - args(bvh_base); -} - -// this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes" -metakernel init_traverse_aabbs_quad_and_write_inner_nodes() -{ - REG0.hi = 0; - REG1 = 1; - REG2 = 31; - REG3 = 4; - REG4 = 2; - REG5 = 7; - REG6 = 255; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; -} - -metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes ) -{ - - REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode! - define C_1 REG1; - define C_31 REG2; - define C_4 REG3; - - REG0 = REG0 + C_31; // + 31 - REG0 = REG0 >> C_4; // >> 4 - REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32 - - DISPATCHDIM_X = REG0.lo; - - dispatch_indirect traverse_aabbs_quad//( (max_inner_nodes+32)/32, 1, 1 ) - args(bvh_base,scratch,geos); -} - -metakernel write_inner_nodes( qword bvh_base, qword scratch )//, dword max_inner_nodes ) -{ - REG0.lo = load_dword( bvh_base + 68 ); // TODO: DOn't hardcode! - define C_1 REG1; - define C_2 REG4; - define C_7 REG5; - - REG0 = REG0 + C_7; // + 7 - REG0 = REG0 >> C_2; // >> 2 - REG0 = REG0 >> C_1; // >> 1 ==> >> 3 (/8) - DISPATCHDIM_X = REG0.lo; - - dispatch_indirect write_inner_nodes//( (max_inner_nodes+7)/8, 1, 1 ) - args(bvh_base,scratch); -} - -metakernel update_single_group_quads( qword bvh_base, qword geos, qword aabbs ) -{ - dispatch update_single_group_quads(1,1,1) //( (max_inner_nodes+1)/2, 1, 1 ) - args(bvh_base,geos,aabbs); -} - -metakernel traverse_aabbs_new_update( qword bvh_base, qword geos, qword scratch ) -{ - REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode! - define C_255 REG6; - define C_4 REG3; - - REG0 = REG0 + C_255; // + 255 - REG0 = REG0 >> C_4; // >> 4 - REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32 - - DISPATCHDIM_X = REG0.lo; - - dispatch_indirect traverse_aabbs_new_update//( (max_inner_nodes+255)/256, 1, 1 ) - args(bvh_base, geos, scratch); -} - -metakernel traverse_aabbs_new_update_single_geo( qword bvh_base, qword vertices, qword geos, qword scratch, dword vertex_format ) -{ - REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode! - define C_255 REG6; - define C_4 REG3; - - REG0 = REG0 + C_255; // + 255 - REG0 = REG0 >> C_4; // >> 4 - REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32 - - DISPATCHDIM_X = REG0.lo; - - dispatch_indirect traverse_aabbs_new_update_single_geo//( (max_inner_nodes+255)/256, 1, 1 ) - args(bvh_base, vertices, geos, scratch, vertex_format); -} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/binned_sah_shared.h b/src/intel/vulkan/grl/gpu/binned_sah_shared.h deleted file mode 100644 index 8b22f6612cd..00000000000 --- a/src/intel/vulkan/grl/gpu/binned_sah_shared.h +++ /dev/null @@ -1,265 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// -// This file contains structure definitions shared by GRL OCL kernels and host code -// - -#include "GRLGen12.h" -#pragma once - -#define BFS_NUM_BINS 16 -#define BFS_NUM_VCONTEXTS 256 -#define BFS_MAX_DEPTH 32 - -#define TRIVIAL_BUILD_THRESHOLD 6 -#define SINGLE_WG_BUILD_THRESHOLD 256 - -#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384 - - -typedef uchar vcontext_id_t; - -GRL_NAMESPACE_BEGIN(GRL) -GRL_NAMESPACE_BEGIN(RTAS) -GRL_NAMESPACE_BEGIN(GPUBVHBuilder) - -struct BFS_Split -{ - float sah; - int dim; - int pos; -}; - - -struct BFS_BinInfo -{ - float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6] - // The 6 are lower(xyz) and -upper(xyz) - // bins use negated-max so that we can use vectorized mins instead of min/max pairs - uint counts[3 * BFS_NUM_BINS]; -}; - -enum_uint8(SAHBuildFlags) -{ - SAH_FLAG_NEED_BACKPOINTERS = 1, // identifies a mixed internal node where each child can have a different type - SAH_FLAG_NEED_MASKS = 2 -}; - -struct SAHBuildGlobals -{ - qword p_primref_index_buffers; - qword p_primrefs_buffer; - qword p_bvh2; - qword p_globals; // TODO: deprecate this - qword p_bvh_base; - gpuva_t p_qnode_root_buffer; - - dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks' - dword num_primrefs; - dword leaf_size; - dword leaf_type; - - dword root_buffer_num_produced; - dword root_buffer_num_produced_hi; - dword root_buffer_num_consumed; - dword root_buffer_num_consumed_hi; - dword root_buffer_num_to_consume; - dword root_buffer_num_to_consume_hi; -}; - -struct SAHBuildBuffersInfo -{ - gpuva_t p_globals; - gpuva_t p_primref_index_buffers; - gpuva_t p_primrefs_buffer; - gpuva_t p_bvh2; - gpuva_t p_bvh_base; - gpuva_t p_qnode_root_buffer; - dword sah_globals_flags; - dword _pad; - gpuva_t _pad2; -}; - -typedef union LRBounds -{ - struct - { - struct AABB3f left_centroid_bounds; - struct AABB3f left_geom_bounds; - struct AABB3f right_centroid_bounds; - struct AABB3f right_geom_bounds; - } boxes; - struct - { - float Array[24]; - } scalars; -} LRBounds; - - -struct VContext -{ - uint dispatch_primref_begin; // range of primrefs for this task - uint dispatch_primref_end; - uint bvh2_root; // BVH2 root node for this task - uint tree_depth; // depth of this node in the tree - uint num_left; // primref counts - uint num_right; - uint lr_mask; // lower 8b : left mask. upper 8b : right mask - uint batch_index; - - // pass1 global working state and output - struct BFS_Split split; - struct BFS_BinInfo global_bin_info; - - // pass2 global working state and output - LRBounds lr_bounds; -}; - - - -struct BFSDispatchRecord -{ - ushort batch_index; - ushort context_id; -}; - - -struct BFSDispatchQueue -{ - uint num_dispatches; - uint wg_count[BFS_NUM_VCONTEXTS]; - struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS]; -}; - -struct BFS1SpillStackEntry -{ - uint primref_begin; - uint primref_end; - uint bvh2_root; - ushort tree_depth; - ushort batch_index; -}; - -struct BFS1SpillStack -{ - uint size; - struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH]; -}; - -struct QNodeGlobalRootBufferEntry -{ - uint bvh2_node; - uint qnode; - uint build_idx; - uint _pad; -}; - -struct QNodeGlobalRootBuffer -{ - uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM - struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2]; -}; - -struct DFSDispatchRecord -{ - uint primref_base; - uint bvh2_base; - uint batch_index; - ushort num_primrefs; - ushort tree_depth; -}; - - -struct DFSDispatchQueue -{ - struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2]; -}; - -#define VCONTEXT_STATE_EXECUTING 0 -#define VCONTEXT_STATE_UNALLOCATED 1 - -union SchedulerUnion -{ - struct VContextScheduler - { - ///////////////////////////////////////////////////////////// - // State data used for communication with command streamer - // NOTE: This part must match definition in 'new_sah_builder.grl' - ///////////////////////////////////////////////////////////// - - dword num_bfs_wgs; - dword num_dfs_wgs; - - dword scheduler_postsync; - dword _pad1; - - dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). - dword num_single_builds; // number of single-wg builds (#primrefs < threshold) - - dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass - dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition - - ///////////////////////////////////////////////////////////// - - dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer - dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer - - dword vcontext_state[BFS_NUM_VCONTEXTS]; - - struct BFSDispatchQueue bfs_queue; - struct DFSDispatchQueue dfs_queue; - - struct VContext contexts[BFS_NUM_VCONTEXTS]; - - struct BFS1SpillStack bfs2_spill_stack; - } vContextScheduler; - - struct QnodeScheduler - { - dword num_qnode_grb_curr_entries; - dword num_qnode_grb_new_entries; - - dword scheduler_postsync; - dword _pad1; - - dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). - dword num_single_builds; // number of single-wg builds (#primrefs < threshold) - - dword batched_builds_to_process; - dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer - - ///////////////////////////////////////////////////////////// - - dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer - dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer - - struct QNodeGlobalRootBuffer qnode_global_root_buffer; - } qnodeScheduler; -}; - - -struct BVH2Node -{ - struct AABB3f box; - uint meta_u; // leaf: primref start. inner: offset from node to its first child - uint meta_ss; - //ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes - //uchar is_inner; // 1 if inner, 0 if leaf - //uchar mask; -}; - -struct BVH2 -{ - uint num_nodes; - uint _pad[7]; // align to 32B -}; - - -GRL_NAMESPACE_END(GPUBVHBuilder) -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/gpu/build_leaf.grl b/src/intel/vulkan/grl/gpu/build_leaf.grl deleted file mode 100644 index 7b154d03b43..00000000000 --- a/src/intel/vulkan/grl/gpu/build_leaf.grl +++ /dev/null @@ -1,206 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module leaf_builder; - -kernel_module leaf_kernels ("bvh_build_leaf.cl") -{ - links lsc_intrinsics; - - kernel opencl_kernel_primref_to_quads < kernelFunction="primref_to_quads" >; - kernel opencl_kernel_primref_to_procedurals < kernelFunction="primref_to_procedurals" >; - kernel opencl_kernel_create_HW_instance_nodes < kernelFunction="create_HW_instance_nodes" >; - kernel opencl_kernel_create_HW_instance_nodes_pointers < kernelFunction="create_HW_instance_nodes_pointers" >; -} - -import struct MKBuilderState "structs.grl"; -import struct MKSizeEstimate "structs.grl"; - -const Instances_GROUPSIZE = 16; - -metakernel buildLeafDXR_instances( - MKBuilderState state, - qword build_primref_index_buffers, - qword srcInstanceDescrArray, - dword stride, - dword offset, - dword numPrims) -{ - define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE; - dispatch opencl_kernel_create_HW_instance_nodes(num_groups,1,1) args( - state.build_globals, - build_primref_index_buffers, - state.build_primref_buffer, - state.bvh_buffer, - srcInstanceDescrArray, - stride, - offset); -} - -metakernel buildLeafDXR_instances_indirect( - MKBuilderState state, - qword build_primref_index_buffers, - qword srcInstanceDescrArray, - qword indirectBuildRangeInfo, - dword stride, - dword offset) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // Instances_GROUPSIZE - 1 - C_4 = 4; // log_2(Instances_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_create_HW_instance_nodes args( - state.build_globals, - build_primref_index_buffers, - state.build_primref_buffer, - state.bvh_buffer, - srcInstanceDescrArray, - stride, - offset); -} - -metakernel buildLeafDXR_instances_pointers( - MKBuilderState state, - qword build_primref_index_buffers, - qword srcInstanceDescrArrayPtr, - dword stride, - dword offset, - dword numPrims) -{ - define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE; - dispatch opencl_kernel_create_HW_instance_nodes_pointers(num_groups,1,1) args( - state.build_globals, - build_primref_index_buffers, - state.build_primref_buffer, - state.bvh_buffer, - srcInstanceDescrArrayPtr, - stride, - offset); -} - -metakernel buildLeafDXR_instances_pointers_indirect( - MKBuilderState state, - qword build_primref_index_buffers, - qword srcInstanceDescrArrayPtr, - qword indirectBuildRangeInfo, - dword stride, - dword offset) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // Instances_GROUPSIZE - 1 - C_4 = 4; // log_2(Instances_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_create_HW_instance_nodes_pointers args( - state.build_globals, - build_primref_index_buffers, - state.build_primref_buffer, - state.bvh_buffer, - srcInstanceDescrArrayPtr, - stride, - offset); -} - -metakernel buildLeafDXR_procedurals( - MKBuilderState state, - qword build_primref_index_buffers, - dword stride, - dword offset, - qword p_numPrimitives) -{ - define C_1 REG0; - define REG_PRIMS_PER_WG REG1; - define REG_PRIMS_PER_WG_SHR REG2; - - C_1 = 1; - REG_PRIMS_PER_WG = 16; - REG_PRIMS_PER_WG_SHR = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements) - - define reg_numPrimitives REG3; - define reg_num_wgs REG4; - - reg_numPrimitives = load_dword(p_numPrimitives); - reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG; - reg_num_wgs = reg_num_wgs - C_1; - reg_num_wgs = reg_num_wgs >> REG_PRIMS_PER_WG_SHR; - - DISPATCHDIM_X = reg_num_wgs; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_primref_to_procedurals args( - state.build_globals, - state.build_primref_buffer, - build_primref_index_buffers, - state.bvh_buffer, - state.geomDesc_buffer, - stride, - offset); -} - -metakernel buildLeafDXR_quads( - MKBuilderState state, - qword build_primref_index_buffers, - dword stride, - dword offset, - qword p_numPrimitives, - dword allow_update) -{ - define C_1 REG0; - define REG_PRIMS_PER_WG REG1; - define SHIFT REG2; - - C_1 = 1; - REG_PRIMS_PER_WG = 32; - SHIFT = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements) - - define reg_numPrimitives REG3; - define reg_num_wgs REG4; - - reg_numPrimitives = load_dword(p_numPrimitives); - reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG; - reg_num_wgs = reg_num_wgs - C_1; - reg_num_wgs = reg_num_wgs >> SHIFT; - reg_num_wgs = reg_num_wgs >> C_1; - - DISPATCHDIM_X = reg_num_wgs; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_primref_to_quads args( - state.build_globals, - state.build_primref_buffer, - build_primref_index_buffers, - state.bvh_buffer, - state.geomDesc_buffer, - stride, - offset, - allow_update); -} diff --git a/src/intel/vulkan/grl/gpu/build_primref.grl b/src/intel/vulkan/grl/gpu/build_primref.grl deleted file mode 100644 index 33728bd01f6..00000000000 --- a/src/intel/vulkan/grl/gpu/build_primref.grl +++ /dev/null @@ -1,229 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module build_primref; - -kernel_module primref_kernels ("bvh_build_primref.cl") -{ - links lsc_intrinsics; - - kernel opencl_kernel_primrefs_from_DXR_instances < kernelFunction="primrefs_from_DXR_instances" >; - kernel opencl_kernel_primrefs_from_DXR_instances_indirect < kernelFunction="primrefs_from_DXR_instances_indirect" >; - kernel opencl_kernel_primrefs_from_DXR_instances_pointers < kernelFunction="primrefs_from_DXR_instances_pointers" >; - kernel opencl_kernel_primrefs_from_DXR_instances_pointers_indirect < kernelFunction="primrefs_from_DXR_instances_pointers_indirect" >; - - kernel opencl_kernel_triangles_to_primrefs < kernelFunction="triangles_to_primrefs" >; - kernel opencl_kernel_triangles_to_primrefs_indirect < kernelFunction="triangles_to_primrefs_indirect" >; - kernel opencl_kernel_procedurals_to_primrefs < kernelFunction="procedurals_to_primrefs" >; - kernel opencl_kernel_procedurals_to_primrefs_indirect < kernelFunction="procedurals_to_primrefs_indirect" >; -} - -import struct MKBuilderState "structs.grl"; -import struct MKSizeEstimate "structs.grl"; - - -const PrimirefsFromInstances_GROUPSIZE = 16; - -metakernel buildPrimirefsFromInstances( - qword instanceDescBuff, - MKSizeEstimate estimate, - MKBuilderState build_state, - dword allowUpdate) -{ - define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE); - dispatch opencl_kernel_primrefs_from_DXR_instances(num_groups,1,1) args( - build_state.build_globals, - build_state.bvh_buffer, - instanceDescBuff, - estimate.numPrimitives, - build_state.build_primref_buffer, - allowUpdate); -} - -metakernel buildPrimirefsFromInstancesIndirect( - qword instanceDescBuff, - qword indirectBuildRangeInfo, - MKBuilderState build_state, - dword allowUpdate) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1 - C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_indirect args( - build_state.build_globals, - build_state.bvh_buffer, - instanceDescBuff, - indirectBuildRangeInfo, - build_state.build_primref_buffer, - allowUpdate); -} - -metakernel buildPrimirefsFromInstancesArrOfPtrs( - qword instanceDescPtrArrayBuff, - MKSizeEstimate estimate, - MKBuilderState build_state, - dword allowUpdate) -{ - define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE); - dispatch opencl_kernel_primrefs_from_DXR_instances_pointers(num_groups,1,1) args( - build_state.build_globals, - build_state.bvh_buffer, - instanceDescPtrArrayBuff, - estimate.numPrimitives, - build_state.build_primref_buffer, - allowUpdate); -} - -metakernel buildPrimirefsFromInstancesArrOfPtrsIndirect( - qword instanceDescPtrArrayBuff, - qword indirectBuildRangeInfo, - MKSizeEstimate estimate, - MKBuilderState build_state, - dword allowUpdate) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1 - C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_pointers_indirect args( - build_state.build_globals, - build_state.bvh_buffer, - instanceDescPtrArrayBuff, - build_state.build_primref_buffer, - indirectBuildRangeInfo, - allowUpdate); -} - - - - -metakernel primrefs_from_tris( - MKBuilderState build_state, - MKSizeEstimate estimate, - qword geo_ptr, - dword geom_id, - dword geom_flags, - dword num_prims) -{ - define num_threads ((num_prims+15)/16); - dispatch opencl_kernel_triangles_to_primrefs(num_threads,1,1) args( - build_state.build_globals, - build_state.bvh_buffer, - build_state.build_primref_buffer, - geo_ptr, - (geom_id & 0x00ffffff) + (geom_flags<<24), - num_prims); -} - -metakernel primrefs_from_tris_indirect( - MKBuilderState build_state, - MKSizeEstimate estimate, - qword geo_ptr, - qword indirectBuildRangeInfo, - dword geom_id, - dword geom_flags) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1 - C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_triangles_to_primrefs_indirect args( - build_state.build_globals, - build_state.bvh_buffer, - build_state.build_primref_buffer, - geo_ptr, - indirectBuildRangeInfo, - (geom_id & 0x00ffffff) + (geom_flags << 24)); -} - -metakernel primrefs_from_proc( - MKBuilderState build_state, - MKSizeEstimate estimate, - qword geo_ptr, - dword geom_id, - dword geom_flags, - dword num_prims) -{ - define num_threads ((num_prims+15)/16); - dispatch opencl_kernel_procedurals_to_primrefs(num_threads,1,1) args( - build_state.build_globals, - build_state.bvh_buffer, - build_state.build_primref_buffer, - geo_ptr, - (geom_id & 0x00ffffff) + (geom_flags<<24), - num_prims); -} - -metakernel primrefs_from_proc_indirect( - MKBuilderState build_state, - MKSizeEstimate estimate, - qword geo_ptr, - qword indirectBuildRangeInfo, - dword geom_id, - dword geom_flags) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1 - C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_procedurals_to_primrefs_indirect args( - build_state.build_globals, - build_state.bvh_buffer, - build_state.build_primref_buffer, - geo_ptr, - indirectBuildRangeInfo, - (geom_id & 0x00ffffff) + (geom_flags<<24)); -} diff --git a/src/intel/vulkan/grl/gpu/build_refit.grl b/src/intel/vulkan/grl/gpu/build_refit.grl deleted file mode 100644 index 46d6e76add2..00000000000 --- a/src/intel/vulkan/grl/gpu/build_refit.grl +++ /dev/null @@ -1,324 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module build_refit; - -kernel_module morton_kernels ("bvh_build_refit.cl") -{ - links lsc_intrinsics; - - kernel update_instance_leaves < kernelFunction="update_instance_leaves" >; - kernel refit_indirect_sg < kernelFunction="Refit_indirect_sg" >; - kernel update_instance_leaves_indirect < kernelFunction="update_instance_leaves_indirect" >; - - -} - -const INSTANCE_LEAF_GROUP_SIZE = 16; -const REFIT_GROUP_SIZE = 8; - -metakernel update_instance_leaves( - qword bvh, - qword dxrInstancesArray, - qword dxrInstancesPtrArray, - qword instance_leaf_aabbs, - dword num_instances ) -{ - define num_groups (num_instances + INSTANCE_LEAF_GROUP_SIZE - 1) / INSTANCE_LEAF_GROUP_SIZE; - - dispatch update_instance_leaves(num_groups, 1, 1) args( - bvh, - dxrInstancesArray, - dxrInstancesPtrArray, - instance_leaf_aabbs); -} - -metakernel update_instance_leaves_indirect( - qword bvh, - qword dxrInstancesArray, - qword dxrInstancesPtrArray, - qword instance_leaf_aabbs, - qword indirectBuildRangeInfo) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // INSTANCE_LEAF_GROUP_SIZE - 1 - C_4 = 4; // log_2(INSTANCE_LEAF_GROUP_SIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / INSTANCE_LEAF_GROUP_SIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect update_instance_leaves_indirect args( - bvh, - dxrInstancesArray, - dxrInstancesPtrArray, - instance_leaf_aabbs, - indirectBuildRangeInfo); -} - -/* -metakernel refit( - qword bvh, - qword geomDesc, - qword instance_aabbs, - dword dispatchSize ) -{ - define num_groups (dispatchSize + REFIT_GROUP_SIZE - 1) / REFIT_GROUP_SIZE; - - dispatch refit(num_groups, 1, 1) args( - bvh, - geomDesc, - instance_aabbs); -} - -const REFIT_SIMD_SIZE = 8; -const REFIT_SIMD_SIZE_SHIFT = 3; - -metakernel refit_indirect( - qword bvh, - qword bvh_inner_nodes_start_value, - qword bvh_inner_nodes_end, - qword geomDesc, - qword instance_aabbs ) -{ - define cRoundingSIMD REG4; - define TWO REG3; - define ONE REG5; - cRoundingSIMD = (REFIT_SIMD_SIZE - 1); - - TWO = 2; - ONE = 1; - - REG0 = bvh_inner_nodes_start_value; - REG1 = load_dword(bvh_inner_nodes_end); - REG1.hi = 0; - REG2 = REG1 - REG0; - REG2 = REG2 + cRoundingSIMD; - REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer - REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. - - DISPATCHDIM_X = REG2.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect refit_indirect args( - bvh, - geomDesc, - instance_aabbs); - -} -*/ - -metakernel refit_indirect_sg( - qword bvh, - qword bvh_inner_nodes_start_value, - qword bvh_inner_nodes_end, - qword geomDesc, - qword instance_aabbs ) -{ - - REG0 = bvh_inner_nodes_start_value; - REG1.lo = load_dword(bvh_inner_nodes_end); - REG1.hi = 0; - REG2 = REG1 - REG0; - - DISPATCHDIM_X = REG2.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect refit_indirect_sg args( - bvh, - geomDesc, - instance_aabbs); - -} -/* -//////////////////////////////////////////////////////////////// -// constructing treelets -// phase 1: mark nodes that will be roots of bottom treelets -// also for each node leave a number of startpoints that are under it and max depth of the path from the node -metakernel find_refit_treelets( - qword bvh, - qword treelet_node_data, - qword scratch_startpoints, - qword startpointAlloc, - qword bvh_inner_nodes_start_value, - qword bvh_inner_nodes_end ) -{ - define cRoundingSIMD REG4; - define TWO REG3; - define ONE REG5; - cRoundingSIMD = (REFIT_SIMD_SIZE - 1); - - TWO = 2; - ONE = 1; - - REG0 = bvh_inner_nodes_start_value; - REG1.lo = load_dword(bvh_inner_nodes_end); - REG1.hi = 0; - REG2 = REG1 - REG0; - REG2 = REG2 + cRoundingSIMD; - REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer - REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. - - DISPATCHDIM_X = REG2.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect find_refit_treelets args( - bvh, - treelet_node_data, - scratch_startpoints, - startpointAlloc); -} - - -//////////////////////////////////////////////////////////////// -// constructing treelets -// phase 2 totally parallel, run threads up to assign startpoints to given treelet -// -metakernel assign_refit_startpoints_to_treelets( - qword bvh, - qword treelet_node_data, - qword scratch_startpoints, - qword bvh_inner_nodes_start_value, - qword bvh_inner_nodes_end ) -{ - define cRoundingSIMD REG4; - define TWO REG3; - define ONE REG5; - cRoundingSIMD = (REFIT_SIMD_SIZE - 1); - - TWO = 2; - ONE = 1; - - REG0 = bvh_inner_nodes_start_value; - REG1.lo = load_dword(bvh_inner_nodes_end); - REG1.hi = 0; - REG2 = REG1 - REG0; - REG2 = REG2 + cRoundingSIMD; - REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer - REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. - - DISPATCHDIM_X = REG2.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect assign_refit_startpoints_to_treelets args( - bvh, - treelet_node_data, - scratch_startpoints); -} - - -//////////////////////////////////////////////////////////////// -// constructing treelets -// phase 3 local work: group per treelet, sort the startpoints in treelets ?// by length of the path -metakernel finalize_treelets_in_groups( - qword bvh, - qword scratch_startpoints, - qword ptrNumTreelets ) -{ - REG0 = load_qword(ptrNumTreelets); - - DISPATCHDIM_X = REG0.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect finalize_treelets_in_groups args( - bvh, - scratch_startpoints); -} - - -//////////////////////////////////////////////////////////////// -// Updating treelets -// phase 1 update vertex and generate boxes for vertices -// - -const PER_GROUP_ELEMENTS_ROUNDING = 15; -const PER_GROUP_ELEMENTS_SHIFT = 4; - -metakernel init_treelets_refit(qword pSquashGroupsCountToReset) -{ - REG1 = 0; - store_qword(pSquashGroupsCountToReset, REG1); - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - //REG4 = PER_GROUP_ELEMENTS_SHIFT; - //REG5.hi = PER_GROUP_ELEMENTS_ROUNDING; - //REG5.lo = 0; -} - -metakernel update_quads( - qword scratch_box, - qword bvh, - qword input, - dword numPrimsDividedBy32, - qword bigSquashInput) -{ - //REG0 = load_qword(quads_nodes_begin_end_pair); - //REG1.hi = REG0.lo; // this holds inner nodes begin - //REG2 = REG0 - REG1; - //REG2 = REG2 + REG5; - //REG2 = REG2 >> REG4; - //DISPATCHDIM_X = REG2.hi; - - dispatch refit_quads(numPrimsDividedBy32, 1, 1) args( - bvh, - input, - scratch_box, - numPrimsDividedBy32, - bigSquashInput ); -} - -// -//////////////////////////////////////////////////////////////// - - -//////////////////////////////////////////////////////////////// -// -// phase 1 or 2 - update primitives as well as bottom up refit internal nodes -// in single dispatch (in single group per tree) -metakernel refit_tree_by_group_including_quads( - qword squashed_inputs, - dword numBuilds -) -{ - dispatch refit_tree_per_group(numBuilds, 1, 1) args( - squashed_inputs); -} -// -//////////////////////////////////////////////////////////////// - - -//////////////////////////////////////////////////////////////// -// -// phase 2 bottom up refit internal nodes -// -metakernel refit_treelet_per_group( - qword bigSquashInput, - qword ptrNumTreelets) -{ - DISPATCHDIM_X = load_dword(ptrNumTreelets); - - dispatch_indirect refit_treelet_per_group args( - bigSquashInput); -} -// -//////////////////////////////////////////////////////////////// - -#endif -*/ diff --git a/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl deleted file mode 100644 index d72f192056e..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl +++ /dev/null @@ -1,4823 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "binned_sah_shared.h" - -#include "libs/lsc_intrinsics.h" -#include "intrinsics.h" -#include "AABB.h" -#include "AABB3f.h" - -#include "qbvh6.h" -#include "common.h" - -#include "libs/lsc_intrinsics.h" - -#define SGPRINT_16x(prefix,fmt,type,val) {\ - type v0 = sub_group_broadcast( val, 0 );\ - type v1 = sub_group_broadcast( val, 1 );\ - type v2 = sub_group_broadcast( val, 2 );\ - type v3 = sub_group_broadcast( val, 3 );\ - type v4 = sub_group_broadcast( val, 4 );\ - type v5 = sub_group_broadcast( val, 5 );\ - type v6 = sub_group_broadcast( val, 6 );\ - type v7 = sub_group_broadcast( val, 7 );\ - type v8 = sub_group_broadcast( val, 8 );\ - type v9 = sub_group_broadcast( val, 9 );\ - type v10 = sub_group_broadcast( val, 10 );\ - type v11 = sub_group_broadcast( val, 11 );\ - type v12 = sub_group_broadcast( val, 12 );\ - type v13 = sub_group_broadcast( val, 13 );\ - type v14 = sub_group_broadcast( val, 14 );\ - type v15 = sub_group_broadcast( val, 15 );\ - sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ - if( get_sub_group_local_id() == 0 ) { \ - printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \ - fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \ - v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}} - - -#define SGPRINT_6x(prefix,fmt,type,val) {\ - type v0 = sub_group_broadcast( val, 0 );\ - type v1 = sub_group_broadcast( val, 1 );\ - type v2 = sub_group_broadcast( val, 2 );\ - type v3 = sub_group_broadcast( val, 3 );\ - type v4 = sub_group_broadcast( val, 4 );\ - type v5 = sub_group_broadcast( val, 5 );\ - sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ - if( get_sub_group_local_id() == 0 ) { \ - printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \ - v0,v1,v2,v3,v4,v5);}} - -#define BFS_WG_SIZE 512 - -#define BFS_NUM_VCONTEXTS 256 // must be multiple of 64 - -#define TREE_ARITY 6 - -#define DFS_WG_SIZE 256 -#define DFS_THRESHOLD 256 - - -void BFSDispatchQueue_print(struct BFSDispatchQueue* q, uint n) -{ - for (uint i = 0; i < q->num_dispatches; i++) - printf(" %u,ctx=%u,batch=%u\n", q->wg_count[i], q->records[i].context_id, q->records[i].batch_index); -} - -void VContextScheduler_print(struct VContextScheduler* scheduler) -{ - if (get_local_id(0) == 0) - { - printf("SCHEDULER:\n"); - printf(" bfs=%u dfs=%u\n", scheduler->num_bfs_wgs, scheduler->num_dfs_wgs); - - printf("BFS QUEUE:\n"); - BFSDispatchQueue_print(&scheduler->bfs_queue, scheduler->num_bfs_wgs); - - - printf("DFS QUEUE\n"); - for (uint i = 0; i < scheduler->num_dfs_wgs; i++) - { - struct DFSDispatchRecord* r = &scheduler->dfs_queue.records[i]; - printf(" (%u-%u) root=%u depth=%u batch_index=%u\n", - r->primref_base, r->primref_base + r->num_primrefs, - r->bvh2_base, r->tree_depth, r->batch_index); - } - - printf("CONTEXTS:\n"); - for (uint i = 0; i < BFS_NUM_VCONTEXTS; i++) - { - if (scheduler->vcontext_state[i] != VCONTEXT_STATE_UNALLOCATED) - { - printf(" context: %u state=%u\n", i, scheduler->vcontext_state[i]); - printf(" prims: %u-%u\n", scheduler->contexts[i].dispatch_primref_begin, scheduler->contexts[i].dispatch_primref_end); - printf(" depth: %u\n", scheduler->contexts[i].tree_depth); - printf(" root: %u\n", scheduler->contexts[i].bvh2_root); - printf(" batch: %u\n", scheduler->contexts[i].batch_index); - } - } - - - - } - -} - - -inline float3 select_min(float3 v, bool mask) -{ - return (float3)(mask ? v.x : (float)(INFINITY), - mask ? v.y : (float)(INFINITY), - mask ? v.z : (float)(INFINITY)); -} -inline float3 select_max(float3 v, bool mask) -{ - return (float3)(mask ? v.x : -(float)(INFINITY), - mask ? v.y : -(float)(INFINITY), - mask ? v.z : -(float)(INFINITY)); -} - -/////////////////////////////////////////////////////////////////////////// - -// The 'LRBounds' structure uses negated-max to allow -// both atomic_min and atomic_max to be issued fused into one message - -struct AABB3f LRBounds_get_left_centroid( LRBounds* b ) -{ - struct AABB3f* pbox = &b->boxes.left_centroid_bounds; - return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); -} -struct AABB3f LRBounds_get_right_centroid( LRBounds* b ) -{ - struct AABB3f* pbox = &b->boxes.right_centroid_bounds; - return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); -} -struct AABB3f LRBounds_get_left_geom( LRBounds* b ) -{ - struct AABB3f* pbox = &b->boxes.left_geom_bounds; - return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); -} -struct AABB3f LRBounds_get_right_geom( LRBounds* b ) -{ - struct AABB3f* pbox = &b->boxes.right_geom_bounds; - return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) ); -} - - -void LRBounds_merge_left( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax ) -{ - // All of the input vectors have come from sub-group reductions and are thus uniform - // Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs - // The code below should result in 1 atomic_min message and a simularly large stack of movs - - float mergeVal0 = INFINITY; - float mergeVal1 = INFINITY; - uint i = get_sub_group_local_id(); - - // insert the various merge values into one register - // We use two parallel variables here to enable some ILP - - uint imod = (i>=6) ? (i-6) : i; - mergeVal0 = (imod==0) ? CMin.x : mergeVal0; - mergeVal1 = (imod==0) ? GMin.x : mergeVal1; - - mergeVal0 = (imod==1) ? CMin.y : mergeVal0; - mergeVal1 = (imod==1) ? GMin.y : mergeVal1; - - mergeVal0 = (imod==2) ? CMin.z : mergeVal0; - mergeVal1 = (imod==2) ? GMin.z : mergeVal1; - - mergeVal0 = (imod==3) ? -CMax.x : mergeVal0; - mergeVal1 = (imod==3) ? -GMax.x : mergeVal1; - - mergeVal0 = (imod==4) ? -CMax.y : mergeVal0; - mergeVal1 = (imod==4) ? -GMax.y : mergeVal1; - - mergeVal0 = (imod==5) ? -CMax.z : mergeVal0; - mergeVal1 = (imod==5) ? -GMax.z : mergeVal1; - - float merge = (i<6) ? mergeVal0 : mergeVal1; - if( i < 12 ) - atomic_min( &b->scalars.Array[i], merge ); - - //atomic_min( &b->boxes.left_centroid_bounds.lower[0], CMin.x ); - //atomic_min( &b->boxes.left_centroid_bounds.lower[1], CMin.y ); - //atomic_min( &b->boxes.left_centroid_bounds.lower[2], CMin.z ); - //atomic_min( &b->boxes.left_centroid_bounds.upper[0], -CMax.x ); - //atomic_min( &b->boxes.left_centroid_bounds.upper[1], -CMax.y ); - //atomic_min( &b->boxes.left_centroid_bounds.upper[2], -CMax.z ); - //atomic_min( &b->boxes.left_geom_bounds.lower[0], GMin.x ); - //atomic_min( &b->boxes.left_geom_bounds.lower[1], GMin.y ); - //atomic_min( &b->boxes.left_geom_bounds.lower[2], GMin.z ); - //atomic_min( &b->boxes.left_geom_bounds.upper[0], -GMax.x ); - //atomic_min( &b->boxes.left_geom_bounds.upper[1], -GMax.y ); - //atomic_min( &b->boxes.left_geom_bounds.upper[2], -GMax.z ); -} - -void LRBounds_merge_right( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax ) -{ - // All of the input vectors have come from sub-group reductions and are thus uniform - // Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs - // The code below should result in 1 atomic_min message and a simularly large stack of movs - - float mergeVal0 = INFINITY; - float mergeVal1 = INFINITY; - uint i = get_sub_group_local_id(); - - // insert the various merge values into one register - // We use two parallel variables here to enable some ILP - - uint imod = (i>=6) ? (i-6) : i; - mergeVal0 = (imod==0) ? CMin.x : mergeVal0; - mergeVal1 = (imod==0) ? GMin.x : mergeVal1; - - mergeVal0 = (imod==1) ? CMin.y : mergeVal0; - mergeVal1 = (imod==1) ? GMin.y : mergeVal1; - - mergeVal0 = (imod==2) ? CMin.z : mergeVal0; - mergeVal1 = (imod==2) ? GMin.z : mergeVal1; - - mergeVal0 = (imod==3) ? -CMax.x : mergeVal0; - mergeVal1 = (imod==3) ? -GMax.x : mergeVal1; - - mergeVal0 = (imod==4) ? -CMax.y : mergeVal0; - mergeVal1 = (imod==4) ? -GMax.y : mergeVal1; - - mergeVal0 = (imod==5) ? -CMax.z : mergeVal0; - mergeVal1 = (imod==5) ? -GMax.z : mergeVal1; - - float merge = (i<6) ? mergeVal0 : mergeVal1; - if( i < 12 ) - atomic_min( &b->scalars.Array[i+12], merge ); - - //atomic_min( &b->boxes.right_centroid_bounds.lower[0], CMin.x ); - //atomic_min( &b->boxes.right_centroid_bounds.lower[1], CMin.y ); - //atomic_min( &b->boxes.right_centroid_bounds.lower[2], CMin.z ); - //atomic_min( &b->boxes.right_centroid_bounds.upper[0], -CMax.x ); - //atomic_min( &b->boxes.right_centroid_bounds.upper[1], -CMax.y ); - //atomic_min( &b->boxes.right_centroid_bounds.upper[2], -CMax.z ); - //atomic_min( &b->boxes.right_geom_bounds.lower[0], GMin.x ); - //atomic_min( &b->boxes.right_geom_bounds.lower[1], GMin.y ); - //atomic_min( &b->boxes.right_geom_bounds.lower[2], GMin.z ); - //atomic_min( &b->boxes.right_geom_bounds.upper[0], -GMax.x ); - //atomic_min( &b->boxes.right_geom_bounds.upper[1], -GMax.y ); - //atomic_min( &b->boxes.right_geom_bounds.upper[2], -GMax.z ); -} - -void LRBounds_merge( global LRBounds* globalBounds, local LRBounds* localBounds ) -{ - uint i = get_local_id(0); - if( i < 24 ) - atomic_min(&globalBounds->scalars.Array[i], localBounds->scalars.Array[i] ); -} - - -void LRBounds_init( LRBounds* bounds ) -{ - uint i = get_local_id(0) * 4; - if( i < 24 ) - { - // compiler should merge it into a 4xdword send - bounds->scalars.Array[i+0] = INFINITY; - bounds->scalars.Array[i+1] = INFINITY; - bounds->scalars.Array[i+2] = INFINITY; - bounds->scalars.Array[i+3] = INFINITY; - } - -} - - -inline void LRBounds_init_subgroup( LRBounds* bounds) -{ - uint sg_size = get_sub_group_size(); - uint lane = get_sub_group_local_id(); - - for (uint i = lane * 4; i < 24; i += sg_size * 4) - { - // compiler should merge it into a 4xdword send - bounds->scalars.Array[i+0] = INFINITY; - bounds->scalars.Array[i+1] = INFINITY; - bounds->scalars.Array[i+2] = INFINITY; - bounds->scalars.Array[i+3] = INFINITY; - } - -} - -/////////////////////////////////////////////////////////////////////////// - -inline void BinInfo_init(struct BFS_BinInfo* bin_info) -{ - for (uint id = get_local_id(0) * 4; id < 18 * BFS_NUM_BINS; id += get_local_size(0) * 4) - { - float inf = INFINITY; - // compiler should merge it into a 4xdword send - bin_info->min_max[id+0] = inf; - bin_info->min_max[id+1] = inf; - bin_info->min_max[id+2] = inf; - bin_info->min_max[id+3] = inf; - } - for (uint id = get_local_id(0) * 4; id < 3 * BFS_NUM_BINS; id += get_local_size(0) * 4) - { - // compiler should merge it into a 4xdword send - bin_info->counts[id+0] = 0; - bin_info->counts[id+1] = 0; - bin_info->counts[id+2] = 0; - bin_info->counts[id+3] = 0; - } -} - - -// copy global to local -inline void BinInfo_copy( local struct BFS_BinInfo* local_bin_info, global struct BFS_BinInfo* global_bin_info ) -{ - for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0)) - { - float inf = INFINITY ; - float f = global_bin_info->min_max[id]; - local_bin_info->min_max[id] = f; - } - for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0)) - { - local_bin_info->counts[id] = global_bin_info->counts[id]; - } -} - -inline void BinInfo_init_subgroup(struct BFS_BinInfo* bin_info) -{ - uint sg_size = get_sub_group_size(); - uint lane = get_sub_group_local_id(); - - for (uint i = lane * 4; i < 3 * BFS_NUM_BINS; i += sg_size * 4) - { - // compiler should merge it into a 4xdword send - bin_info->counts[i+0] = 0; - bin_info->counts[i+1] = 0; - bin_info->counts[i+2] = 0; - bin_info->counts[i+3] = 0; - } - - - for (uint i = lane * 4; i < 18 * BFS_NUM_BINS; i += sg_size * 4) - { - // compiler should merge it into a 4xdword send - bin_info->min_max[i+0] = INFINITY; - bin_info->min_max[i+1] = INFINITY; - bin_info->min_max[i+2] = INFINITY; - bin_info->min_max[i+3] = INFINITY; - } - -} - -float3 shuffle_down_float3( float3 a, float3 b, uint delta ) -{ - return (float3)( - intel_sub_group_shuffle_down( a.x, b.x, delta ), - intel_sub_group_shuffle_down( a.y, b.y, delta ), - intel_sub_group_shuffle_down( a.z, b.z, delta ) - ); -} - - - - -void BinInfo_primref_ballot_loop( local struct BFS_BinInfo* bin_info, uint axis, uint bin, float3 lower, float3 upper, bool active_lane ) -{ - local float* bins_min = &bin_info->min_max[0]; - local float* bins_max = &bin_info->min_max[3]; - - varying uint place = (bin + axis*BFS_NUM_BINS); - varying uint lane = get_sub_group_local_id(); - - uniform uint active_mask = intel_sub_group_ballot(active_lane); - - while( active_mask ) - { - uniform uint leader = ctz( active_mask ); - uniform uint lead_place = intel_sub_group_shuffle( place, leader ); - varying bool matching_bin = lead_place == place && active_lane; - - varying float3 lo = (float3)(INFINITY,INFINITY,INFINITY); - varying float3 hi = (float3)(-INFINITY,-INFINITY,-INFINITY); - if (matching_bin) - { - lo = lower.xyz; - hi = upper.xyz; - } - - lo = sub_group_reduce_min_float3( lo ); - hi = sub_group_reduce_max_float3( hi ); - - { - // atomic min operation vectorized across 6 lanes - // [ lower.xyz ][-][upper.xyz][-] - // - // Lanes 3 and 7 are inactive - - uint lmod = lane % 4; - uint ldiv = lane / 4; - float vlo = lo.x; - float vhi = hi.x; - vlo = (lmod == 1) ? lo.y : vlo; - vhi = (lmod == 1) ? hi.y : vhi; - vlo = (lmod == 2) ? lo.z : vlo; - vhi = (lmod == 2) ? hi.z : vhi; - - float v = (ldiv == 0) ? vlo : -vhi; - - if( (1<min_max[ 6*lead_place + lmod + 3*ldiv ], v ); - } - - //if( lane == 0 ) - // atomic_add_local(&bin_info->counts[lead_place], popcount(active_mask & intel_sub_group_ballot(matching_bin)) ); - - active_mask = active_mask & intel_sub_group_ballot(!matching_bin); - } -} - -inline void BinInfo_add_primref(struct BinMapping* binMapping, local struct BFS_BinInfo* bin_info, PrimRef* primref, bool active_lane ) -{ - - const float4 lower = primref->lower; - const float4 upper = primref->upper; - const float4 p = lower + upper; - const uint4 i = convert_uint4( (p - binMapping->ofs) * binMapping->scale ); - - BinInfo_primref_ballot_loop( bin_info, 0, i.x, lower.xyz, upper.xyz, active_lane ); - BinInfo_primref_ballot_loop( bin_info, 1, i.y, lower.xyz, upper.xyz, active_lane ); - BinInfo_primref_ballot_loop( bin_info, 2, i.z, lower.xyz, upper.xyz, active_lane ); - - if (active_lane) - { - atomic_inc_local( &bin_info->counts[i.x + 0 * BFS_NUM_BINS] ); - atomic_inc_local( &bin_info->counts[i.y + 1 * BFS_NUM_BINS] ); - atomic_inc_local( &bin_info->counts[i.z + 2 * BFS_NUM_BINS] ); - } -} - -inline void BinInfo_merge(global struct BFS_BinInfo* global_info, local struct BFS_BinInfo* local_info) -{ - uint id = get_local_id(0); - for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0)) - { - float v = local_info->min_max[id]; - if( v != INFINITY ) - atomic_min(&global_info->min_max[id], v); - } - for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0)) - { - uint c = local_info->counts[id]; - if( c ) - atomic_add_global(&global_info->counts[id], c); - } -} - -inline struct AABB3f BinInfo_get_AABB(struct BFS_BinInfo* bin_info, ushort bin, ushort axis) -{ - float* min = &bin_info->min_max[6*(bin + axis*BFS_NUM_BINS)]; - float* max = min + 3; - struct AABB3f box; - for (uint i = 0; i < 3; i++) - { - box.lower[i] = min[i]; - box.upper[i] = -max[i]; - } - - return box; -} - -inline uint3 BinInfo_get_counts(struct BFS_BinInfo* bin_info, ushort bin) -{ - uint3 counts; - counts.x = bin_info->counts[bin + 0 * BFS_NUM_BINS]; // TODO: block load these - counts.y = bin_info->counts[bin + 1 * BFS_NUM_BINS]; - counts.z = bin_info->counts[bin + 2 * BFS_NUM_BINS]; - return counts; -} -inline uint BinInfo_get_count(struct BFS_BinInfo* bin_info, ushort bin, ushort axis) -{ - return bin_info->counts[bin + axis * BFS_NUM_BINS]; -} - - -void BVH2_Initialize( struct BVH2* bvh ) -{ - bvh->num_nodes = 1; -} - -inline bool BVH2_IsInnerNode( global struct BVH2* bvh, uint node_index ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - return (n->meta_ss & 0x10000) != 0; -} -inline uint BVH2_GetRoot( struct BVH2* bvh ) -{ - return 0; -} - -////////////////////////////////////////////// -// BVH2NodeMetaData funcs -////////////////////////////////////////////// -struct BVH2NodeMetaData -{ - uint meta_u; // leaf: primref start. inner: offset from node to its first child - uint meta_ss; -}; - -inline struct BVH2NodeMetaData BVH2_GetNodeMetaData( global struct BVH2* bvh, uint node_index ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - struct BVH2NodeMetaData meta; - meta.meta_u = n->meta_u; - meta.meta_ss = n->meta_ss; - return meta; -} - -inline bool BVH2NodeMetaData_IsInnerNode( struct BVH2NodeMetaData* meta ) -{ - return (meta->meta_ss & 0x10000) != 0; -} - -inline ushort BVH2NodeMetaData_GetLeafPrimCount( struct BVH2NodeMetaData* meta ) -{ - return meta->meta_ss & 0xffff; -} - -inline uint BVH2NodeMetaData_GetLeafPrimStart( struct BVH2NodeMetaData* meta ) -{ - return meta->meta_u; -} - -inline uint BVH2NodeMetaData_GetMask( struct BVH2NodeMetaData* meta ) -{ - return (meta->meta_ss>>24); -} - -////////////////////////////////////////////// - -inline ushort BVH2_GetLeafPrimCount( struct BVH2* bvh, uint node_index ) -{ - struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; - return n->meta_ss & 0xffff; -} -inline uint BVH2_GetLeafPrimStart( struct BVH2* bvh, uint node_index ) -{ - struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; - return n->meta_u; -} -inline uint2 BVH2_GetChildIndices( struct BVH2* bvh, uint node_index ) -{ - struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; - uint2 idx; - idx.x = n->meta_u; - idx.y = idx.x + (n->meta_ss & 0xffff); - return idx; -} - -inline float BVH2_GetNodeArea( global struct BVH2* bvh, uint node_index ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - return AABB3f_halfArea( &n->box ); -} - - -inline struct AABB3f BVH2_GetNodeBox( global struct BVH2* bvh, uint node_index ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - return n->box; -} -inline void BVH2_SetNodeBox( global struct BVH2* bvh, uint node_index, struct AABB3f* box ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - n->box = *box; -} - -inline void BVH2_SetNodeBox_lu( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - AABB3f_set( &n->box, lower, upper ); -} - -inline void BVH2_InitNodeBox( struct BVH2* bvh, uint node_index ) -{ - struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index; - AABB3f_init( &n->box ); -} - -inline struct AABB BVH2_GetAABB( global struct BVH2* bvh, uint node_index ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - struct AABB r; - r.lower.xyz = AABB3f_load_lower( &n->box ); - r.upper.xyz = AABB3f_load_upper( &n->box ); - return r; -} - -inline void BVH2_WriteInnerNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint2 child_offsets, uint mask ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - n->box = *box; - n->meta_u = child_offsets.x; - n->meta_ss = 0x10000 + (child_offsets.y - child_offsets.x) + (mask<<24); - // n->is_inner = true; -} - -inline void BVH2_WriteLeafNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint prim_start, uint prim_count, uint mask ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - n->box = *box; - n->meta_u = prim_start; - n->meta_ss = prim_count + (mask<<24); - // n->is_inner = true; -} - -inline uint BVH2_GetMask( global struct BVH2* bvh, uint node_index ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - return (n->meta_ss>>24); -} - - -uint BVH2_AllocateNodes( global struct BVH2* bvh, uint num_nodes ) -{ - return atomic_add_global( &bvh->num_nodes, num_nodes ); -} - -inline void BVH2_AtomicMergeNodeBox( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper ) -{ - global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index; - AABB3f_atomic_merge_global_lu( &n->box, lower, upper ); -} - - -void BVH2_print( global struct BVH2* bvh, uint start_node ) -{ - if ( get_local_id( 0 ) == 0 && get_sub_group_id() == 0 ) - { - uint num_nodes = bvh->num_nodes; - - uint2 stack[BFS_MAX_DEPTH * 2]; - uint sp = 0; - - printf( "allocated_nodes=%u\n", num_nodes ); - - stack[sp++] = (uint2)(start_node, 0); - while ( sp > 0 ) - { - uint2 data = stack[--sp]; - uint node = data.x; - uint depth = data.y; - - for ( uint i = 0; i < depth; i++ ) - printf( " " ); - - if ( BVH2_IsInnerNode( bvh, node ) ) - { - uint2 kids = BVH2_GetChildIndices( bvh, node ); - printf( " %5u: inner: %u %u \n", node, kids.x, kids.y ); - stack[sp++] = (uint2)(kids.y, depth + 1); - stack[sp++] = (uint2)(kids.x, depth + 1); - - struct AABB3f l = BVH2_GetNodeBox( bvh, kids.x ); - struct AABB3f r = BVH2_GetNodeBox( bvh, kids.y ); - struct AABB3f p = BVH2_GetNodeBox( bvh, node ); - - float3 pl = AABB3f_load_lower( &p ); - float3 pu = AABB3f_load_upper( &p ); - float3 ll = AABB3f_load_lower( &l ); - float3 lu = AABB3f_load_upper( &l ); - float3 rl = AABB3f_load_lower( &r ); - float3 ru = AABB3f_load_upper( &r ); - if ( any( ll < pl ) || any( rl < pl ) || - any( lu > pu ) || any( ru > pu ) ) - { - for ( uint i = 0; i < depth; i++ ) - printf( " " ); - - printf( "BAD_BOUNDS!!!!!!!! %u\n", node ); - } - - - } - else - { - - uint start = BVH2_GetLeafPrimStart( bvh, node ); - uint count = BVH2_GetLeafPrimCount( bvh, node ); - printf( " %5u: leaf: start=%u count=%u\n ",node,start,count ); - - } - } - } - barrier( CLK_LOCAL_MEM_FENCE ); -} - - -global uint* SAHBuildGlobals_GetPrimrefIndices_In( struct SAHBuildGlobals* globals, bool odd_pass ) -{ - uint num_refs = globals->num_primrefs; - global uint* ib = (global uint*) globals->p_primref_index_buffers; - return ib + (odd_pass ? num_refs : 0); -} - -global uint* SAHBuildGlobals_GetPrimrefIndices_Out( struct SAHBuildGlobals* globals, bool odd_pass ) -{ - uint num_refs = globals->num_primrefs; - global uint* ib = (global uint*) globals->p_primref_index_buffers; - return ib + (odd_pass ? 0 : num_refs); -} - -global PrimRef* SAHBuildGlobals_GetPrimrefs( struct SAHBuildGlobals* globals ) -{ - return (global PrimRef*) globals->p_primrefs_buffer; -} - -global struct BVH2* SAHBuildGlobals_GetBVH2( struct SAHBuildGlobals* globals ) -{ - return (global struct BVH2*)globals->p_bvh2; -} - -uint SAHBuildGlobals_GetLeafSizeInBytes( struct SAHBuildGlobals* globals ) -{ - return globals->leaf_size; -} - -uint SAHBuildGlobals_GetLeafType( struct SAHBuildGlobals* globals ) -{ - return globals->leaf_type; -} - -uint SAHBuildGlobals_GetInternalNodeType( struct SAHBuildGlobals* globals ) -{ - return NODE_TYPE_INTERNAL; -} - -global struct BVHBase* SAHBuildGlobals_GetBVHBase( struct SAHBuildGlobals* globals ) -{ - return (global struct BVHBase*) globals->p_bvh_base; -} - -uint SAHBuildGlobals_GetTotalPrimRefs( struct SAHBuildGlobals* globals ) -{ - return globals->num_primrefs; -} - -inline bool SAHBuildGlobals_NeedBackPointers( struct SAHBuildGlobals* globals ) -{ - return globals->flags & SAH_FLAG_NEED_BACKPOINTERS; -} -inline bool SAHBuildGlobals_NeedMasks( struct SAHBuildGlobals* globals ) -{ - return globals->flags & SAH_FLAG_NEED_MASKS; -} - - -void SAHBuildGlobals_print( struct SAHBuildGlobals* globals ) -{ - if ( get_local_id( 0 ) == 0 ) - { - printf( "SAHBuildGlobals: %p\n", globals ); - printf( " p_primref_index_buffers =%p\n", globals->p_primref_index_buffers ); - printf( " p_primrefs_buffer =%p\n", globals->p_primrefs_buffer ); - printf( " p_bvh2 =%p\n", globals->p_bvh2 ); - printf( " p_globals =%p\n", globals->p_globals ); - printf( " p_bvh_base =%p\n", globals->p_bvh_base ); - printf( " num_primrefs = %u\n", globals->num_primrefs ); - printf( " leaf_size = %u\n", globals->leaf_size ); - printf( " leaf_type = %u\n", globals->leaf_type ); - printf( " p_qnode_buffer = %p\n", globals->p_qnode_root_buffer); - } - - barrier( CLK_LOCAL_MEM_FENCE ); -} - - -uint get_num_wgs(uint thread_count, uint WG_SIZE) -{ - return (thread_count + WG_SIZE - 1) / WG_SIZE; -} - - - - - -struct BFSDispatchArgs -{ - global struct VContextScheduler* scheduler; - global struct VContext* context; - global struct BVH2* bvh2; - global uint* primref_index_in; - global uint* primref_index_out; - global PrimRef* primref_buffer; - - uint wg_primref_begin; - uint wg_primref_end; - uint dispatch_primref_begin; - uint dispatch_primref_end; - uint context_id; - uint num_wgs; - uint bvh2_root; - uint global_num_primrefs; - bool do_mask_processing; -}; - - - - -// TODO_OPT: Enable larger WGs -// We need a way to do this in a portable fashion. -// Gen12 can support larger WGs than Gen9 can -// -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) -kernel void -begin( global struct VContextScheduler* scheduler, - dword leaf_size, - dword leaf_type, - global uint* primref_index_buffers, - global PrimRef* primref_buffer, - global struct BVH2* bvh2, - global struct BVHBase* bvh_base, - global struct Globals* globals, - global struct SAHBuildGlobals* sah_globals, - global uint2* qnode_root_buffer, - dword sah_globals_flags - ) -{ - dword num_primrefs = globals->numPrimitives; - if ( get_local_id( 0 ) == 0 ) - { - sah_globals->p_primrefs_buffer = (qword) primref_buffer; - sah_globals->p_primref_index_buffers = (qword)primref_index_buffers; - sah_globals->p_bvh2 = (qword) bvh2; - sah_globals->p_bvh_base = (qword) bvh_base; - sah_globals->leaf_size = leaf_size; - sah_globals->leaf_type = leaf_type; - sah_globals->num_primrefs = num_primrefs; - sah_globals->p_globals = (qword) globals; - sah_globals->p_qnode_root_buffer = (gpuva_t) qnode_root_buffer; - sah_globals->flags = sah_globals_flags; - - // initialize the spill stack - scheduler->bfs2_spill_stack.size = 0; - - // initialize BVH2 node counter - BVH2_Initialize( bvh2 ); - - // configure first vcontext for first build - scheduler->contexts[0].dispatch_primref_begin = 0; - scheduler->contexts[0].dispatch_primref_end = num_primrefs; - scheduler->contexts[0].bvh2_root = BVH2_GetRoot( bvh2 ); - scheduler->contexts[0].tree_depth = 0; - scheduler->contexts[0].batch_index = 0; - - scheduler->bfs_queue.records[0].context_id = 0; - - scheduler->contexts[0].num_left = 0; - scheduler->contexts[0].num_right = 0; - scheduler->contexts[0].lr_mask = 0; - - // copy centroid bounds into the BVH2 root node' - BVH2_SetNodeBox_lu( bvh2, BVH2_GetRoot( bvh2 ), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz ); - - // zero the trivial build counters.. these are only used by the batch-build path - // but single-wg QNode path (if used) depends on them - scheduler->num_trivial_builds = 0; - scheduler->num_single_builds = 0; - - // initialize the root-buffer counters - sah_globals->root_buffer_num_produced = 0; - sah_globals->root_buffer_num_produced_hi = 0; - sah_globals->root_buffer_num_consumed = 0; - sah_globals->root_buffer_num_consumed_hi = 0; - } - - // initialize vcontext states - for ( uint i = get_local_id( 0 ); i < BFS_NUM_VCONTEXTS; i += get_local_size( 0 ) ) - scheduler->vcontext_state[i] = (i==0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED; - - // initialize global bin info in vcontext - only context[0] will be used in first iteration - BinInfo_init( &scheduler->contexts[0].global_bin_info ); - LRBounds_init( &scheduler->contexts[0].lr_bounds ); - - // barrier( CLK_GLOBAL_MEM_FENCE ); // lsc flush ... driver now does these as part of COMPUTE_WALKER -} - -// TODO_OPT: Enable larger WGs -// We need a way to do this in a portable fashion. -// Gen12 can support larger WGs than Gen9 can -// - - -// TODO_OPT: Enable larger WGs -// We need a way to do this in a portable fashion. -// Gen12 can support larger WGs than Gen9 can -// -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(512, 1, 1))) -kernel void -categorize_builds_and_init_scheduler( - global struct VContextScheduler* scheduler, - global gpuva_t* globals_ptrs, // OCL-C does not allow kernel parameters to be pointer-to-pointer, so we trick it... - global struct SAHBuildBuffersInfo* buffers_info, - global struct SAHBuildGlobals* builds_out, - dword num_builds -) -{ - local uint num_trivial; - local uint num_single; - local uint num_full; - - if (get_group_id(0) == 0) // first workgroup performs build categorization - { - if (get_local_id(0) == 0) - { - num_trivial = 0; - num_single = 0; - num_full = 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // first pass, count builds of each type - uint triv = 0; - uint single = 0; - uint full = 0; - for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0)) - { - global struct Globals* globals = (global struct Globals*) globals_ptrs[i]; - dword num_refs = globals->numPrimitives; - - if (num_refs <= TRIVIAL_BUILD_THRESHOLD) - triv++; - else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD) - single++; - else - full++; - } - - // merge counts across work-group. These variables are now offsets into this thread's ranges - triv = atomic_add_local(&num_trivial, triv); - single = atomic_add_local(&num_single, single); - full = atomic_add_local(&num_full, full); - - barrier(CLK_LOCAL_MEM_FENCE); - - global struct SAHBuildGlobals* trivial_builds_out = builds_out; - global struct SAHBuildGlobals* single_builds_out = builds_out + num_trivial; - global struct SAHBuildGlobals* full_builds_out = builds_out + num_trivial + num_single; - - for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0)) - { - global struct Globals* globals = (global struct Globals*) globals_ptrs[i]; - global struct SAHBuildBuffersInfo* buffers = &buffers_info[i]; - - dword num_refs = globals->numPrimitives; - dword leaf_type = globals->leafPrimType; - dword leaf_size = globals->leafSize; - - global struct SAHBuildGlobals* place; - if (num_refs <= TRIVIAL_BUILD_THRESHOLD) - place = trivial_builds_out + (triv++); - else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD) - place = single_builds_out + (single++); - else - place = full_builds_out + (full++); - - place->p_primref_index_buffers = buffers->p_primref_index_buffers; - place->p_primrefs_buffer = buffers->p_primrefs_buffer; - place->p_bvh2 = buffers->p_bvh2; - place->p_bvh_base = buffers->p_bvh_base; - place->p_globals = (gpuva_t)globals; - place->num_primrefs = num_refs; - place->leaf_size = leaf_size; - place->leaf_type = leaf_type; - place->flags = buffers->sah_globals_flags; - place->p_qnode_root_buffer = buffers->p_qnode_root_buffer; - - // only initialize BVH2 if it will actually be used by the build - // trivial passes will not use it - if( num_refs > SINGLE_WG_BUILD_THRESHOLD ) - { - // initialize BVH2 node counter - global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(place); - BVH2_Initialize(bvh2); - - // copy centroid bounds into the BVH2 root node' - BVH2_SetNodeBox_lu(bvh2, BVH2_GetRoot(bvh2), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz); - } - } - - if (get_local_id(0) == 0) - { - scheduler->num_trivial_builds = num_trivial; - scheduler->num_single_builds = num_single; - scheduler->batched_build_offset = num_trivial + num_single; - scheduler->batched_build_count = num_full; - } - } - else // second workgroup initializes the scheduler - { - // initialize vcontext states - for (uint i = get_local_id(0); i < BFS_NUM_VCONTEXTS; i += get_local_size(0)) - scheduler->vcontext_state[i] = (i == 0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED; - - // initialize global bin info in vcontexts - for (uint i = get_sub_group_id(); i < BFS_NUM_VCONTEXTS; i += get_num_sub_groups()) - BinInfo_init_subgroup(&scheduler->contexts[i].global_bin_info); - - // initialize the spill stack - if (get_local_id(0) == 0) - scheduler->bfs2_spill_stack.size = 0; - } - - //barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );// lsc flush ... driver now does these as part of COMPUTE_WALKER -} - - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(BFS_NUM_VCONTEXTS, 1, 1))) -kernel void -begin_batchable( - global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* sah_globals -) -{ - ushort scheduler_build_offset = scheduler->batched_build_offset; - ushort scheduler_num_builds = scheduler->batched_build_count; - - ushort num_builds = min( scheduler_num_builds, (ushort)BFS_NUM_VCONTEXTS ); - - uint num_wgs = 0; - - ushort tid = get_local_id(0); - if ( tid < num_builds ) - { - ushort batch_index = scheduler_build_offset + tid; - - uint num_primrefs = sah_globals[batch_index].num_primrefs; - - // configure first vcontext for first build - scheduler->contexts[tid].dispatch_primref_begin = 0; - scheduler->contexts[tid].dispatch_primref_end = num_primrefs; - scheduler->contexts[tid].bvh2_root = BVH2_GetRoot( SAHBuildGlobals_GetBVH2(&sah_globals[batch_index]) ); - scheduler->contexts[tid].tree_depth = 0; - scheduler->contexts[tid].batch_index = batch_index; - scheduler->vcontext_state[tid] = VCONTEXT_STATE_EXECUTING; - - scheduler->contexts[tid].num_left = 0; - scheduler->contexts[tid].num_right = 0; - scheduler->contexts[tid].lr_mask = 0; - - num_wgs = get_num_wgs( num_primrefs, BFS_WG_SIZE ); - - scheduler->bfs_queue.wg_count[tid] = num_wgs; - scheduler->bfs_queue.records[tid].batch_index = batch_index; - scheduler->bfs_queue.records[tid].context_id = tid; - } - - num_wgs = work_group_reduce_add(num_wgs); - - if (tid == 0) - { - // write out build count and offset for next BFS iteration - scheduler->batched_build_offset = scheduler_build_offset + num_builds; - scheduler->batched_build_count = scheduler_num_builds - num_builds; - - // write out initial WG count and loop termination mask for command streamer to consume - scheduler->batched_build_wg_count = num_wgs; - scheduler->batched_build_loop_mask = (scheduler_num_builds > num_builds) ? 1 : 0; - - scheduler->bfs_queue.num_dispatches = num_builds; - } - - for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() ) - BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info ); - - for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() ) - LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds ); -} - - - -bool is_leaf( uint num_refs ) -{ - return num_refs <= TREE_ARITY; -} - -bool is_dfs( uint num_refs ) -{ - return num_refs > TREE_ARITY&& num_refs <= DFS_THRESHOLD; -} - -bool is_bfs( uint num_refs ) -{ - return num_refs > DFS_THRESHOLD; -} - -int2 is_leaf_2( uint2 num_refs ) -{ - return num_refs.xy <= TREE_ARITY; -} -int2 is_bfs_2( uint2 num_refs ) -{ - return num_refs.xy > DFS_THRESHOLD; -} - -int2 is_dfs_2( uint2 num_refs ) -{ - return num_refs.xy > TREE_ARITY && num_refs.xy <= DFS_THRESHOLD; -} - -#if 0 -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void -sg_scheduler( global struct VContextScheduler* scheduler ) -{ - local struct BFS1SpillStackEntry SLM_local_spill_stack[BFS_NUM_VCONTEXTS]; - local uchar SLM_context_state[BFS_NUM_VCONTEXTS]; - local vcontext_id_t SLM_free_list[BFS_NUM_VCONTEXTS]; - local vcontext_id_t SLM_exec_list[BFS_NUM_VCONTEXTS]; - - - varying ushort lane = get_sub_group_local_id(); - - uniform uint free_list_size = 0; - uniform uint exec_list_size = 0; - - // read context states, build lists of free and executing contexts - for (varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size()) - { - uchar state = scheduler->vcontext_state[i]; - SLM_context_state[i] = state; - - uniform ushort exec_mask = intel_sub_group_ballot(state == VCONTEXT_STATE_EXECUTING); - - varying ushort prefix_exec = subgroup_bit_prefix_exclusive(exec_mask); - varying ushort prefix_free = lane - prefix_exec; - varying ushort exec_list_pos = exec_list_size + prefix_exec; - varying ushort free_list_pos = free_list_size + prefix_free; - - if (state == VCONTEXT_STATE_EXECUTING) - SLM_exec_list[exec_list_pos] = i; - else - SLM_free_list[free_list_pos] = i; - - uniform ushort num_exec = popcount(exec_mask); - exec_list_size += num_exec; - free_list_size += get_sub_group_size() - num_exec; - } - - uniform uint total_bfs_dispatches = 0; - uniform uint total_dfs_dispatches = 0; - uniform uint bfs_spill_stack_size = 0; - uniform uint total_bfs_wgs = 0; - - // process executing context. accumulate bfs/dfs dispatches and free-list entries - for (uint i = 0; i < exec_list_size; i+= get_sub_group_size() ) - { - varying ushort num_dfs_dispatches = 0; - varying ushort num_bfs_spills = 0; - - varying ushort num_bfs_children; - varying ushort context_id; - struct VContext* context; - varying uint num_left ; - varying uint num_right ; - varying uint primref_begin ; - varying uint primref_end ; - varying uint depth ; - - bool active_lane = (i + lane) < exec_list_size; - if ( active_lane ) - { - context_id = SLM_exec_list[i + lane]; - context = &scheduler->contexts[context_id]; - - num_left = context->num_left; - num_right = context->num_right; - primref_begin = context->dispatch_primref_begin; - primref_end = context->dispatch_primref_end; - depth = context->tree_depth; - - // get dispatch counts - - num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right); - num_bfs_children = is_bfs(num_left) + is_bfs(num_right); - num_bfs_spills = (num_bfs_children == 2) ? 1 : 0; - } - - // allocate space for DFS, BFS dispatches, and BFS spills - varying uint dfs_pos = total_dfs_dispatches + sub_group_scan_exclusive_add(num_dfs_dispatches); - varying ushort mask_bfs_spills = intel_sub_group_ballot(num_bfs_children & 2); // spill if #children == 2 - varying ushort mask_bfs_dispatches = intel_sub_group_ballot(num_bfs_children & 3); // dispatch if #children == 1 or 2 - varying uint bfs_spill_pos = bfs_spill_stack_size + subgroup_bit_prefix_exclusive(mask_bfs_spills); - varying uint bfs_dispatch_pos = total_bfs_dispatches + subgroup_bit_prefix_exclusive(mask_bfs_dispatches); - - total_dfs_dispatches += sub_group_reduce_add(num_dfs_dispatches); - bfs_spill_stack_size += popcount(mask_bfs_spills); - total_bfs_dispatches += popcount(mask_bfs_dispatches); - - varying uint num_bfs_wgs = 0; - if (active_lane) - { - if (num_dfs_dispatches) - { - if (is_dfs(num_left)) - { - scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin; - scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left; - scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->left_bvh2_root; - scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; - dfs_pos++; - } - if (is_dfs(num_right)) - { - scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left; - scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right; - scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->right_bvh2_root; - scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; - } - } - - uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right); - if (num_bfs_children == 2) - { - // spill the right child.. push an entry onto local spill stack - SLM_local_spill_stack[bfs_spill_pos].primref_begin = primref_begin + num_left; - SLM_local_spill_stack[bfs_spill_pos].primref_end = primref_end; - SLM_local_spill_stack[bfs_spill_pos].bvh2_root = context->right_bvh2_root; - SLM_local_spill_stack[bfs_spill_pos].tree_depth = depth + 1; - - // setup BFS1 dispatch for left child - context->dispatch_primref_end = primref_begin + num_left; - context->bvh2_root = context->left_bvh2_root; - context->tree_depth = depth + 1; - num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE); - - scheduler->bfs_queue.wg_count[bfs_dispatch_pos] = num_bfs_wgs; - scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id; - } - else if (num_bfs_children == 1) - { - // setup BFS1 dispatch for whichever child wants it - if (is_bfs(num_left)) - { - // bfs on left child - context->dispatch_primref_end = context->dispatch_primref_begin + num_left; - context->bvh2_root = context->left_bvh2_root; - context->tree_depth = depth + 1; - num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE); - } - else - { - // bfs on right child - context->dispatch_primref_begin = context->dispatch_primref_begin + num_left; - context->bvh2_root = context->right_bvh2_root; - context->tree_depth = depth + 1; - num_bfs_wgs = get_num_wgs(num_right, BFS_WG_SIZE); - } - - scheduler->bfs_queue.wg_count[bfs_dispatch_pos] = num_bfs_wgs; - scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id; - } - else - { - // no bfs dispatch.. this context is now free - SLM_context_state[context_id] = VCONTEXT_STATE_UNALLOCATED; - } - } - - // count bfs work groups - total_bfs_wgs += sub_group_reduce_add(num_bfs_wgs); - - // add newly deallocated contexts to the free list - uniform uint free_mask = intel_sub_group_ballot( active_lane && num_bfs_children == 0); - varying uint free_list_pos = free_list_size + subgroup_bit_prefix_exclusive(free_mask); - free_list_size += popcount(free_mask); - - if ( free_mask & (1<bfs2_spill_stack.size; - - if(bfs_spill_stack_size < free_list_size && memory_spill_stack_size > 0 ) - { - uniform uint read_count = min(free_list_size - bfs_spill_stack_size, memory_spill_stack_size); - - for (varying uint i = lane; i < read_count; i+= get_sub_group_size()) - SLM_local_spill_stack[bfs_spill_stack_size + i] = scheduler->bfs2_spill_stack.entries[memory_spill_stack_size - 1 - i]; - - bfs_spill_stack_size += read_count; - memory_spill_stack_size -= read_count; - } - - // steal pending BFS work and assign it to free contexts - uniform uint num_steals = min(bfs_spill_stack_size, free_list_size); - - for (uniform uint i = 0; i < num_steals; i += get_sub_group_size()) - { - varying uint num_bfs_wgs = 0; - - if (i + lane < num_steals) - { - uint context_id = SLM_free_list[i+lane]; - struct VContext* context = &scheduler->contexts[context_id]; - struct BFS1SpillStackEntry entry = SLM_local_spill_stack[i+lane]; - - context->dispatch_primref_begin = entry.primref_begin; - context->dispatch_primref_end = entry.primref_end; - context->bvh2_root = entry.bvh2_root; - context->tree_depth = entry.tree_depth; - - num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE); - - scheduler->bfs_queue.wg_count[total_bfs_dispatches + i + lane] = num_bfs_wgs; - scheduler->bfs_queue.records[total_bfs_dispatches + i + lane].context_id = context_id; - - SLM_context_state[context_id] = VCONTEXT_STATE_EXECUTING; - } - - total_bfs_wgs += sub_group_reduce_add( num_bfs_wgs ); - } - - total_bfs_dispatches += num_steals; - - // write out excess spills to global spill stack - uniform uint extra_spills = bfs_spill_stack_size - num_steals; - for (varying uint i = lane; i < extra_spills; i += get_sub_group_size()) - { - scheduler->bfs2_spill_stack.entries[memory_spill_stack_size + i] = SLM_local_spill_stack[num_steals+i]; - } - - - // write out modified context states - for ( varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size()) - scheduler->vcontext_state[i] = SLM_context_state[i]; - - - if (get_local_id(0) == 0) - { - // write out new memory stack size - scheduler->bfs2_spill_stack.size = memory_spill_stack_size + extra_spills; - - // store workgroup counters - scheduler->bfs_queue.num_dispatches = total_bfs_dispatches; - scheduler->num_bfs_wgs = total_bfs_wgs; - scheduler->num_dfs_wgs = total_dfs_dispatches; - } - - // barrier(CLK_GLOBAL_MEM_FENCE); // make memory writes globally visible// lsc flush ... driver now does these as part of COMPUTE_WALKER -} -#endif - -#define SCHEDULER_SG_SIZE 16 -#define SCHEDULER_WG_SIZE BFS_NUM_VCONTEXTS -#define SCHEDULER_NUM_SGS (SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE) - - -struct BFSDispatchArgs get_bfs_args_from_record_batchable( - struct BFSDispatchRecord* record, - global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* globals_buffer ); - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(SCHEDULER_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(SCHEDULER_SG_SIZE))) -kernel void -scheduler(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) -{ - local struct BFS1SpillStackEntry SLM_local_spill_stack[2 * BFS_NUM_VCONTEXTS]; - local uint SLM_local_spill_stack_size; - local uint SLM_dfs_dispatch_count; - - if (get_local_id(0) == 0) - { - SLM_local_spill_stack_size = 0; - SLM_dfs_dispatch_count = 0; - } - - uint context_id = get_local_id(0); - uint state = scheduler->vcontext_state[context_id]; - uint initial_state = state; - - uint batch_index = 0; - global struct VContext* context = &scheduler->contexts[context_id]; - - barrier(CLK_LOCAL_MEM_FENCE); - - - uint global_spill_stack_size = scheduler->bfs2_spill_stack.size; - - - if (state == VCONTEXT_STATE_EXECUTING) - { - uint left_bvh2_root; - uint right_bvh2_root; - - uint num_left = context->num_left; - uint num_right = context->num_right; - - uint primref_begin = context->dispatch_primref_begin; - uint primref_end = context->dispatch_primref_end; - - uint depth = context->tree_depth; - uint batch_index = context->batch_index; - - struct BFSDispatchRecord record; - record.context_id = context_id; - record.batch_index = context->batch_index; - - struct BFSDispatchArgs args = get_bfs_args_from_record_batchable( &record, scheduler, sah_globals); - - // do cleanup of bfs_pass2 - { - // compute geom bounds - struct AABB3f left_geom_bounds; - struct AABB3f right_geom_bounds; - struct AABB3f left_centroid_bounds; - struct AABB3f right_centroid_bounds; - uint2 lr_counts = (uint2)(num_left, num_right); - - { - left_centroid_bounds = LRBounds_get_left_centroid( &context->lr_bounds ); - left_geom_bounds = LRBounds_get_left_geom( &context->lr_bounds ); - right_centroid_bounds = LRBounds_get_right_centroid( &context->lr_bounds ); - right_geom_bounds = LRBounds_get_right_geom( &context->lr_bounds ); - } - - int2 v_is_leaf = is_leaf_2( lr_counts ); - int2 v_is_dfs = is_dfs_2( lr_counts ); - int2 v_is_bfs = is_bfs_2( lr_counts ); - uint left_mask = args.do_mask_processing ? context->lr_mask & 0xff : 0xff; - uint right_mask = args.do_mask_processing ? (context->lr_mask & 0xff00) >> 8 : 0xff; - - // how many BVH2 nodes do we need to allocate? For DFS, we need to pre-allocate full subtree - uint2 lr_node_counts = select( (uint2)(1,1), (2*lr_counts-1), v_is_dfs ); - uint left_node_count = lr_node_counts.x; - uint right_node_count = lr_node_counts.y; - - // allocate the nodes - uint first_node = BVH2_AllocateNodes( args.bvh2, left_node_count + right_node_count ); - - // point our root node at its children - left_bvh2_root = first_node; - right_bvh2_root = first_node + left_node_count; - - // store combined geom bounds in the root node's AABB.. we previously stored centroid bounds there - // but node creation requires geom bounds - struct AABB3f geom_bounds = left_geom_bounds; - AABB3f_extend(&geom_bounds, &right_geom_bounds); - BVH2_WriteInnerNode( args.bvh2, args.bvh2_root, &geom_bounds, (uint2)(left_bvh2_root,right_bvh2_root), left_mask | right_mask ); - -// printf(" node: %u mask: %x\n", args.bvh2_root, left_mask|right_mask ); - - // store the appropriate AABBs in the child nodes - // - BFS passes need centroid bounds - // - DFS passes need geom bounds - // Here we also write leaf connectivity information (prim start+count) - // this will be overwritten later if we are creating an inner node - struct AABB3f left_box, right_box; - left_box = AABB3f_select( left_geom_bounds, left_centroid_bounds, v_is_bfs.xxx ); - right_box = AABB3f_select( right_geom_bounds, right_centroid_bounds, v_is_bfs.yyy ); - - uint left_start = primref_begin; - uint right_start = primref_begin + num_left; - BVH2_WriteLeafNode( args.bvh2, left_bvh2_root, &left_box, left_start, num_left, left_mask ); - BVH2_WriteLeafNode( args.bvh2, right_bvh2_root, &right_box, right_start, num_right, right_mask ); - - // make input and output primref index buffers consistent in the event we're creating a leaf - // There should only ever be one leaf created, otherwise we'd have done a DFS pass sooner - if (any( v_is_leaf.xy )) - { - uint start = v_is_leaf.x ? left_start : right_start; - uint num_refs = v_is_leaf.x ? num_left : num_right; - - for(uint i = 0; i < num_refs; i++) - { - args.primref_index_in[start + i] = args.primref_index_out[start + i]; - } - } - } - - // when BFS2 finishes, we need to dispatch two child tasks. - // DFS dispatches can run free and do not need a context - // BFS dispatches need a context. - // In the case where both of the child nodes are BFS, the current context can immediately run one of the child dispatches - // and the other is spilled for an unallocated context to pick up - - uint num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right); - if (num_dfs_dispatches) - { - uint dfs_pos = atomic_add_local(&SLM_dfs_dispatch_count, num_dfs_dispatches); - if (is_dfs(num_left)) - { - scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin; - scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left; - scheduler->dfs_queue.records[dfs_pos].bvh2_base = left_bvh2_root; - scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; - scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index; - dfs_pos++; - } - if (is_dfs(num_right)) - { - scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left; - scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right; - scheduler->dfs_queue.records[dfs_pos].bvh2_base = right_bvh2_root; - scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1; - scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index; - } - } - - uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right); - if (num_bfs_children) - { - uint place = atomic_add_local(&SLM_local_spill_stack_size, num_bfs_children); - if (is_bfs(num_left)) - { - SLM_local_spill_stack[place].primref_begin = primref_begin; - SLM_local_spill_stack[place].primref_end = primref_begin + num_left; - SLM_local_spill_stack[place].bvh2_root = left_bvh2_root; - SLM_local_spill_stack[place].tree_depth = depth + 1; - SLM_local_spill_stack[place].batch_index = batch_index; - place++; - } - if (is_bfs(num_right)) - { - SLM_local_spill_stack[place].primref_begin = primref_begin + num_left; - SLM_local_spill_stack[place].primref_end = primref_end; - SLM_local_spill_stack[place].bvh2_root = right_bvh2_root; - SLM_local_spill_stack[place].tree_depth = depth + 1; - SLM_local_spill_stack[place].batch_index = batch_index; - place++; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - uint local_spill_stack_size = SLM_local_spill_stack_size; - - struct BFS1SpillStackEntry entry; - state = VCONTEXT_STATE_UNALLOCATED; - if (context_id < local_spill_stack_size) - { - // pull BFS work from the local spill stack if there's enough work there - entry = SLM_local_spill_stack[context_id]; - state = VCONTEXT_STATE_EXECUTING; - } - else if ((context_id - local_spill_stack_size) < (global_spill_stack_size)) - { - // if there isn't enough work on the local stack, consume from the global one - uint global_pos = (global_spill_stack_size - 1) - (context_id - local_spill_stack_size); - entry = scheduler->bfs2_spill_stack.entries[global_pos]; - state = VCONTEXT_STATE_EXECUTING; - } - - // contexts which received work set themselves up for the next BFS1 dispatch - uint num_bfs_wgs = 0; - uint num_bfs_dispatches = 0; - if (state == VCONTEXT_STATE_EXECUTING) - { - context->dispatch_primref_begin = entry.primref_begin; - context->dispatch_primref_end = entry.primref_end; - context->bvh2_root = entry.bvh2_root; - context->tree_depth = entry.tree_depth; - context->batch_index = entry.batch_index; - - context->num_left = 0; - context->num_right = 0; - context->lr_mask = 0; - - batch_index = entry.batch_index; - num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE); - num_bfs_dispatches = 1; - } - - - if (local_spill_stack_size > BFS_NUM_VCONTEXTS) - { - // write out additional spills if we produced more work than we can consume - uint excess_spills = local_spill_stack_size - BFS_NUM_VCONTEXTS; - uint write_base = global_spill_stack_size; - uint lid = get_local_id(0); - if (lid < excess_spills) - scheduler->bfs2_spill_stack.entries[write_base + lid] = SLM_local_spill_stack[BFS_NUM_VCONTEXTS + lid]; - - if (lid == 0) - scheduler->bfs2_spill_stack.size = global_spill_stack_size + excess_spills; - } - else if (global_spill_stack_size > 0) - { - // otherwise, if we consumed any spills from the global stack, update the stack size - if (get_local_id(0) == 0) - { - uint global_spills_consumed = min(global_spill_stack_size, BFS_NUM_VCONTEXTS - local_spill_stack_size); - scheduler->bfs2_spill_stack.size = global_spill_stack_size - global_spills_consumed; - } - } - - - // Do various WG reductions.. the code below is a hand-written version of the following: - // - // uint bfs_dispatch_queue_pos = work_group_scan_exclusive_add( num_bfs_dispatches ); - // uint reduce_num_bfs_wgs = work_group_reduce_add(num_bfs_wgs); - // uint reduce_num_bfs_dispatches = work_group_reduce_add(num_bfs_dispatches); - uint bfs_dispatch_queue_pos; - uint reduce_num_bfs_dispatches; - uint reduce_num_bfs_wgs; - local uint partial_dispatches[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE]; - local uint partial_wgs[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE]; - { - partial_dispatches[get_sub_group_id()] = sub_group_reduce_add(num_bfs_dispatches); - partial_wgs[get_sub_group_id()] = sub_group_reduce_add(num_bfs_wgs); - - uint sg_prefix = sub_group_scan_exclusive_add(num_bfs_dispatches); - - uint prefix_dispatches = 0; - uint total_dispatches = 0; - uint total_wgs = 0; - ushort lane = get_sub_group_local_id(); - - barrier(CLK_LOCAL_MEM_FENCE); - - for (ushort i = 0; i < SCHEDULER_NUM_SGS; i += SCHEDULER_SG_SIZE) // this loop is intended to be fully unrolled after compilation - { - uint p_dispatch = partial_dispatches[i + lane]; - uint p_wg = partial_wgs[i + lane]; - - prefix_dispatches += (i + lane < get_sub_group_id()) ? p_dispatch : 0; - total_dispatches += p_dispatch; - total_wgs += p_wg; - } - - bfs_dispatch_queue_pos = sg_prefix + sub_group_reduce_add(prefix_dispatches); - reduce_num_bfs_dispatches = sub_group_reduce_add(total_dispatches); - reduce_num_bfs_wgs = sub_group_reduce_add(total_wgs); - } - - // insert records into BFS queue - if (num_bfs_dispatches) - { - scheduler->bfs_queue.wg_count[bfs_dispatch_queue_pos] = num_bfs_wgs; - scheduler->bfs_queue.records[bfs_dispatch_queue_pos].context_id = context_id; - scheduler->bfs_queue.records[bfs_dispatch_queue_pos].batch_index = batch_index; - } - - - // store modified vcontext state if it has changed - if (initial_state != state) - scheduler->vcontext_state[context_id] = state; - - - // store workgroup counters - if (get_local_id(0) == 0) - { - scheduler->bfs_queue.num_dispatches = reduce_num_bfs_dispatches; - scheduler->num_bfs_wgs = reduce_num_bfs_wgs; - scheduler->num_dfs_wgs = SLM_dfs_dispatch_count; - } - - const uint contexts_to_clear = min( (uint)BFS_NUM_VCONTEXTS, (uint)(local_spill_stack_size+global_spill_stack_size) ); - - for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() ) - BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info ); - - for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() ) - LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds ); -} - -#if 0 -uint record_search( struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue ) -{ - uint group = get_group_id(0); - ushort lane = get_sub_group_local_id(); - uint num_dispatches = queue->num_dispatches; - uint base = 0; - for (uint i = 0; i < num_dispatches; i += get_sub_group_size()) - { - uint counts = intel_sub_group_block_read(&queue->wg_count[i]); - - for (uint j = 0; j < get_sub_group_size(); j++) - { - uint n = sub_group_broadcast(counts, j); - if (group < n) - { - *record_out = queue->records[i + j]; - return group; - } - group -= n; - } - } - - return 0; // NOTE: unreachable in practice -} -#endif - - -uint record_search(struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue) -{ - uint group = get_group_id(0); - - uint num_dispatches = queue->num_dispatches; - - uint dispatch_id = 0; - uint local_id = 0; - uint i = 0; - do - { - uint counts = intel_sub_group_block_read(&queue->wg_count[i]); - uint prefix = sub_group_scan_exclusive_add(counts); - - uint g = group - prefix; - uint ballot = intel_sub_group_ballot(g < counts); - if (ballot) - { - uint lane = ctz(ballot); - dispatch_id = i + lane; - local_id = intel_sub_group_shuffle(g, lane); - break; - } - - group -= sub_group_broadcast(prefix + counts, get_sub_group_size() - 1); - - i += get_sub_group_size(); - } while (i < num_dispatches); - - - *record_out = queue->records[dispatch_id]; - return local_id; -} - - - - -struct BFSDispatchArgs get_bfs_args(struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals, uint local_group_id) -{ - uint context_id = record->context_id; - struct VContext* context = &scheduler->contexts[context_id]; - bool odd_pass = context->tree_depth & 1; - - struct BFSDispatchArgs args; - args.scheduler = scheduler; - args.primref_index_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, odd_pass ); - args.primref_index_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, odd_pass ); - args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); - args.wg_primref_begin = context->dispatch_primref_begin + local_group_id * BFS_WG_SIZE; - args.wg_primref_end = min( args.wg_primref_begin + BFS_WG_SIZE, context->dispatch_primref_end ); - args.dispatch_primref_begin = context->dispatch_primref_begin; - args.dispatch_primref_end = context->dispatch_primref_end; - args.context_id = context_id; - args.context = &scheduler->contexts[context_id]; - args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE; - args.bvh2_root = context->bvh2_root; - args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); - args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); - args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals ); - return args; -} - -struct BFSDispatchArgs get_bfs_args_queue( global struct BFSDispatchQueue* queue, - global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* globals ) -{ - - // TODO_OPT: Load this entire prefix array into SLM instead of searching.. - // Or use sub-group ops - - struct BFSDispatchRecord record; - uint local_group_id = record_search(&record, queue); - - return get_bfs_args(&record, scheduler, globals, local_group_id); -} - - -struct BFSDispatchArgs get_bfs_args_from_record( struct BFSDispatchRecord* record, - global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* globals ) -{ - return get_bfs_args(record, scheduler, globals, 0); -} - - -struct BFSDispatchArgs get_bfs_args_batchable( - global struct BFSDispatchQueue* queue, - global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* globals_buffer ) -{ - - // TODO_OPT: Load this entire prefix array into SLM instead of searching.. - // Or use sub-group ops - - struct BFSDispatchRecord record; - uint local_group_id = record_search(&record, queue); - - global struct SAHBuildGlobals* globals = globals_buffer + record.batch_index; - - return get_bfs_args(&record, scheduler, globals, local_group_id); -} - - -struct BFSDispatchArgs get_bfs_args_from_record_batchable( - struct BFSDispatchRecord* record, - global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* globals_buffer ) -{ - global struct SAHBuildGlobals* globals = globals_buffer + record->batch_index; - - return get_bfs_args(record, scheduler, globals, 0); -} - -struct BFSDispatchArgs get_bfs_args_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals ) -{ - uint context_id = 0; - - uint num_refs = SAHBuildGlobals_GetTotalPrimRefs( globals ); - - struct BFSDispatchArgs args; - args.scheduler = scheduler; - args.primref_index_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, false ); - args.primref_index_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, false ); - args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); - args.wg_primref_begin = get_group_id(0) * BFS_WG_SIZE; - args.wg_primref_end = min( args.wg_primref_begin + BFS_WG_SIZE, num_refs ); - args.dispatch_primref_begin = 0; - args.dispatch_primref_end = num_refs; - args.context_id = context_id; - args.context = &scheduler->contexts[context_id]; - args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE; - args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); - args.bvh2_root = BVH2_GetRoot( args.bvh2 ); - args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); - args.do_mask_processing = SAHBuildGlobals_NeedMasks(globals); - return args; -} - - -inline void BinMapping_init( struct BinMapping* binMapping, struct AABB3f* centBounds, const uint bins ) -{ - const float4 eps = 1E-34f; - const float4 omega = 1E+34f; - float3 l = AABB3f_load_lower( centBounds ); - float3 u = AABB3f_load_upper( centBounds ); - float4 diag; - diag.xyz = max( eps.xyz, u - l ); - diag.w = 0; - float4 scale = (float4)(0.99f * (float)bins) / diag; - scale = select( (float4)(0.0f), scale, diag > eps ); - scale = select( (float4)(0.0f), scale, diag < omega ); - binMapping->scale = scale; - binMapping->ofs.xyz = l.xyz; - binMapping->ofs.w = 0; -} - - -inline ulong getBestSplit( float3 sah, uint ID, const float4 scale, const ulong defaultSplit ) -{ - ulong splitX = (((ulong)as_uint( sah.x )) << 32) | ((uint)ID << 2) | 0; - ulong splitY = (((ulong)as_uint( sah.y )) << 32) | ((uint)ID << 2) | 1; - ulong splitZ = (((ulong)as_uint( sah.z )) << 32) | ((uint)ID << 2) | 2; - /* ignore zero sized dimensions */ - splitX = select( splitX, defaultSplit, (ulong)(scale.x == 0) ); - splitY = select( splitY, defaultSplit, (ulong)(scale.y == 0) ); - splitZ = select( splitZ, defaultSplit, (ulong)(scale.z == 0) ); - ulong bestSplit = min( min( splitX, splitY ), splitZ ); - bestSplit = sub_group_reduce_min( bestSplit ); - return bestSplit; -} - - - -inline float left_to_right_area16( struct AABB3f* low ) -{ - struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low ); - return halfArea_AABB3f( &low_prefix ); -} - -inline uint left_to_right_counts16( uint low ) -{ - return sub_group_scan_exclusive_add( low ); -} - -inline float right_to_left_area16( struct AABB3f* low ) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint ID = subgroup_size - 1 - subgroupLocalID; - struct AABB3f low_reverse = AABB3f_sub_group_shuffle( low, ID ); - struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse ); - const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID ); - return low_area; -} - -inline uint right_to_left_counts16( uint low ) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint ID = subgroup_size - 1 - subgroupLocalID; - const uint low_reverse = intel_sub_group_shuffle( low, ID ); - const uint low_prefix = sub_group_scan_inclusive_add( low_reverse ); - return intel_sub_group_shuffle( low_prefix, ID ); -} - -inline float2 left_to_right_area32( struct AABB3f* low, struct AABB3f* high ) -{ - struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low ); - struct AABB3f low_reduce = AABB3f_sub_group_reduce( low ); - struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max( high ); - AABB3f_extend( &high_prefix, &low_reduce ); - const float low_area = halfArea_AABB3f( &low_prefix ); - const float high_area = halfArea_AABB3f( &high_prefix ); - return (float2)(low_area, high_area); -} - -inline uint2 left_to_right_counts32( uint low, uint high ) -{ - const uint low_prefix = sub_group_scan_exclusive_add( low ); - const uint low_reduce = sub_group_reduce_add( low ); - const uint high_prefix = sub_group_scan_exclusive_add( high ); - return (uint2)(low_prefix, low_reduce + high_prefix); -} - -inline float2 right_to_left_area32( struct AABB3f* low, struct AABB3f* high ) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint ID = subgroup_size - 1 - subgroupLocalID; - struct AABB3f low_reverse = AABB3f_sub_group_shuffle( high, ID ); - struct AABB3f high_reverse = AABB3f_sub_group_shuffle( low, ID ); - struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse ); - struct AABB3f low_reduce = AABB3f_sub_group_reduce( &low_reverse ); - struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max( &high_reverse ); - AABB3f_extend( &high_prefix, &low_reduce ); - const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &high_prefix ), ID ); - const float high_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID ); - return (float2)(low_area, high_area); -} - -inline uint2 right_to_left_counts32( uint low, uint high ) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint ID = subgroup_size - 1 - subgroupLocalID; - const uint low_reverse = intel_sub_group_shuffle( high, ID ); - const uint high_reverse = intel_sub_group_shuffle( low, ID ); - const uint low_prefix = sub_group_scan_inclusive_add( low_reverse ); - const uint low_reduce = sub_group_reduce_add( low_reverse ); - const uint high_prefix = sub_group_scan_inclusive_add( high_reverse ) + low_reduce; - return (uint2)(intel_sub_group_shuffle( high_prefix, ID ), intel_sub_group_shuffle( low_prefix, ID )); -} - -inline uint fastDivideBy6_uint( uint v ) -{ -#if 1 - const ulong u = (ulong)v >> 1; - return (uint)((u * 0x55555556ul) >> 32); -#else - return v / 6; -#endif -} - -inline uint3 fastDivideBy6_uint3( uint3 v ) -{ - return (uint3)(fastDivideBy6_uint( v.x ), fastDivideBy6_uint( v.y ), fastDivideBy6_uint( v.z )); -} - -#define SAH_LOG_BLOCK_SHIFT 2 - -inline struct BFS_Split BinInfo_reduce( struct BFS_BinInfo* binInfo, const float4 scale ) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - - struct AABB3f boundsX = BinInfo_get_AABB( binInfo, subgroupLocalID, 0 ); - - const float lr_areaX = left_to_right_area16( &boundsX ); - const float rl_areaX = right_to_left_area16( &boundsX ); - - struct AABB3f boundsY = BinInfo_get_AABB( binInfo, subgroupLocalID, 1 ); - - const float lr_areaY = left_to_right_area16( &boundsY ); - const float rl_areaY = right_to_left_area16( &boundsY ); - - struct AABB3f boundsZ = BinInfo_get_AABB( binInfo, subgroupLocalID, 2 ); - - const float lr_areaZ = left_to_right_area16( &boundsZ ); - const float rl_areaZ = right_to_left_area16( &boundsZ ); - - const uint3 counts = BinInfo_get_counts( binInfo, subgroupLocalID ); - - const uint lr_countsX = left_to_right_counts16( counts.x ); - const uint rl_countsX = right_to_left_counts16( counts.x ); - const uint lr_countsY = left_to_right_counts16( counts.y ); - const uint rl_countsY = right_to_left_counts16( counts.y ); - const uint lr_countsZ = left_to_right_counts16( counts.z ); - const uint rl_countsZ = right_to_left_counts16( counts.z ); - - const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ); - const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ); - - const uint3 lr_count = fastDivideBy6_uint3( (uint3)(lr_countsX, lr_countsY, lr_countsZ) + 6 - 1 ); - const uint3 rl_count = fastDivideBy6_uint3( (uint3)(rl_countsX, rl_countsY, rl_countsZ) + 6 - 1 ); - float3 sah = fma( lr_area, convert_float3( lr_count ), rl_area * convert_float3( rl_count ) ); - - /* first bin is invalid */ - sah.x = select( (float)(INFINITY), sah.x, subgroupLocalID != 0 ); - sah.y = select( (float)(INFINITY), sah.y, subgroupLocalID != 0 ); - sah.z = select( (float)(INFINITY), sah.z, subgroupLocalID != 0 ); - - const ulong defaultSplit = (((ulong)as_uint( (float)(INFINITY) )) << 32); - - const ulong bestSplit = getBestSplit( sah, subgroupLocalID, scale, defaultSplit ); - - struct BFS_Split split; - split.sah = as_float( (uint)(bestSplit >> 32) ); - split.dim = (uint)bestSplit & 3; - split.pos = (uint)bestSplit >> 2; - - return split; -} - - -struct BFS_BinInfoReduce3_SLM -{ - uint sah[3*BFS_NUM_BINS]; -}; - - - -inline struct BFS_Split BinInfo_reduce3( local struct BFS_BinInfoReduce3_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale ) -{ - // process each bin/axis combination across sub-groups - for (uint i = get_sub_group_id(); i < 3 * BFS_NUM_BINS; i += get_num_sub_groups()) - { - uint my_bin = i % BFS_NUM_BINS; - uint my_axis = i / BFS_NUM_BINS; - - float3 left_lower = (float3)(INFINITY,INFINITY,INFINITY); - float3 left_upper = -left_lower; - float3 right_lower = (float3)(INFINITY,INFINITY,INFINITY); - float3 right_upper = -right_lower; - - // load the other bins and assign them to the left or to the right - // of this subgroup's bin - uint lane = get_sub_group_local_id(); - struct AABB3f sg_bins = BinInfo_get_AABB(binInfo,lane,my_axis); - - bool is_left = (lane < my_bin); - float3 lower = AABB3f_load_lower(&sg_bins); - float3 upper = AABB3f_load_upper(&sg_bins); - - float3 lower_l = select_min( lower, is_left ); - float3 upper_l = select_max( upper, is_left ); - float3 lower_r = select_min( lower, !is_left ); - float3 upper_r = select_max( upper, !is_left ); - - lower_l = sub_group_reduce_min_float3( lower_l ); - lower_r = sub_group_reduce_min_float3( lower_r ); - upper_l = sub_group_reduce_max_float3( upper_l ); - upper_r = sub_group_reduce_max_float3( upper_r ); - float3 dl = upper_l - lower_l; - float3 dr = upper_r - lower_r; - float area_l = dl.x* (dl.y + dl.z) + (dl.y * dl.z); - float area_r = dr.x* (dr.y + dr.z) + (dr.y * dr.z); - - // get the counts - uint sg_bin_count = BinInfo_get_count(binInfo, lane, my_axis); - uint count_l = (is_left) ? sg_bin_count : 0; - uint count_r = (is_left) ? 0 : sg_bin_count; - count_l = sub_group_reduce_add(count_l); - count_r = sub_group_reduce_add(count_r); - - // compute sah - count_l = fastDivideBy6_uint(count_l + 6 - 1); - count_r = fastDivideBy6_uint(count_r + 6 - 1); - float lr_partial = area_l * count_l; - float rl_partial = area_r * count_r; - float sah = lr_partial + rl_partial; - - // first bin is invalid - sah = select((float)(INFINITY), sah, my_bin != 0); - - // ignore zero sized dimensions - sah = select( sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) ); - sah = select( sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) ); - sah = select( sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) ); - - // tuck the axis into the bottom bits of sah cost. - // The result is an integer between 0 and +inf (7F800000) - // If we have 3 axes with infinite sah cost, we will select axis 0 - slm->sah[i] = (as_uint(sah)&~0x3) | my_axis; - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - // reduce split candidates down to one subgroup - // sah is strictly positive, so integer compares can be used - // which results in a faster sub_group_reduce_min() - // - uint best_sah = 0xffffffff; - - uint lid = get_sub_group_local_id(); - if (lid < BFS_NUM_BINS) - { - best_sah = slm->sah[lid]; - lid += BFS_NUM_BINS; - best_sah = min( best_sah, slm->sah[lid] ); - lid += BFS_NUM_BINS; - best_sah = min( best_sah, slm->sah[lid] ); - } - - uint reduced_bestsah = sub_group_reduce_min( best_sah ); - uint best_bin = ctz(intel_sub_group_ballot(best_sah == reduced_bestsah)); - uint best_axis = as_uint(reduced_bestsah) & 0x3; - - struct BFS_Split ret; - ret.sah = as_float(reduced_bestsah); - ret.dim = best_axis; - ret.pos = best_bin; - return ret; -} - - -struct BFS_BinInfoReduce_SLM -{ - struct - { - float sah; - uint bin; - } axisInfo[3]; -}; - - - -inline struct BFS_Split BinInfo_reduce2( local struct BFS_BinInfoReduce_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale, uint num_primrefs) -{ - ushort my_axis = get_sub_group_id(); - ushort my_bin = get_sub_group_local_id(); - - if (my_axis < 3) - { - struct AABB3f aabb = BinInfo_get_AABB(binInfo, my_bin, my_axis); - uint count = BinInfo_get_count(binInfo, my_bin, my_axis); - - float lr_area = left_to_right_area16(&aabb); - float rl_area = right_to_left_area16(&aabb); - - uint lr_count = sub_group_scan_exclusive_add(count); - uint rl_count = num_primrefs - lr_count; - - lr_count = fastDivideBy6_uint(lr_count + 6 - 1); - rl_count = fastDivideBy6_uint(rl_count + 6 - 1); - float lr_partial = lr_area * lr_count; - float rl_partial = rl_area * rl_count; - float sah = lr_partial + rl_partial; - - // first bin is invalid - sah = select((float)(INFINITY), sah, my_bin != 0); - - float best_sah = sub_group_reduce_min( sah ); - uint best_bin = ctz(intel_sub_group_ballot(sah == best_sah)); - - // ignore zero sized dimensions - best_sah = select( best_sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) ); - best_sah = select( best_sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) ); - best_sah = select( best_sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) ); - - if (get_sub_group_local_id() == 0) - { - slm->axisInfo[my_axis].sah = best_sah; - slm->axisInfo[my_axis].bin = best_bin; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - float sah = (float)(INFINITY); - if( get_sub_group_local_id() < 3 ) - sah = slm->axisInfo[get_sub_group_local_id()].sah; - - float bestsah = min(sub_group_broadcast(sah, 0), min(sub_group_broadcast(sah, 1), sub_group_broadcast(sah, 2))); - uint bestAxis = ctz( intel_sub_group_ballot(bestsah == sah) ); - - struct BFS_Split split; - split.sah = bestsah; - split.dim = bestAxis; - split.pos = slm->axisInfo[bestAxis].bin; - return split; -} - - -inline bool is_left( struct BinMapping* binMapping, struct BFS_Split* split, struct AABB* primref ) -{ - const uint dim = split->dim; - const float lower = primref->lower[dim]; - const float upper = primref->upper[dim]; - const float c = lower + upper; - const uint pos = convert_uint_rtz( (c - binMapping->ofs[dim]) * binMapping->scale[dim] ); - return pos < split->pos; -} - -struct BFS_Pass1_SLM -{ - struct BFS_BinInfo bin_info; -// struct BFS_BinInfoReduce3_SLM reduce3; -}; - - -void DO_BFS_pass1( local struct BFS_Pass1_SLM* slm, - uint thread_primref_id, - bool thread_primref_valid, - struct BFSDispatchArgs args - ) -{ - local struct BFS_BinInfo* local_bin_info = &slm->bin_info; - global struct VContext* context = args.context; - struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); // root AABB is initialized to centroid bounds - - struct BinMapping bin_mapping; - BinMapping_init( &bin_mapping, ¢roid_bounds, BFS_NUM_BINS ); - - // fetch this thread's primref - PrimRef ref; - if ( thread_primref_valid ) - ref = args.primref_buffer[thread_primref_id]; - - // init bin info - BinInfo_init( local_bin_info ); - - // fence on local bin-info init - barrier( CLK_LOCAL_MEM_FENCE ); - - // merge this thread's primref into local bin info - BinInfo_add_primref( &bin_mapping, local_bin_info, &ref, thread_primref_valid ); - - // fence on local bin-info update - barrier( CLK_LOCAL_MEM_FENCE ); - - BinInfo_merge(&context->global_bin_info, local_bin_info); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size(BFS_WG_SIZE,1,1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void -BFS_pass1_indexed( - global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* sah_globals ) -{ - local struct BFS_Pass1_SLM slm; - struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals ); - - bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end; - uint thread_primref_id = 0; - if ( thread_primref_valid ) - thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )]; - - DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args ); -} - - -__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void -BFS_pass1_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) -{ - local struct BFS_Pass1_SLM slm; - struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals ); - - uint thread_primref_id = args.wg_primref_begin + get_local_id( 0 ); - bool thread_primref_valid = thread_primref_id < args.wg_primref_end; - - DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args ); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void -BFS_pass1_indexed_batchable( - global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* globals_buffer ) -{ - local struct BFS_Pass1_SLM slm; - struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer ); - - bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end; - uint thread_primref_id = 0; - if (thread_primref_valid) - thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)]; - - DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void -BFS_pass1_initial_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) -{ - local struct BFS_Pass1_SLM slm; - struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer ); - - uint thread_primref_id = args.wg_primref_begin + get_local_id(0); - bool thread_primref_valid = thread_primref_id < args.wg_primref_end; - - DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args); -} - - -///////////////////////////////////////////////////////////////////////////////////////////////// -/// -/// BVH2 construction -- BFS Phase Pass2 -/// -///////////////////////////////////////////////////////////////////////////////////////////////// - -struct BFS_Pass2_SLM -{ - struct BFS_BinInfoReduce3_SLM reduce3; - //struct AABB3f left_centroid_bounds; - //struct AABB3f right_centroid_bounds; - //struct AABB3f left_geom_bounds; - //struct AABB3f right_geom_bounds; - LRBounds lr_bounds; - uint left_count; - uint right_count; - uint lr_mask; - uint left_primref_base; - uint right_primref_base; -// uint num_wgs; - -// uint output_indices[BFS_WG_SIZE]; -}; - - - - - - - -void DO_BFS_pass2( - local struct BFS_Pass2_SLM* slm, - uint thread_primref_id, - bool thread_primref_valid, - struct BFSDispatchArgs args -) -{ - global struct VContext* context = args.context; - - struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); - - // load the thread's primref - PrimRef ref; - if ( thread_primref_valid ) - ref = args.primref_buffer[thread_primref_id]; - - struct BinMapping bin_mapping; - BinMapping_init( &bin_mapping, ¢roid_bounds, BFS_NUM_BINS ); - - // initialize working SLM space - LRBounds_init(&slm->lr_bounds); - if(get_local_id(0) == 0) - { - slm->left_count = 0; - slm->right_count = 0; - - if( args.do_mask_processing ) - slm->lr_mask = 0; - } - - // compute split - every workgroup does the same computation - // local barrier inside BinInfo_reduce3 - struct BFS_Split split = BinInfo_reduce3( &slm->reduce3, &context->global_bin_info,bin_mapping.scale ); - - uint wg_prim_count = args.wg_primref_end - args.wg_primref_begin; - - // partition primrefs into L/R subsets... - bool go_left = false; - if (split.sah == (float)(INFINITY)) // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes - go_left = get_local_id(0) < (wg_prim_count / 2); - else - go_left = is_left( &bin_mapping, &split, &ref ); - - // assign this primref a position in the output array, and expand corresponding centroid-bounds - uint local_index; - { - float3 centroid = ref.lower.xyz + ref.upper.xyz; - - uint l_ballot = intel_sub_group_ballot( go_left && thread_primref_valid ); - uint r_ballot = intel_sub_group_ballot( !go_left && thread_primref_valid ); - if (l_ballot) - { - bool active_lane = l_ballot & (1 << get_sub_group_local_id()); - float3 Cmin, Cmax, Gmin, Gmax; - Cmin = select_min( centroid.xyz, active_lane ); - Cmax = select_max( centroid.xyz, active_lane ); - Gmin = select_min( ref.lower.xyz, active_lane ); - Gmax = select_max( ref.upper.xyz, active_lane ); - - Cmin = sub_group_reduce_min_float3( Cmin ); - Cmax = sub_group_reduce_max_float3( Cmax ); - Gmin = sub_group_reduce_min_float3( Gmin ); - Gmax = sub_group_reduce_max_float3( Gmax ); - - LRBounds_merge_left( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax ); - } - - if (r_ballot) - { - bool active_lane = r_ballot & (1 << get_sub_group_local_id()); - float3 Cmin, Cmax, Gmin, Gmax; - Cmin = select_min(centroid.xyz, active_lane); - Cmax = select_max(centroid.xyz, active_lane); - Gmin = select_min(ref.lower.xyz, active_lane); - Gmax = select_max(ref.upper.xyz, active_lane); - - Cmin = sub_group_reduce_min_float3(Cmin); - Cmax = sub_group_reduce_max_float3(Cmax); - Gmin = sub_group_reduce_min_float3(Gmin); - Gmax = sub_group_reduce_max_float3(Gmax); - - LRBounds_merge_right( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax ); - } - - if( args.do_mask_processing ) - { - uint mask =0; - if (thread_primref_valid) - { - mask = PRIMREF_instanceMask(&ref) ; - mask = go_left ? mask : mask<<8; - } - - // TODO OPT: there is no 'sub_group_reduce_or' and IGC does not do the reduction trick - // for atomics on sub-group uniform addresses - for( uint i= get_sub_group_size()/2; i>0; i/= 2) - mask = mask | intel_sub_group_shuffle_down(mask,mask,i); - if( get_sub_group_local_id() == 0 ) - atomic_or_local( &slm->lr_mask, mask ); - } - - uint l_base = 0; - uint r_base = 0; - if( get_sub_group_local_id() == 0 && l_ballot ) - l_base = atomic_add_local( &slm->left_count, popcount(l_ballot) ); - if( get_sub_group_local_id() == 0 && r_ballot ) - r_base = atomic_add_local( &slm->right_count, popcount(r_ballot) ); - - sub_group_barrier( CLK_LOCAL_MEM_FENCE ); - l_base = sub_group_broadcast(l_base,0); - r_base = sub_group_broadcast(r_base,0); - - l_base = l_base + subgroup_bit_prefix_exclusive( l_ballot ); - r_base = r_base + subgroup_bit_prefix_exclusive( r_ballot ); - - local_index = (go_left) ? l_base : r_base; - } - - - barrier( CLK_LOCAL_MEM_FENCE ); - - // merge local into global - // TODO_OPT: Look at spreading some of this across subgroups - if ( get_sub_group_id() == 0 ) - { - // allocate primref space for this wg and merge local/global centroid bounds - uint num_left = slm->left_count; - { - if (num_left && get_sub_group_local_id() == 0) - { - num_left = atomic_add_global( &context->num_left, num_left ); - slm->left_primref_base = args.dispatch_primref_begin + num_left; - } - } - uint num_right = slm->right_count; - { - if (num_right && get_sub_group_local_id() == 0) - { - num_right = atomic_add_global( &context->num_right, num_right ); - slm->right_primref_base = (args.dispatch_primref_end - 1) - num_right; - } - } - - if( args.do_mask_processing && get_sub_group_local_id() == 0 ) - atomic_or_global( &context->lr_mask, slm->lr_mask ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - LRBounds_merge( &context->lr_bounds, &slm->lr_bounds ); - - // move thread's primref ID into correct position in output index buffer - if (thread_primref_valid) - { - uint pos = go_left ? slm->left_primref_base + local_index - : slm->right_primref_base - local_index; - - args.primref_index_out[pos] = thread_primref_id; - } -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void -BFS_pass2_indexed( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) -{ - local struct BFS_Pass2_SLM slm; - struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals ); - - bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end; - uint thread_primref_id = 0; - if ( thread_primref_valid ) - thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )]; - - DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args ); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void -BFS_pass2_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals ) -{ - local struct BFS_Pass2_SLM slm; - struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals ); - - uint thread_primref_id = args.wg_primref_begin + get_local_id( 0 ); - bool thread_primref_valid = thread_primref_id < args.wg_primref_end; - - DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args ); -} - - -__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void -BFS_pass2_indexed_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer ) -{ - local struct BFS_Pass2_SLM slm; - struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer ); - - bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end; - uint thread_primref_id = 0; - if (thread_primref_valid) - thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)]; - - DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args); - -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void -BFS_pass2_initial_batchable(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer) -{ - local struct BFS_Pass2_SLM slm; - struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer ); - - uint thread_primref_id = args.wg_primref_begin + get_local_id(0); - bool thread_primref_valid = thread_primref_id < args.wg_primref_end; - - DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args); -} - - - - -///////////////////////////////////////////////////////////////////////////////////////////////// -/// -/// BVH2 construction -- DFS Phase -/// -///////////////////////////////////////////////////////////////////////////////////////////////// - -struct DFSArgs -{ - uint primref_base; - uint global_bvh2_base; - bool do_mask_processing; - ushort num_primrefs; - global uint* primref_indices_in; - global uint* primref_indices_out; - global PrimRef* primref_buffer; - global struct BVH2* global_bvh2; -}; - - -struct DFSPrimRefAABB -{ - half lower[3]; - half upper[3]; -}; - -void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb ) -{ - bb->lower[0] = 1; - bb->lower[1] = 1; - bb->lower[2] = 1; - bb->upper[0] = 0; - bb->upper[1] = 0; - bb->upper[2] = 0; -} - -void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v ) -{ - aabb->lower[0] = min( aabb->lower[0], v->lower[0] ); - aabb->lower[1] = min( aabb->lower[1], v->lower[1] ); - aabb->lower[2] = min( aabb->lower[2], v->lower[2] ); - aabb->upper[0] = max( aabb->upper[0], v->upper[0] ); - aabb->upper[1] = max( aabb->upper[1], v->upper[1] ); - aabb->upper[2] = max( aabb->upper[2], v->upper[2] ); -} - -half DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb ) -{ - const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]); - return fma( d.x, (d.y + d.z), d.y * d.z ); -} - -struct DFSPrimRef -{ - struct DFSPrimRefAABB aabb; - ushort2 meta; -}; - -void DFSPrimRef_SetBVH2Root( struct DFSPrimRef* ref, ushort root ) -{ - ref->meta.y = root; -} - -uint DFSPrimRef_GetInputIndex( struct DFSPrimRef* ref ) -{ - return ref->meta.x; -} - -uint DFSPrimRef_GetBVH2Parent( struct DFSPrimRef* ref ) -{ - return ref->meta.y; -} - - -struct PrimRefSet -{ - struct DFSPrimRefAABB AABB[DFS_WG_SIZE]; - ushort2 meta[DFS_WG_SIZE]; - uint input_indices[DFS_WG_SIZE]; -}; - - - - -local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id ) -{ - return &refs->AABB[id]; -} -struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id ) -{ - struct DFSPrimRef r; - r.aabb = refs->AABB[id]; - r.meta = refs->meta[id]; - return r; -} -void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id ) -{ - refs->AABB[id] = ref.aabb; - refs->meta[id] = ref.meta; -} - -void PrimRefSet_SetPrimRef_FullPrecision( struct AABB3f* root_aabb, local struct PrimRefSet* refs, PrimRef ref, ushort id ) -{ - float3 root_l = AABB3f_load_lower( root_aabb ); - float3 root_u = AABB3f_load_upper( root_aabb ); - float3 d = root_u - root_l; - float scale = 1.0f / max( d.x, max( d.y, d.z ) ); - - float3 l = ref.lower.xyz; - float3 u = ref.upper.xyz; - half3 lh = convert_half3_rtz( (l - root_l) * scale ); - half3 uh = convert_half3_rtp( (u - root_l) * scale ); - - refs->AABB[id].lower[0] = lh.x; - refs->AABB[id].lower[1] = lh.y; - refs->AABB[id].lower[2] = lh.z; - refs->AABB[id].upper[0] = uh.x; - refs->AABB[id].upper[1] = uh.y; - refs->AABB[id].upper[2] = uh.z; - refs->meta[id].x = id; - refs->meta[id].y = 0; -} - - - -void DFS_CreatePrimRefSet( struct DFSArgs args, - local struct PrimRefSet* prim_refs ) -{ - ushort id = get_local_id( 0 ); - ushort num_primrefs = args.num_primrefs; - - struct AABB3f box = BVH2_GetNodeBox( args.global_bvh2, args.global_bvh2_base ); - if ( id < num_primrefs ) - { - PrimRef ref = args.primref_buffer[args.primref_indices_in[id]]; - prim_refs->input_indices[id] = args.primref_indices_in[id]; - PrimRefSet_SetPrimRef_FullPrecision( &box, prim_refs, ref, id ); - } -} - -struct ThreadRangeInfo -{ - uchar start; - uchar local_num_prims; - uchar bvh2_root; - bool active; -}; - -struct BVHBuildLocals // size: ~3.8K -{ - uchar2 axis_and_left_count[ DFS_WG_SIZE ]; - struct ThreadRangeInfo range[ DFS_WG_SIZE ]; - uint sah[ DFS_WG_SIZE ]; -}; - -#define LOCAL_BVH2_NODE_COUNT (2*(DFS_WG_SIZE) -1) - -struct LocalBVH2 -{ - uint nodes[LOCAL_BVH2_NODE_COUNT]; - uint num_nodes; - - // bit layout is for a node is - // uchar child_ptr; // this is right_child_index >> 1. right child's msb is always 0 - // uchar primref_base; // index of the node's first primref. will be 0 at the root - // uchar parent_dist; // distance in nodes from this node to its parent - // uchar prim_counter; // number of prims in this subtree. For a complete tree (256 prims), the root may be off by 1 - - // for a WG size of 256, 8b is enough for parent distance, because the tree is built in level order - // the maximum distance between parent and child occurs for a complete tree. - // in this scenario the left-most leaf has index 255, its parent has index 127, the deltas to the children are 128 and 129 -}; - - -void LocalBVH2_Initialize( struct LocalBVH2* bvh2, ushort num_prims ) -{ - bvh2->num_nodes = 1; - bvh2->nodes[0] = min(num_prims,(ushort)255); -} - - - -void LocalBVH2_Initialize_Presplit(struct LocalBVH2* bvh2, ushort num_prims, ushort left_count, ushort right_count ) -{ - bvh2->num_nodes = 3; - bvh2->nodes[0] = min(num_prims, (ushort)255); - - ushort bvh2_root = 0; - ushort child_place = 1; - - uint child_ptr = (child_place + 1) >> 1; - bvh2->nodes[bvh2_root] |= (child_ptr) << 24; - - uint parent_dist = child_place - bvh2_root; - - // initialize child nodes - ushort primref_base_left = 0; - ushort primref_base_right = left_count; - uint left = (primref_base_left << 16) + ((parent_dist << 8)) + left_count; - uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8) + right_count; - bvh2->nodes[child_place] = left; - bvh2->nodes[child_place + 1] = right; -} - - -void LocalBVH2_CreateInnerNode( local struct LocalBVH2* bvh2, ushort bvh2_root, uint primref_base_left, uint primref_base_right ) -{ - ushort child_place = atomic_add_local( &(bvh2-> num_nodes), 2 ); - - uint child_ptr = (child_place + 1) >> 1; - bvh2->nodes[bvh2_root] |= (child_ptr) << 24; - - uint parent_dist = child_place - bvh2_root; - - // initialize child nodes - uint left = (primref_base_left << 16) + ((parent_dist << 8)); - uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8); - bvh2->nodes[child_place] = left; - bvh2->nodes[child_place + 1] = right; -} - -ushort2 LocalBVH2_GetChildIndices( struct LocalBVH2* bvh2, ushort bvh2_root ) -{ - ushort right_idx = (bvh2->nodes[bvh2_root] & 0xff000000) >> 23; - return (ushort2)(right_idx - 1, right_idx); -} - - -ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* bvh2, ushort bvh2_root ) -{ - // increment only the lower 8 bits. Algorithm will not overflow by design - return atomic_inc_local( &bvh2->nodes[bvh2_root] ) & 0xff; -} - -ushort LocalBVH2_SetLeafPrimCount(local struct LocalBVH2* bvh2, ushort bvh2_root, ushort count) -{ - return bvh2->nodes[bvh2_root] |= (count& 0xff); -} - -bool LocalBVH2_IsRoot( struct LocalBVH2* bvh2, ushort node_id ) -{ - return node_id == 0; -} - -ushort LocalBVH2_GetLeafPrimrefStart( struct LocalBVH2* bvh2, ushort bvh2_node_id ) -{ - return (bvh2->nodes[bvh2_node_id] >> 16) & 255; -} - -bool LocalBVH2_IsLeftChild( struct LocalBVH2* bvh2, ushort parent_node, ushort current_node ) -{ - return (current_node & 1); // nodes are allocated in pairs. first node is root, left child is an odd index -} - -ushort LocalBVH2_GetParent( struct LocalBVH2* bvh2, ushort node ) -{ - return node - ((bvh2->nodes[node] >> 8) & 255); -} - -uint LocalBVH2_GetNodeCount( struct LocalBVH2* bvh2 ) -{ - return bvh2->num_nodes; -} - -bool LocalBVH2_IsLeaf( struct LocalBVH2* bvh2, ushort node_index ) -{ - return (bvh2->nodes[node_index] & 255) <= TREE_ARITY; -} - -ushort LocalBVH2_GetLeafPrimCount( struct LocalBVH2* bvh2, ushort node_index ) -{ - return (bvh2->nodes[node_index] & 255); -} - -void DFS_ConstructBVH2( local struct LocalBVH2* bvh2, - local struct PrimRefSet* prim_refs, - ushort bvh2_root, - ushort prim_range_start, - ushort local_num_prims, - ushort global_num_prims, - local struct BVHBuildLocals* locals, - local uint* num_active_threads ) -{ - ushort tid = get_local_id( 0 ); - ushort primref_position = tid; - - bool active_thread = tid < global_num_prims; - - // Handle cases where initial binner creates leaves - if ( active_thread && local_num_prims <= TREE_ARITY ) - { - struct DFSPrimRef ref = PrimRefSet_GetPrimRef(prim_refs, primref_position); - DFSPrimRef_SetBVH2Root(&ref, bvh2_root); - PrimRefSet_SetPrimRef(prim_refs, ref, primref_position); - active_thread = false; - if (primref_position == prim_range_start) - atomic_sub_local(num_active_threads, local_num_prims); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - locals->range[ tid ].start = prim_range_start; - locals->range[ tid ].local_num_prims = local_num_prims; - locals->range[ tid ].bvh2_root = bvh2_root; - locals->range[ tid ].active = active_thread; - - do - { - if(active_thread && prim_range_start == primref_position) - locals->sah[primref_position] = UINT_MAX; - - barrier( CLK_LOCAL_MEM_FENCE ); - - if ( active_thread ) - { - local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); - - // each thread evaluates a possible split candidate. Scan primrefs and compute sah cost - // do this axis-by-axis to keep register pressure low - float best_sah = INFINITY; - ushort best_axis = 3; - ushort best_count = 0; - - struct DFSPrimRefAABB box_left[3]; - struct DFSPrimRefAABB box_right[3]; - float CSplit[3]; - ushort count_left[3]; - - for ( ushort axis = 0; axis < 3; axis++ ) - { - DFSPrimRefAABB_init( &box_left[axis] ); - DFSPrimRefAABB_init( &box_right[axis] ); - - CSplit[axis] = my_box->lower[axis] + my_box->upper[axis]; - count_left[axis] = 0; - } - - // scan primrefs in our subtree and partition using this thread's prim as a split plane - { - struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start ); - - for ( ushort p = 1; p < local_num_prims; p++ ) - { - struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration - - for( ushort axis = 0; axis < 3; axis++ ) - { - float c = box.lower[axis] + box.upper[axis]; - - if ( c < CSplit[axis] ) - { - // this primitive is to our left. - DFSPrimRefAABB_extend( &box_left[axis], &box ); - count_left[axis]++; - } - else - { - // this primitive is to our right - DFSPrimRefAABB_extend( &box_right[axis], &box ); - } - } - - box = next_box; - } - - // last iteration without preloading box - for( ushort axis = 0; axis < 3; axis++ ) - { - float c = box.lower[axis] + box.upper[axis]; - - if ( c < CSplit[axis] ) - { - // this primitive is to our left. - DFSPrimRefAABB_extend( &box_left[axis], &box ); - count_left[axis]++; - } - else - { - // this primitive is to our right - DFSPrimRefAABB_extend( &box_right[axis], &box ); - } - } - - } - - for ( ushort axis = 0; axis < 3; axis++ ) - { - float Al = DFSPrimRefAABB_halfArea( &box_left[axis] ); - float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] ); - - // Avoid NANs in SAH calculation in the corner case where all prims go right - // In this case we set Al=Ar, because such a split will only be selected if all primrefs - // are co-incident.. In that case, we will fall back to split-in-the-middle and both subtrees - // should store the same quantized area value - if ( count_left[axis] == 0 ) - Al = Ar; - - // compute sah cost - ushort count_right = local_num_prims - count_left[axis]; - float sah = Ar * count_right + Al * count_left[axis]; - - // keep this split if it is better than the previous one, or if the previous one was a corner-case - if ( sah < best_sah || best_count == 0 ) - { - // yes, keep it - best_axis = axis; - best_sah = sah; - best_count = count_left[axis]; - } - } - - // write split information to SLM - locals->axis_and_left_count[primref_position].x = best_axis; - locals->axis_and_left_count[primref_position].y = best_count; - uint sah = as_uint(best_sah); - // break ties by axis to ensure deterministic split selection - // otherwise builder can produce non-deterministic tree structure run to run - // based on the ordering of primitives (which can vary due to non-determinism in atomic counters) - // Embed split axis and index into sah value; compute min over sah and max over axis - sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | tid ); - - // reduce on split candidates in our local subtree and decide the best one - atomic_min_local( &locals->sah[ prim_range_start ], sah); - } - - - barrier( CLK_LOCAL_MEM_FENCE ); - - ushort split_index = locals->sah[ prim_range_start ] & 255; - ushort split_axis = locals->axis_and_left_count[split_index].x; - ushort split_left_count = locals->axis_and_left_count[split_index].y; - - if ( (primref_position == split_index) && active_thread ) - { - // first thread in a given subtree creates the inner node - ushort start_left = prim_range_start; - ushort start_right = prim_range_start + split_left_count; - if ( split_left_count == 0 ) - start_right = start_left + (local_num_prims / 2); // handle split-in-the-middle case - - LocalBVH2_CreateInnerNode( bvh2, bvh2_root, start_left, start_right ); - } - - - barrier( CLK_LOCAL_MEM_FENCE ); - - struct DFSPrimRef ref; - ushort new_primref_position; - - if ( active_thread ) - { - ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); - bool go_left; - - if ( split_left_count == 0 ) - { - // We chose a split with no left-side prims - // This will only happen if all primrefs are located in the exact same position - // In that case, fall back to split-in-the-middle - split_left_count = (local_num_prims / 2); - go_left = (primref_position - prim_range_start < split_left_count); - } - else - { - // determine what side of the split this thread's primref belongs on - local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); - local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index ); - float c = my_box->lower[split_axis] + my_box->upper[split_axis]; - float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis]; - go_left = c < Csplit; - } - - // adjust state variables for next loop iteration - bvh2_root = (go_left) ? kids.x : kids.y; - local_num_prims = (go_left) ? split_left_count : (local_num_prims - split_left_count); - prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count; - - // determine the new primref position by incrementing a counter in the destination subtree - new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root ); - - // load our primref from its previous position - ref = PrimRefSet_GetPrimRef( prim_refs, primref_position ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - if ( active_thread ) - { - // write our primref into its sorted position and note which node it went in - DFSPrimRef_SetBVH2Root( &ref, bvh2_root ); - PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position ); - primref_position = new_primref_position; - - - // deactivate all threads whose subtrees are small enough to form a leaf - if ( local_num_prims <= TREE_ARITY ) - { - active_thread = false; - if( primref_position == prim_range_start ) - atomic_sub_local( num_active_threads, local_num_prims ); - } - - locals->range[ primref_position ].start = prim_range_start; - locals->range[ primref_position ].local_num_prims = local_num_prims; - locals->range[ primref_position ].bvh2_root = bvh2_root; - locals->range[ primref_position ].active = active_thread; - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - // if we'll have next iteration then load from SLM - if(*num_active_threads) - { - prim_range_start = locals->range[ tid ].start; - local_num_prims = locals->range[ tid ].local_num_prims; - bvh2_root = locals->range[ tid ].bvh2_root; - active_thread = locals->range[ tid ].active; - primref_position = tid; - } - else - { - break; - } - - } while ( true ); - -} - - -#define REFIT_BIT_DWORDS (LOCAL_BVH2_NODE_COUNT - DFS_WG_SIZE)/32 - -struct RefitBits -{ - uint bits[REFIT_BIT_DWORDS]; -}; - -struct DFS_SLM -{ - union - { - struct LocalBVH2 bvh2; - struct { - struct AABB3f centroid_bounds; - uint left_count; - uint right_count; - struct BFS_BinInfo bins; - struct BFS_BinInfoReduce3_SLM reduce3; - } binning; - - } u1; - - union - { - struct { - struct PrimRefSet prim_refs; - struct BVHBuildLocals locals; - } pass0; - - struct AABB3f node_boxes[LOCAL_BVH2_NODE_COUNT]; - - } u2; - - union - { - uchar bytes[DFS_WG_SIZE]; - uint dwords[DFS_WG_SIZE/4]; - } mask_info; - - struct RefitBits refit_bits; - -}; - - -void DFS_InitialBinningPass( - local struct BFS_BinInfo* bins, - local struct BFS_BinInfoReduce3_SLM* reduce3, - uniform local struct AABB3f* centroid_bounds, - local struct PrimRefSet* refs, - local uint* left_counter, - local uint* right_counter, - ushort num_refs ) -{ - uint tid = get_local_id(0); - - // initialize SLM structures - if (tid == 0) - { - AABB3f_init(centroid_bounds); - *left_counter = 0; - *right_counter = 0; - } - - BinInfo_init(bins); - - PrimRef ref; - struct DFSPrimRef dfs_ref; - - if (tid < num_refs) - { - dfs_ref = PrimRefSet_GetPrimRef(refs, tid); - struct DFSPrimRefAABB box = dfs_ref.aabb; - ref.lower.xyz = (float3)(box.lower[0], box.lower[1], box.lower[2]); - ref.upper.xyz = (float3)(box.upper[0], box.upper[1], box.upper[2]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // compute centroid bounds so that we can bin - if (tid < num_refs) - { - float3 centroid = ref.lower.xyz + ref.upper.xyz; - Uniform_AABB3f_atomic_merge_local_sub_group_lu(centroid_bounds, centroid, centroid); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // add primrefs to bins - struct BinMapping mapping; - BinMapping_init(&mapping, centroid_bounds, BFS_NUM_BINS); - - BinInfo_add_primref( &mapping, bins, &ref, tidu1.bvh2; - - global struct BVH2* global_bvh2 = args.global_bvh2; - - PrimRef ref; - uint parent_node; - - { - local struct BVHBuildLocals* locals = &slm->u2.pass0.locals; - local struct PrimRefSet* prim_refs = &slm->u2.pass0.prim_refs; - - DFS_CreatePrimRefSet(args, prim_refs); - - uint local_id = get_local_id(0); - - ushort bvh2_root = 0; - ushort prim_range_start = 0; - ushort local_num_prims = args.num_primrefs; - - if(local_id == 0) - *num_active_threads = local_num_prims; - - // barrier for DFS_CreatePrimRefSet and num_active_threads - barrier(CLK_LOCAL_MEM_FENCE); - - // initial binning pass if number of primrefs is large - if( args.num_primrefs > 32 ) - { - DFS_InitialBinningPass(&slm->u1.binning.bins, &slm->u1.binning.reduce3, &slm->u1.binning.centroid_bounds, prim_refs, - &slm->u1.binning.left_count, &slm->u1.binning.right_count, args.num_primrefs); - - barrier(CLK_LOCAL_MEM_FENCE); - - ushort left_count = slm->u1.binning.left_count; - ushort right_count = args.num_primrefs - left_count; - if (get_local_id(0) == 0) - LocalBVH2_Initialize_Presplit(bvh2, args.num_primrefs, left_count, right_count); - - bvh2_root = (local_id < left_count) ? 1 : 2; - local_num_prims = (local_id < left_count) ? left_count : right_count; - prim_range_start = (local_id < left_count) ? 0 : left_count; - } - else - { - if (get_local_id(0) == 0) - LocalBVH2_Initialize(bvh2, args.num_primrefs); - } - - DFS_ConstructBVH2( bvh2, prim_refs, bvh2_root, prim_range_start, local_num_prims, args.num_primrefs, locals, num_active_threads); - - // move the prim refs into their sorted position - // keep this thread's primref around for later use - if ( local_id < args.num_primrefs ) - { - struct DFSPrimRef dfs_ref = PrimRefSet_GetPrimRef( prim_refs, local_id ); - - uint input_id = DFSPrimRef_GetInputIndex( &dfs_ref ); - - parent_node = DFSPrimRef_GetBVH2Parent( &dfs_ref ); - - uint primref_index = prim_refs->input_indices[input_id]; - ref = args.primref_buffer[primref_index]; - args.primref_indices_out[local_id] = primref_index; - args.primref_indices_in[local_id] = primref_index; - // these buffers are not read again until the end of kernel - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - } - - - // initialize flags for determining when subtrees are done refit - if ( get_local_id( 0 ) < REFIT_BIT_DWORDS ) - slm->refit_bits.bits[get_local_id( 0 )] = 0; - - - // stash full-precision primref AABBs in slm storage - local struct AABB3f* slm_boxes = &slm->u2.node_boxes[0]; - bool active_thread = get_local_id( 0 ) < args.num_primrefs; - if( active_thread ) - { - AABB3f_set( &slm_boxes[get_local_id( 0 )], ref.lower.xyz, ref.upper.xyz ); - - // stash instance masks in SLM storage - if( args.do_mask_processing ) - slm->mask_info.bytes[get_local_id(0)] = PRIMREF_instanceMask( &ref ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - // Refit leaf nodes - uint box_index; - if ( active_thread ) - { - // the thread for the first primref in every leaf is the one that will ascend - // remaining threads merge their AABB/mask into the first one and terminate - uint first_ref = LocalBVH2_GetLeafPrimrefStart( bvh2, parent_node ); - if ( first_ref != get_local_id( 0 ) ) - { - AABB3f_atomic_merge_local_lu( &slm_boxes[first_ref], ref.lower.xyz, ref.upper.xyz ); - - if( args.do_mask_processing ) - { - uint dword_index = first_ref/4; - uint shift = (first_ref%4)*8; - uint mask = PRIMREF_instanceMask(&ref) << shift; - atomic_or_local( &slm->mask_info.dwords[dword_index], mask ); - } - active_thread = false; // switch off all primref threads except the first one - } - - box_index = first_ref; - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - if ( active_thread ) - { - uint current_node = parent_node; - parent_node = LocalBVH2_GetParent( bvh2, current_node ); - - // write out the leaf node's AABB - uint num_prims = LocalBVH2_GetLeafPrimCount( bvh2, current_node ); - uint prim_offs = args.primref_base + LocalBVH2_GetLeafPrimrefStart( bvh2, current_node ); - - uint mask = 0xff; - if( args.do_mask_processing ) - mask = slm->mask_info.bytes[box_index]; - - BVH2_WriteLeafNode( global_bvh2, args.global_bvh2_base + current_node, &slm_boxes[box_index], prim_offs, num_prims, mask ); - - // we no longer need the BVH2 bits for this node, so re-purpose the memory to store the AABB index - bvh2->nodes[current_node] = box_index; - - // toggle flag bit in parent node. The second thread to flip the bit is the one that gets to proceed - uint thread_mask = (1 << (parent_node % 32)); - if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], thread_mask ) & thread_mask) == 0 ) - active_thread = false; - } - - // count how many active threads in sub_group we have and increment wg's number of active threads - uint sg_active = sub_group_reduce_add(active_thread ? 1 : 0); - if(get_sub_group_local_id() == 0) - { - atomic_add_local(num_active_threads, sg_active); - } - - // refit internal nodes: - // walk up the tree and refit AABBs - - do - { - barrier( CLK_LOCAL_MEM_FENCE ); // we need this barrier because we need to make sure all threads read num_active_threads before modifying it - if ( active_thread ) - { - uint current_node = parent_node; - parent_node = LocalBVH2_GetParent( bvh2, current_node ); - - // pull left/right box indices from current node - ushort2 kids = LocalBVH2_GetChildIndices( bvh2, current_node ); - - uint left_box = bvh2->nodes[kids.x]; - uint right_box = bvh2->nodes[kids.y]; - - struct AABB3f left = slm_boxes[left_box]; - struct AABB3f right = slm_boxes[right_box]; - AABB3f_extend( &left, &right ); - - uint2 child_offsets = (uint2)( - args.global_bvh2_base + kids.x, - args.global_bvh2_base + kids.y); - - uint mask = 0xff; - if( args.do_mask_processing ) - { - mask = slm->mask_info.bytes[left_box] - | slm->mask_info.bytes[right_box]; - slm->mask_info.bytes[left_box] = mask; - } - - BVH2_WriteInnerNode( args.global_bvh2, args.global_bvh2_base+current_node, &left, child_offsets, mask ); - - slm_boxes[left_box] = left; - bvh2->nodes[current_node] = left_box; - - // stop at the root - if ( LocalBVH2_IsRoot( bvh2, current_node ) ) - { - active_thread = false; - atomic_dec_local(num_active_threads); - } - else - { - // toggle flag bit in parent node. The second thread to flip the bit is the one that gets to proceed - uint mask = (1 << (parent_node % 32)); - if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], mask ) & mask) == 0 ) - { - active_thread = false; - atomic_dec_local(num_active_threads); - } - } - } - - barrier( CLK_LOCAL_MEM_FENCE ); - } while ( *num_active_threads > 0 ); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size(DFS_WG_SIZE,1,1) )) -__attribute__( (intel_reqd_sub_group_size(16)) ) -kernel void -DFS( global struct VContextScheduler* scheduler, - global struct SAHBuildGlobals* globals_buffer ) -{ - local struct DFS_SLM slm; - local struct DFSDispatchRecord record; - local uint num_active_threads; - - if ( get_local_id( 0 ) == 0 ) - { - // pop an entry off the DFS dispatch queue - //uint wg_index = atomic_dec_global( &scheduler->num_dfs_wgs ) - 1; - //record = scheduler->dfs_queue.records[wg_index]; - - // TODO: The version above races, but is considerably faster... investigate - uint wg_index = get_group_id(0); - record = scheduler->dfs_queue.records[wg_index]; - write_mem_fence( CLK_LOCAL_MEM_FENCE ); - atomic_dec_global( &scheduler->num_dfs_wgs ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - - bool odd_pass = record.tree_depth & 1; - - global struct SAHBuildGlobals* sah_globals = globals_buffer + record.batch_index; - - struct DFSArgs args; - args.num_primrefs = record.num_primrefs; - args.primref_indices_in = SAHBuildGlobals_GetPrimrefIndices_In( sah_globals, odd_pass ); - args.primref_indices_out = SAHBuildGlobals_GetPrimrefIndices_Out( sah_globals, odd_pass ); - args.primref_buffer = SAHBuildGlobals_GetPrimrefs( sah_globals ); - args.global_bvh2 = SAHBuildGlobals_GetBVH2( sah_globals ); - args.primref_indices_in += record.primref_base; - args.primref_indices_out += record.primref_base; - args.primref_base = record.primref_base; - args.global_bvh2_base = record.bvh2_base; - args.do_mask_processing = SAHBuildGlobals_NeedMasks( sah_globals ); - - Do_DFS( args, &slm, &num_active_threads ); - -} - - -///////////////////////////////////////////////////////////////////////////////////////////////// -/// -/// BVH2 to BVH6 -/// -///////////////////////////////////////////////////////////////////////////////////////////////// - - - -struct BuildFlatTreeArgs -{ - ushort leaf_size_in_bytes; - ushort leaf_type; - ushort inner_node_type; - bool do_mask_processing; - - global uint* primref_indices; - global PrimRef* primref_buffer; - global struct Globals* globals; - global struct BVHBase* bvh_base; - global struct BVH2* bvh2; -}; - - -// lane i in the return value is the index of the ith largest primref in the input -// the return value can be used with shuffle() to move data into its sorted position -// the elements of 'key' must be unique.. only the first 6 elements are sorted -varying ushort SUBGROUP_get_sort_indices_N6( varying uint key ) -{ - // each lane computes the number of items larger than it - // this is its position in the descending order - // TODO_OPT: Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it - // if compiler is not generating optimal code, consider moving to Cm - - varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0; - varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0; - varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0; - varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0; - varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0; - varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0; - varying ushort a = cmp0 + cmp2 + cmp4; - varying ushort b = cmp1 + cmp3 + cmp5; - varying ushort num_larger = a + b; - - // each lane determines which of the input elements it should pull - varying ushort lane = get_sub_group_local_id(); - a = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0; - b = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0; - a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0; - b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0; - a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0; - b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0; - return a + b; -} - -uint SUBGROUP_area_to_sort_key( varying float area, uniform ushort num_children ) -{ - varying ushort lane = get_sub_group_local_id(); - area = (lane < num_children) ? area : 0; // put inactive nodes last - - // drop LSBs and break ties by lane number to ensure unique keys - // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal. - // If we do not do this it can lead to non-deterministic tree structure - return (as_uint(area) & 0xffffff80) + (lane^(get_sub_group_size()-1)); -} - -// lane i in the return value is the index of the ith largest primref in the input -// the return value can be used with shuffle() to move data into its sorted position -// the elements of 'key' must be unique.. only the first 6 elements are sorted -varying ushort SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16( varying uint key ) -{ - // each lane computes the number of items larger than it - // this is its position in the descending order - // TODO_OPT: Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it - // if compiler is not generating optimal code, consider moving to Cm - - varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0; - varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0; - varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0; - varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0; - varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0; - varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0; - varying ushort a = cmp0 + cmp2 + cmp4; - varying ushort b = cmp1 + cmp3 + cmp5; - varying ushort num_larger = a + b; - - varying ushort cmp0_1 = (sub_group_broadcast(key, 8) > key) ? 1 : 0; - varying ushort cmp1_1 = (sub_group_broadcast(key, 9) > key) ? 1 : 0; - varying ushort cmp2_1 = (sub_group_broadcast(key, 10) > key) ? 1 : 0; - varying ushort cmp3_1 = (sub_group_broadcast(key, 11) > key) ? 1 : 0; - varying ushort cmp4_1 = (sub_group_broadcast(key, 12) > key) ? 1 : 0; - varying ushort cmp5_1 = (sub_group_broadcast(key, 13) > key) ? 1 : 0; - varying ushort a_1 = cmp0_1 + cmp2_1 + cmp4_1; - varying ushort b_1 = cmp1_1 + cmp3_1 + cmp5_1; - varying ushort num_larger_1 = a_1 + b_1; - - // each lane determines which of the input elements it should pull - varying ushort lane = get_sub_group_local_id(); - if(lane < 8) - { - a = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0; - b = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0; - a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0; - b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0; - a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0; - b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0; - } - else - { - a = (sub_group_broadcast(num_larger_1, 8) == lane-8) ? 8 : 8; - b = (sub_group_broadcast(num_larger_1, 9) == lane-8) ? 1 : 0; - a += (sub_group_broadcast(num_larger_1, 10) == lane-8) ? 2 : 0; - b += (sub_group_broadcast(num_larger_1, 11) == lane-8) ? 3 : 0; - a += (sub_group_broadcast(num_larger_1, 12) == lane-8) ? 4 : 0; - b += (sub_group_broadcast(num_larger_1, 13) == lane-8) ? 5 : 0; - } - - return a + b; -} - -uint SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16( varying float area, uniform ushort num_children ) -{ - varying ushort lane = get_sub_group_local_id() % 8; - area = (lane < num_children) ? area : 0; // put inactive nodes last - - // drop LSBs and break ties by lane number to ensure unique keys - // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal. - // If we do not do this it can lead to non-deterministic tree structure - return (as_uint(area) & 0xffffff80) + (lane^7); -} - -ushort SUBGROUP_BuildFlatTreeNode( - uniform struct BuildFlatTreeArgs args, - uniform uint bvh2_root, - uniform struct InternalNode* qnode, - uniform uint qnode_index, - varying uint3* sg_children_out // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z) - // if a leaf is created, receives number of primrefs (z) -) // return value is the number of child nodes or 0 for a leaf -{ - global struct BVH2* bvh2 = args.bvh2; - varying ushort lane = get_sub_group_local_id(); - - global struct BVHBase* base = args.bvh_base; - - - if ( !BVH2_IsInnerNode( bvh2, bvh2_root ) ) - { - uniform ushort num_prims = BVH2_GetLeafPrimCount( bvh2, bvh2_root ); - uniform uint primref_start = BVH2_GetLeafPrimStart( bvh2, bvh2_root ); - varying uint primref_index = primref_start + ((lane < num_prims) ? lane : 0); - - varying uint ref_id = args.primref_indices[primref_index]; - varying PrimRef ref = args.primref_buffer[ref_id]; - uniform char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); - uniform char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes; - - uniform int offset = (int)(leaf_mem - (char*)qnode); - offset = offset >> 6; - - varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&ref), num_prims ); - varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key); - ref = PrimRef_sub_group_shuffle(&ref, sort_index); - ref_id = intel_sub_group_shuffle(ref_id, sort_index); - - if (lane < num_prims) - args.primref_indices[primref_index] = ref_id; - - uint global_num_prims = args.globals->numPrimitives; - char* bvh_mem = (char*) args.bvh_base; - - if(lane < num_prims) - args.primref_indices[primref_index + global_num_prims] = qnode - (struct InternalNode*)bvh_mem; - - if (args.leaf_type == NODE_TYPE_INSTANCE) - subgroup_setInstanceQBVHNodeN( offset, &ref, num_prims, (struct QBVHNodeN*)qnode, lane < num_prims ? PRIMREF_instanceMask(&ref) : 0 ); - else - subgroup_setQBVHNodeN( offset, args.leaf_type, &ref, num_prims, (struct QBVHNodeN*)qnode, BVH_NODE_DEFAULT_MASK ); - - sg_children_out->z = num_prims; - return 0; - } - else - { - // collapse BVH2 into BVH6. - // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough - uniform ushort num_children = 2; - - uniform uint2 kids = BVH2_GetChildIndices( bvh2, bvh2_root ); - varying uint sg_bvh2_node = kids.x; - if ( lane == 1 ) - sg_bvh2_node = kids.y; - - do - { - // choose the inner node with maximum area to replace. - // Its left child goes in its old location. Its right child goes in a new lane - - // TODO_OPT: We re-read the AABBs again and again to compute area - // ... store per-lane boxes instead and pre-compute areas - - varying float sg_area = BVH2_GetNodeArea( bvh2, sg_bvh2_node ); - varying bool sg_is_inner = BVH2_IsInnerNode( bvh2, sg_bvh2_node ); - sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf - - uniform float max_area = sub_group_reduce_max_N6( sg_area ); - varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner; - uniform uint mask = intel_sub_group_ballot( sg_reducable ); - - // TODO_OPT: Some of these ops seem redundant.. look at trimming further - - if ( mask == 0 ) - break; - - // choose the inner node with maximum area to replace - uniform ushort victim_child = ctz( mask ); - uniform uint victim_node = sub_group_broadcast( sg_bvh2_node, victim_child ); - kids = BVH2_GetChildIndices( bvh2, victim_node ); - - if ( lane == victim_child ) - sg_bvh2_node = kids.x; - else if ( lane == num_children ) - sg_bvh2_node = kids.y; - - num_children++; - - } while ( num_children < TREE_ARITY ); - - // allocate inner node space - uniform uint kids_offset; - if (get_sub_group_local_id() == 0) - kids_offset = allocate_inner_nodes( args.bvh_base, num_children ); - kids_offset = sub_group_broadcast(kids_offset, 0); - - uniform struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset); - uniform int offset = (int)((char*)kid - (char*)qnode) >> 6; - -#if 0 - uniform uint kids_offset; - if ( get_sub_group_local_id() == 0 ) - kids_offset = alloc_node_mem( args.globals, sizeof( struct QBVHNodeN ) * num_children ); - kids_offset = sub_group_broadcast( kids_offset, 0 ); - - - // create inner node - uniform struct QBVHNodeN* kid = (struct QBVHNodeN*) ((char*)(args.bvh_base) + kids_offset); - uniform int offset = (int)((char*)kid - (char*)qnode) >> 6; -#endif - uniform uint child_type = args.inner_node_type; - - // sort child nodes in descending order by AABB area - varying struct AABB box = BVH2_GetAABB( bvh2, sg_bvh2_node ); - varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&box), num_children ); - varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key); - box = AABB_sub_group_shuffle(&box, sort_index); - sg_bvh2_node = intel_sub_group_shuffle(sg_bvh2_node, sort_index); - - uniform uint node_mask = (args.do_mask_processing) ? BVH2_GetMask( bvh2, bvh2_root ) : 0xff; - - subgroup_setQBVHNodeN( offset, child_type, &box, num_children, (struct QBVHNodeN*)qnode, node_mask ); - - // return child information - *sg_children_out = (uint3)(sg_bvh2_node, qnode_index + offset + get_sub_group_local_id(), num_children ); - return num_children; - } -} - -ushort SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16( - uniform struct BuildFlatTreeArgs args, - varying uint bvh2_root, - varying struct InternalNode* qnode_base, - varying uint qnode_index, - varying uint3* sg_children_out, // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z) - // if a leaf is created, receives number of primrefs (z) - bool active_lane -) // return value is the number of child nodes or 0 for a leaf -{ - global struct BVH2* bvh2 = args.bvh2; - varying ushort SIMD16_lane = get_sub_group_local_id(); - varying ushort SIMD8_lane = get_sub_group_local_id() % 8; - varying ushort SIMD8_id = get_sub_group_local_id() / 8; - varying ushort lane = get_sub_group_local_id(); - global struct BVHBase* base = args.bvh_base; - - struct BVH2NodeMetaData nodeMetaData = BVH2_GetNodeMetaData( bvh2, bvh2_root ); - - bool is_leaf = active_lane && !BVH2NodeMetaData_IsInnerNode( &nodeMetaData ); - bool is_inner = active_lane && BVH2NodeMetaData_IsInnerNode( &nodeMetaData ); - - uchar mask = BVH_NODE_DEFAULT_MASK; - if(is_inner) - mask = (args.do_mask_processing) ? BVH2NodeMetaData_GetMask( &nodeMetaData ) : 0xff; - - int offset; - - varying struct InternalNode* qnode = qnode_base + qnode_index; - // TOOD: we don't need unions, I left them only for readability - union { - uint num_prims; - uint num_children; - } lane_num_data; - - union { - PrimRef ref; // this is in fact AABB - struct AABB box; - } lane_box_data; - - union { - uint ref_id; - uint sg_bvh2_node; - } lane_id_data; - - // for leafs - varying uint primref_index; - - if(is_leaf) - { - lane_num_data.num_prims = BVH2NodeMetaData_GetLeafPrimCount( &nodeMetaData ); - uint primref_start = BVH2NodeMetaData_GetLeafPrimStart( &nodeMetaData ); - primref_index = primref_start + ((SIMD8_lane < lane_num_data.num_prims) ? SIMD8_lane : 0); - - lane_id_data.ref_id = args.primref_indices[primref_index]; - lane_box_data.ref = args.primref_buffer[lane_id_data.ref_id]; - char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); - char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes; - - offset = (int)(leaf_mem - (char*)qnode); - offset = offset >> 6; - } - - - if(intel_sub_group_ballot(is_inner)) - { - // collapse BVH2 into BVH6. - // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough - - uint2 kids; - if(is_inner) - { - lane_num_data.num_children = 2; - kids = BVH2_GetChildIndices( bvh2, bvh2_root ); - - lane_id_data.sg_bvh2_node = kids.x; - if ( SIMD8_lane == 1 ) - lane_id_data.sg_bvh2_node = kids.y; - } - - bool active = is_inner; - do - { - // choose the inner node with maximum area to replace. - // Its left child goes in its old location. Its right child goes in a new lane - - // TODO_OPT: We re-read the AABBs again and again to compute area - // ... store per-lane boxes instead and pre-compute areas - - varying float sg_area = 0; - varying bool sg_is_inner = false; - if(active) - { - sg_area = BVH2_GetNodeArea( bvh2, lane_id_data.sg_bvh2_node ); - sg_is_inner = BVH2_IsInnerNode( bvh2, lane_id_data.sg_bvh2_node ); - sg_area = (sg_is_inner && SIMD8_lane < lane_num_data.num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf - } - - float max_area = sub_group_reduce_max_N6_2xSIMD8_in_SIMD16( sg_area ); - varying bool sg_reducable = max_area == sg_area && sg_is_inner && (SIMD8_lane < lane_num_data.num_children); - uint mask = intel_sub_group_ballot( sg_reducable ) & (0xFF << SIMD8_id * 8); // we'll end up with two different masks for two SIMD8 in SIMD16 due to bits masking - - // TODO_OPT: Some of these ops seem redundant.. look at trimming further - - if ( mask == 0 ) - active = false; - - // choose the inner node with maximum area to replace - ushort victim_child = ctz( mask ); - uint victim_node = intel_sub_group_shuffle( lane_id_data.sg_bvh2_node, victim_child ); - if(active) - { - kids = BVH2_GetChildIndices( bvh2, victim_node ); - - if ( SIMD16_lane == victim_child ) // we use SIMD16_lane, cause victim_child was calculated based on SIMD16 i.e. second node will have victim from 8..13 - lane_id_data.sg_bvh2_node = kids.x; - else if ( SIMD8_lane == lane_num_data.num_children ) - lane_id_data.sg_bvh2_node = kids.y; - - lane_num_data.num_children++; - - if(lane_num_data.num_children >= TREE_ARITY) - active = false; - } - - } while ( intel_sub_group_ballot(active) ); // if any active, then continue - - // sum children from both halfs of SIMD16 to allocate nodes only once per sub_group - uniform ushort num_children = is_inner ? lane_num_data.num_children : 0; - uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); - uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); - - num_children = first_SIMD8_num_children + second_SIMD8_num_children; - uint kids_offset; - - // allocate inner node space - if(num_children && SIMD16_lane == 0) - kids_offset = allocate_inner_nodes( args.bvh_base, num_children ); - kids_offset = sub_group_broadcast(kids_offset, 0); - if((is_inner)) - { - kids_offset += SIMD8_id * first_SIMD8_num_children; - - struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset); - - offset = (int)((char*)kid - (char*)qnode) >> 6; - lane_box_data.box = BVH2_GetAABB( bvh2, lane_id_data.sg_bvh2_node ); - } - } - - // sort child nodes in descending order by AABB area - varying uint key = SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16(AABB_halfArea(&lane_box_data.box), lane_num_data.num_children ); - varying ushort sort_index = SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16(key); - lane_box_data.box = PrimRef_sub_group_shuffle(&lane_box_data.box, sort_index); - lane_id_data.sg_bvh2_node = intel_sub_group_shuffle(lane_id_data.sg_bvh2_node, sort_index); - - char* bvh_mem = (char*) args.bvh_base; - if (is_leaf && SIMD8_lane < lane_num_data.num_prims) - { - args.primref_indices[primref_index] = lane_id_data.ref_id; - args.primref_indices[primref_index + args.globals->numPrimitives] = qnode - (struct InternalNode*)bvh_mem; - } - - bool degenerated = false; - uint node_type = is_leaf ? args.leaf_type : args.inner_node_type; - - if(args.leaf_type == NODE_TYPE_INSTANCE) - degenerated = subgroup_setInstanceBox_2xSIMD8_in_SIMD16(&lane_box_data.box, lane_num_data.num_children, &mask, SIMD8_lane < lane_num_data.num_prims ? PRIMREF_instanceMask(&lane_box_data.ref) : 0, is_leaf); - - subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, node_type, &lane_box_data.box, lane_num_data.num_children, mask, (struct QBVHNodeN*)(qnode), degenerated, active_lane); - - // return child information - if(is_inner) - { - sg_children_out->x = lane_id_data.sg_bvh2_node; - sg_children_out->y = qnode_index + offset + SIMD8_lane; - } - - sg_children_out->z = lane_num_data.num_children; - - return is_inner ? lane_num_data.num_children : 0; -} - -void check_primref_integrity( global struct SAHBuildGlobals* globals ) -{ - global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); - global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, 0 ); - dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals ); - if ( get_local_id( 0 ) == 0 ) - { - for ( uint i = 0; i < num_primrefs; i++ ) - { - primref_out[i] = 0; - } - - for ( uint i = 0; i < num_primrefs; i++ ) - primref_out[primref_in[i]]++; - - for ( uint i = 0; i < num_primrefs; i++ ) - if ( primref_out[i] != 1 ) - printf( "Foo: %u %u\n", i, primref_out[i] ); - } -} - - - - -void check_bvh2(global struct SAHBuildGlobals* globals ) -{ - global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(globals); - global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); - global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out(globals, 0); - dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs(globals); - - if (get_local_id(0) == 0) - { - for (uint i = 0; i < num_primrefs; i++) - primref_out[i] = 0; - - uint stack[256]; - uint sp=0; - uint r = BVH2_GetRoot(bvh2); - stack[sp++] = r; - while (sp) - { - r = stack[--sp]; - if (BVH2_IsInnerNode(bvh2,r)) - { - uint2 kids = BVH2_GetChildIndices( bvh2, r); - if (kids.x >= bvh2->num_nodes || kids.y >= bvh2->num_nodes) - { - printf("BVH2!! Bad node index found!\n"); - return; - } - - stack[sp++] = kids.x; - stack[sp++] = kids.y; - } - else - { - uint ref = BVH2_GetLeafPrimStart(bvh2,r); - uint count = BVH2_GetLeafPrimCount(bvh2,r); - if( count == 0 ) - { - printf("BVH2!! Empty leaf found!\n"); - return; - } - for (uint i = 0; i < count; i++) - { - if (ref + i > num_primrefs) - { - printf("BVH2!! Bad leaf range!\n"); - return; - } - uint c = primref_out[ref+i]; - if (c != 0) - { - printf("BVH2!! overlapped prim ranges\n"); - return; - } - primref_out[ref+i] = 1; - if (primref_in[ref + i] >= num_primrefs) - { - printf("BAD PRIMREF ID FOUND!\n"); - return; - } - } - } - } - } - - printf("bvh2 is ok!\n"); -} - - -#if 0 -// TODO_OPT: Enable larger WGs. WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size(256,1,1)) ) -__attribute__( (intel_reqd_sub_group_size(8) ) ) -kernel void -build_qnodes( global struct SAHBuildGlobals* globals, global struct VContextScheduler* scheduler ) -{ - globals = globals + (scheduler->num_trivial_builds + scheduler->num_single_builds); - globals = globals + get_group_id(0); - - - struct BuildFlatTreeArgs args; - args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals ); - args.leaf_type = SAHBuildGlobals_GetLeafType( globals ); - args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals ); - args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); - args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); - args.bvh_base = SAHBuildGlobals_GetBVHBase( globals ); - args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); - args.globals = (global struct Globals*) globals->p_globals; - args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals ); - - dword alloc_backpointers = SAHBuildGlobals_NeedBackPointers( globals ); - global uint2* root_buffer = (global uint2*) globals->p_qnode_root_buffer; - global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base ); - global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); - - local uint nodes_produced; - if ( get_sub_group_id() == 0 ) - { - // allocate first node - if (get_sub_group_local_id() == 0) - allocate_inner_nodes( args.bvh_base, 1 ); - - // first subgroup does first node - varying uint3 children_info; - uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, BVH2_GetRoot(args.bvh2), qnodes, 0, &children_info ); - - if ( get_sub_group_local_id() < num_children ) - root_buffer[get_sub_group_local_id()] = children_info.xy; - - if ( alloc_backpointers ) - { - // set root's backpointer - if( get_sub_group_local_id() == 0 ) - back_pointers[0] = (0xffffffc0) | (children_info.z << 3); - - // point child backpointers at the parent - if( get_sub_group_local_id() < num_children ) - back_pointers[children_info.y] = 0; - } - - if ( get_sub_group_local_id() == 0 ) - nodes_produced = num_children; - } - - barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE ); - - - uniform uint buffer_index = get_sub_group_id(); - uniform bool sg_active = buffer_index < nodes_produced; - - while ( work_group_any( sg_active ) ) - { - if( sg_active ) - { - uniform uint bvh2_node = root_buffer[buffer_index].x; - uniform uint qnode_index = root_buffer[buffer_index].y; - - // build a node - varying uint3 children_info; - uniform ushort num_children = SUBGROUP_BuildFlatTreeNode( args, bvh2_node, qnodes + qnode_index, qnode_index, &children_info ); - - // handle backpointers - if ( alloc_backpointers ) - { - // update this node's backpointer with child count - if ( get_sub_group_local_id() == 0 ) - back_pointers[qnode_index] |= (children_info.z << 3); - - // point child backpointers at parent - if ( get_sub_group_local_id() < num_children ) - back_pointers[children_info.y] = (qnode_index << 6); - } - - if ( num_children ) - { - // allocate space in the child buffer - uint root_buffer_position = 0; - if ( get_sub_group_local_id() == 0 ) - root_buffer_position = atomic_add_local( &nodes_produced, num_children ); - root_buffer_position = sub_group_broadcast( root_buffer_position, 0 ); - - // store child indices in root buffer - if ( get_sub_group_local_id() < num_children ) - root_buffer[root_buffer_position + get_sub_group_local_id()] = children_info.xy; - } - } - - // sync everyone - work_group_barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, - memory_scope_work_group ); - - - if( sg_active ) - buffer_index += get_num_sub_groups(); - - sg_active = (buffer_index < nodes_produced); - } -} -#endif - - - - - - - -inline bool buffer_may_overflow( uint capacity, uint current_size, uint elements_processed_per_sub_group ) -{ - uint num_consumed = min( get_num_sub_groups() * elements_processed_per_sub_group, current_size ); - uint space_available = (capacity - current_size) + num_consumed; - uint space_needed = TREE_ARITY * num_consumed; - return space_available < space_needed; -} - -inline uint build_qnodes_pc( - global struct SAHBuildGlobals* globals, - bool alloc_backpointers, - bool process_masks, - uint first_qnode, - uint first_bvh2_node, - - local uint2* SLM_local_root_buffer, - local uint* SLM_ring_tail, - const uint RING_SIZE -) - -{ - struct BuildFlatTreeArgs args; - args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals ); - args.leaf_type = SAHBuildGlobals_GetLeafType( globals ); - args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals ); - args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 ); - args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals ); - args.bvh_base = SAHBuildGlobals_GetBVHBase( globals ); - args.bvh2 = SAHBuildGlobals_GetBVH2( globals ); - args.globals = (global struct Globals*) globals->p_globals; - args.do_mask_processing = process_masks; - - global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base ); - global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); - - // first subgroup adds first node - if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0) - { - SLM_local_root_buffer[0].x = first_bvh2_node; - SLM_local_root_buffer[0].y = first_qnode; - *SLM_ring_tail = 1; - - } - - uint ring_head = 0; - uint ring_tail = 1; - uint ring_size = 1; - - barrier( CLK_LOCAL_MEM_FENCE ); - - const uniform uint elements_processed_in_sg = 2; - - while ( ring_size > 0 && !buffer_may_overflow( RING_SIZE, ring_size, elements_processed_in_sg ) ) - { - ushort SIMD16_lane = get_sub_group_local_id(); - - // SIMD16 as 2xSIMD8 - ushort SIMD8_lane = get_sub_group_local_id() % 8; - ushort SIMD8_id = get_sub_group_local_id() / 8; - bool active_lane; - - uniform uint nodes_consumed = min( get_num_sub_groups() * elements_processed_in_sg, ring_size ); // times two because we process two nodes in subgroup - uniform bool sg_active = get_sub_group_id() * elements_processed_in_sg < nodes_consumed; - ushort num_children = 0; - varying uint3 children_info = 0; - - uint bvh2_node = 0; - uint qnode_index = 0; - - if (sg_active) - { - ushort consumed_pos = get_sub_group_id() * elements_processed_in_sg + SIMD8_id; - active_lane = consumed_pos < nodes_consumed ? true : false; - consumed_pos = consumed_pos < nodes_consumed ? consumed_pos : consumed_pos-1; - - uint buffer_index = (ring_head + consumed_pos) % RING_SIZE; - - bvh2_node = SLM_local_root_buffer[buffer_index].x; - qnode_index = SLM_local_root_buffer[buffer_index].y; - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - if (sg_active) - { - // build a node - num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, bvh2_node, qnodes, qnode_index, &children_info, active_lane); - - // handle backpointers - // TODO_OPT: This should be separate shaders not a runtime branch - // doing it this way for now because GRLTLK does not make dynamic shader selection on host very easy. - // this needs to change... GRLTLK should - - if (alloc_backpointers && active_lane) - { - // update this node's backpointer with child count - if (SIMD8_lane == 0) - back_pointers[qnode_index] |= (children_info.z << 3); - - // point child backpointers at parent - if (SIMD8_lane < num_children) - back_pointers[children_info.y] = (qnode_index << 6); - } - - // save data - - uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); - uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); - uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children; - - uint root_buffer_position = 0; - - // allocate space in the child buffer - if (SIMD16_lane == 0 && SIMD16_num_children) - root_buffer_position = atomic_add_local(SLM_ring_tail, SIMD16_num_children); - - root_buffer_position = sub_group_broadcast( root_buffer_position, 0 ); - root_buffer_position += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16 - - // store child indices in root buffer - if (SIMD8_lane < num_children) - { - uint store_pos = (root_buffer_position + SIMD8_lane) % RING_SIZE; - SLM_local_root_buffer[store_pos] = children_info.xy; - } - } - - // sync everyone - barrier( CLK_LOCAL_MEM_FENCE ); - - ring_head += nodes_consumed; - ring_tail = *SLM_ring_tail; - ring_size = ring_tail - ring_head; - } - - return ring_head; -} - - - - -inline void amplify_and_spill( - global struct SAHBuildGlobals* globals, - dword alloc_backpointers, - uint first_qnode, - uint first_bvh2_node, - global uint2* global_root_buffer, - local uint* root_buffer_counter, - const uint RING_SIZE -) - -{ - struct BuildFlatTreeArgs args; - args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals); - args.leaf_type = SAHBuildGlobals_GetLeafType(globals); - args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals); - args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); - args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals); - args.bvh_base = SAHBuildGlobals_GetBVHBase(globals); - args.bvh2 = SAHBuildGlobals_GetBVH2(globals); - args.globals = (global struct Globals*) globals->p_globals; - - global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base); - global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base); - - - varying uint3 children_info; - uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, first_bvh2_node, qnodes + first_qnode, first_qnode, &children_info); - - if (alloc_backpointers) - { - // set first node's backpointer - if (get_sub_group_local_id() == 0) - { - // if first node is root, use root sentinel in backpointer - // otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread) - uint bp = 0xffffffc0; - if (first_qnode != 0) - bp = back_pointers[first_qnode]; - bp |= (children_info.z << 3); - - back_pointers[first_qnode] = bp; - } - - // point child backpointers at the parent - if (get_sub_group_local_id() < num_children) - back_pointers[children_info.y] = (first_qnode << 6); - } - - if (num_children) - { - uint spill_pos = 0; - if (get_sub_group_local_id() == 0) - spill_pos = atomic_add_local(root_buffer_counter,num_children); - - spill_pos = sub_group_broadcast(spill_pos, 0); - - if (get_sub_group_local_id() < num_children) - global_root_buffer[spill_pos+get_sub_group_local_id()] = children_info.xy; - } - -} - - - - -inline void build_qnodes_pc_kickoff_func( - global struct SAHBuildGlobals* globals, - global uint2* root_buffer, - bool alloc_backpointers, - bool process_masks, - - local uint2* SLM_local_root_buffer, - local uint* SLM_spill_pos, - local uint* SLM_ring_tail, - int RING_SIZE -) -{ - // allocate first node - if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0 ) - allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(globals), 1 ); - - *SLM_spill_pos=0; - - uint ring_head = build_qnodes_pc( globals, alloc_backpointers, process_masks, - 0, BVH2_GetRoot(SAHBuildGlobals_GetBVH2(globals)), SLM_local_root_buffer, SLM_ring_tail, RING_SIZE ); - - - uint n = *SLM_ring_tail - ring_head; - if (n > 0) - { -#if 0 - // do an additional round of amplification so we can get more nodes into the root buffer and go wider in the next phase - /// JDB TODO: this is causing hangs on DG2 for metro, so disabling for now... - for (uint i = get_sub_group_id(); i < n; i+= get_num_sub_groups() ) - { - uint consume_pos = (ring_head + i) % RING_SIZE; - uniform uint bvh2_root = SLM_local_root_buffer[consume_pos].x; - uniform uint qnode_root = SLM_local_root_buffer[consume_pos].y; - - amplify_and_spill( globals, alloc_backpointers, qnode_root, bvh2_root, root_buffer, SLM_spill_pos, RING_SIZE ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); -#else - for (uint i = get_local_id(0); i < n; i += get_local_size(0)) - root_buffer[i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; -#endif - - if (get_local_id(0) == 0) - { - globals->root_buffer_num_produced = n; - globals->root_buffer_num_produced_hi = 0; - globals->root_buffer_num_consumed = 0; - globals->root_buffer_num_consumed_hi = 0; - } - } -} - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( 256, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void -build_qnodes_pc_kickoff( - global struct SAHBuildGlobals* globals, - global uint2* root_buffer, - dword sah_flags -) -{ - bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; - bool process_masks = sah_flags & SAH_FLAG_NEED_MASKS; - - - const int RING_SIZE = 64; - - local uint2 SLM_local_root_buffer[RING_SIZE]; - local uint SLM_spill_pos; - local uint SLM_ring_tail; - - build_qnodes_pc_kickoff_func(globals, - root_buffer, - alloc_backpointers, - process_masks, - SLM_local_root_buffer, - &SLM_spill_pos, - &SLM_ring_tail, - RING_SIZE - ); -} - - - - -inline void build_qnodes_pc_amplify_func( - global struct SAHBuildGlobals* globals, - global uint2* root_buffer, - bool alloc_backpointers, - bool process_masks, - - local uint2* SLM_local_root_buffer, - local uint* SLM_broadcast, - local uint* SLM_ring_tail, - int RING_SIZE - ) -{ - // TODO_OPT: Probably don't need this atomic.. could clear 'num_consumed' every time - // and just use get_group_id() - // - - if (get_local_id(0) == 0) - *SLM_broadcast = atomic_inc_global(&globals->root_buffer_num_consumed); - - barrier( CLK_LOCAL_MEM_FENCE ); - - uniform uint consume_pos = *SLM_broadcast; - uniform uint bvh2_root = root_buffer[consume_pos].x; - uniform uint qnode_root = root_buffer[consume_pos].y; - - uint ring_head = build_qnodes_pc(globals, alloc_backpointers,process_masks, - qnode_root, bvh2_root, SLM_local_root_buffer, SLM_ring_tail, RING_SIZE); - - // TODO_OPT: Instead of spilling the nodes, do one more round of amplification and write - // generated children directly into the root buffer. This should allow faster amplification - - // spill root buffer contents - uint n = *SLM_ring_tail - ring_head; - if (n > 0) - { - - if (get_local_id(0) == 0) - *SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n); - - barrier( CLK_LOCAL_MEM_FENCE ); - uint produce_pos = *SLM_broadcast; - - for (uint i = get_local_id(0); i < n; i += get_local_size(0)) - root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; - } -} - - - - - -// Process two nodes per wg during amplification phase. -// DOing it this way ensures maximum parallelism -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void -build_qnodes_pc_amplify( - global struct SAHBuildGlobals* globals, - global uint2* root_buffer, - dword sah_flags ) -{ - bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; - - struct BuildFlatTreeArgs args; - args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals); - args.leaf_type = SAHBuildGlobals_GetLeafType(globals); - args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals); - args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0); - args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals); - args.bvh_base = SAHBuildGlobals_GetBVHBase(globals); - args.bvh2 = SAHBuildGlobals_GetBVH2(globals); - args.globals = (global struct Globals*) globals->p_globals; - args.do_mask_processing = sah_flags & SAH_FLAG_NEED_MASKS; - - global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base); - global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base); - - ushort SIMD16_lane = get_sub_group_local_id(); - - // SIMD16 as 2xSIMD8 - ushort SIMD8_lane = get_sub_group_local_id() % 8; - ushort SIMD8_id = get_sub_group_local_id() / 8; - bool active_lane = false; - - uint consume_pos; - consume_pos = globals->root_buffer_num_consumed + get_group_id(0) * 2; // times 2 because we process two nodes in workgroup - consume_pos += SIMD8_id; - - active_lane = consume_pos < globals->root_buffer_num_to_consume ? true : false; - consume_pos = consume_pos < globals->root_buffer_num_to_consume ? consume_pos : consume_pos-1; - - uint first_bvh2_node = root_buffer[consume_pos].x; - uint first_qnode = root_buffer[consume_pos].y; - - varying uint3 children_info; - ushort num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, first_bvh2_node, qnodes, first_qnode, &children_info, active_lane); - - if (alloc_backpointers && active_lane) - { - // set first node's backpointer - if (SIMD8_lane == 0) - { - // if first node is root, use root sentinel in backpointer - // otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread) - uint bp = 0xffffffc0; - if (first_qnode != 0) - bp = back_pointers[first_qnode]; - bp |= (children_info.z << 3); - - back_pointers[first_qnode] = bp; - } - - // point child backpointers at the parent - if (SIMD8_lane < num_children) - back_pointers[children_info.y] = (first_qnode << 6); - } - - // save data - { - // sum children from both halfs of SIMD16 to do only one atomic per sub_group - uint produce_pos; - uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0); - uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8); - uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children; - - if (SIMD16_lane == 0 && SIMD16_num_children) - produce_pos = atomic_add_global(&globals->root_buffer_num_produced, SIMD16_num_children); - - produce_pos = sub_group_broadcast(produce_pos, 0); - produce_pos += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16 - - if (SIMD8_lane < num_children) - { - root_buffer[produce_pos + SIMD8_lane] = children_info.xy; - } - } -} - - -////////// -// -// Batched version of qnode creation -// -////////// - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) -kernel void -build_qnodes_init_scheduler_batched(global struct QnodeScheduler* scheduler, dword num_builds, dword num_max_qnode_global_root_buffer_entries) -{ - - scheduler->batched_build_offset = scheduler->num_trivial_builds + scheduler->num_single_builds; - scheduler->batched_build_count = num_builds - scheduler->batched_build_offset; - scheduler->num_max_qnode_global_root_buffer_entries = num_max_qnode_global_root_buffer_entries; - - const uint num_builds_to_process = scheduler->batched_build_count; - const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; - - scheduler->batched_builds_to_process = num_builds_to_process; - scheduler->num_qnode_grb_curr_entries = (num_builds_to_process + 15) / 16; // here we store number of workgroups for "build_qnodes_begin_batchable" kernel - scheduler->num_qnode_grb_new_entries = num_builds_to_process; - scheduler->qnode_global_root_buffer.curr_entries_offset = max_qnode_grb_entries; -} - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void -build_qnodes_begin_batchable(global struct QnodeScheduler* scheduler, - global struct SAHBuildGlobals* builds_globals) -{ - const uint tid = get_group_id(0) * get_local_size(0) + get_local_id(0); - - const uint num_builds_to_process = scheduler->batched_builds_to_process; - - if(tid < num_builds_to_process) - { - const uint build_idx = scheduler->batched_build_offset + tid; - - uint bvh2_node = BVH2_GetRoot(SAHBuildGlobals_GetBVH2(&builds_globals[build_idx])); - uint qnode = 0; - struct QNodeGlobalRootBufferEntry entry = { bvh2_node, qnode, build_idx, 1}; - scheduler->qnode_global_root_buffer.entries[tid] = entry; - - builds_globals[build_idx].root_buffer_num_produced = 0; - builds_globals[build_idx].root_buffer_num_produced_hi = 0; - builds_globals[build_idx].root_buffer_num_consumed = 0; - builds_globals[build_idx].root_buffer_num_consumed_hi = 0; - - // allocate first node for this build - //allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx]), 1 ); - SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx])->nodeDataCur++; - } -} - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( 1, 1, 1 )) ) -kernel void -build_qnodes_scheduler(global struct QnodeScheduler* scheduler) -{ - const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; - - uint new_entries = min(scheduler->num_qnode_grb_new_entries, max_qnode_grb_entries); - - scheduler->num_qnode_grb_curr_entries = new_entries; - scheduler->num_qnode_grb_new_entries = 0; - scheduler->qnode_global_root_buffer.curr_entries_offset = scheduler->qnode_global_root_buffer.curr_entries_offset ? 0 : max_qnode_grb_entries; -} - - - - -// TODO_OPT: Enable larger WGs. WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( 32, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void -build_qnodes_pc_amplify_batched( - global struct SAHBuildGlobals* builds_globals, - global struct QnodeScheduler* scheduler - ) -{ - const uint group_id = get_group_id(0); - - global struct QNodeGlobalRootBuffer* global_root_buffer = &scheduler->qnode_global_root_buffer; - const uint curr_entries_offset = global_root_buffer->curr_entries_offset; - struct QNodeGlobalRootBufferEntry entry = global_root_buffer->entries[curr_entries_offset + group_id]; - - const uint build_id = entry.build_idx; - - global struct SAHBuildGlobals* globals = &builds_globals[build_id]; - global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer; - bool alloc_backpointers = SAHBuildGlobals_NeedBackPointers(globals); - bool process_masks = SAHBuildGlobals_NeedMasks(globals); - - const int RING_SIZE = 32; // for 2 SGs, 16 should result in 2 rounds: one SG produces 6, then 2 SGs consume 2 and produce 12 - // for 4 SGs, 32 results in 2 rounds: one SG produces 6, 4 SGs consume 4 and produce 24, resulting in 26 - - local uint2 SLM_local_root_buffer[RING_SIZE]; - local uint SLM_broadcast; - local uint SLM_ring_tail; - local uint SLM_grb_broadcast; - - - //// This below can be moved to separate function if needed for TLAS //// - - uniform uint bvh2_root = entry.bvh2_node; - uniform uint qnode_root = entry.qnode; - - uint ring_head = build_qnodes_pc(globals, alloc_backpointers, process_masks, - qnode_root, bvh2_root, SLM_local_root_buffer, &SLM_ring_tail, RING_SIZE); - - // spill root buffer contents - uint n = SLM_ring_tail - ring_head; - if (n > 0) - { - const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; - - if (get_local_id(0) == 0) - { - SLM_grb_broadcast = atomic_add_global(&scheduler->num_qnode_grb_new_entries, n); - - if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, then make space in build's root_buffer - SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n); - else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then make space in build's root_buffer - SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n - (max_qnode_grb_entries - SLM_grb_broadcast)); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - uint produce_pos = SLM_broadcast; - - uint grb_produce_num = n; // grb stands for global_root_buffer - uint lrb_produce_num = 0; // lrb stands for local root buffer, meaning this build's root_buffer - - if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, don't write to it - { - grb_produce_num = 0; - lrb_produce_num = n; - } - else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then decrease amount of entries and store rest in build's root buffer - { - grb_produce_num = max_qnode_grb_entries - SLM_grb_broadcast; - lrb_produce_num = n - grb_produce_num; - } - - // save data to global_root_buffer - for(uint i = get_local_id(0); i < grb_produce_num; i += get_local_size(0)) - { - const uint2 slm_record = SLM_local_root_buffer[(ring_head + i) % RING_SIZE]; - - struct QNodeGlobalRootBufferEntry new_entry; - new_entry.bvh2_node = slm_record.x; - new_entry.qnode = slm_record.y; - new_entry.build_idx = entry.build_idx; - - const uint new_entries_offset = curr_entries_offset ? 0 : max_qnode_grb_entries; - global_root_buffer->entries[new_entries_offset + SLM_grb_broadcast + i] = new_entry; - } - - // if anything left, write to build's root buffer - for (uint i = get_local_id(0); i < lrb_produce_num; i += get_local_size(0)) - root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i + grb_produce_num) % RING_SIZE]; - } -} - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void -build_qnodes_try_to_fill_grb_batched( - global struct SAHBuildGlobals* builds_globals, - global struct QnodeScheduler* scheduler - ) -{ - const uint build_id = scheduler->batched_build_offset + get_group_id(0); - global struct SAHBuildGlobals* globals = &builds_globals[build_id]; - global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer; - - global struct QNodeGlobalRootBuffer* qnode_root_buffer = (global struct QNodeGlobalRootBuffer*)&scheduler->qnode_global_root_buffer; - - const uint num_produced = globals->root_buffer_num_produced; - const uint num_consumed = globals->root_buffer_num_consumed; - const uint entries = num_produced - num_consumed; // entries to build's root buffer - - if(!entries) - return; - - uint global_root_buffer_offset; - if(get_local_id(0) == 0) - global_root_buffer_offset = atomic_add_global(&scheduler->num_qnode_grb_new_entries, entries); - - global_root_buffer_offset = sub_group_broadcast(global_root_buffer_offset, 0); - - const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries; - - if(global_root_buffer_offset >= max_qnode_grb_entries) // if global_root_buffer is full, then return - return; - - uint global_root_buffer_produce_num = entries; - if(global_root_buffer_offset + entries >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then reduce number of entries to push - global_root_buffer_produce_num = max_qnode_grb_entries - global_root_buffer_offset; - - for(uint i = get_local_id(0); i < global_root_buffer_produce_num; i += get_local_size(0)) - { - const uint2 entry = root_buffer[num_consumed + i]; - - struct QNodeGlobalRootBufferEntry new_entry; - new_entry.bvh2_node = entry.x; - new_entry.qnode = entry.y; - new_entry.build_idx = build_id; - - const uint new_entries_offset = qnode_root_buffer->curr_entries_offset ? 0 : max_qnode_grb_entries; - qnode_root_buffer->entries[new_entries_offset + global_root_buffer_offset + i] = new_entry; - } - - if(get_local_id(0) == 0) - globals->root_buffer_num_consumed += global_root_buffer_produce_num; -} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl deleted file mode 100644 index 1f64ef3fbe2..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl +++ /dev/null @@ -1,2025 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "intrinsics.h" -#include "AABB3f.h" -#include "AABB.h" -#include "GRLGen12.h" -#include "quad.h" -#include "common.h" -#include "instance.h" - -#include "api_interface.h" - -#include "binned_sah_shared.h" - - -#if 0 -#define LOOP_TRIPWIRE_INIT uint _loop_trip=0; - -#define LOOP_TRIPWIRE_INCREMENT(max_iterations) \ - _loop_trip++;\ - if ( _loop_trip > max_iterations )\ - {\ - if( get_local_id(0) == 0 )\ - printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!! group=%u\n", get_group_id(0) );\ - break;\ - } -#else - -#define LOOP_TRIPWIRE_INIT -#define LOOP_TRIPWIRE_INCREMENT(max_iterations) - -#endif - - -// ========================================================= -// DFS -// ========================================================= - -// there are 128 threads x SIMD16 == 2048 lanes in a DSS -// There is 128KB of SLM. Upper limit of 64KB per WG, so target is 2 groups of 1024 lanes @ 64K each -// --> Full occupancy requires using less than 64B per lane -// -// Groups of 256 lanes gives us 16KB per group -// - -// We use subgroups very heavily here in order to avoid -// use of per-thread scratch space for intermediate values - -#define DFS_WG_SIZE 256 -#define DFS_NUM_SUBGROUPS 16 -#define DFS_BVH2_NODE_COUNT (2*(DFS_WG_SIZE)-1) -#define TREE_ARITY 6 - -// FlatTree node limits: -// these are the derivations if we always collapse to one primitive and pack nodes as tightly as possible -// If BVH2 construction is allowed to terminate early and place multiple prims in a leaf, these numbers will be too low -#if 0 - -// maximum flattree size is the number of inner nodes in a full M-ary tree with one leaf per primitive -// This is given by I = (L-1)/(M-1) -// For a 256 thread workgroup, L=256, M=6, this gives: 51 -#define DFS_MAX_FLATTREE_NODES 51 - - -// A flattree leaf is a node which contains only primitives. -// -// The maximum number of leaves is related to the number of nodes as: -// L(N) = ((M-1)*N + 1) / M -// -#define DFS_MAX_FLATTREE_LEAFS 43 // = 43 for 256 thread WG (L=256, M=6) - -#else - -// This is the result of estimate_qbvh6_nodes(256) - -#define DFS_MAX_FLATTREE_LEAFS 256 -#define DFS_MAX_FLATTREE_NODES 307 // 256 fat-leaves + 51 inner nodes. 51 = ceil(256/5) -#define DFS_MAX_FLATTREE_DEPTH 52 // number of inner nodes in the worst-case tree - -#endif - -#define uniform -#define varying - - -struct DFSArgs -{ - global struct BVHBase* bvh_base; - global PrimRef* primref_buffer; - ushort leaf_node_type; - ushort inner_node_type; - ushort leaf_size_in_bytes; - bool need_backpointers; - bool need_masks; - ushort num_primrefs; - global uint* primref_index_buffer; -}; - - -struct DFSPrimRefAABB -{ - half lower[3]; - half upper[3]; -}; - -GRL_INLINE void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb ) -{ - bb->lower[0] = 1; - bb->lower[1] = 1; - bb->lower[2] = 1; - bb->upper[0] = 0; - bb->upper[1] = 0; - bb->upper[2] = 0; -} - -GRL_INLINE void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v ) -{ - aabb->lower[0] = min( aabb->lower[0], v->lower[0] ); - aabb->lower[1] = min( aabb->lower[1], v->lower[1] ); - aabb->lower[2] = min( aabb->lower[2], v->lower[2] ); - aabb->upper[0] = max( aabb->upper[0], v->upper[0] ); - aabb->upper[1] = max( aabb->upper[1], v->upper[1] ); - aabb->upper[2] = max( aabb->upper[2], v->upper[2] ); -} - -GRL_INLINE float DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb ) -{ - const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]); - return fma( d.x, (d.y + d.z), d.y * d.z ); -} - -GRL_INLINE struct DFSPrimRefAABB DFSPrimRefAABB_sub_group_reduce( struct DFSPrimRefAABB* aabb ) -{ - struct DFSPrimRefAABB bounds; - bounds.lower[0] = sub_group_reduce_min( aabb->lower[0] ); - bounds.lower[1] = sub_group_reduce_min( aabb->lower[1] ); - bounds.lower[2] = sub_group_reduce_min( aabb->lower[2] ); - bounds.upper[0] = sub_group_reduce_max( aabb->upper[0] ); - bounds.upper[1] = sub_group_reduce_max( aabb->upper[1] ); - bounds.upper[2] = sub_group_reduce_max( aabb->upper[2] ); - return bounds; -} - -struct DFSPrimRef -{ - struct DFSPrimRefAABB aabb; - uint2 meta; -}; - -struct PrimRefMeta -{ - uchar2 meta; -}; - -GRL_INLINE uint PrimRefMeta_GetInputIndex( struct PrimRefMeta* it ) -{ - return it->meta.x; -} -GRL_INLINE uint PrimRefMeta_GetInstanceMask( struct PrimRefMeta* it ) -{ - return it->meta.y; -} - - -struct PrimRefSet -{ - struct AABB3f root_aabb; - struct DFSPrimRefAABB AABB[DFS_WG_SIZE]; - uint2 meta[DFS_WG_SIZE]; - -}; - -GRL_INLINE local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id ) -{ - return &refs->AABB[id]; -} - -GRL_INLINE float PrimRefSet_GetMaxAABBArea( local struct PrimRefSet* refs ) -{ - float3 root_l = AABB3f_load_lower( &refs->root_aabb ); - float3 root_u = AABB3f_load_upper( &refs->root_aabb ); - float3 d = root_u - root_l; - float scale = 1.0f / max( d.x, max( d.y, d.z ) ); - - half3 dh = convert_half3_rtp( d * scale ); - return fma( dh.x, (dh.y + dh.z), dh.y * dh.z ); -} - -GRL_INLINE float3 ulp3( float3 v ) { - - return fabs(v) * FLT_EPSILON; -} - -GRL_INLINE struct AABB PrimRefSet_ConvertAABB( local struct PrimRefSet* refs, struct DFSPrimRefAABB* box ) -{ - float3 root_l = AABB3f_load_lower( &refs->root_aabb ); - float3 root_u = AABB3f_load_upper( &refs->root_aabb ); - float3 d = root_u - root_l; - float scale = max( d.x, max( d.y, d.z ) ); - - float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) ); - float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) ); - l = l * scale + root_l ; - u = u * scale + root_l ; - - // clamping is necessary in case that a vertex lies exactly in the upper AABB plane. - // If we use unclamped values, roundoff error in the scale factor calculation can cause us - // to snap to a flattened AABB that lies outside of the original one, resulting in missed geometry. - u = min( u, root_u ); - l = min( l, root_u ); - - struct AABB r; - r.lower.xyz = l.xyz; - r.upper.xyz = u.xyz; - return r; -} - -GRL_INLINE PrimRef PrimRefSet_GetFullPrecisionAABB( local struct PrimRefSet* refs, ushort id ) -{ - struct AABB r; - r = PrimRefSet_ConvertAABB( refs, &refs->AABB[id] ); - r.lower.w = 0; - r.upper.w = 0; - return r; -} - -GRL_INLINE uint PrimRefSet_GetInputIndex( local struct PrimRefSet* refs, ushort id ) -{ - return refs->meta[id].x; -} - -GRL_INLINE uint PrimRefSet_GetInstanceMask( local struct PrimRefSet* refs, ushort id ) -{ - return refs->meta[id].y; -} -GRL_INLINE struct PrimRefMeta PrimRefSet_GetMeta( local struct PrimRefSet* refs, ushort id ) -{ - struct PrimRefMeta meta; - meta.meta.x = refs->meta[id].x; - meta.meta.y = refs->meta[id].y; - return meta; -} - - -GRL_INLINE struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id ) -{ - struct DFSPrimRef r; - r.aabb = refs->AABB[id]; - r.meta = refs->meta[id]; - return r; -} - - -GRL_INLINE void PrimRefSet_SetPrimRef_FullPrecision( local struct PrimRefSet* refs, PrimRef ref, ushort id ) -{ - - float3 root_l = AABB3f_load_lower( &refs->root_aabb ); - float3 root_u = AABB3f_load_upper( &refs->root_aabb ); - float3 d = root_u - root_l; - float scale = 1.0f / max(d.x, max(d.y,d.z)); - - float3 l = ref.lower.xyz; - float3 u = ref.upper.xyz; - half3 lh = convert_half3_rtz( (l - root_l) * scale ); - half3 uh = convert_half3_rtp( (u - root_l) * scale ); - - refs->AABB[id].lower[0] = lh.x; - refs->AABB[id].lower[1] = lh.y; - refs->AABB[id].lower[2] = lh.z; - refs->AABB[id].upper[0] = uh.x; - refs->AABB[id].upper[1] = uh.y; - refs->AABB[id].upper[2] = uh.z; - refs->meta[id].x = id; - refs->meta[id].y = PRIMREF_instanceMask(&ref); - - -} - -GRL_INLINE void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id ) -{ - refs->AABB[id] = ref.aabb; - refs->meta[id] = ref.meta; -} - -GRL_INLINE struct AABB3f PrimRefSet_GetRootAABB( local struct PrimRefSet* refs ) -{ - return refs->root_aabb; -} - -GRL_INLINE void SUBGROUP_PrimRefSet_Initialize( local struct PrimRefSet* refs ) -{ - if ( get_sub_group_local_id() == 0 ) - AABB3f_init( &refs->root_aabb ); // TODO_OPT: subgroup-vectorized version of AABB3f_init -} - - -GRL_INLINE void PrimRefSet_Printf( local struct PrimRefSet* refs, ushort num_prims ) -{ - - barrier( CLK_LOCAL_MEM_FENCE ); - if ( get_local_id( 0 ) == 0 ) - { - printf( "Scene AABB:\n" ); - struct AABB3f rootBox = PrimRefSet_GetRootAABB( refs ); - AABB3f_print( &rootBox ); - - float ma = PrimRefSet_GetMaxAABBArea( refs ); - - for ( uint i = 0; i < num_prims; i++ ) - { - printf( "Ref: %u\n", i ); - struct AABB r = PrimRefSet_GetFullPrecisionAABB( refs, i ); - AABB_print( &r ); - - float a = DFSPrimRefAABB_halfArea( PrimRefSet_GetAABBPointer( refs, i ) ); - printf( "Scaled Area: %f / %f = %f \n", a, ma, a / ma ); - - } - } - barrier( CLK_LOCAL_MEM_FENCE ); -} - - - -GRL_INLINE void PrimRefSet_CheckBounds( local struct PrimRefSet* refs, ushort num_prims, PrimRef* primref_buffer ) -{ - - barrier( CLK_LOCAL_MEM_FENCE ); - if ( get_local_id( 0 ) == 0 ) - { - - for ( uint i = 0; i < num_prims; i++ ) - { - PrimRef ref = primref_buffer[i]; - struct AABB r2 = PrimRefSet_GetFullPrecisionAABB( refs, i ); - - struct DFSPrimRefAABB* box = &refs->AABB[i]; - float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) ); - float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) ); - - printf( " halfs:{%x,%x,%x}{%x,%x,%x}\n", as_uint(l.x), as_uint(l.y), as_uint(l.z), as_uint(u.x), as_uint(u.y), as_uint(u.z) ); - - printf( " {%f,%f,%f} {%f,%f,%f} {%f,%f,%f} {%f,%f,%f} {%u,%u,%u,%u,%u,%u}\n", - ref.lower.x, ref.lower.y, ref.lower.z, r2.lower.x, r2.lower.y, r2.lower.z, - ref.upper.x, ref.upper.y, ref.upper.z, r2.upper.x, r2.upper.y, r2.upper.z, - r2.lower.x <= ref.lower.x, - r2.lower.y <= ref.lower.y, - r2.lower.z <= ref.lower.z, - - r2.upper.x >= ref.upper.x, - r2.upper.y >= ref.upper.y, - r2.upper.z >= ref.upper.z ); - - } - - } - barrier( CLK_LOCAL_MEM_FENCE ); -} - - - -struct LocalBVH2 -{ - uint num_nodes; - uint nodes[DFS_BVH2_NODE_COUNT]; - - // nodes are a bitfield: - // bits 8:0 (9b) ==> number of primrefs in this subtree - // - // bits 17:9 (9b) ==> for an inner node: contains offset to a pair of children - // ==> for a leaf node: contains index of the first primref in this leaf - // - // bits 30:18 (13b) ==> quantized AABB area (relative to root box) - // bit 31 (1b) ==> is_inner flag - // - // NOTE: The left child offset of any node is always odd.. therefore, it is possible to recover a bit if we need it - // by storing only the 8 MSBs -}; - -#define DFS_BVH2_AREA_QUANT 8191.0f - - - -GRL_INLINE void SUBGROUP_LocalBVH2_Initialize( local struct LocalBVH2* tree, ushort num_prims ) -{ - tree->num_nodes = 1; // include the root node - tree->nodes[0] = num_prims; // initialize root node as a leaf containing the full subtree - -} - -GRL_INLINE void LocalBVH2_CreateInnerNode( local struct LocalBVH2* tree, ushort node_index, - ushort start_left, ushort start_right, - ushort quantized_left_area, ushort quantized_right_area ) -{ - uint child_pos = atomic_add_local( &tree->num_nodes, 2 ); - - // set the inner node flag and child position in the parent - // leave the other bits intact - uint parent_node = tree->nodes[node_index]; - parent_node |= 0x80000000; - parent_node = (parent_node & ~(0x1ff<<9)) | (child_pos << 9); - tree->nodes[node_index] = parent_node; - - // setup children as leaf nodes with prim-count zero - uint left_child = (convert_uint(start_left) << 9) | (convert_uint( quantized_left_area ) << 18); - uint right_child = (convert_uint(start_right) << 9) | (convert_uint( quantized_right_area ) << 18); - tree->nodes[child_pos] = left_child; - tree->nodes[child_pos + 1] = right_child; - -} - -GRL_INLINE ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* tree, ushort node_index ) -{ - // increment only the lower bits. Given correct tree construction algorithm this will not overflow into MSBs - return (atomic_inc_local( &tree->nodes[node_index] )) & 0x1ff; -} - -GRL_INLINE ushort LocalBVH2_GetNodeArea( local struct LocalBVH2* tree, ushort nodeID ) -{ - return (tree->nodes[nodeID] >> 18) & 0x1FFF; -} - -GRL_INLINE bool LocalBVH2_IsInnerNode( local struct LocalBVH2* tree, ushort nodeID ) -{ - return (tree->nodes[nodeID] & 0x80000000) != 0; -} - - -GRL_INLINE ushort2 LocalBVH2_GetChildIndices( local struct LocalBVH2* tree, ushort nodeID ) -{ - ushort idx = ((tree->nodes[nodeID] >> 9) & 0x1FF); - return (ushort2)(idx, idx + 1); -} - -GRL_INLINE ushort LocalBVH2_GetSubtreePrimCount( local struct LocalBVH2* tree, ushort node ) -{ - return tree->nodes[node] & 0x1FF; -} - -GRL_INLINE ushort LocalBVH2_GetLeafPrimStart( local struct LocalBVH2* tree, ushort node ) -{ - return ((tree->nodes[node] >> 9) & 0x1FF); -} - - -GRL_INLINE void LocalBVH2_Printf( local struct LocalBVH2* tree ) -{ - barrier( CLK_LOCAL_MEM_FENCE ); - - if ( get_local_id( 0 ) == 0 ) - { - printf( "Nodes: %u\n", tree->num_nodes ); - - for ( uint i = 0; i < tree->num_nodes; i++ ) - { - uint num_prims = LocalBVH2_GetSubtreePrimCount( tree, i ); - printf( "%3u : 0x%08x %3u 0x%04x ", i, tree->nodes[i], num_prims, LocalBVH2_GetNodeArea(tree,i) ); - if ( LocalBVH2_IsInnerNode( tree, i ) ) - { - ushort2 kids = LocalBVH2_GetChildIndices( tree, i ); - printf( " INNER ( %3u %3u )\n", kids.x, kids.y ); - } - else - { - printf( " LEAF {" ); - for ( uint j = 0; j < num_prims; j++ ) - printf( " %3u ", LocalBVH2_GetLeafPrimStart( tree, i ) + j ); - printf( "}\n" ); - } - } - } - - barrier( CLK_LOCAL_MEM_FENCE ); -} - -struct FlatTreeInnerNode -{ - uint DW0; // lower 16b are index of corresponding LocalBVH2 node.. Bits 30:16 are an atomic flag used during refit. Bit 31 is a leaf marker - ushort parent_index; - ushort first_child; - uchar index_in_parent; - uchar num_children; - - //struct DFSPrimRefAABB AABB; -}; - -struct FlatTree -{ - uint num_nodes; - uint qnode_byte_offset; // byte offset from the BVHBase to the flat-tree's first QNode - uint qnode_base_index; - - struct FlatTreeInnerNode nodes[DFS_MAX_FLATTREE_NODES]; - uchar primref_back_pointers[DFS_WG_SIZE]; -}; - -GRL_INLINE void FlatTree_Printf( local struct FlatTree* flat_tree ) -{ - barrier( CLK_LOCAL_MEM_FENCE ); - if ( get_local_id( 0 ) == 0 ) - { - printf( "NumNodes: %u\n", flat_tree->num_nodes ); - for ( uint i = 0; i < flat_tree->num_nodes; i++ ) - { - ushort bvh2_node = flat_tree->nodes[i].DW0 & 0xffff; - printf( "%2u Parent: %2u Index_in_parent: %u, NumKids: %u FirstKid: %3u bvh2: %3u DW0: 0x%x\n", - i, - flat_tree->nodes[i].parent_index, - flat_tree->nodes[i].index_in_parent, - flat_tree->nodes[i].num_children, - flat_tree->nodes[i].first_child, - bvh2_node, - flat_tree->nodes[i].DW0 ); - } - } - barrier( CLK_LOCAL_MEM_FENCE ); -} - - - - -GRL_INLINE ushort FlatTree_GetNodeCount( local struct FlatTree* flat_tree ) -{ - return flat_tree->num_nodes; -} - -GRL_INLINE uint FlatTree_GetParentIndex( local struct FlatTree* flat_tree, ushort id ) -{ - return flat_tree->nodes[id].parent_index; -} - -GRL_INLINE ushort FlatTree_GetBVH2Root( local struct FlatTree* flat_tree, ushort node_index ) -{ - return (flat_tree->nodes[node_index].DW0) & 0xffff; -} - -GRL_INLINE ushort FlatTree_GetNumChildren( local struct FlatTree* flat_tree, ushort node_index ) -{ - return flat_tree->nodes[node_index].num_children; -} - -GRL_INLINE bool FlatTree_IsLeafNode( local struct FlatTree* flat_tree, ushort node_index ) -{ - return (flat_tree->nodes[node_index].DW0 & 0x80000000) != 0; -} - - -GRL_INLINE uint FlatTree_GetQNodeByteOffset( struct FlatTree* flat_tree, ushort node_index ) -{ - return flat_tree->qnode_byte_offset + node_index * sizeof(struct QBVHNodeN); -} - -GRL_INLINE uint FlatTree_GetQNodeIndex( struct FlatTree* flat_tree, ushort node_index ) -{ - return flat_tree->qnode_base_index + node_index; -} - -GRL_INLINE void FlatTree_AllocateQNodes( struct FlatTree* flat_tree, struct DFSArgs args ) -{ - uint node_base = 64*allocate_inner_nodes( args.bvh_base, flat_tree->num_nodes ); - flat_tree->qnode_base_index = (node_base - BVH_ROOT_NODE_OFFSET) / sizeof( struct QBVHNodeN ); - flat_tree->qnode_byte_offset = node_base; -} - -GRL_INLINE ushort FlatTree_GetFirstChild( struct FlatTree* flat_tree, ushort node_index ) -{ - return flat_tree->nodes[node_index].first_child; -} - -GRL_INLINE ushort FlatTree_GetPrimRefStart( struct FlatTree* flat_tree, ushort node_index ) -{ - return flat_tree->nodes[node_index].first_child; -} -GRL_INLINE ushort FlatTree_GetPrimRefCount( struct FlatTree* flat_tree, ushort node_index ) -{ - return flat_tree->nodes[node_index].num_children; -} - -GRL_INLINE uint FlatTree_BuildBackPointer( local struct FlatTree* flat_tree, ushort node_index ) -{ - uint parent_index = flat_tree->nodes[node_index].parent_index + flat_tree->qnode_base_index; - parent_index = (parent_index << 6) | (FlatTree_GetNumChildren( flat_tree, node_index ) << 3); - return parent_index; -} - - -GRL_INLINE void SUBGROUP_FlatTree_Initialize( uniform local struct FlatTree* flat_tree, struct DFSArgs args ) -{ - if ( get_sub_group_local_id() == 0 ) - { - flat_tree->num_nodes = 1; - flat_tree->nodes[0].DW0 = 0; // point first node at BVH2 root node, which is assumed to be at index zero - } - -} -/* -GRL_INLINE void SUBGROUP_FlatTree_ReduceAndSetAABB( uniform local struct FlatTree* flat_tree, - uniform ushort node_index, - varying local struct DFSPrimRefAABB* box ) -{ - // TODO_OPT: Replace this with an optimized reduction which exploits the fact that we only ever have 6 active lanes - // Try using the "negated max" trick here to compute min/max simultaneously, with max in top 6 lanes - // This will replace 6 reductions with 3 - - // TODO_OPT: This only utilizes up to 6 SIMD lanes. We can use up to 12 of them by putting - // min into even lanes, and -max into odd lanes, and using a manual min-reduction on pairs of lanes - - struct DFSPrimRefAABB bb = DFSPrimRefAABB_sub_group_reduce( box ); - if( get_sub_group_local_id() ) - flat_tree->nodes[node_index].AABB = bb; -} -*/ - -GRL_INLINE void SUBGROUP_FlatTree_CreateInnerNode( uniform local struct FlatTree* flat_tree, - uniform ushort flat_tree_root, - varying ushort sg_child_bvh2_root, - uniform ushort num_children ) -{ - uniform uint lane = get_sub_group_local_id(); - - // increment counter to allocate new nodes.. set required root node fields - uniform uint child_base; - if ( lane == 0 ) - { - child_base = atomic_add_local( &flat_tree->num_nodes, num_children ); - flat_tree->nodes[flat_tree_root].first_child = (uchar) child_base; - flat_tree->nodes[flat_tree_root].num_children = num_children; - - // initialize mask bits for this node's live children - uint child_mask = ((1 << num_children) - 1) << 16; - flat_tree->nodes[flat_tree_root].DW0 |= child_mask; - } - - child_base = sub_group_broadcast( child_base, 0 ); - - // initialize child nodes - if ( lane < num_children ) - { - varying uint child = child_base + lane; - flat_tree->nodes[child].DW0 = sg_child_bvh2_root; - flat_tree->nodes[child].index_in_parent = lane; - flat_tree->nodes[child].parent_index = flat_tree_root; - } - -} - - - -GRL_INLINE void SUBGROUP_FlatTree_CreateLeafNode( uniform local struct FlatTree* flat_tree, - uniform ushort flat_tree_root, - uniform ushort primref_start, - uniform ushort num_prims ) -{ - ushort lane = get_sub_group_local_id(); - if ( lane < num_prims ) - { - flat_tree->primref_back_pointers[primref_start + lane] = (uchar) flat_tree_root; - if ( lane == 0 ) - { - flat_tree->nodes[flat_tree_root].first_child = (uchar) primref_start; - flat_tree->nodes[flat_tree_root].num_children = (uchar) num_prims; - flat_tree->nodes[flat_tree_root].DW0 |= 0x80000000; - } - } -} - - -GRL_INLINE uniform bool SUBGROUP_FlatTree_SignalRefitComplete( uniform local struct FlatTree* flat_tree, uniform ushort* p_node_index ) -{ - uniform ushort node_index = *p_node_index; - uniform ushort parent = flat_tree->nodes[node_index].parent_index; - uniform ushort index_in_parent = flat_tree->nodes[node_index].index_in_parent; - - // clear the corresponding mask bit in the parent node - uniform uint child_mask = (0x10000 << index_in_parent); - uniform uint old_mask_bits = 0; - if( get_sub_group_local_id() == 0 ) - old_mask_bits = atomic_xor( &flat_tree->nodes[parent].DW0, child_mask ); - - old_mask_bits = sub_group_broadcast( old_mask_bits, 0 ); - - // if we cleared the last mask bit, this subgroup proceeds up the tree and refits the next node - // otherwise, it looks for something else to do - if ( ((old_mask_bits^child_mask) & 0xffff0000) == 0 ) - { - *p_node_index = parent; - return true; - } - - return false; -} - -/* -GRL_INLINE local struct DFSPrimRefAABB* FlatTree_GetChildAABB( local struct FlatTree* flat_tree, - local struct PrimRefSet* prim_refs, - ushort node_index, ushort child_index ) -{ - ushort child_id = FlatTree_GetFirstChild( flat_tree, node_index ) + child_index; - - if( !FlatTree_IsLeafNode( flat_tree, node_index ) ) - return &flat_tree->nodes[child_id].AABB; - else - return PrimRefSet_GetAABBPointer( prim_refs, child_id ); -} -*/ -GRL_INLINE uint FlatTree_GetPrimRefBackPointer( local struct FlatTree* flat_tree, ushort primref_index ) -{ - return flat_tree->primref_back_pointers[primref_index] * sizeof(struct QBVHNodeN) + flat_tree->qnode_byte_offset; -} - - -GRL_INLINE void FlatTree_check_boxes(local struct FlatTree* flat_tree, - global struct AABB* primref_buffer, - local struct AABB3f* boxes, - local struct PrimRefMeta* meta ) - -{ - barrier(CLK_LOCAL_MEM_FENCE); - if (get_local_id(0) == 0) - { - printf("checking flattree bounds...\n"); - - for (uint i = 0; i < flat_tree->num_nodes; i++) - { - struct AABB rb; - rb.lower.xyz = AABB3f_load_lower(&boxes[i]); - rb.upper.xyz = AABB3f_load_upper(&boxes[i]); - - uint offs = FlatTree_GetFirstChild( flat_tree, i ); - uint count = FlatTree_GetNumChildren( flat_tree, i ); - - for (uint c = 0; c < count; c++) - { - struct AABB lb; - if (FlatTree_IsLeafNode( flat_tree, i )) - { - lb = primref_buffer[ PrimRefMeta_GetInputIndex( &meta[offs+c] ) ]; - } - else - { - lb.lower.xyz = AABB3f_load_lower(&boxes[ offs+c ]); - lb.upper.xyz = AABB3f_load_upper(&boxes[ offs+c ]); - } - - if( !AABB_subset( &lb, &rb ) ) - printf("Bad bounds!! child %u of %u %f : %f %f : %f %f : %f %f : %f %f : %f %f : %f \n", - c, i , - rb.lower.x, rb.upper.x, rb.lower.y, rb.upper.y, rb.lower.z, rb.upper.z, - lb.lower.x, lb.upper.x, lb.lower.y, lb.upper.y, lb.lower.z, lb.upper.z - ); - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); -} - - -struct FlatTreeScheduler -{ - int num_leafs; - uint writeout_produce_count; - uint writeout_consume_count; - uint active_subgroups; - uint num_built_nodes; - uint num_levels; // number of depth levels in the tree - - //uchar leaf_indices[DFS_MAX_FLATTREE_LEAFS]; // indices of leaf FlatTree nodes to be refitted - //uchar writeout_indices[DFS_MAX_FLATTREE_NODES]; // indices of flattree nodes to be written out or collapsed - - ushort level_ordered_nodes[DFS_MAX_FLATTREE_NODES]; // node indices sorted by depth (pre-order, high depth before low depth) - ushort level_start[DFS_MAX_FLATTREE_DEPTH]; // first node at given level in the level-ordered node array - uint level_count[DFS_MAX_FLATTREE_DEPTH]; // number of nodes at given level -}; - -GRL_INLINE void SUBGROUP_FlatTreeScheduler_Initialize( uniform local struct FlatTreeScheduler* scheduler ) -{ - scheduler->num_built_nodes = 0; - scheduler->num_leafs = 0; - scheduler->writeout_produce_count = 0; - scheduler->writeout_consume_count = 0; - scheduler->active_subgroups = DFS_NUM_SUBGROUPS; -} -/* -GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueLeafForRefit( uniform local struct FlatTreeScheduler* scheduler, - uniform ushort leaf ) -{ - if ( get_sub_group_local_id() == 0 ) - scheduler->leaf_indices[atomic_inc( &scheduler->num_leafs )] = leaf; -}*/ - -GRL_INLINE void SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node ) -{ - if ( get_sub_group_local_id() == 0 ) - atomic_inc_local( &scheduler->num_built_nodes ); -} - -GRL_INLINE uint FlatTreeScheduler_GetNumBuiltNodes( uniform local struct FlatTreeScheduler* scheduler ) -{ - return scheduler->num_built_nodes; -} - -/* -GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node ) -{ - if ( get_sub_group_local_id() == 0 ) - scheduler->writeout_indices[atomic_inc( &scheduler->writeout_produce_count )] = node; -}*/ - -/* -GRL_INLINE bool SUBGROUP_FlatTreeScheduler_GetRefitTask( uniform local struct FlatTreeScheduler* scheduler, uniform ushort* leaf_idx ) -{ - // schedule the leaves in reverse order to ensure that later leaves - // complete before earlier ones.. This prevents contention during the WriteOut stage - // - // There is a barrier between this function and 'QueueLeafForRefit' so we can safely decrement the same counter - // that we incremented earlier - varying int idx = 0; - if( get_sub_group_local_id() == 0 ) - idx = atomic_dec( &scheduler->num_leafs ); - - sub_group_barrier( CLK_LOCAL_MEM_FENCE ); - idx = sub_group_broadcast( idx, 0 ); - - if ( idx <= 0 ) - return false; - - *leaf_idx = scheduler->leaf_indices[idx-1]; - return true; -}*/ - -/* -// Signal the scheduler that a subgroup has reached the DONE state. -// Return true if this is the last subgroup to be done -void SUBGROUP_FlatTreeScheduler_SubGroupDone( local struct FlatTreeScheduler* scheduler ) -{ - if ( get_sub_group_local_id() == 0 ) - atomic_dec( &scheduler->active_subgroups ); -} -*/ - -/* - -#define STATE_SCHEDULE_REFIT 0x1234 -#define STATE_SCHEDULE_WRITEOUT 0x5679 -#define STATE_REFIT 0xabcd -#define STATE_WRITEOUT 0xefef -#define STATE_DONE 0xaabb - -// Get a flattree node to write out. Returns the new scheduler state -GRL_INLINE ushort SUBGROUP_FlatTreeScheduler_GetWriteOutTask( uniform local struct FlatTreeScheduler* scheduler, - uniform ushort num_nodes, - uniform ushort* node_idx ) -{ - uniform ushort return_state = STATE_WRITEOUT; - uniform ushort idx = 0; - if ( get_sub_group_local_id() == 0 ) - { - idx = atomic_inc( &scheduler->writeout_consume_count ); - - if ( idx >= scheduler->writeout_produce_count ) - { - // more consumers than there are produced tasks.... - - if ( scheduler->writeout_produce_count == num_nodes ) - { - // if all nodes have been written out, flattening is done - return_state = STATE_DONE; - } - else - { - // some writeout tasks remain, and have not been produced by refit threads yet - // we need to put this one back - atomic_dec( &scheduler->writeout_consume_count ); - return_state = STATE_SCHEDULE_WRITEOUT; - } - } - else - { - // scheduled successfully - idx = scheduler->writeout_indices[idx]; - } - } - - *node_idx = sub_group_broadcast( idx, 0 ); - return sub_group_broadcast( return_state, 0 ); - -} -*/ - - -/* -GRL_INLINE void FlatTreeScheduler_Printf( local struct FlatTreeScheduler* scheduler ) -{ - barrier( CLK_LOCAL_MEM_FENCE ); - - if ( get_local_id( 0 ) == 0 ) - { - printf( "***SCHEDULER***\n" ); - printf( "built_nodes=%u active_sgs=%u leafs=%u wo_p=%u wo_c=%u\n", scheduler->num_built_nodes, scheduler->active_subgroups, scheduler->num_leafs, - scheduler->writeout_produce_count, scheduler->writeout_consume_count ); - printf( "leafs for refit: {" ); - - int nleaf = max( scheduler->num_leafs, 0 ); - - for ( uint i = 0; i < nleaf; i++ ) - printf( "%u ", scheduler->leaf_indices[i] ); - printf( "}\n" ); - - printf( "writeout queue: %u:%u {", scheduler->writeout_produce_count, scheduler->writeout_consume_count ); - for ( uint i = 0; i < scheduler->writeout_produce_count; i++ ) - printf( "%u ", scheduler->writeout_indices[i] ); - printf( "}\n" ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - -} -*/ - - -GRL_INLINE void SUBGROUP_BuildFlatTreeNode( local struct LocalBVH2* bvh2, - local struct FlatTree* flat_tree, - local struct FlatTreeScheduler* scheduler, - uniform ushort flat_tree_root ) -{ - varying ushort lane = get_sub_group_local_id(); - varying ushort bvh2_root = FlatTree_GetBVH2Root( flat_tree, flat_tree_root ); - - if ( !LocalBVH2_IsInnerNode( bvh2, bvh2_root ) ) - { - uniform ushort num_prims = LocalBVH2_GetSubtreePrimCount( bvh2, bvh2_root ); - uniform ushort primref_start = LocalBVH2_GetLeafPrimStart( bvh2, bvh2_root ); - - SUBGROUP_FlatTree_CreateLeafNode( flat_tree, flat_tree_root, primref_start, num_prims ); - } - else - { - // collapse BVH2 into BVH6. - // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough - uniform ushort num_children = 2; - - uniform ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); - varying ushort sg_bvh2_node = kids.x; - if ( lane == 1 ) - sg_bvh2_node = kids.y; - - do - { - // choose the inner node with maximum area to replace. - // Its left child goes in its old location. Its right child goes in a new lane - - varying ushort sg_area = LocalBVH2_GetNodeArea( bvh2, sg_bvh2_node ); - varying bool sg_is_inner = LocalBVH2_IsInnerNode( bvh2, sg_bvh2_node ); - sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf - - uniform ushort max_area = sub_group_reduce_max( sg_area ); - varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner; - uniform uint mask = intel_sub_group_ballot( sg_reducable ); - - // TODO_OPT: Some of these ops seem redundant.. look at trimming further - // TODO_OPT: sub_group_reduce_max results in too many instructions...... unroll the loop and specialize it.. - // or ask IGC to give us a version that declares a static maximum number of subgroups to use - - if ( mask == 0 ) - break; - - // choose the inner node with maximum area to replace - uniform ushort victim_child = ctz( mask ); - uniform ushort victim_node = sub_group_broadcast( sg_bvh2_node, victim_child ); - uniform ushort2 kids = LocalBVH2_GetChildIndices( bvh2, victim_node ); - - if ( lane == victim_child ) - sg_bvh2_node = kids.x; - else if ( lane == num_children ) - sg_bvh2_node = kids.y; - - - num_children++; - - - }while ( num_children < TREE_ARITY ); - - SUBGROUP_FlatTree_CreateInnerNode( flat_tree, flat_tree_root, sg_bvh2_node, num_children ); - } - -} - - -GRL_INLINE void SUBGROUP_DFS_BuildFlatTree( uniform local struct LocalBVH2* bvh2, - uniform local struct FlatTree* flat_tree, - uniform local struct FlatTreeScheduler* scheduler - ) -{ - - uniform ushort flat_tree_node_index = get_sub_group_id(); - uniform ushort num_nodes = 1; - uniform ushort num_built = 0; - - uint tid = get_local_id(0); - if (tid < DFS_MAX_FLATTREE_DEPTH) - { - scheduler->level_start[tid] = DFS_MAX_FLATTREE_NODES; - scheduler->level_count[tid] = 0; - scheduler->num_levels = 0; - } - - LOOP_TRIPWIRE_INIT; - - do - { - // process one flat tree node per sub group, as many as are available - // - // The first pass will only run one sub-group, the second up to 6, the third up to 36, and so on - // nodes will be processed in breadth-first order, but they are not guaranteed to be stored in this order - // due to use of atomic counters for node allocation - // - if ( flat_tree_node_index < num_nodes ) - { - SUBGROUP_BuildFlatTreeNode( bvh2, flat_tree, scheduler, flat_tree_node_index ); - SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( scheduler, flat_tree_node_index ); - flat_tree_node_index += get_num_sub_groups(); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - // bump up the node count if new nodes were created - // stop as soon as all flattree nodes have been processed - num_nodes = FlatTree_GetNodeCount( flat_tree ); - num_built = FlatTreeScheduler_GetNumBuiltNodes( scheduler ); - - barrier( CLK_LOCAL_MEM_FENCE ); - - LOOP_TRIPWIRE_INCREMENT( 300 ); - - } while ( num_built < num_nodes ); - - barrier( CLK_LOCAL_MEM_FENCE ); - - - // determine depth of each node, compute node ranges and counts for each depth level, - // and prepare a depth-ordered node index array - uint depth = 0; - uint level_pos = 0; - for( uint i=tid; ilevel_count[depth] ); - - // compute total number of levels - atomic_max_local( &scheduler->num_levels, depth+1 ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - for( uint i=tid; ilevel_count[d]; - - scheduler->level_start[depth] = level_start; - - // scatter node indices into level-ordered node array - scheduler->level_ordered_nodes[level_start + level_pos] = tid; - } - - barrier( CLK_LOCAL_MEM_FENCE ); - -} - -/* -GRL_INLINE bool SUBGROUP_RefitNode( uniform local struct FlatTree* flat_tree, - uniform local struct PrimRefSet* prim_refs, - uniform ushort* p_node_index ) -{ - - // fetch and reduce child AABBs across the subgroup - uniform ushort node_index = *p_node_index; - uniform ushort num_kids = FlatTree_GetNumChildren( flat_tree, node_index ); - varying ushort sg_child_index = (get_sub_group_local_id() < num_kids) ? get_sub_group_local_id() : 0; - - varying local struct DFSPrimRefAABB* box = FlatTree_GetChildAABB( flat_tree, prim_refs, node_index, sg_child_index ); - - SUBGROUP_FlatTree_ReduceAndSetAABB( flat_tree, node_index, box ); - - if ( node_index == 0 ) - return false; // if we just refitted the root, we can stop now - - // signal the parent node that this node was refitted. If this was the last child to be refitted - // returns true and sets 'node_index' to the parent node, so that this thread can continue refitting - return SUBGROUP_FlatTree_SignalRefitComplete( flat_tree, p_node_index ); -}*/ - -GRL_INLINE struct QBVHNodeN* qnode_ptr( BVHBase* bvh_mem, uint byte_offset ) -{ - return (struct QBVHNodeN*)(((char*)bvh_mem) + byte_offset); -} - -GRL_INLINE void SUBGROUP_WriteQBVHNode( - uniform local struct FlatTree* flat_tree, - uniform local struct PrimRefMeta* primref_meta, - uniform local struct AABB3f* boxes, - uniform ushort flat_tree_root, - uniform struct DFSArgs args, - uniform local uchar* masks - ) -{ - - - uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root ); - uniform bool is_leaf = FlatTree_IsLeafNode( flat_tree, flat_tree_root ); - - varying ushort lane = get_sub_group_local_id(); - varying ushort sg_child_index = (lane < num_children) ? lane : 0; - - uniform ushort child_base = FlatTree_GetFirstChild( flat_tree, flat_tree_root ); - - varying struct AABB sg_box4; - if (FlatTree_IsLeafNode( flat_tree, flat_tree_root )) - { - // fetch AABBs for primrefs - sg_box4 = args.primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_base + sg_child_index] ) ]; - - } - else - { - // fetch AABBs for child nodes - sg_box4.lower.xyz = AABB3f_load_lower( &boxes[child_base+sg_child_index] ); - sg_box4.upper.xyz = AABB3f_load_upper( &boxes[child_base+sg_child_index] ); - } - - - struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) ); - - uniform int offset; - uniform uint child_type; - if ( is_leaf ) - { - char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); - - leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes; - - offset = (int)(leaf_mem - (char*)qnode); - child_type = args.leaf_node_type; - } - else - { - struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) ); - offset = (int) ((char*)kid - (char*)qnode); - child_type = args.inner_node_type; - } - offset = offset >> 6; - - if (child_type == NODE_TYPE_INSTANCE) - { - uint instanceMask = PrimRefMeta_GetInstanceMask( &primref_meta[child_base + sg_child_index] ); - subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 ); - } - else - { - uint mask = BVH_NODE_DEFAULT_MASK; - if( args.need_masks ) - mask = masks[flat_tree_root]; - - subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode, mask ); - } - - if ( args.need_backpointers ) - { - global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); - uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root ); - uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root ); - back_pointers[idx] = bp; - } - - /* - // TODO_OPT: Eventually this section should also handle leaf splitting due to mixed primref types - // For now this is done by the leaf creation pipeline, but that path should probably be refactored - // such that all inner node creation is done in one place - - uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root ); - uniform bool is_leaf = FlatTree_IsLeafNode( flat_tree, flat_tree_root ); - - varying ushort lane = get_sub_group_local_id(); - varying ushort sg_child_index = (lane < num_children) ? lane : 0; - - varying local struct DFSPrimRefAABB* sg_box = FlatTree_GetChildAABB( flat_tree, prim_refs, flat_tree_root, sg_child_index ); - - varying struct AABB sg_box4 = PrimRefSet_ConvertAABB( prim_refs, sg_box ); - - struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) ); - - uniform int offset; - uniform uint child_type; - if ( is_leaf ) - { - char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base ); - - leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes; - - offset = (int)(leaf_mem - (char*)qnode); - child_type = args.leaf_node_type; - } - else - { - struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) ); - offset = (int) ((char*)kid - (char*)qnode); - child_type = args.inner_node_type; - } - offset = offset >> 6; - - if (child_type == NODE_TYPE_INSTANCE) - { - uint instanceMask = PrimRefSet_GetInstanceMask( prim_refs, FlatTree_GetPrimRefStart(flat_tree, flat_tree_root) + lane ); - subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 ); - } - else - subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode ); - - if ( args.need_backpointers ) - { - global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base ); - uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root ); - uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root ); - back_pointers[idx] = bp; - } - */ -} - -/* -GRL_INLINE void SUBGROUP_DFS_RefitAndWriteOutFlatTree( - uniform local struct FlatTree* flat_tree, - uniform local struct PrimRefSet* prim_refs, - uniform local struct FlatTreeScheduler* scheduler, - uniform struct DFSArgs args) -{ - - uniform ushort state = STATE_SCHEDULE_REFIT; - uniform ushort node_index = 0; - uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree); - - { - LOOP_TRIPWIRE_INIT; - - bool active = true; - bool continue_refit = false; - while (1) - { - if (active) - { - if (continue_refit || SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index)) - { - continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index); - } - else - { - active = false; - if (get_sub_group_local_id() == 0) - atomic_dec(&scheduler->active_subgroups); - - sub_group_barrier(CLK_LOCAL_MEM_FENCE); - } - } - - barrier(CLK_LOCAL_MEM_FENCE); // finish all atomics - if (scheduler->active_subgroups == 0) - break; - barrier(CLK_LOCAL_MEM_FENCE); // finish all checks.. prevent race between thread which loops around and thread which doesn't - - LOOP_TRIPWIRE_INCREMENT(200); - } - } - - for (uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups()) - SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, i, args); - - barrier(CLK_LOCAL_MEM_FENCE); - - - // JDB: Version below attempts to interleave refit and qnode write-out - // This could theoretically reduce thread idle time, but it is more complex and does more atomics for scheduling - -#if 0 - // after we've constructed the flat tree (phase 1), there are two things that need to happen: - // PHASE 2: Refit the flat tree, computing all of the node ABBs - // PHASE 3: Write the nodes out to memory - // - // all of this is sub-group centric. Different subgroups can execute phases 2 and 3 concurrently - // - - // TODO_OPT: The scheduling algorithm might need to be re-thought. - // Fused EUs are very hard to reason about. It's possible that by scheduling independent - // SGs in this way we would lose a lot of performance due to fused EU serialization. - // Needs to be tested experimentally if such a thing is possible - - uniform ushort state = STATE_SCHEDULE_REFIT; - uniform ushort node_index = 0; - uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree); - - LOOP_TRIPWIRE_INIT; - - do - { - // barrier necessary to protect access to scheduler->active_subgroups - barrier(CLK_LOCAL_MEM_FENCE); - - if (state == STATE_SCHEDULE_REFIT) - { - if (SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index)) - state = STATE_REFIT; - else - state = STATE_SCHEDULE_WRITEOUT; // fallthrough - } - if (state == STATE_SCHEDULE_WRITEOUT) - { - state = SUBGROUP_FlatTreeScheduler_GetWriteOutTask(scheduler, num_nodes, &node_index); - if (state == STATE_DONE) - SUBGROUP_FlatTreeScheduler_SubGroupDone(scheduler); - } - - - // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask' - // Note that in theory we could have the write-out tasks spin until the refit tasks clear, which would make this barrier unnecessary - // However, we cannot do this safely on SKUs which do not support independent subgroup forward progress. - barrier(CLK_LOCAL_MEM_FENCE); - - if (state == STATE_REFIT) - { - uniform ushort prev_node = node_index; - uniform bool continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index); - - SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut(scheduler, prev_node); - - if (!continue_refit) - state = STATE_SCHEDULE_REFIT; - } - else if (state == STATE_WRITEOUT) - { - SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, node_index, args); - state = STATE_SCHEDULE_WRITEOUT; - } - // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask' - barrier(CLK_LOCAL_MEM_FENCE); - - LOOP_TRIPWIRE_INCREMENT(200); - - } while (scheduler->active_subgroups > 0); - -#endif -} -*/ - -GRL_INLINE void DFS_CreatePrimRefSet( struct DFSArgs args, - local struct PrimRefSet* prim_refs ) -{ - ushort id = get_local_id( 0 ); - ushort num_primrefs = args.num_primrefs; - - - PrimRef ref; - struct AABB3f local_aabb; - if ( id < num_primrefs ) - { - ref = args.primref_buffer[id]; - AABB3f_set_lower( &local_aabb, ref.lower.xyz ); - AABB3f_set_upper( &local_aabb, ref.upper.xyz ); - } - else - { - AABB3f_init( &local_aabb ); - } - - AABB3f_atomic_merge_localBB_nocheck( &prim_refs->root_aabb, &local_aabb ); - - barrier( CLK_LOCAL_MEM_FENCE ); - - if ( id < num_primrefs ) - PrimRefSet_SetPrimRef_FullPrecision( prim_refs, ref, id ); -} - - - -struct BVHBuildLocals -{ - float Al[DFS_WG_SIZE]; - float Ar[DFS_WG_SIZE]; - uchar2 axis_and_left_count[ DFS_WG_SIZE ]; - uint sah[DFS_WG_SIZE]; - uint num_active_threads; -}; - - -GRL_INLINE void DFS_ConstructBVH2( local struct LocalBVH2* bvh2, - local struct PrimRefSet* prim_refs, - ushort num_prims, - local struct BVHBuildLocals* locals ) -{ - ushort tid = get_local_id( 0 ); - - ushort bvh2_root = 0; - ushort prim_range_start = 0; - ushort primref_position = tid; - - bool active_thread = tid < num_prims; - float root_area = PrimRefSet_GetMaxAABBArea( prim_refs ); - float area_scale = DFS_BVH2_AREA_QUANT / root_area; - - locals->num_active_threads = num_prims; - barrier( CLK_LOCAL_MEM_FENCE ); - - LOOP_TRIPWIRE_INIT; - - do - { - if(active_thread && prim_range_start == primref_position) - locals->sah[primref_position] = UINT_MAX; - - if ( active_thread ) - { - local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); - - // each thread evaluates a possible split candidate. Scan primrefs and compute sah cost - // do this axis-by-axis to keep register pressure low - float best_sah = INFINITY; - ushort best_axis = 3; - ushort best_count = 0; - float best_al = INFINITY; - float best_ar = INFINITY; - - struct DFSPrimRefAABB box_left[3]; - struct DFSPrimRefAABB box_right[3]; - float CSplit[3]; - ushort count_left[3]; - - for ( ushort axis = 0; axis < 3; axis++ ) - { - DFSPrimRefAABB_init( &box_left[axis] ); - DFSPrimRefAABB_init( &box_right[axis] ); - - CSplit[axis] = my_box->lower[axis] + my_box->upper[axis]; - count_left[axis] = 0; - } - - // scan primrefs in our subtree and partition using this thread's prim as a split plane - { - struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start ); - - for ( ushort p = 1; p < num_prims; p++ ) - { - struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration - - for( ushort axis = 0; axis < 3; axis++ ) - { - float c = box.lower[axis] + box.upper[axis]; - - if ( c < CSplit[axis] ) - { - // this primitive is to our left. - DFSPrimRefAABB_extend( &box_left[axis], &box ); - count_left[axis]++; - } - else - { - // this primitive is to our right - DFSPrimRefAABB_extend( &box_right[axis], &box ); - } - } - - box = next_box; - } - - // last iteration without preloading box - for( ushort axis = 0; axis < 3; axis++ ) - { - float c = box.lower[axis] + box.upper[axis]; - - if ( c < CSplit[axis] ) - { - // this primitive is to our left. - DFSPrimRefAABB_extend( &box_left[axis], &box ); - count_left[axis]++; - } - else - { - // this primitive is to our right - DFSPrimRefAABB_extend( &box_right[axis], &box ); - } - } - } - - for ( ushort axis = 0; axis < 3; axis++ ) - { - float Al = DFSPrimRefAABB_halfArea( &box_left[axis] ); - float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] ); - - // Avoid NANs in SAH calculation in the corner case where all prims go right - // In this case we set Al=Ar, because such a split will only be selected if all primrefs - // are co-incident.. In that case, we will fall back to split-in-the-middle and both subtrees - // should store the same quantized area value - if ( count_left[axis] == 0 ) - Al = Ar; - - // compute sah cost - ushort count_right = num_prims - count_left[axis]; - float sah = Ar * count_right + Al * count_left[axis]; - - // keep this split if it is better than the previous one, or if the previous one was a corner-case - if ( sah < best_sah || best_count == 0 ) - { - // yes, keep it - best_axis = axis; - best_sah = sah; - best_count = count_left[axis]; - best_al = Al; - best_ar = Ar; - } - } - - - // write split information to SLM - locals->Al[primref_position] = best_al; - locals->Ar[primref_position] = best_ar; - locals->axis_and_left_count[primref_position].x = best_axis; - locals->axis_and_left_count[primref_position].y = best_count; - - uint sah = as_uint(best_sah); - // break ties by axis to ensure deterministic split selection - // otherwise builder can produce non-deterministic tree structure run to run - // based on the ordering of primitives (which can vary due to non-determinism in atomic counters) - // Embed split axis and index into sah value; compute min over sah and max over axis - sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | primref_position ); - - // reduce on split candidates in our local subtree and decide the best one - atomic_min_local( &locals->sah[ prim_range_start ], sah); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - ushort split_index = locals->sah[ prim_range_start ] & 255; - ushort split_axis = locals->axis_and_left_count[split_index].x; - ushort split_left_count = locals->axis_and_left_count[split_index].y; - float split_al = locals->Al[split_index]; - float split_ar = locals->Ar[split_index]; - - if ( (primref_position == prim_range_start) && active_thread ) - { - // first thread in a given subtree creates the inner node - ushort quantized_left_area = convert_ushort_rtn( split_al * area_scale ); - ushort quantized_right_area = convert_ushort_rtn( split_ar * area_scale ); - ushort start_left = prim_range_start; - ushort start_right = prim_range_start + split_left_count; - if ( split_left_count == 0 ) - start_right = start_left + (num_prims / 2); // handle split-in-the-middle case - - LocalBVH2_CreateInnerNode( bvh2, bvh2_root, - start_left, start_right, - quantized_left_area, quantized_right_area ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - struct DFSPrimRef ref; - ushort new_primref_position; - - if ( active_thread ) - { - ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root ); - bool go_left; - - if ( split_left_count == 0 ) - { - // We chose a split with no left-side prims - // This will only happen if all primrefs are located in the exact same position - // In that case, fall back to split-in-the-middle - split_left_count = (num_prims / 2); - go_left = (primref_position - prim_range_start < split_left_count); - } - else - { - // determine what side of the split this thread's primref belongs on - local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position ); - local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index ); - float c = my_box->lower[split_axis] + my_box->upper[split_axis]; - float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis]; - go_left = c < Csplit; - } - - // adjust state variables for next loop iteration - bvh2_root = (go_left) ? kids.x : kids.y; - num_prims = (go_left) ? split_left_count : (num_prims - split_left_count); - prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count; - - // determine the new primref position by incrementing a counter in the destination subtree - new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root ); - - // load our primref from its previous position - ref = PrimRefSet_GetPrimRef( prim_refs, primref_position ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - if ( active_thread ) - { - // write our primref into its sorted position - PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position ); - primref_position = new_primref_position; - - // deactivate all threads whose subtrees are small enough to form a leaf - if ( num_prims <= TREE_ARITY ) - { - active_thread = false; - atomic_dec_local( &locals->num_active_threads ); - } - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - LOOP_TRIPWIRE_INCREMENT( 50 ); - - - } while ( locals->num_active_threads > 0 ); - - -} - - - -// fast path for #prims <= TREE_ARITY -GRL_INLINE void Trivial_DFS( struct DFSArgs args ) -{ - - ushort tid = get_local_id( 0 ); - - PrimRef myRef; - AABB_init( &myRef ); - if( tid < args.num_primrefs ) - myRef = args.primref_buffer[tid]; - - uint node_offset; - if ( tid == 0 ) - node_offset = 64*allocate_inner_nodes( args.bvh_base, 1 ); - node_offset = sub_group_broadcast(node_offset,0); - - char* bvh_mem = (char*) args.bvh_base; - struct QBVHNodeN* qnode = (struct QBVHNodeN*) (bvh_mem + node_offset); - - uint child_type = args.leaf_node_type; - uint prim_base = args.bvh_base->quadLeafStart*64 ; - - char* leaf_mem = bvh_mem + prim_base; - int offset = (int)( leaf_mem - (char*)qnode ); - - if (child_type == NODE_TYPE_INSTANCE) - { - subgroup_setInstanceQBVHNodeN( offset >> 6, &myRef, args.num_primrefs, qnode, tid < args.num_primrefs ? PRIMREF_instanceMask(&myRef) : 0 ); - } - else - subgroup_setQBVHNodeN( offset >> 6, child_type, &myRef, args.num_primrefs, qnode, BVH_NODE_DEFAULT_MASK ); - - if ( tid < args.num_primrefs ) - { - global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs; - uint bp = node_offset; - - // TODO_OPT: Leaf creation pipeline can be made simpler by having a sideband buffer containing - // fatleaf index + position in fatleaf for each primref, instead of forcing leaf creation shader to reconstruct it - // should also probably do the fat-leaf splitting here - args.primref_buffer[tid] = myRef; - args.primref_index_buffer[tid] = tid; - - primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN); - - if ( tid == 0 && args.need_backpointers ) - { - uint bp = ((uint)-1) << 6; - bp |= (args.num_primrefs) << 3; - *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) = bp; - } - } -} - - - - - -void SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( uniform local struct FlatTree* flat_tree, - uniform local struct FlatTreeScheduler* flat_scheduler, - uniform local struct AABB3f* boxes, - uniform local struct PrimRefMeta* primref_meta, - uniform global struct AABB* primref_buffer, - uniform local uchar* masks, - bool need_masks ) - -{ - uniform int num_levels = (int) flat_scheduler->num_levels; - varying ushort lane = get_sub_group_local_id(); - - // iterate over depth levels in the tree... deepest to shallowest - for (uniform int level = num_levels - 1; level >= 0; level--) - { - // loop over a range of flattree nodes at this level, one node per sub-group - // TODO_OPT: Try and enable this code to process two nodes in a SIMD16 subgroup - uniform ushort level_start = flat_scheduler->level_start[level]; - uniform ushort level_node_count = flat_scheduler->level_count[level]; - - for (uniform ushort i = get_sub_group_id(); i < level_node_count; i += get_num_sub_groups()) - { - uniform ushort node_index = flat_scheduler->level_ordered_nodes[ level_start + i ]; - - varying struct AABB box; - AABB_init(&box); - - uniform uint child_base = FlatTree_GetFirstChild( flat_tree, node_index ); - uniform uint num_children = FlatTree_GetNumChildren( flat_tree, node_index ); - varying uint child_index = child_base + ((laneflat_tree, args ); - else if ( get_sub_group_id() == 2 ) - SUBGROUP_LocalBVH2_Initialize( &slm->u.s1.bvh2, args.num_primrefs ); - else if ( get_sub_group_id() == 4 ) - SUBGROUP_FlatTreeScheduler_Initialize( &slm->flat_scheduler ); - else if ( get_sub_group_id() == 6 ) - SUBGROUP_PrimRefSet_Initialize( &slm->u.s1.prim_refs ); - - barrier( CLK_LOCAL_MEM_FENCE ); - - // load the PrimRefs - DFS_CreatePrimRefSet( args, &slm->u.s1.prim_refs ); - - // build the BVH2 - DFS_ConstructBVH2( &slm->u.s1.bvh2, &slm->u.s1.prim_refs, args.num_primrefs, &slm->u.s1.bvh2_locals ); - - // copy out metadata for primrefs now that they have been sorted - if( tid < args.num_primrefs ) - { - slm->primitive_meta[tid] = PrimRefSet_GetMeta( &slm->u.s1.prim_refs, tid ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // collapse into a FlatTree - SUBGROUP_DFS_BuildFlatTree( &slm->u.s1.bvh2, &slm->flat_tree, &slm->flat_scheduler ); - - // allocate output QBVH6 nodes - if ( get_local_id( 0 ) == 0 ) - FlatTree_AllocateQNodes( &slm->flat_tree, args ); - - barrier( CLK_LOCAL_MEM_FENCE ); - - SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( &slm->flat_tree, &slm->flat_scheduler, &slm->u.s2.boxes[0], slm->primitive_meta, args.primref_buffer, slm->u.s2.masks, args.need_masks ); - - //FlatTree_Printf( &slm->flat_tree ); - //FlatTree_check_boxes ( &slm->flat_tree, args.primref_buffer, &slm->u.s2.boxes[0], slm->primitive_meta ); - - SUBGROUP_DFS_WriteNodes( &slm->flat_tree, &slm->u.s2.boxes[0], slm->primitive_meta, args, slm->u.s2.masks ); - - - // generate sorted primref index buffer and backpointers to feed the leaf creation pipeilne - if ( tid < args.num_primrefs ) - { - uint input_index = PrimRefMeta_GetInputIndex(&slm->primitive_meta[tid]); - - uint bp = FlatTree_GetPrimRefBackPointer( &slm->flat_tree, tid ); - global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs; - - args.primref_index_buffer[tid] = input_index; - - primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN); - - if ( tid == 0 && args.need_backpointers ) - { - *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) |= ((uint)-1) << 6; - } - } -} - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void DFS( global struct Globals* globals, - global char* bvh_mem, - global PrimRef* primref_buffer, - global uint* primref_index_buffer, - uint alloc_backpointers - ) -{ - struct DFSArgs args; - args.bvh_base = (global struct BVHBase*) bvh_mem; - args.leaf_node_type = globals->leafPrimType; - args.inner_node_type = NODE_TYPE_INTERNAL; - args.leaf_size_in_bytes = globals->leafSize; - args.primref_buffer = primref_buffer; - args.need_backpointers = alloc_backpointers != 0; - args.num_primrefs = globals->numPrimitives; - args.primref_index_buffer = primref_index_buffer; - args.need_masks = args.leaf_node_type == NODE_TYPE_INSTANCE; - - if ( args.num_primrefs <= TREE_ARITY ) - { - // TODO_OPT: This decision should be made using indirect dispatch - if( get_sub_group_id() == 0 ) - Trivial_DFS( args ); - return; - } - - local struct Single_WG_build_SLM slm; - - execute_single_WG_build( args, &slm ); -} - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void DFS_single_wg( - global struct Globals* globals, - global char* bvh_mem, - global PrimRef* primref_buffer, - global uint* primref_index_buffer, - uint sah_flags -) -{ - struct DFSArgs args; - args.bvh_base = (global struct BVHBase*) bvh_mem; - args.leaf_node_type = globals->leafPrimType; - args.inner_node_type = NODE_TYPE_INTERNAL; - args.leaf_size_in_bytes = globals->leafSize; - args.primref_buffer = primref_buffer; - args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; - args.num_primrefs = globals->numPrimitives; - args.primref_index_buffer = primref_index_buffer; - args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS; - - local struct Single_WG_build_SLM slm; - - execute_single_WG_build( args, &slm ); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -kernel void DFS_trivial( - global struct Globals* globals, - global char* bvh_mem, - global PrimRef* primref_buffer, - global uint* primref_index_buffer, - uint sah_flags -) -{ - struct DFSArgs args; - args.bvh_base = (global struct BVHBase*) bvh_mem; - args.leaf_node_type = globals->leafPrimType; - args.inner_node_type = NODE_TYPE_INTERNAL; - args.leaf_size_in_bytes = globals->leafSize; - args.primref_buffer = primref_buffer; - args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS; - args.num_primrefs = globals->numPrimitives; - args.primref_index_buffer = primref_index_buffer; - args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS; - - Trivial_DFS( args ); -} - - -struct DFSArgs dfs_args_from_sah_globals( global struct SAHBuildGlobals* sah_globals ) -{ - struct DFSArgs args; - args.bvh_base = (global struct BVHBase*) sah_globals->p_bvh_base; - args.leaf_node_type = sah_globals->leaf_type; - args.inner_node_type = NODE_TYPE_INTERNAL; - args.leaf_size_in_bytes = sah_globals->leaf_size; - args.primref_buffer = (global PrimRef*) sah_globals->p_primrefs_buffer; - args.need_backpointers = sah_globals->flags & SAH_FLAG_NEED_BACKPOINTERS; - args.num_primrefs = sah_globals->num_primrefs; - args.primref_index_buffer = (global uint*) sah_globals->p_primref_index_buffers; - args.need_masks = sah_globals->flags & SAH_FLAG_NEED_MASKS; - - return args; -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(DFS_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void DFS_single_wg_batchable( - global struct SAHBuildGlobals* globals_buffer, - global struct VContextScheduler* scheduler -) -{ - global struct SAHBuildGlobals* sah_globals = globals_buffer + scheduler->num_trivial_builds + get_group_id(0); - - struct DFSArgs args = dfs_args_from_sah_globals( sah_globals ); - - local struct Single_WG_build_SLM slm; - - execute_single_WG_build(args, &slm); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -kernel void DFS_trivial_batchable( - global struct SAHBuildGlobals* globals_buffer -) -{ - global struct SAHBuildGlobals* sah_globals = globals_buffer + get_group_id(0); - - struct DFSArgs args = dfs_args_from_sah_globals(sah_globals); - - Trivial_DFS(args); -} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl deleted file mode 100644 index bb220b30612..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl +++ /dev/null @@ -1,357 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "api_interface.h" -#include "common.h" -#include "instance.h" - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(32, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel -primref_to_quads(global struct Globals *globals, - global struct AABB *primref, - global char *primref_index, - global char *bvh_mem, - global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, - const uint stride, - const uint offset, - const uint allow_update) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart ); - uint quadIndicesStart = bvh->quadIndicesDataStart; - - const uint numPrimitives = globals->numPrimitives; - uint i = get_group_id( 0 ) * get_local_size( 0 ) + get_local_id(0); - if (i < numPrimitives) - { - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - - const uint primrefID = *(uint *)(primref_index + i * stride + offset); - - const uint geomID = PRIMREF_geomID(&primref[primrefID]); - const uint primID0 = PRIMREF_primID0(&primref[primrefID]); - const uint primID1 = PRIMREF_primID1(&primref[primrefID]); - const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]); - - const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0); - const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1); - - const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1); - - uint vertex_stride = geomDesc[geomID].Desc.Triangles.VertexBufferByteStride; - - const uint4 indices = q.a; - - const uint mask = 0xff; // FIXME: hardcoded mask - float3 vtx0, vtx1, vtx2, vtx3; - GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices); - - uint j0 = q.lb.x; - uint j1 = q.lb.y; - uint j2 = q.lb.z; - uint shaderIndex = (mask << 24) | geomID; - uint geomIndex = geomID | (geomFlags << 30); - uint primIndex0 = primID0; - const uint delta = primID1 - primID0; - const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4)); - uint primIndex1Delta = delta | (j << 16) | (1 << 22); - - uint4 pack0 = (uint4)(shaderIndex, geomIndex, primIndex0, primIndex1Delta); - float4 pack1 = (float4)(vtx0.x, vtx0.y, vtx0.z, vtx1.x); - float4 pack2 = (float4)(vtx1.y, vtx1.z, vtx2.x, vtx2.y); - float4 pack3 = (float4)(vtx2.z, vtx3.x, vtx3.y, vtx3.z); - - global uint4* dst = (global uint4*)&quads[i]; - store_uint4_L1WB_L3WB(dst, 0, pack0); - store_uint4_L1WB_L3WB(dst, 1, as_uint4(pack1)); - store_uint4_L1WB_L3WB(dst, 2, as_uint4(pack2)); - store_uint4_L1WB_L3WB(dst, 3, as_uint4(pack3)); - - if(allow_update) - { - global uint4* vertex_indice_ptr = (global uint4*)(((char*)bvh) + (64u * quadIndicesStart + 32 * i)); - - uint4 pack_indices = (uint4) ( indices.x , indices.y, indices.z, indices.w ); - - store_uint4_L1WB_L3WB( vertex_indice_ptr, 0, pack0 ); - store_uint4_L1WB_L3WB( vertex_indice_ptr, 1, pack_indices * vertex_stride); - } - - if (i == 0) - bvh->quadLeafCur += numPrimitives ; - } - - - -#if 0 - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart ); - - const uint numPrimitives = globals->numPrimitives; - const uint startID = get_group_id( 0 ) * get_local_size( 0 ); - const uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives); - - for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) - { - const uint primrefID = *(uint *)(primref_index + i * stride + offset); - - const uint geomID = PRIMREF_geomID(&primref[primrefID]); - const uint primID0 = PRIMREF_primID0(&primref[primrefID]); - const uint primID1 = PRIMREF_primID1(&primref[primrefID]); - const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]); - - const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0); - const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1); - - const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1); - - const uint4 indices = q.a; - const uint mask = 0xff; // FIXME: hardcoded mask - float3 vtx0, vtx1, vtx2, vtx3; - GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices); - - setQuad(&quads[i], (float4)(vtx0,0), (float4)(vtx1,0), (float4)(vtx2,0), (float4)(vtx3,0), q.lb.x, q.lb.y, q.lb.z, geomID, primID0, primID1, mask, geomFlags ); - } - - if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0) - bvh->quadLeafCur += numPrimitives ; -#endif -} - -GRL_INLINE void create_procedural_leaf(global struct Globals *globals, - global struct AABB *primref, - local uint *primrefids, - uint numProcedurals, - struct QBVHNodeN *qnode, - global char *bvh_mem, - global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - if (get_local_id(0) >= 8) - return; - - global struct BVHBase* bvh_base = (global struct BVHBase*)bvh_mem; - - /* first read geomID of all primitives */ - uint primrefID = -1; - uint geomID = -1; - uint geomFlags = 0; - if (get_local_id(0) < numProcedurals) - { - primrefID = primrefids[get_local_id(0)]; - geomID = PRIMREF_geomID(&primref[primrefID]); - geomFlags = PRIMREF_geomFlags( &primref[primrefID] ); - } - - // cannot sort by geomID as bounds in parent node are then wrong - //ulong geomID_primrefID = (((ulong)geomID) << 32) | ((ulong)primrefID); - //geomID_primrefID = sort8_ascending_ulong(geomID_primrefID); - //geomID = geomID_primrefID >> 32; - //primrefID = geomID_primrefID; - - /* We have to split at geomID boundaries into multiple leaves. This - * block calculates the lane where a leaf starts and ends. */ - const uint geomIDprev = intel_sub_group_shuffle_up(0xFFFFFFFFu, geomID, 1u); - const uint geomIDnext = intel_sub_group_shuffle_down(geomID, 0xFFFFFFFFu, 1u); - const uint leaf_start = geomIDprev != geomID; - const uint leaf_end = geomIDnext != geomID; - const uint leaf_start_next = intel_sub_group_shuffle_down(leaf_start, 0u, 1u); - - /* This computes which leaf a lane processes. E.g. form geomID = - * [3,3,4,4,4,0] we get leaf_id = [0,0,1,1,1,2] */ - //const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); // FIXME: exclusive? - - /* This computes the n'th primitive a lane processes inside its - * leaf. For the example above we compute leaf_prim = - * [0,1,0,1,2,0]. */ - const uint leaf_prim = get_local_id(0) - sub_group_scan_inclusive_max(leaf_start ? get_local_id(0) : 0); - - /* from here on we allocate data and write to memory, thus only - * lanes that process a primitive should continue. */ - if (get_local_id(0) >= numProcedurals) - return; - - /* Here we allocate a single memory block for each required - * ProceduralLeaf node. We do this from a single lane to ensure - * the allocation is contiguous. */ - uint leaf_base_offset = 0; - uint n_leafs = sub_group_reduce_add(leaf_start); - if (get_local_id(0) == 0) - leaf_base_offset = allocate_procedural_leaves( bvh_base, n_leafs ); - leaf_base_offset = sub_group_broadcast(leaf_base_offset, 0); - - /* Compute the leaf offset for each lane. */ - uint leaf_offset = leaf_base_offset + sub_group_scan_inclusive_add(leaf_start) - 1; - - struct ProceduralLeaf *pleaf = ((global struct ProceduralLeaf *)(bvh_mem)) + leaf_offset; - - /* write the procedural leaf headers */ - if (leaf_end) - { - pleaf->leafDesc.shaderIndex_geomMask = 0xFF000000 | (geomID & 0x00FFFFFF); // FIXME: use accessor function. Future extensions may have shaderIndex != geomID - pleaf->leafDesc.geomIndex_flags = geomID | (geomFlags<<30); // FIXME: Use setter function - pleaf->DW1 = 0xFFFFFFF0 | (leaf_prim + 1); // !!! - } - /* write the procedural leaf primIDs */ - pleaf->_primIndex[leaf_prim] = PRIMREF_primID0(&primref[primrefID]); - - /* update leaf node offset inside parent node */ - if (get_local_id(0) == 0) - { - QBVH6Node_set_offset(qnode, pleaf); - QBVH6Node_set_type(qnode, NODE_TYPE_PROCEDURAL); - } - - /* Let parent node children point to proper procedural leaf block - * and primitive. */ - qnode->childData[get_local_id(0)] = leaf_start_next | (leaf_prim << 2); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -GRL_ANNOTATE_BIG_REG_REQ -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -primref_to_procedurals(global struct Globals *globals, - global struct AABB *primref, - global char *primref_index, - global char *bvh_mem, - global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, - const uint stride, - const uint offset) -{ - global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; - - const uint numPrimitives = globals->numPrimitives; - uint startID = get_group_id( 0 ) * get_local_size( 0 ); - uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives); - - uint offset1 = stride * globals->numPrimitives; - if (stride == 8) - offset1 = 4; - - uint prev_start_back_pointer = startID == 0 ? -1 : *(uint *)(primref_index + (startID-1) * stride + offset1); - /* start at leaf start */ - while (startID < numPrimitives) - { - const uint back_pointer = *(uint *)(primref_index + startID * stride + offset1); - if (back_pointer != prev_start_back_pointer) - break; - startID++; - } - - uint prev_end_back_pointer = *(uint *)(primref_index + (endID-1) * stride + offset1); - /* end at next leaf start */ - while (endID < numPrimitives) - { - const uint back_pointer = *(uint *)(primref_index + endID * stride + offset1); - if (back_pointer != prev_end_back_pointer) - break; - endID++; - } - - local uint procedurals[16]; - - for (uint lid = startID + get_local_id(0); lid < endID + get_local_id(0);) - { - /* load leaf start points and back_pointer */ - const uint primrefID = *(uint *)(primref_index + lid * stride + offset); - uint back_pointer = *(uint *)(primref_index + lid * stride + offset1); - uint prev_back_pointer = get_local_id(0) == 0 ? -1 : *(uint *)(primref_index + (lid-1) * stride + offset1); - - const uint leaf_start = back_pointer != prev_back_pointer; - uint leaf_start_back_pointer = sub_group_broadcast(back_pointer, 0); - - /* compute number of primitives inside the leaf starting at lid */ - const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); - uint numPrimitives = 0; - if (back_pointer == leaf_start_back_pointer && lid < endID) - numPrimitives = sub_group_reduce_add(1); - numPrimitives = sub_group_broadcast(numPrimitives, 0); - - procedurals[get_local_id(0)] = primrefID; - - struct QBVHNodeN *qnode = (struct QBVHNodeN *)bvh_mem + back_pointer; - - create_procedural_leaf(globals, primref, procedurals, numPrimitives, qnode, bvh_mem, geomDesc); - - lid += numPrimitives; - } -} - -GRL_INLINE void create_HW_instance_leaf( - global struct BVHBase* bvh, - global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc, - uint dstLeafId, - uint instanceIndex, - uint rootNodeByteOffset, - uint instanceMask) -{ - /* convert DXR instance to instance leaf node */ - global struct HwInstanceLeaf* leaves = (__global struct HwInstanceLeaf*)BVHBase_quadLeaves(bvh); - HwInstanceLeaf_Constructor(&leaves[dstLeafId], instDesc, instanceIndex, rootNodeByteOffset, instanceMask); -} - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel create_HW_instance_nodes( - global const struct Globals *globals, - global char *primref_index, - global struct AABB *primref, - global struct BVHBase *bvh, - global struct GRL_RAYTRACING_INSTANCE_DESC *src_instances, - uint32_t stride, - uint32_t offset) -{ - uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id(); - uint num_prims = globals->numPrimitives; - if (dstLeafId >= num_prims) - return; - if( dstLeafId == 0 ) - bvh->instanceLeafEnd += 2*num_prims; - - /* get instance ID */ - const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset); - const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]); - const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]); - const uint instMask = PRIMREF_instanceMask(&primref[primrefID]); - create_HW_instance_leaf(bvh, &src_instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask ); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel create_HW_instance_nodes_pointers( - global const struct Globals *globals, - global char *primref_index, - global struct AABB *primref, - global struct BVHBase *bvh, - global void *instances_in, - uint32_t stride, - uint32_t offset) -{ - uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id(); - uint num_prims = globals->numPrimitives; - if (dstLeafId >= num_prims) - return; - if (dstLeafId == 0) - bvh->instanceLeafEnd += 2 * num_prims; - - global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = - (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; - - /* get instance ID */ - const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset); - const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]); - const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]); - const uint instMask = PRIMREF_instanceMask(&primref[primrefID]); - create_HW_instance_leaf(bvh, instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask ); -} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl deleted file mode 100644 index bc9cf590f51..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl +++ /dev/null @@ -1,556 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "api_interface.h" -#include "common.h" - -#define GRID_SIZE 1024 - -/* - This presplit item contains for each primitive a number of splits to - perform (priority) and the primref index. - */ - -struct PresplitItem -{ - unsigned int index; - float priority; -}; - -/* - - This function splits a line v0->v1 at position pos in dimension dim - and merges the bounds for the left and right line segments into - lbounds and rbounds. - - */ - -GRL_INLINE void splitLine(const uint dim, - const float pos, - const float4 v0, - const float4 v1, - struct AABB *lbounds, - struct AABB *rbounds) -{ - const float v0d = v0[dim]; - const float v1d = v1[dim]; - - /* this point is on left side */ - if (v0d <= pos) - AABB_extend_point(lbounds, v0); - - /* this point is on right side */ - if (v0d >= pos) - AABB_extend_point(rbounds, v0); - - /* the edge crosses the splitting location */ - if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) - { - const float f = (pos - v0d) / (v1d - v0d); - const float4 c = f * (v1 - v0) + v0; - AABB_extend_point(lbounds, c); - AABB_extend_point(rbounds, c); - } -} - -/* - - This function splits a clipped triangle v0,v1,v2 with bounds prim at - position pos in dimension dim and merges the bounds for the left and - right clipped triangle fragments into lbounds and rbounds. - - */ - -GRL_INLINE void splitTriangle(struct AABB *prim, - const uint dim, - const float pos, - const float4 v0, - const float4 v1, - const float4 v2, - struct AABB *lbounds, - struct AABB *rbounds) -{ - /* clip each triangle edge */ - splitLine(dim, pos, v0, v1, lbounds, rbounds); - splitLine(dim, pos, v1, v2, lbounds, rbounds); - splitLine(dim, pos, v2, v0, lbounds, rbounds); - - /* the triangle itself was clipped already, thus clip against triangle bounds */ - AABB_intersect(lbounds, prim); - AABB_intersect(rbounds, prim); -} - -float calculate_priority(struct AABB *prim, global GRL_RAYTRACING_GEOMETRY_DESC *geom) -{ - /* calculate projected area of first triangles */ - const uint primID0 = PRIMREF_primID0(prim); - const uint3 tri0 = GRL_load_triangle(geom, primID0); - const float4 av0 = GRL_load_vertex(geom, tri0.x); - const float4 av1 = GRL_load_vertex(geom, tri0.y); - const float4 av2 = GRL_load_vertex(geom, tri0.z); - const float area_tri0 = areaProjectedTriangle(av0, av1, av2); - - /* calculate projected area of second triangle */ - const uint primID1 = PRIMREF_primID1(prim); - const uint3 tri1 = GRL_load_triangle(geom, primID1); - const float4 bv0 = GRL_load_vertex(geom, tri1.x); - const float4 bv1 = GRL_load_vertex(geom, tri1.y); - const float4 bv2 = GRL_load_vertex(geom, tri1.z); - const float area_tri1 = areaProjectedTriangle(bv0, bv1, bv2); - - /* as priority we use the AABB area */ - const float area_aabb = AABB_halfArea(prim); - float priority = area_aabb; - - /* prefer triangles with a large potential SAH gain. */ - const float area_tris = area_tri0 + area_tri1; - const float area_ratio = min(4.0f, area_aabb / max(1E-12f, area_tris)); - priority *= area_ratio; - - /* ignore too small primitives */ - //const float4 size = AABB_size(prim); - //const float max_size = max(size.x,max(size.y,size.z)); - //if (max_size < 0.5f*max_scene_size/GRID_SIZE) - // priority = 0.0f; - - return priority; -} - -/* - - This kernel calculates for each primitive an estimated splitting priority. - - */ - - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel compute_num_presplits(global struct Globals *globals, - global struct BVHBase* bvh_base, - global struct AABB *primref, - global struct PresplitItem *presplit, - global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - //assert(sizeof(PresplitItem) == sizeof_PresplitItem); - - /* calculate the range of primitives each work group should process */ - const uint numPrimitives = globals->numPrimitives; - const uint startID = (get_group_id(0) + 0) * numPrimitives / get_num_groups(0); - const uint endID = (get_group_id(0) + 1) * numPrimitives / get_num_groups(0); - - /* get scene bounding box size */ - const float3 scene_size = AABB3f_size(&bvh_base->Meta.bounds); - const float max_scene_size = max(scene_size.x, max(scene_size.y, scene_size.z)); - - /* each work group iterates over its range of primitives */ - for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) - { - const uint geomID = PRIMREF_geomID(&primref[i]); - - /* splitting heuristic for triangles */ - if (GRL_is_triangle(&geomDesc[geomID])) - { - presplit[i].index = i; - presplit[i].priority = calculate_priority(&primref[i], &geomDesc[geomID]); - } - - /* splitting of procedurals is not supported */ - else if (GRL_is_procedural(&geomDesc[geomID])) - { - presplit[i].index = i; - presplit[i].priority = 0.0f; - } - - else - { - //assert(false); - } - } - - if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0) - globals->numOriginalPrimitives = globals->numPrimitives; -} - -/* - - This kernel computes the sum of all priorities. - - */ - - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -priority_sum(global struct Globals *globals, - global struct PresplitItem *presplit, - uint numPrimitivesToSplit) -{ - const uint N = globals->numPrimitives; - const uint j = get_local_id(0); - const uint J = get_local_size(0); - const uint BLOCKSIZE = (N + J - 1) / J; - const uint start = min((j + 0) * BLOCKSIZE, N); - const uint end = min((j + 1) * BLOCKSIZE, N); - - float prioritySum = 0; - for (uint i = start; i < end; i++) - prioritySum += presplit[i].priority; - - prioritySum = work_group_reduce_add(prioritySum); - globals->presplitPrioritySum = prioritySum; - -#if 0 - work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); - - float scale = 1.0f; - for (uint i = 0; i < 10; i++) - { - //if (j == 0) - //printf("prioritySum = %f\n",scale*prioritySum); - - uint numSplits = 0; - for (uint i = start; i < end; i++) - numSplits += presplit[i].priority / (scale*prioritySum)*numPrimitivesToSplit; - - numSplits = work_group_reduce_add(numSplits); - - if (numSplits > numPrimitivesToSplit) - break; - - //if (j == 0) - // printf("numSplits = %i (%i)\n",numSplits,numPrimitivesToSplit); - - globals->presplitPrioritySum = scale * prioritySum; - scale -= 0.05f; - } -#endif -} - -GRL_INLINE void heapify_down(struct AABB *array, uint size) -{ - /* we start at the root */ - uint cur_node_id = 0; - struct AABB *cur_node = array; - - while (true) - { - int larger_node_id = cur_node_id; - struct AABB *larger_node = cur_node; - - /* check if left child is largest */ - const int left_node_id = 2 * cur_node_id + 1; - struct AABB *left_node = &array[left_node_id]; - if (left_node_id < size && AABB_halfArea(left_node) > AABB_halfArea(larger_node)) - { - larger_node_id = left_node_id; - larger_node = left_node; - } - - /* check if right child is largest */ - const int right_node_id = 2 * cur_node_id + 2; - struct AABB *right_node = &array[right_node_id]; - if (right_node_id < size && AABB_halfArea(right_node) > AABB_halfArea(larger_node)) - { - larger_node_id = right_node_id; - larger_node = right_node; - } - - /* if current node is largest heap property is fulfilled and we are done */ - if (larger_node_id == cur_node_id) - break; - - /* otherwise we swap cur and largest */ - struct AABB tmp = *cur_node; - *cur_node = *larger_node; - *larger_node = tmp; - - /* we continue downwards with the largest node */ - cur_node_id = larger_node_id; - cur_node = larger_node; - } -} - -GRL_INLINE void heapify_up(struct AABB *array, uint cur_node_id) -{ - /* stop if we start at the root */ - if (cur_node_id == 0) - return; - - struct AABB *cur_node = &array[cur_node_id]; - - /* we loop until we reach the root node */ - while (cur_node_id) - { - /* get parent node */ - uint parent_node_id = (cur_node_id - 1) / 2; - struct AABB *parent_node = &array[parent_node_id]; - - /* if parent is larger then current we fulfill the heap property and can terminate */ - if (AABB_halfArea(parent_node) > AABB_halfArea(cur_node)) - break; - - /* otherwise we swap cur and parent */ - struct AABB tmp = *cur_node; - *cur_node = *parent_node; - *parent_node = tmp; - - /* and continue upwards */ - cur_node_id = parent_node_id; - cur_node = parent_node; - } -} - -/* splits a quad primref */ -GRL_INLINE void splitQuadPrimRef(global GRL_RAYTRACING_GEOMETRY_DESC *geom, - struct AABB *cur, uint dim, float fsplit, - struct AABB *left, struct AABB *right) -{ - /* left and right bounds to compute */ - AABB_init(left); - AABB_init(right); - - /* load first triangle and split it */ - const uint primID0 = PRIMREF_primID0(cur); - const uint3 tri0 = GRL_load_triangle(geom, primID0); - const float4 av0 = GRL_load_vertex(geom, tri0.x); - const float4 av1 = GRL_load_vertex(geom, tri0.y); - const float4 av2 = GRL_load_vertex(geom, tri0.z); - splitTriangle(cur, dim, fsplit, av0, av1, av2, left, right); - - /* load second triangle and split it */ - const uint primID1 = PRIMREF_primID1(cur); - const uint3 tri1 = GRL_load_triangle(geom, primID1); - const float4 bv0 = GRL_load_vertex(geom, tri1.x); - const float4 bv1 = GRL_load_vertex(geom, tri1.y); - const float4 bv2 = GRL_load_vertex(geom, tri1.z); - splitTriangle(cur, dim, fsplit, bv0, bv1, bv2, left, right); - - /* copy the PrimRef payload into left and right */ - left->lower.w = cur->lower.w; - left->upper.w = cur->upper.w; - right->lower.w = cur->lower.w; - right->upper.w = cur->upper.w; -} - -/* - - This kernel performs the actual pre-splitting. It selects split - locations based on an implicit octree over the scene. - - */ - -#define USE_HEAP 0 -#define HEAP_SIZE 32u - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -//__attribute__((intel_reqd_sub_group_size(16))) -void kernel -perform_presplits(global struct Globals *globals, - global struct BVHBase* bvh_base, - global struct AABB *primref, - global struct PresplitItem *presplit, - global char *bvh_mem, - global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, - uint numPrimitivesToSplit) -{ - /* calculate the range of primitives each work group should process */ - const uint numPrimitives = globals->numPrimitives; - int pstart = globals->numOriginalPrimitives - numPrimitivesToSplit; - pstart = max(0, pstart); - const uint numPrimitivesToProcess = globals->numPrimitives - pstart; - const uint startID = (get_group_id(0) + 0) * numPrimitivesToProcess / get_num_groups(0); - const uint endID = (get_group_id(0) + 1) * numPrimitivesToProcess / get_num_groups(0); - - /* calculates the 3D grid */ - float4 grid_base; - grid_base.xyz = AABB3f_load_lower( &bvh_base->Meta.bounds ); - grid_base.w = 0; - - float4 grid_extend; - grid_extend.xyz = AABB3f_size(&bvh_base->Meta.bounds); - grid_extend.w=0; - - grid_extend = max(grid_extend.x, max(grid_extend.y, grid_extend.z)); - const float4 grid_scale = select(GRID_SIZE / grid_extend, 0.0f, grid_extend == 0.0f); - const float inv_grid_size = 1.0f / GRID_SIZE; - - /* we have to update centroid bounds */ - struct AABB centroidBounds; - AABB_init(¢roidBounds); - - /* initialize heap */ - struct AABB heap[HEAP_SIZE]; - uint heap_size = 0; - - /* each work group iterates over its range of primitives */ - for (uint j = startID + get_local_id(0); j < endID; j += get_local_size(0)) - { - /* array is in ascending order */ - //const uint ID = numPrimitives-1-j; - const uint ID = pstart + j; - const float prob = presplit[ID].priority; - const uint i = presplit[ID].index; - const uint geomID = PRIMREF_geomID(&primref[i]); - - /* do not split primitives with low splitting priority */ - if (prob <= 0.0f) - continue; - - /* we support splitting only for triangles */ - if (!GRL_is_triangle(&geomDesc[geomID])) - continue; - - /* compute number of split primitives to produce */ - uint numSplitPrims = prob / globals->presplitPrioritySum * numPrimitivesToSplit; - numSplitPrims = min(HEAP_SIZE, numSplitPrims); - - /* stop if not splits have to get performed */ - if (numSplitPrims <= 1) - continue; - - /* add primref to heap */ - heap[0] = primref[i]; - heap_size = 1; - uint heap_pos = 0; - - /* iterate until all splits are done */ - uint prims = 1; - uint last_heap_size = heap_size; - while (prims < numSplitPrims) - { - /* map the primitive bounds to the grid */ - const float4 lower = heap[heap_pos].lower; - const float4 upper = heap[heap_pos].upper; - const float4 glower = (lower - grid_base) * grid_scale + 0.2f; - const float4 gupper = (upper - grid_base) * grid_scale - 0.2f; - uint4 ilower = convert_uint4_rtz(glower); - uint4 iupper = convert_uint4_rtz(gupper); - - /* this ignores dimensions that are empty */ - if (glower.x >= gupper.x) - iupper.x = ilower.x; - if (glower.y >= gupper.y) - iupper.y = ilower.y; - if (glower.z >= gupper.z) - iupper.z = ilower.z; - - /* Now we compute a morton code for the lower and upper grid - * coordinates. */ - const uint lower_code = bitInterleave3D(ilower); - const uint upper_code = bitInterleave3D(iupper); - - /* if all bits are equal then we cannot split */ - if (lower_code == upper_code) - { -#if !USE_HEAP - prims++; // !!!!!!! - - heap_pos++; - if (heap_pos == last_heap_size) - { - heap_pos = 0; - last_heap_size = heap_size; - } - continue; -#else - if (heap_size == 1) - break; - - const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1); - primref[offset] = heap[heap_pos]; - - presplit[offset].index = offset; - presplit[offset].priority = calculate_priority(&heap[heap_pos], &geomDesc[geomID]); - - heap[0] = heap[--heap_size]; - heapify_down(heap, heap_size); - continue; -#endif - } - - /* We find the bit position of the first differing bit from the - * top down. This bit indicates a split position inside an - * implicit octree. */ - const uint diff = 31 - clz(lower_code ^ upper_code); - - /* compute octree level and dimension to perform the split in */ - const uint level = diff / 3; - const uint dim = diff % 3; - - /* now we compute the grid position of the split */ - const uint isplit = iupper[dim] & ~((1 << level) - 1); - - /* compute world space position of split */ - const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend[dim]; - - /* split primref into left and right part */ - struct AABB left, right; - splitQuadPrimRef(&geomDesc[geomID], &heap[heap_pos], dim, fsplit, &left, &right); - prims++; - - /* update centroid bounds */ - AABB_extend_point(¢roidBounds, AABB_centroid2(&left)); - AABB_extend_point(¢roidBounds, AABB_centroid2(&right)); - -#if !USE_HEAP - - heap[heap_pos] = left; - heap[heap_size] = right; - heap_size++; - - heap_pos++; - if (heap_pos == last_heap_size) - { - heap_pos = 0; - last_heap_size = heap_size; - } -#else - - /* insert left element into heap */ - heap[0] = left; - heapify_down(heap, heap_size); - - /* insert right element into heap */ - heap[heap_size] = right; - heapify_up(heap, heap_size); - - heap_size++; -#endif - } - - /* copy primities to primref array */ - primref[i] = heap[0]; - - presplit[ID].index = i; - presplit[ID].priority = calculate_priority(&heap[0], &geomDesc[geomID]); - - for (uint k = 1; k < heap_size; k++) - { - const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1); - primref[offset] = heap[k]; - - presplit[offset].index = offset; - presplit[offset].priority = calculate_priority(&heap[k], &geomDesc[geomID]); - } - } - - /* merge centroid bounds into global bounds */ - centroidBounds = AABB_sub_group_reduce(¢roidBounds); - if (get_sub_group_local_id() == 0) - AABB_global_atomic_merge(&globals->centroidBounds, ¢roidBounds); - - work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); - - /* update number of primitives on finish */ - if (Globals_OnFinish(globals)) - { - globals->numPrimitives = globals->numPrimitives + globals->numSplittedPrimitives; - globals->numSplittedPrimitives = 0; - - /* update first build record */ // FIXME: should be done in builder itself - global struct BuildRecord *record = (global struct BuildRecord *)(bvh_mem + bvh_base->quadLeafStart*64); - record->end = globals->numPrimitives; - } -} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.cl b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl deleted file mode 100644 index 1dd9a3cdd92..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_primref.cl +++ /dev/null @@ -1,674 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "api_interface.h" -#include "common.h" -#include "instance.h" - -#include "bvh_build_primref.h" - -//#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable -//int sub_group_non_uniform_any(int predicate); - -#define WINDOW_SIZE 16 - -/* Representation of two merged triangles. */ -struct QuadIndices -{ - uint primID0, primID1; - uint v0, v1, v2, v3; -}; - -/* - - This function calculates a PrimRef from a merged quad and writes - this PrimRef to memory. - - */ -GRL_INLINE void create_prim_ref(const uint geomID, - const struct QuadIndices quad, - global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, - struct AABB *geometryBounds, - struct AABB *centroidBounds, - global uint *numPrimitives, - global struct AABB *primref) -{ - - /* load quad vertices */ - const float4 vtx0 = GRL_load_vertex(geomDesc, quad.v0); // FIXME: these multiple load_vertex calls should get merged - const float4 vtx1 = GRL_load_vertex(geomDesc, quad.v1); - const float4 vtx2 = GRL_load_vertex(geomDesc, quad.v2); - const float4 vtx3 = GRL_load_vertex(geomDesc, quad.v3); - - /* calculate bounds for quad */ - float4 lower = min(min(vtx0, vtx1), min(vtx2, vtx3)); - float4 upper = max(max(vtx0, vtx1), max(vtx2, vtx3)); - - /* extend geometry and centroid bounds */ - const float4 centroid2 = lower + upper; - AABB_extendlu(geometryBounds, lower, upper); - AABB_extendlu(centroidBounds, centroid2, centroid2); - - PrimRef ref; - PRIMREF_setAABB( &ref, lower.xyz, upper.xyz ); - PRIMREF_setQuadMetaData( &ref, quad.primID0, quad.primID1, geomID, GRL_get_Flags( geomDesc ) ); - - /* store primref to memory */ - const uint offset = atomic_add_global(numPrimitives, 1); - primref[offset] = ref; -} - -/* - - This function calculates a PrimRef from a procedural and writes - this PrimRef to memory. - - */ -GRL_INLINE void create_prim_ref_procedural(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, - const uint geomID, - const uint primID, - struct AABB *geometryBounds, - struct AABB *centroidBounds, - global uint *numPrimitives, - global struct AABB *primref) -{ - /* load aabb from memory */ - struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID); - - /* extend geometry and centroid bounds */ - float4 lower = (float4)(aabb.MinX, aabb.MinY, aabb.MinZ, 0.0f); - float4 upper = (float4)(aabb.MaxX, aabb.MaxY, aabb.MaxZ, 0.0f); - const float4 centroid2 = lower + upper; - AABB_extendlu(geometryBounds, lower, upper); - AABB_extendlu(centroidBounds, centroid2, centroid2); - - /* encode geomID, primID */ - uint geomFlags = GRL_get_Flags(&geomDesc[geomID]); - - PrimRef ref; - PRIMREF_setAABB( &ref, lower.xyz, upper.xyz ); - PRIMREF_setProceduralMetaData( &ref, geomID, primID, geomFlags ); - - /* store primref to memory */ - const uint offset = atomic_add_global(numPrimitives, 1); - primref[offset] = ref; -} - -/* - - This function performs a binary search to calculate the geomID and - primID of the i'th primitive of the scene. For the search a - prefix_sum array is used that stores for each location j the sum of - the number of primitives of all meshes k with k 1) - { - const uint m = (l + r) / 2; - k = prefix_sum[m]; - if (k <= i) - { - l = m; - } - else if (i < k) - { - r = m; - } - } - - struct GeomPrimID id; - id.geomID = l; - id.primID = i - prefix_sum[l]; - return id; -} - -/* - - Checks if a vertex contains only finite floating point numbers. - - */ - -GRL_INLINE bool isfinite_vertex(float4 vtx) -{ - return isfinite(vtx.x) && isfinite(vtx.y) && isfinite(vtx.z); -} - - -/* - Create primrefs from array of instance descriptors. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel -primrefs_from_DXR_instances(global struct Globals *globals, - global struct BVHBase* bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, - uint numInstances, - global struct AABB *primrefs, - uint allowUpdate) -{ - const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; - if (instanceIndex < numInstances) - { - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex; - - primrefs_from_instances( - globals, - bvh, - instance, - instanceIndex, - primrefs, - 0, - allowUpdate); - } -} - -/* - Create primrefs from array of instance descriptors. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -void kernel -primrefs_from_DXR_instances_indirect(global struct Globals *globals, - global struct BVHBase* bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, - global struct IndirectBuildRangeInfo* indirect_data, - global struct AABB *primrefs, - uint allowUpdate) -{ - // TODO: On DG2, we have 8 dwords of 'inline data' which can be pushed - // directly to the kernel. THe rest of the kernel args are pulled using - // loads from memory. It may be more efficient to put 'numInstances' and - // 'allowUpdate' into 'globals' - - const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH; - - if (instanceIndex < indirect_data->primitiveCount) - { - instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) - (((global char*)instances) + indirect_data->primitiveOffset); - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex; - primrefs_from_instances( - globals, - bvh, - instance, - instanceIndex, - primrefs, - 0, - allowUpdate); - } -} - -/* - Create primrefs from array of pointers to instance descriptors. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel -primrefs_from_DXR_instances_pointers(global struct Globals *globals, - global struct BVHBase* bvh, - global void *instances_in, - uint numInstances, - global struct AABB *primrefs, - uint allowUpdate) -{ - global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = - (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; - - const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; - if (instanceIndex < numInstances) - { - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex]; - - primrefs_from_instances( - globals, - bvh, - instance, - instanceIndex, - primrefs, - 0, - allowUpdate); - } -} - -/* - Create primrefs from array of pointers to instance descriptors. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -void kernel -primrefs_from_DXR_instances_pointers_indirect(global struct Globals *globals, - global struct BVHBase* bvh, - global void *instances_in, - global struct AABB *primrefs, - global struct IndirectBuildRangeInfo* indirect_data, - uint allowUpdate) -{ - global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = - (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; - - const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH; - - if (instanceIndex < indirect_data->primitiveCount) - { - instances = (global const struct GRL_RAYTRACING_INSTANCE_DESC**) - (((global char*)instances) + indirect_data->primitiveOffset); - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex]; - - primrefs_from_instances( - globals, - bvh, - instance, - instanceIndex, - primrefs, - 0, - allowUpdate); - } -} - - -/////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////// - -bool can_pair( uint3 a, uint3 b ) -{ - bool match0 = any( a.xxx == b.xyz ) ? 1 : 0; - bool match1 = any( a.yyy == b.xyz ) ? 1 : 0; - bool match2 = any( a.zzz == b.xyz ) ? 1 : 0; - return (match0 + match1 + match2) >= 2; -} - -void reduce_bounds( - float3 lower, - float3 upper, - global struct Globals* globals, - global struct BVHBase* bvh ) -{ - - // reduce centroid bounds... make sure to exclude lanes with invalid AABBs - float3 cent = lower + upper; - float3 cent_lower = select( (float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper); - float3 cent_upper = select(-(float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper); - - // reduce geo bounds - AABB3f_atomic_merge_global_sub_group_lu( &bvh->Meta.bounds, lower, upper ); - AABB_global_atomic_merge_sub_group_lu(&globals->centroidBounds, cent_lower, cent_upper ); -} - - -struct TriState -{ - bool valid; - uint prim_index; - uint pairing; - uint3 indices; - float3 lower; - float3 upper; -}; - -#define NOT_PAIRED 0xffffffff - -void load_triangle_data(uniform global char* index_buffer, - uniform const uint index_format, - uniform global char* vertex_buffer, - uniform const uint vertex_format, - uniform const uint vertex_stride, - uniform global float* transform_buffer, - uniform uint total_vert_count, - struct TriState* state, - float4* v) -{ - state->indices = GRL_load_indices_from_buffer(index_buffer, index_format, state->prim_index ); - - const uint last_vertex = total_vert_count - 1; - const uint x = min(state->indices.x, last_vertex); - const uint y = min(state->indices.y, last_vertex); - const uint z = min(state->indices.z, last_vertex); - - GRL_load_triangle_vertices(vertex_buffer, vertex_format, vertex_stride, transform_buffer, x, y, z, v); -} - -struct TriState load_triangle( uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - uniform uint base, - uniform uint num_prims, - uniform uint total_vert_count ) -{ - - struct TriState state; - state.pairing = NOT_PAIRED; - state.valid = false; - state.prim_index = base + get_sub_group_local_id(); - state.lower = (float3)(INFINITY, INFINITY, INFINITY); - state.upper = -(float3)(INFINITY, INFINITY, INFINITY); - - if (state.prim_index < num_prims) - { - state.valid = true; - float4 v[3]; - load_triangle_data((global char*)geomDesc->Desc.Triangles.pIndexBuffer, - geomDesc->Desc.Triangles.IndexFormat, - (global char*)geomDesc->Desc.Triangles.pVertexBuffer, - geomDesc->Desc.Triangles.VertexFormat, - geomDesc->Desc.Triangles.VertexBufferByteStride, - (global float*)geomDesc->Desc.Triangles.pTransformBuffer, - total_vert_count, - &state, - v); - - if (state.indices.x >= total_vert_count || state.indices.y >= total_vert_count || state.indices.z >= total_vert_count || - !isfinite_vertex(v[0]) || !isfinite_vertex(v[1]) || !isfinite_vertex(v[2]) || - state.indices.x == state.indices.y || state.indices.x == state.indices.z || state.indices.y == state.indices.z) - { - state.valid = false; - } - else - { - state.lower.xyz = min(v[2].xyz, min(v[1].xyz, v[0].xyz)); - state.upper.xyz = max(v[2].xyz, max(v[1].xyz, v[0].xyz)); - } - } - return state; -} - -void broadcast_triangles_local( struct TriState* state ) -{ - varying uint my_prim = state->prim_index; - varying uint my_pairing = state->pairing; - varying float3 my_lower = state->lower; - varying float3 my_upper = state->upper; - varying bool valid = state->valid; - varying uint3 indices = state->indices; - - for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++) - { - // don't broadcast invalid prims - if ( !sub_group_broadcast( valid, broadcast_lane ) ) - continue; - - uint broadcast_pairing = sub_group_broadcast(my_pairing, broadcast_lane); - uint broadcast_prim = sub_group_broadcast(my_prim, broadcast_lane); - - if (broadcast_pairing == NOT_PAIRED) - { - // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it - bool pairable = false; - uint3 other_indices = sub_group_broadcast_uint3( indices, broadcast_lane ); - if (broadcast_prim != my_prim && my_pairing == NOT_PAIRED && valid ) - { - pairable = can_pair( indices, other_indices ); - } - - - uint pairable_lane = ctz(intel_sub_group_ballot(pairable)); - if (valid && pairable_lane < get_sub_group_size()) - { - // pair the broadcast primitive with the first lane that can accept it - float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane); - float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane); - if (get_sub_group_local_id() == pairable_lane) - { - my_pairing = broadcast_prim; - my_lower.xyz = min(my_lower.xyz, broadcast_lower); - my_upper.xyz = max(my_upper.xyz, broadcast_upper); - } - - // pair the broadcast primitive with the same that was paired to it - uint pairable_prim = sub_group_broadcast(my_pairing, pairable_lane); - if (get_sub_group_local_id() == broadcast_lane) - { - my_pairing = pairable_prim; - } - } - } - else - { - // - // if this lane was already paired with the broadcasting tri - // in an earlier loop iteration, then record the pairing in this lane's registers - float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane); - float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane); - if (broadcast_pairing == my_prim) - { - my_pairing = broadcast_prim; - my_lower.xyz = min(my_lower.xyz, broadcast_lower); - my_upper.xyz = max(my_upper.xyz, broadcast_upper); - } - } - } - - state->pairing = my_pairing; - state->lower = my_lower; - state->upper = my_upper; -} - - -void broadcast_triangles_nonlocal(struct TriState* state, const struct TriState* other ) -{ - varying uint my_prim = state->prim_index; - varying uint my_pairing = state->pairing; - varying float3 my_lower = state->lower; - varying float3 my_upper = state->upper; - varying bool valid = state->valid; - varying uint3 indices = state->indices; - - for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++) - { - // don't broadcast invalid prims - if (!sub_group_broadcast(other->valid, broadcast_lane)) - continue; - - uint broadcast_pairing = sub_group_broadcast(other->pairing, broadcast_lane); - uint broadcast_prim = sub_group_broadcast(other->prim_index, broadcast_lane); - - if (broadcast_pairing == NOT_PAIRED) - { - // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it - bool pairable = false; - if ( my_pairing == NOT_PAIRED && valid ) - { - uint3 other_indices = sub_group_broadcast_uint3(other->indices, broadcast_lane); - pairable = can_pair(indices, other_indices); - } - - // pair the broadcast primitive with the first lane that can accept it - uint pairable_mask = intel_sub_group_ballot(pairable); - if (valid && (ctz(pairable_mask) == get_sub_group_local_id())) - { - my_pairing = broadcast_prim; - my_lower.xyz = min(my_lower.xyz, sub_group_broadcast_float3(other->lower.xyz, broadcast_lane)); - my_upper.xyz = max(my_upper.xyz, sub_group_broadcast_float3(other->upper.xyz, broadcast_lane)); - } - } - - } - - state->pairing = my_pairing; - state->lower = my_lower; - state->upper = my_upper; -} - -GRL_INLINE void do_triangles_to_primrefs( - global struct Globals* globals, - global struct BVHBase* bvh, - global struct AABB* primref, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - uint geomID_and_flags, - const uint num_prims) -{ - uint geomID = geomID_and_flags & 0x00ffffff; - uint geom_flags = geomID_and_flags >> 24; - uint prim_base = get_group_id(0) * get_local_size(0); - uint total_vert_count = GRL_get_triangles_VertexCount(geomDesc); - - struct TriState tri = load_triangle( geomDesc, prim_base, num_prims, total_vert_count ); - broadcast_triangles_local( &tri ); - - - // we will produce output if the lane creates a triangle (my_pairing == NOT_PAIRED) - // or for the lane corresponding to the larger of two triangles - bool will_write = (tri.pairing > tri.prim_index) && tri.valid; - uint write_mask = intel_sub_group_ballot(will_write); - uint write_offs = subgroup_bit_prefix_exclusive( write_mask ); - uint write_count = popcount(write_mask); - - // allocate space in primref buffer - uint write_base; - if( get_sub_group_local_id() == 0 ) - write_base = atomic_add_global( &globals->numPrimitives, write_count ); - write_offs += sub_group_broadcast( write_base, 0 ); - - uint primID0 = tri.prim_index; - uint primID1 = (tri.pairing != NOT_PAIRED) ? tri.pairing : tri.prim_index; - - if (will_write) - { - PrimRef ref; - PRIMREF_setAABB(&ref, tri.lower.xyz, tri.upper.xyz); - PRIMREF_setQuadMetaData(&ref, primID0, primID1, geomID, geom_flags); - uint8 val = (uint8)( - as_uint(ref.lower.x), as_uint(ref.lower.y), as_uint(ref.lower.z), as_uint(ref.lower.w), - as_uint(ref.upper.x), as_uint(ref.upper.y), as_uint(ref.upper.z), as_uint(ref.upper.w)); - store_uint8_L1WB_L3WB((global uint8*)(primref + write_offs), 0, val); - } - - reduce_bounds( tri.lower, tri.upper, globals, bvh ); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -triangles_to_primrefs( - global struct Globals* globals, - global struct BVHBase* bvh, - global struct AABB* primref, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - uint geomID_and_flags, - uint num_prims - ) -{ - do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel -triangles_to_primrefs_indirect( - global struct Globals* globals, - global struct BVHBase* bvh, - global struct AABB* primref, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global struct IndirectBuildRangeInfo* indirect_data, - uint geomID_and_flags) -{ - const uint num_prims = indirect_data->primitiveCount; - do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims); -} - -GRL_INLINE void do_procedurals_to_primrefs( - global struct Globals* globals, - global struct BVHBase* bvh, - global struct AABB* primref, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - uint geomID_and_flags, - const uint num_prims) -{ - uint geomID = geomID_and_flags & 0x00ffffff; - uint geomFlags = geomID_and_flags >> 24; - - uint primID = get_group_id(0) * get_local_size(0) + get_sub_group_local_id(); - - bool create_primref = false; - float3 lower = (float3)(INFINITY, INFINITY, INFINITY); - float3 upper = -(float3)(INFINITY, INFINITY, INFINITY); - if (primID < num_prims) - { - /* check if procedural is valid */ - struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(geomDesc, primID); - const bool valid_min = isfinite(aabb.MinX) && isfinite(aabb.MinY) && isfinite(aabb.MinZ); - const bool valid_max = isfinite(aabb.MaxX) && isfinite(aabb.MaxY) && isfinite(aabb.MaxZ); - if (valid_min & valid_max) - { - /* load aabb from memory */ - float3 l = (float3)(aabb.MinX, aabb.MinY, aabb.MinZ); - float3 u = (float3)(aabb.MaxX, aabb.MaxY, aabb.MaxZ); - - // convert degenerate boxes to points at the box centroid - lower = min( l, u ); - upper = max( l, u ); - - create_primref = true; - } - } - - uint write_mask = intel_sub_group_ballot(create_primref); - uint write_offs = subgroup_bit_prefix_exclusive(write_mask); - uint write_count = popcount(write_mask); - - // allocate space in primref buffer - uint write_base; - if (get_sub_group_local_id() == 0) - write_base = atomic_add_global(&globals->numPrimitives, write_count); - write_offs += sub_group_broadcast(write_base, 0); - - // write the primref - if (create_primref) - { - PrimRef ref; - PRIMREF_setAABB(&ref, lower.xyz, upper.xyz); - PRIMREF_setProceduralMetaData(&ref, geomID, primID, geomFlags); - primref[write_offs] = ref; - } - - reduce_bounds(lower, upper, globals, bvh); - -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -procedurals_to_primrefs( - global struct Globals* globals, - global struct BVHBase* bvh, - global struct AABB* primref, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - uint geomID_and_flags, - uint num_prims - ) -{ - do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel -procedurals_to_primrefs_indirect( - global struct Globals* globals, - global struct BVHBase* bvh, - global struct AABB* primref, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global const struct IndirectBuildRangeInfo* indirect_data, - uint geomID_and_flags - ) -{ - const uint num_prims = indirect_data->primitiveCount; - do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims); -} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.h b/src/intel/vulkan/grl/gpu/bvh_build_primref.h deleted file mode 100644 index 25e2d3df194..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_primref.h +++ /dev/null @@ -1,246 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#if 0 -/* - -Create primrefs from array of instance descriptors. - -*/ - -void store_instance_primref( - global struct BVHBase* top_bvh, - global struct Globals* globals, - global PrimRef* primrefs, - bool alloc_primref, - PrimRef new_primref ) -{ - uint allocatePrimref = alloc_primref ? 1 : 0; - uint index = 0; - uint numAllocations = sub_group_reduce_add(allocatePrimref); - - if (get_sub_group_local_id() == 0) - { - index = atomic_add_global(&globals->numPrimitives, numAllocations); - } - - index = sub_group_broadcast(index, 0); - index = index + sub_group_scan_exclusive_add(allocatePrimref); - - if (allocatePrimref) - { - primrefs[index] = new_primref; - } - - struct AABB centroidBounds; - centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref); - struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref); - struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(¢roidBounds); - - if (get_sub_group_local_id() == 0) - { - AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz); - AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds); - } -} - - - -// Compute transformed blas AABB. Returns false if instance is degenerate -bool create_instance_primref( - PrimRef* ref_out, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance, - global struct BVHBase* bvh, - uint instanceMask, - uint instanceIndex - ) -{ - struct AABB3f bbox; - bool alloc_primref = false; - uint rootNodeOffset = NO_NODE_OFFSET; - if (bvh != 0) - { - alloc_primref = true; - AABB3f AS_bounds = BVHBase_GetRootAABB(bvh); - - const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]); - const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]); - - if (!valid_min || !valid_max || instanceMask == 0) - { - // degenerated instance case - - // TODO this should be under if ( allocate backpointers ) - { - // we have to allocate the primref because this instance can be updated to non-degenerated - // take the origin of the instance as a bounding box. - - bbox.lower[0] = instance->Transform[3]; - bbox.lower[1] = instance->Transform[7]; - bbox.lower[2] = instance->Transform[11]; - bbox.upper[0] = instance->Transform[3]; - bbox.upper[1] = instance->Transform[7]; - bbox.upper[2] = instance->Transform[11]; - instanceMask = 0; - } - } - else - { - rootNodeOffset = BVH_ROOT_NODE_OFFSET; - float transformOverhead = 0.0f; - bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead); - } - } - - *ref_out = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, 0); - return alloc_primref; -} - -GRL_INLINE void primrefs_from_instances( - global struct Globals* globals, - global struct BVHBase* top_bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance, - uint instanceIndex, - global struct AABB* primrefs) -{ - bool alloc_primref = false; - PrimRef new_primref; - AABB_init(&new_primref); - - if (instance) - { - uint mask = GRL_get_InstanceMask(instance); - global struct BVHBase* bvh = (global struct BVHBase*)instance->AccelerationStructure; - alloc_primref = create_instance_primref(&new_primref, instance, bvh, mask, instanceIndex); - } - - store_instance_primref(top_bvh, globals, primrefs, alloc_primref, new_primref); -} -#endif - -#if 1 -GRL_INLINE void primrefs_from_instances( - global struct Globals* globals, - global struct BVHBase* top_bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance, - uint instanceIndex, - global struct AABB* primrefs, - global GRL_RAYTRACING_AABB* procedural_aabb, - uint allowUpdate - ) -{ - struct AABB3f bbox; - uint allocatePrimref = 0; - - uint rootNodeOffset = NO_NODE_OFFSET; - uint instanceMask = 0; - - bool is_procedural = (procedural_aabb != 0); - - if( instance ) - { - instanceMask = GRL_get_InstanceMask(instance) ; - if ( is_procedural ) - { - // procedural instance primref - allocatePrimref = 1; - - float3 lower = (float3)(procedural_aabb->MinX, procedural_aabb->MinY, procedural_aabb->MinZ); - float3 upper = (float3)(procedural_aabb->MaxX, procedural_aabb->MaxY, procedural_aabb->MaxZ); - - if (instanceMask == 0 || any(lower > upper)) - { - bbox.lower[0] = instance->Transform[3]; - bbox.lower[1] = instance->Transform[7]; - bbox.lower[2] = instance->Transform[11]; - bbox.upper[0] = instance->Transform[3]; - bbox.upper[1] = instance->Transform[7]; - bbox.upper[2] = instance->Transform[11]; - instanceMask = 0; - } - else - { - bbox = transform_aabb(lower, upper, instance->Transform); - } - } - else - { - // HW-instance primref - - global struct BVHBase* bvh = instance ? - (global struct BVHBase*)instance->AccelerationStructure : - 0; - - if (bvh != 0) - { - AABB3f AS_bounds = BVHBase_GetRootAABB(bvh); - - const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]); - const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]); - - - if (valid_min && valid_max && instanceMask != 0) - { - allocatePrimref = 1; - rootNodeOffset = BVH_ROOT_NODE_OFFSET; - float transformOverhead = 0.0f; - bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead); - } - else if (allowUpdate) - { - // degenerated instance case - // we have to allocate the primref because this instance can be updated to non-degenerated - // take the origin of the instance as a bounding box. - allocatePrimref = 1; - bbox.lower[0] = instance->Transform[3]; - bbox.lower[1] = instance->Transform[7]; - bbox.lower[2] = instance->Transform[11]; - bbox.upper[0] = instance->Transform[3]; - bbox.upper[1] = instance->Transform[7]; - bbox.upper[2] = instance->Transform[11]; - instanceMask = 0; - } - } - } - } - - uint index = 0; - uint numAllocations = sub_group_reduce_add(allocatePrimref); - - if (get_sub_group_local_id() == 0) - { - index = atomic_add_global(&globals->numPrimitives, numAllocations); - } - - index = sub_group_broadcast(index, 0); - index = index + sub_group_scan_exclusive_add(allocatePrimref); - - struct AABB new_primref; - struct AABB centroidBounds; - if (allocatePrimref) - { - new_primref = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, is_procedural); - primrefs[index] = new_primref; - centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref); - } - else - { - AABB_init(&new_primref); - AABB_init(¢roidBounds); - } - - - struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref); - struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(¢roidBounds); - - if (get_sub_group_local_id() == 0) - { - AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz); - AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds); - } -} -#endif diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.cl b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl deleted file mode 100644 index bcda2fa54ec..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_refit.cl +++ /dev/null @@ -1,491 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "bvh_build_refit.h" -#include "api_interface.h" -#include "common.h" - - - - - -#if 0 -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) -void kernel -update_instance_leaves( global struct BVHBase* bvh, - uint64_t dxrInstancesArray, - uint64_t dxrInstancesPtr, - global struct AABB3f* instance_aabb_scratch -) -{ - uint num_leaves = BVHBase_GetNumHWInstanceLeaves( bvh ); - uint id = get_local_id( 0 ) + get_local_size( 0 ) * get_group_id( 0 ); - if ( id >= num_leaves ) - return; - - global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray = - (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray; - global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray = - (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr; - - global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh ); - - /* iterate over all children of the instance node and get their bounds */ - - uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex( &leafs[id] ); - global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL; - if ( dxrInstancesArray != NULL ) - instance = &instancesArray[instanceIdx]; - else - instance = instancesPtrArray[instanceIdx]; - - struct AffineSpace3f xfm = AffineSpace3f_load_row_major( instance->Transform ); - global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure; - struct AABB3f newSubtreeBounds = instanceBvh->Meta.bounds; - struct AABB3f bbox = AABB3f_transform( xfm, newSubtreeBounds ); // JDB TODO: Use faster abs-matrix method - - const bool valid_min = isfinite( bbox.lower[0] ) && isfinite( bbox.lower[1] ) && isfinite( bbox.lower[2] ); - const bool valid_max = isfinite( bbox.upper[0] ) && isfinite( bbox.upper[1] ) && isfinite( bbox.upper[2] ); - - uint mask = GRL_get_InstanceMask(instance); - - uint offset = instanceBvh->rootNodeOffset; - if ( !valid_min || !valid_max ) - { - bbox.lower[0] = xfm.p.x; - bbox.lower[1] = xfm.p.y; - bbox.lower[2] = xfm.p.z; - bbox.upper[0] = xfm.p.x; - bbox.upper[1] = xfm.p.y; - bbox.upper[2] = xfm.p.z; - offset = NO_NODE_OFFSET; - mask = 0; - } - - instance_aabb_scratch[id] = bbox; - - HwInstanceLeaf_Constructor( &leafs[id], instance, instanceIdx, offset, mask ); // TODO: No instance opening for refittable BVH -} -#endif - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel -update_instance_leaves(global struct BVHBase* bvh, - uint64_t dxrInstancesArray, - uint64_t dxrInstancesPtr, - global struct AABB3f* instance_aabb_scratch -) -{ - uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh); - uint id = get_local_id(0) + get_local_size(0) * get_group_id(0); - if (id >= num_leaves) - return; - - DO_update_instance_leaves( - bvh, - dxrInstancesArray, - dxrInstancesPtr, - instance_aabb_scratch, - id, - 0 ); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel -update_instance_leaves_indirect(global struct BVHBase* bvh, - uint64_t dxrInstancesArray, - uint64_t dxrInstancesPtr, - global struct AABB3f* instance_aabb_scratch, - global struct IndirectBuildRangeInfo* indirect_data) -{ - uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh); - uint id = get_local_id(0) + get_local_size(0) * get_group_id(0); - if (id >= num_leaves) - return; - - DO_update_instance_leaves( - bvh, - dxrInstancesArray + indirect_data->primitiveOffset, - dxrInstancesPtr, - instance_aabb_scratch, - id, - 0 ); -} - -#if 0 -/* - - This kernel refit a BVH. The algorithm iterates over all BVH nodes - to find all leaf nodes, which is where refitting starts. For these - leaf nodes bounds get recalculated and then propagates up the tree. - - One kernel instance considers a range of inner nodes as startpoints. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(8, 1, 1))) void kernel refit( - global struct BVHBase *bvh, - global GRL_RAYTRACING_GEOMETRY_DESC* geosArray, - global struct AABB3f* instance_leaf_aabbs ) -{ - /* here we temporarily store the bounds for the children of a node */ - struct AABB childrenAABB[BVH_NODE_N6]; - - /* get pointer to inner nodes and back pointers */ - global struct QBVHNodeN *inner_nodes = BVHBase_rootNode(bvh); - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - - /* construct range of nodes that each work group will process */ - const uint numInnerNodes = BVHBase_numNodes(bvh); - const uint startID = (get_group_id(0) + 0) * numInnerNodes / get_num_groups(0); - const uint endID = (get_group_id(0) + 1) * numInnerNodes / get_num_groups(0); - - /* each workgroup iterates over its range of nodes */ - for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) - { - global struct QBVHNodeN* curNode = &inner_nodes[i]; - uint numChildren = refit_bottom(bvh, geosArray, - instance_leaf_aabbs, - curNode, - childrenAABB, - *InnerNode_GetBackPointer(backPointers, i)); - if (numChildren != 0) - { - /* update bounds of node */ - QBVHNodeN_setBounds(curNode, childrenAABB, numChildren); - - /* refit upper parts of the BVH */ - // TODO: this will not gonna work for mixed nodes - refit_bottom_up(curNode, bvh, childrenAABB, numChildren); - } - } -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(8, 1, 1))) -void kernel Find_refit_treelets( - global struct BVHBase* bvh, - global TreeletNodeData* treelets, - global uint* scratchStartpoints, - global uint* startpointAlloc) -{ - find_refit_treelets(bvh, - treelets, - scratchStartpoints, - startpointAlloc); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel Assign_refit_startpoints_to_treelets( - global struct BVHBase* bvh, - global TreeletNodeData* treelets, - global uint* scratchStartpoints) -{ - assign_refit_startpoints_to_treelets(bvh, treelets, scratchStartpoints); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(128, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel Finalize_treelets_in_groups( - global struct BVHBase* bvh, - global uint* scratchStartpoints ) -{ - local uint depths[FINALIZE_TREELETS_SLM_DEPTHS_SPACE]; - - finalize_treelets_in_groups(bvh, scratchStartpoints, depths); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(256, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel Refit_quads_tree_per_group(global SquashedInput* psqinputs) -{ - uint group_id = get_group_id(0); - SquashedInput sqinput = psqinputs[group_id]; - global struct BVHBase* bvh = sqinput.pBvh; - uint numLeaves = BVHBase_GetNumQuads(bvh); - global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh); - - global void* input = sqinput.pInput; - global struct AABB* bbox_scratch = sqinput.bbox_scratch; - - uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64; - global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input; - uint id = get_local_id(0); - - for (uint leaf_id = id; leaf_id < numLeaves; leaf_id += get_local_size(0)) - { - struct AABB theAABB; - refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB); - theAABB.lower.w = as_float(0xABBADEFFu); - bbox_scratch[leafsIndexOffset + leaf_id] = theAABB; - } -} - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(32, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel Refit_quads( - global struct BVHBase* bvh, - global void* input, - global struct AABB* bbox_scratch, - uint numGroupsExecuted, - global SquashedInputGroupDesc* sqinput) -{ - uint numLeafs = BVHBase_GetNumQuads(bvh); - if (numLeafs == 0) return; - global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh); - - global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input; - uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64; - - uint numLeafsPerGr = (numLeafs + (numGroupsExecuted - 1)) / numGroupsExecuted; - - uint id_start = get_group_id(0) * numLeafsPerGr + get_local_id(0); - uint id_end = min(id_start + numLeafsPerGr, numLeafs); - for (uint id = id_start; id < id_end; id+= get_local_size(0)) - { - struct AABB theAABB; - refit_bottom_child_quad(leafs + id, geosArray, &theAABB); - theAABB.lower.w = as_float(0xABBADEFFu); - bbox_scratch[leafsIndexOffset + id] = theAABB; - } - - if (get_group_id(0) == 0 && get_local_id(0) < 16) - { - - uint groupnr; - uint treeletCnt = *BVHBase_GetRefitTreeletCntPtr(bvh); - if (get_sub_group_local_id() == 0) { - groupnr = atomic_add_global(&sqinput->totalNumGroups, treeletCnt); - } - groupnr = sub_group_broadcast(groupnr, 0); - for (uint subtree = get_sub_group_local_id(); subtree < treeletCnt; subtree += get_sub_group_size()) - { - uint gr = groupnr + subtree; - //printf("tree %llx, treelet %d/%d, grId %d, numStartpoints %d\n", bvh, subtree,treeletCnt, gr, BVHBase_GetRefitTreeletDescs(bvh)[subtree].numStartpoints); - sqinput[gr].bvh = (qword)bvh; - sqinput[gr].scratch = (qword)bbox_scratch; - sqinput[gr].groupInTree = subtree; - } - //if (get_local_id(0)==0 && treeletCnt > 1) - //{ - // printf("tree %llx, tip treelet %d/%d = numStartpoints %d depth %d\n", bvh, treeletCnt, treeletCnt, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].numStartpoints, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].maxDepth); - //} - } -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(256, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel -Refit_tree_per_group_quad( - global SquashedInput* psqinputs) -{ - uint group_id = get_group_id(0); - SquashedInput sqinput = psqinputs[group_id]; - global struct BVHBase* bvh = sqinput.pBvh; - global struct AABB* bbox_scratch = sqinput.bbox_scratch; - global void* pInput = sqinput.pInput; - local Treelet_by_single_group_locals loc; - - if (*BVHBase_GetRefitTreeletCntPtr(bvh) == 0) - return; - -#if REFIT_DEBUG_CHECKS - uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh); - if (bottoms_cnt != 1) { - if (get_local_id(0) == 0) - { - printf("Error: this tree has more than 1 treelets!\n"); - } - return; - } -#endif - - /* get pointer to inner nodes and back pointers */ - uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh); - - // uniform per group - uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh); - - uint numLeafs = bvh->quadLeafCur - bvh->quadLeafStart; - - if (numLeafs == 0) { return; } - - uint numLeafsByOneThread = (numLeafs + (get_local_size(0) - 1)) / get_local_size(0); - - update_quads(bvh, pInput, bbox_scratch, get_local_id(0), numLeafsByOneThread); - - mem_fence_workgroup_default(); work_group_barrier(0); - - RefitTreelet trltDsc = *pTrltDsc; - - refit_treelet_by_single_group( - bbox_scratch, - &loc, - bvh, - trltDsc, - false, - true); - - if (trltDsc.maxDepth > 0) - { - mem_fence_workgroup_default(); work_group_barrier(0); - post_refit_encode_qnode_tree_per_group(bbox_scratch,bvh); - } -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(256, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel -Refit_treelet_per_group( - global SquashedInputGroupDesc* sqinput) -{ - uint group_id = get_group_id(0); - global struct AABB* bbox_scratch = (global struct AABB* )sqinput[group_id].scratch; - global struct BVHBase* bvh = (global struct BVHBase* )sqinput[group_id].bvh; - group_id = sqinput[group_id].groupInTree; - - /* get pointer to inner nodes and back pointers */ - uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh); - - uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh); - - // uniform per group - uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh); - - bool should_we_process_treetip = true; - local Treelet_by_single_group_locals loc; - local bool* l_should_we_process_treetip = (local bool*)&loc; -#if REFIT_VERBOSE_LOG - if (group_id != 0) return; -#endif - - if (bottoms_cnt > 1) - { -#if REFIT_VERBOSE_LOG - for (; group_id < bottoms_cnt; group_id++) - { - if (get_local_id(0) == 0) { printf("\n ====== treelet %d ====== \n", group_id); } - work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, memory_scope_device); -#endif - bool rootProcThread = refit_treelet_by_single_group( - bbox_scratch, - &loc, - bvh, - pTrltDsc[group_id], - true, - false); - - // we have to make last group that finishes go up and process the treetip - if (rootProcThread) - { - - mem_fence_gpu_invalidate(); - uint finished_cnt = atomic_inc_global((global uint*) & bvh->refitTreeletCnt2); - should_we_process_treetip = finished_cnt + 1 == bottoms_cnt; - - * l_should_we_process_treetip = should_we_process_treetip; - - if (should_we_process_treetip) mem_fence_gpu_invalidate(); - } -#if REFIT_VERBOSE_LOG - } -#endif - work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group); - - should_we_process_treetip = *l_should_we_process_treetip; - } - - if (should_we_process_treetip) - { - //this group will process treetip - if (get_local_id(0) == 0) { bvh->refitTreeletCnt2 = 0; } - if (bottoms_cnt == 1) { bottoms_cnt = 0; } - refit_treelet_by_single_group( - bbox_scratch, - &loc, - bvh, - pTrltDsc[bottoms_cnt], - true, - true); - } -} - -/* - This kernel refit a BVH. The algorithm iterates over all BVH nodes - to find all leaf nodes, which is where refitting starts. For these - leaf nodes bounds get recalculated and then propagates up the tree. - - One kernel instance considers exactly one inner_node startpoint. - not range of inner nodes. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(8, 1, 1))) void kernel -Refit_per_one_startpoint( - global struct BVHBase* bvh, - global GRL_RAYTRACING_GEOMETRY_DESC* geosArray, - global struct AABB3f* instance_leaf_aabbs ) -{ - /* here we temporarily store the bounds for the children of a node */ - struct AABB childrenAABB[BVH_NODE_N6]; - - /* get pointer to inner nodes and back pointers */ - global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh); - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - - /* get the inner node that we will consider as a bottom startpoint */ - const uint numInnerNodes = BVHBase_numNodes(bvh); - const uint innerNodeIdx = (get_group_id(0) + 0) * get_local_size(0) + get_local_id(0); - - if (innerNodeIdx >= numInnerNodes) return; - - global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx]; - uint numChildren = refit_bottom( - bvh, - geosArray, - instance_leaf_aabbs, - curNode, - childrenAABB, - *InnerNode_GetBackPointer(backPointers, innerNodeIdx)); - - if (numChildren != 0) - { - /* update bounds of node */ - QBVHNodeN_setBounds(curNode, childrenAABB, numChildren); - - /* refit upper parts of the BVH */ - /* TODO: this will not gonna work for mixed nodes */ - refit_bottom_up(curNode, bvh, childrenAABB, numChildren); - } -} - -#endif - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel -Refit_indirect_sg( - global struct BVHBase* bvh, - global GRL_RAYTRACING_GEOMETRY_DESC* geosArray, - global struct AABB3f* instance_leaf_aabbs) -{ - DO_Refit_per_one_startpoint_sg(bvh, geosArray, instance_leaf_aabbs, 0); - -} diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_refit.h deleted file mode 100644 index 522a44b23a7..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_refit.h +++ /dev/null @@ -1,546 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "common.h" -#include "api_interface.h" -#include "instance.h" -#include "GRLGen12.h" -#include "libs/lsc_intrinsics.h" - - -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel -DO_update_instance_leaves(global struct BVHBase* bvh, - uint64_t dxrInstancesArray, - uint64_t dxrInstancesPtr, - global struct AABB3f* instance_aabb_scratch, - uint id , - global struct GRL_RAYTRACING_AABB* procedural_box -) -{ - - global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray = - (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray; - global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray = - (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr; - - global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh); - - - /* iterate over all children of the instance node and get their bounds */ - - uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]); - global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL; - if (dxrInstancesArray != NULL) - instance = &instancesArray[instanceIdx]; - else - instance = instancesPtrArray[instanceIdx]; - - uint mask = GRL_get_InstanceMask(instance); - uint offset = NO_NODE_OFFSET; - - struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform); - struct AABB3f bbox; - - if (procedural_box != 0) - { - bbox.lower[0] = procedural_box->MinX; - bbox.lower[1] = procedural_box->MinY; - bbox.lower[2] = procedural_box->MinZ; - bbox.upper[0] = procedural_box->MaxX; - bbox.upper[1] = procedural_box->MaxY; - bbox.upper[2] = procedural_box->MaxZ; - } - else - { - global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure; - bbox = instanceBvh->Meta.bounds; - offset = BVH_ROOT_NODE_OFFSET; - } - - - const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]); - const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]); - - if (!valid_min || !valid_max ) - { - bbox.lower[0] = xfm.p.x; - bbox.lower[1] = xfm.p.y; - bbox.lower[2] = xfm.p.z; - bbox.upper[0] = xfm.p.x; - bbox.upper[1] = xfm.p.y; - bbox.upper[2] = xfm.p.z; - offset = NO_NODE_OFFSET; - mask = 0; - } - else - { - bbox = AABB3f_transform(xfm, bbox); // JDB TODO: Use faster abs-matrix method - } - - instance_aabb_scratch[id] = bbox; - - HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH -} - -/* - This function starts at some BVH node and refits all nodes upwards - to the root. At some node the algorithm only proceeds upwards if - all children of the current node have already been processed. This - is checked as each time a node is reached an atomic counter is - incremented, which will reach the number of children of the node at - some time. - */ - -GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed) - global struct BVHBase *bvh, // pointer to BVH - struct AABB *childrenAABB, // temporary data to use - uint numChildrenTotal) -{ - global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - - /* compute the index of the start node */ - uint curNodeIndex = qnode_start - nodeData; - - /* the start node got already processed, thus go to its parent node */ - curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6; - - /* end at root node */ - while (curNodeIndex != 0x03FFFFFF) - { - /* increment refit counter that counts refitted children of current node */ - const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex)); - - /* if all children got refitted, then continue */ - const uint numChildrenRefitted = (parentPointer >> 0) & 0x7; - numChildrenTotal = (parentPointer >> 3) & 0x7; - if (numChildrenRefitted != numChildrenTotal) - return; - - /* reset refit counter for next refit */ - *InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8; - - /* get bounds of all children from child nodes directly */ - global struct QBVHNodeN *qnode = nodeData + curNodeIndex; - global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode); - for (uint k = 0; k < numChildrenTotal; k++) - childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k); - - /* update node bounds of all children */ - QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal); - - write_mem_fence(CLK_GLOBAL_MEM_FENCE); - - /* make parent node the current node */ - curNodeIndex = parentPointer >> 6; - } - - /* update QBVH6 bounds */ - struct AABB bounds; - AABB_init(&bounds); - - for (uint i = 0; i < numChildrenTotal; i++) - AABB_extend(&bounds, &childrenAABB[i]); - - setBVHBaseBounds(bvh, &bounds); -} - - -GRL_INLINE void SUBGROUP_refit_bottom_up( - uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed) - uniform global struct BVHBase* bvh, // pointer to BVH - varying struct AABB reduce_bounds, - uniform uint numChildrenTotal, - varying ushort lane, - varying ushort head_lane) -{ - uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); - uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); - - /* compute the index of the start node */ - uniform uint curNodeIndex = qnode_start - nodeData; - - /* the start node got already processed, thus go to its parent node */ - uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6; - - varying struct AABB childrenAABB; - - /* end at root node */ - while ( curNodeIndex != 0x03FFFFFF ) - { - mem_fence_gpu_invalidate(); - - /* increment refit counter that counts refitted children of current node */ - uniform uint parentPointer = 1; - if (lane == 0) - { - // acquire fence ensures that all previous writes complete before the atomic starts - parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex)); - } - - parentPointer = intel_sub_group_shuffle( parentPointer, head_lane ); - - /* if all children got refitted, then continue */ - uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7; - numChildrenTotal = (parentPointer >> 3) & 0x7; - if ( numChildrenRefitted != numChildrenTotal ) - return; - - /* reset refit counter for next refit */ - if (lane == 0) - { - *InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8); - } - - /* get bounds of all children from child nodes directly */ - global struct QBVHNodeN* qnode = nodeData + curNodeIndex; - global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode ); - - varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0; - childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx ); - - /* update node bounds of all children */ - reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB ); - reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane ); - - subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane); - - /* update node mask */ - uchar childrenMask = qnode_child[child_idx].instMask; - - qnode->instMask = sub_group_reduce_or_N6(childrenMask); - - /* make parent node the current node */ - curNodeIndex = parentPointer >> 6; - } - - /* update QBVH6 bounds */ - - if( lane == 0 ) - setBVHBaseBounds( bvh, &reduce_bounds ); -} - - -GRL_INLINE void quadCopyVertices( - const struct QuadLeaf* pQuad, - struct QuadLeaf* newQuad) -{ - const uint4* s = (const uint4*) & (pQuad->v[0][0]); - uint4* d = (uint4*) & (newQuad->v[0][0]); - const uint8* s2 = (const uint8*)(s+1); - uint8* d2 = (uint8*)(d+1); - *d = *s; - *d2 = *s2; -} - - -GRL_INLINE void get_updated_quad( - global const struct QuadLeaf* pQuad, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs, - struct QuadLeaf* newQuad) -{ - struct QuadLeaf tempQuad; - - // fetch non vtx data; - { - uint4* tempQuad4U = (uint4*)&tempQuad; - global const uint4* pQuad4U = (global const uint4*)pQuad; - *tempQuad4U = *pQuad4U; - } - - /* get the geomID and primID0/1 for both quad triangles */ - const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc); - const uint primID0 = tempQuad.primIndex0; - const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad); - ushort fourth_vert = 0; - - if (primID1 != primID0) - { - ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad); - fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert; - fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert; - } - - global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID; - - uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert); - - // read the indices of the 4 verts we want - float3 vtx0, vtx1, vtx2, vtx3; - GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices); - - QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3); - - *newQuad = tempQuad; -} - -// This calculates children BBs for innerNode having *all* children leafs. -// mixed nodes will be updated by passing through bottom-up thread. -GRL_INLINE uint refit_bottom( global struct BVHBase* bvh, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global struct AABB3f* instance_leaf_aabbs, - global struct QBVHNodeN* curNode, - struct AABB *childrenAABB, - uint backPointer) -{ - uint numChildren = 0; - - /* we start refit at leaf nodes, this case is for quad nodes */ - if (curNode->type == BVH_QUAD_NODE) - { - global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode); - - /* iterate over all quads of the quad node and get their bounds */ - numChildren = (backPointer >> 3) & 0x7; - for (uint k = 0; k < numChildren; k++) - { - struct QuadLeaf Q; - get_updated_quad(&quads[k], geomDesc, &Q); - quadCopyVertices(&Q, &quads[k]); - childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad - } - } - - /* we start refit at leaf nodes, this case is for procedural nodes */ - else if (curNode->type == BVH_PROCEDURAL_NODE) - { - global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode); - - /* iterate over all children of the procedural node and get their bounds */ - numChildren = (backPointer >> 3) & 0x7; - for (uint k = 0; k < numChildren; k++) - { - /* extract geomID and primID from leaf */ - const uint startPrim = QBVHNodeN_startPrim(curNode, k); - const uint geomID = ProceduralLeaf_geomIndex(leaf); - const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf! - - /* read bounds from geometry descriptor */ - struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID); - childrenAABB[k].lower.x = aabb.MinX; - childrenAABB[k].lower.y = aabb.MinY; - childrenAABB[k].lower.z = aabb.MinZ; - childrenAABB[k].upper.x = aabb.MaxX; - childrenAABB[k].upper.y = aabb.MaxY; - childrenAABB[k].upper.z = aabb.MaxZ; - - /* advance leaf pointer to next child */ - leaf += QBVHNodeN_blockIncr(curNode, k); - } - } - - /* we start refit at leaf nodes, this case is for instance nodes */ - else if (curNode->type == BVH_INSTANCE_NODE) - { - global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode); - global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh ); - - /* iterate over all children of the instance node and get their bounds */ - numChildren = (backPointer >> 3) & 0x7; - for (uint k = 0; k < numChildren; k++) - { - uint leafindex = (instancesLeaves + k) - leafBase; - childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] ); - childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] ); - } - } - - return numChildren; -} - - - - - -// This calculates children BBs for innerNode having *all* children leafs. -// mixed nodes will be updated by passing through bottom-up thread. -GRL_INLINE uint SUBGROUP_refit_bottom( - uniform global struct BVHBase* bvh, - uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - uniform global struct AABB3f* instance_leaf_aabbs, - uniform global struct QBVHNodeN* curNode, - uniform uint backPointer, - varying struct AABB* childrenAABB, - varying uchar* childrenMask, - varying ushort lane, - global uchar* is_procedural_instance - ) -{ - uniform uint numChildren = 0; - bool enable_procedural_instance = (is_procedural_instance != 0); - - /* we start refit at leaf nodes, this case is for quad nodes */ - if (curNode->type == BVH_QUAD_NODE) - { - /* iterate over all quads of the quad node and get their bounds */ - numChildren = (backPointer >> 3) & 0x7; - - uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode); - - struct QuadLeaf Q; - if (lane < numChildren) - { - get_updated_quad(&quads[lane], geomDesc, &Q); - - *childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad - - quadCopyVertices(&Q, &quads[lane]); - *childrenMask = 0xff; - } - // FIXME: support leaves with more than one quad - } - - /* we start refit at leaf nodes, this case is for procedural nodes */ - else if (curNode->type == BVH_PROCEDURAL_NODE) - { - uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode); - - - - /* iterate over all children of the procedural node and get their bounds */ - numChildren = (backPointer >> 3) & 0x7; - - varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0; - incr = sub_group_scan_exclusive_add(incr); - - if( lane < numChildren ) - { - /* extract geomID and primID from leaf */ - varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane ); - varying global struct ProceduralLeaf* my_leaf = leaf + incr; - const uint geomID = ProceduralLeaf_geomIndex(my_leaf); - const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim); - - /* read bounds from geometry descriptor */ - struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID); - childrenAABB->lower.x = aabb.MinX; - childrenAABB->lower.y = aabb.MinY; - childrenAABB->lower.z = aabb.MinZ; - childrenAABB->upper.x = aabb.MaxX; - childrenAABB->upper.y = aabb.MaxY; - childrenAABB->upper.z = aabb.MaxZ; - *childrenMask = 0xff; - } - } - - /* we start refit at leaf nodes, this case is for instance nodes */ - else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE) - { - uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode); - uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh); - - /* iterate over all children of the instance node and get their bounds and masks */ - numChildren = (backPointer >> 3) & 0x7; - if( lane < numChildren ) - { - uint leafindex = (instancesLeaves + lane) - leafBase; - childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]); - childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]); - *childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]); - } - } - else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE) - { - // Handle procedural-instance leaves - // TODO: Generalize this! Should re-write the kernel to work with arbitrary mixed-mode leaves - - numChildren = (backPointer >> 3) & 0x7; - uint childType = BVH_INTERNAL_NODE; - if ( lane < numChildren ) - { - childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane ); - if (childType != BVH_INTERNAL_NODE) - { - uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode ); - uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh ); - uint leafindex = (instancesLeaves + lane) - leafBase; - childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] ); - childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] ); - *childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] ); - - // see if the child has flipped from procedural to non-procedural and update the child type field as needed - uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] ); - uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE; - if (newChildType != childType) - { - InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType ); - } - } - } - - - // don't ascend the tree for a true internal node - if (sub_group_all(childType == BVH_INTERNAL_NODE)) - numChildren = 0; - } - - return numChildren; -} - -#define SG_REFIT_WG_SIZE 8 - -void DO_Refit_per_one_startpoint_sg( - global struct BVHBase* bvh, - global GRL_RAYTRACING_GEOMETRY_DESC* geosArray, - global struct AABB3f* instance_leaf_aabbs, - global uchar* is_procedural_instance ) -{ - /* get pointer to inner nodes and back pointers */ - global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh); - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - - /* get the inner node that we will consider as a bottom startpoint */ - const uint numInnerNodes = BVHBase_numNodes(bvh); - const uint innerNodeIdx = get_sub_group_global_id(); - - varying ushort lane = get_sub_group_local_id(); - - if (innerNodeIdx >= numInnerNodes) return; - - varying struct AABB childrenAABB; // one child AABB per lane - AABB_init(&childrenAABB); - - varying uchar childrenMask = 0; // one child mask per lane - - global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx]; - uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); - uint numChildren = SUBGROUP_refit_bottom( - bvh, - geosArray, - instance_leaf_aabbs, - curNode, - backPointer, - &childrenAABB, - &childrenMask, - lane, - is_procedural_instance - ); - - - if (numChildren != 0) - { - /* update bounds of node */ - struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB); - reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0); - subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane); - - /* update mask of node */ - uchar mask = sub_group_reduce_or_N6(childrenMask); - curNode->instMask = mask; - - /* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done - only by the first thread (similar to morton phase1) the machine hangs. */ - mem_fence_gpu_invalidate(); - - /* refit upper parts of the BVH */ - /* TODO: this will not gonna work for mixed nodes */ - SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0); - } -} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl deleted file mode 100644 index 0a4bd3466af..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl +++ /dev/null @@ -1,1917 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "common.h" -#include "instance.h" - -#define DBG(x) - -#define ENABLE_CHECKS 0 - -#define ENABLE_32BINS_IN_BREADTH_FIRST_PHASE 1 - -/* todo: */ -/* - new cross WG code path for first splits */ -/* - optimize find best child loop sequence */ -/* - subgroup_setQBVHNodeN needs work on 6 slots in parallel */ - -#define DIVIDE_BY_6 1 - -inline uint getNumPrims(struct BuildRecord *buildRecord) -{ - return buildRecord->end - buildRecord->start; -} - -inline void printBuildRecord(struct BuildRecord *record) -{ - printf("centroidBounds\n"); - AABB_print(&record->centroidBounds); - printf("start %d end %d size %d depth %d \n", record->start, record->end, record->end - record->start, getBuildRecursionDepth(record)); -} - -inline void printBinInfo2(struct BinInfo2 *record) -{ - printf("boundsX[%d]\n", BINS * 2); - for (uint b = 0; b < BINS * 2; b++) - { - AABB3f_print(&record->boundsX[b]); - printf("counts.x = %d\n", record->counts[b].x); - } - printf("boundsY[%d]\n", BINS * 2); - for (uint b = 0; b < BINS * 2; b++) - { - AABB3f_print(&record->boundsY[b]); - printf("counts.y = %d\n", record->counts[b].y); - } - printf("boundsZ[%d]\n", BINS * 2); - for (uint b = 0; b < BINS * 2; b++) - { - AABB3f_print(&record->boundsZ[b]); - printf("counts.z = %d\n", record->counts[b].z); - } -} - -inline void initBinMapping(struct BinMapping *binMapping, struct AABB *centBounds, const uint bins) -{ - const float4 eps = 1E-34f; - const float4 diag = max(eps, centBounds->upper - centBounds->lower); - const float4 scale = (float4)(0.99f * (float)bins) / diag; - binMapping->scale = select((float4)(0.0f), scale, diag > eps); - binMapping->ofs = centBounds->lower; -} - -inline void atomicExtendLocalBuildRecord(local struct BuildRecord *buildRecord, global struct AABB *primref) -{ - const float4 centroid2 = primref->lower + primref->upper; - AABB_local_atomic_merge(&buildRecord->centroidBounds, centroid2, centroid2); -} - -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- -// ---------------------------------------------------------------------------------------- - -inline void initBinInfo(struct BinInfo *binInfo) -{ - for (uint i = 0; i < BINS; i++) - { - AABB3f_init(&binInfo->boundsX[i]); - AABB3f_init(&binInfo->boundsY[i]); - AABB3f_init(&binInfo->boundsZ[i]); - binInfo->counts[i] = (uint3)(0); - } -} - -inline void subgroup_initBinInfo(struct BinInfo *binInfo) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - - for (uint i = subgroupLocalID; i < BINS; i += subgroup_size) - { - AABB3f_init(&binInfo->boundsX[i]); - AABB3f_init(&binInfo->boundsY[i]); - AABB3f_init(&binInfo->boundsZ[i]); - binInfo->counts[i] = (uint3)(0); - } -} - -inline void parallel_initBinInfo(struct BinInfo *binInfo) -{ - const uint localID = get_local_id(0); - if (localID < BINS) - { - AABB3f_init(&binInfo->boundsX[localID]); - AABB3f_init(&binInfo->boundsY[localID]); - AABB3f_init(&binInfo->boundsZ[localID]); - binInfo->counts[localID] = (uint3)(0); - } -} - -inline void atomicUpdateLocalBinInfo(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref) -{ - const float4 lower = primref->lower; - const float4 upper = primref->upper; - const float4 p = lower + upper; - const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); - AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper); - AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper); - AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper); - atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); - atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); - atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); -} - -inline void atomicUpdateLocalBinInfo_nocheck(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref) -{ - const float4 lower = primref->lower; - const float4 upper = primref->upper; - const float4 p = lower + upper; - const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); - AABB3f_atomic_merge_local_nocheck(&binInfo->boundsX[i.x], lower, upper); - AABB3f_atomic_merge_local_nocheck(&binInfo->boundsY[i.y], lower, upper); - AABB3f_atomic_merge_local_nocheck(&binInfo->boundsZ[i.z], lower, upper); - atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); - atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); - atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); -} - -inline void updateBins(struct BinMapping *binMapping, struct BinInfo *binInfo, global struct AABB *primref) -{ - const float4 lower = primref->lower; - const float4 upper = primref->upper; - const float4 p = lower + upper; - const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); - AABB3f_extendlu(&binInfo->boundsX[i.x], lower.xyz, upper.xyz); - AABB3f_extendlu(&binInfo->boundsY[i.y], lower.xyz, upper.xyz); - AABB3f_extendlu(&binInfo->boundsZ[i.z], lower.xyz, upper.xyz); - binInfo->counts[i.x].x++; - binInfo->counts[i.y].y++; - binInfo->counts[i.z].z++; -} - -// ===================================================================================================================== -// ===================================================================================================================== -// ===================================================================================================================== - -inline void parallel_initBinInfo2(struct BinInfo2 *binInfo, const uint bins) -{ - const uint localID = get_local_id(0); - if (localID < bins) - { - AABB3f_init(&binInfo->boundsX[localID]); - AABB3f_init(&binInfo->boundsY[localID]); - AABB3f_init(&binInfo->boundsZ[localID]); - binInfo->counts[localID] = (uint3)(0); - } -} - -inline void atomicUpdateLocalBinInfo2(struct BinMapping *binMapping, local struct BinInfo2 *binInfo, global struct AABB *primref) -{ - const float4 lower = primref->lower; - const float4 upper = primref->upper; - const float4 p = lower + upper; - const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale); - AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper); - AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper); - AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper); - atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1); - atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1); - atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1); -} - -inline void atomicUpdateGlobalFromLocalBinInfo2(global struct BinInfo2 *dest, local struct BinInfo2 *source, const uint bins) -{ - const uint localID = get_local_id(0); - if (localID < bins) - { - AABB3f_atomic_merge_global_local(&dest->boundsX[localID], &source->boundsX[localID]); - AABB3f_atomic_merge_global_local(&dest->boundsY[localID], &source->boundsY[localID]); - AABB3f_atomic_merge_global_local(&dest->boundsZ[localID], &source->boundsZ[localID]); - atomic_add((global uint *)&dest->counts[localID] + 0, source->counts[localID].x); - atomic_add((global uint *)&dest->counts[localID] + 1, source->counts[localID].y); - atomic_add((global uint *)&dest->counts[localID] + 2, source->counts[localID].z); - } -} - -inline uint subgroup_getMaxAreaChild(struct AABB *childrenAABB, const uint numChildren) -{ - const uint subgroupLocalID = get_sub_group_local_id(); -#if 0 - /*! find best child to split */ - const float area = (subgroupLocalID < numChildren) & (as_uint(childrenAABB[subgroupLocalID].upper.w) > cfg_minLeafSize) ? childrenAABB[subgroupLocalID].lower.w : -(float)INFINITY; - const float maxArea = sub_group_reduce_max(area); - const uint mask = intel_sub_group_ballot(area == maxArea); - const uint bestChild = maxArea != -(float)INFINITY ? ctz(mask) : -1; -#else - float bestArea = -(float)INFINITY; - int bestChild = -1; - for (int i = 0; i < numChildren; i++) - { - /* ignore leaves as they cannot get split */ - if (as_uint(childrenAABB[i].upper.w) <= cfg_minLeafSize) - continue; - - /* find child with largest surface area */ - if (childrenAABB[i].lower.w > bestArea) - { - bestChild = i; - bestArea = childrenAABB[i].lower.w; - } - } -#endif - return bestChild; -} - -inline bool AABB_verifyBounds(struct BuildRecord *buildRecord, struct AABB *geometryBounds, struct AABB *primref) -{ - const float4 centroid2 = primref->lower + primref->upper; - - if (centroid2.x < buildRecord->centroidBounds.lower.x) - return false; - if (centroid2.y < buildRecord->centroidBounds.lower.y) - return false; - if (centroid2.z < buildRecord->centroidBounds.lower.z) - return false; - - if (centroid2.x > buildRecord->centroidBounds.upper.x) - return false; - if (centroid2.y > buildRecord->centroidBounds.upper.y) - return false; - if (centroid2.z > buildRecord->centroidBounds.upper.z) - return false; - - if (primref->lower.x < geometryBounds->lower.x) - return false; - if (primref->lower.y < geometryBounds->lower.y) - return false; - if (primref->lower.z < geometryBounds->lower.z) - return false; - - if (primref->upper.x > geometryBounds->upper.x) - return false; - if (primref->upper.y > geometryBounds->upper.y) - return false; - if (primref->upper.z > geometryBounds->upper.z) - return false; - - return true; -} - -/* initialize primref index array */ -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -create_primref_index(global struct Globals *globals, - global struct AABB *primref, - global unsigned int *primref_index) -{ - const uint local_size = get_local_size(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - const uint localID = get_local_id(0); - - const uint startID = (taskID + 0) * globals->numPrimitives / numTasks; - const uint endID = (taskID + 1) * globals->numPrimitives / numTasks; - for (uint primID = startID + localID; primID < endID; primID += local_size) - primref_index[primID] = primID; -} - -// ========================================================================================================== -// ========================================================================================================== -// ========================================================================================================== - -inline float left_to_right_area16(struct AABB3f *low) -{ - struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low); - return halfArea_AABB3f(&low_prefix); -} - -inline uint left_to_right_counts16(uint low) -{ - return sub_group_scan_exclusive_add(low); -} - -inline float right_to_left_area16(struct AABB3f *low) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint ID = subgroup_size - 1 - subgroupLocalID; - struct AABB3f low_reverse = AABB3f_sub_group_shuffle(low, ID); - struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse); - const float low_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID); - return low_area; -} - -inline uint right_to_left_counts16(uint low) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint ID = subgroup_size - 1 - subgroupLocalID; - const uint low_reverse = sub_group_broadcast(low, ID); - const uint low_prefix = sub_group_scan_inclusive_add(low_reverse); - return sub_group_broadcast(low_prefix, ID); -} - -inline float2 left_to_right_area32(struct AABB3f *low, struct AABB3f *high) -{ - struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low); - struct AABB3f low_reduce = AABB3f_sub_group_reduce(low); - struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max(high); - AABB3f_extend(&high_prefix, &low_reduce); - const float low_area = halfArea_AABB3f(&low_prefix); - const float high_area = halfArea_AABB3f(&high_prefix); - return (float2)(low_area, high_area); -} - -inline uint2 left_to_right_counts32(uint low, uint high) -{ - const uint low_prefix = sub_group_scan_exclusive_add(low); - const uint low_reduce = sub_group_reduce_add(low); - const uint high_prefix = sub_group_scan_exclusive_add(high); - return (uint2)(low_prefix, low_reduce + high_prefix); -} - -inline float2 right_to_left_area32(struct AABB3f *low, struct AABB3f *high) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint ID = subgroup_size - 1 - subgroupLocalID; - struct AABB3f low_reverse = AABB3f_sub_group_shuffle(high, ID); - struct AABB3f high_reverse = AABB3f_sub_group_shuffle(low, ID); - struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse); - struct AABB3f low_reduce = AABB3f_sub_group_reduce(&low_reverse); - struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max(&high_reverse); - AABB3f_extend(&high_prefix, &low_reduce); - const float low_area = sub_group_broadcast(halfArea_AABB3f(&high_prefix), ID); - const float high_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID); - return (float2)(low_area, high_area); -} - -inline uint2 right_to_left_counts32(uint low, uint high) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint ID = subgroup_size - 1 - subgroupLocalID; - const uint low_reverse = sub_group_broadcast(high, ID); - const uint high_reverse = sub_group_broadcast(low, ID); - const uint low_prefix = sub_group_scan_inclusive_add(low_reverse); - const uint low_reduce = sub_group_reduce_add(low_reverse); - const uint high_prefix = sub_group_scan_inclusive_add(high_reverse) + low_reduce; - return (uint2)(sub_group_broadcast(high_prefix, ID), sub_group_broadcast(low_prefix, ID)); -} - -inline ulong getBestSplit(float3 sah, uint ID, const float4 scale, const ulong defaultSplit) -{ - ulong splitX = (((ulong)as_uint(sah.x)) << 32) | ((uint)ID << 2) | 0; - ulong splitY = (((ulong)as_uint(sah.y)) << 32) | ((uint)ID << 2) | 1; - ulong splitZ = (((ulong)as_uint(sah.z)) << 32) | ((uint)ID << 2) | 2; - /* ignore zero sized dimensions */ - splitX = select(splitX, defaultSplit, (ulong)(scale.x == 0)); - splitY = select(splitY, defaultSplit, (ulong)(scale.y == 0)); - splitZ = select(splitZ, defaultSplit, (ulong)(scale.z == 0)); - ulong bestSplit = min(min(splitX, splitY), splitZ); - bestSplit = sub_group_reduce_min(bestSplit); - return bestSplit; -} - -inline uint fastDivideBy6_uint(uint v) -{ -#if 1 - const ulong u = (ulong)v >> 1; - return (uint)((u * 0x55555556ul) >> 32); -#else - return v / 6; -#endif -} - -inline uint3 fastDivideBy6_uint3(uint3 v) -{ - return (uint3)(fastDivideBy6_uint(v.x), fastDivideBy6_uint(v.y), fastDivideBy6_uint(v.z)); -} - -inline struct Split reduceBinsAndComputeBestSplit16(struct BinInfo *binInfo, const float4 scale, uint startID, uint endID) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - - struct AABB3f boundsX = binInfo->boundsX[subgroupLocalID]; - - const float lr_areaX = left_to_right_area16(&boundsX); - const float rl_areaX = right_to_left_area16(&boundsX); - - struct AABB3f boundsY = binInfo->boundsY[subgroupLocalID]; - - const float lr_areaY = left_to_right_area16(&boundsY); - const float rl_areaY = right_to_left_area16(&boundsY); - - struct AABB3f boundsZ = binInfo->boundsZ[subgroupLocalID]; - - const float lr_areaZ = left_to_right_area16(&boundsZ); - const float rl_areaZ = right_to_left_area16(&boundsZ); - - const uint3 counts = binInfo->counts[subgroupLocalID]; - - const uint lr_countsX = left_to_right_counts16(counts.x); - const uint rl_countsX = right_to_left_counts16(counts.x); - const uint lr_countsY = left_to_right_counts16(counts.y); - const uint rl_countsY = right_to_left_counts16(counts.y); - const uint lr_countsZ = left_to_right_counts16(counts.z); - const uint rl_countsZ = right_to_left_counts16(counts.z); - - const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ); - const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ); - -#if DIVIDE_BY_6 == 0 - const uint blocks_shift = SAH_LOG_BLOCK_SHIFT; - uint3 blocks_add = (uint3)((1 << blocks_shift) - 1); - const uint3 lr_count = ((uint3)(lr_countsX, lr_countsY, lr_countsZ) + blocks_add) >> blocks_shift; - const uint3 rl_count = ((uint3)(rl_countsX, rl_countsY, rl_countsZ) + blocks_add) >> blocks_shift; -#else - const uint3 lr_count = fastDivideBy6_uint3((uint3)(lr_countsX, lr_countsY, lr_countsZ) + BVH_NODE_N6 - 1); - const uint3 rl_count = fastDivideBy6_uint3((uint3)(rl_countsX, rl_countsY, rl_countsZ) + BVH_NODE_N6 - 1); -#endif - float3 sah = fma(lr_area, convert_float3(lr_count), rl_area * convert_float3(rl_count)); - - /* first bin is invalid */ - - sah.x = select((float)(INFINITY), sah.x, subgroupLocalID != 0); - sah.y = select((float)(INFINITY), sah.y, subgroupLocalID != 0); - sah.z = select((float)(INFINITY), sah.z, subgroupLocalID != 0); - - const uint mid = (startID + endID) / 2; - const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0; - - const ulong bestSplit = getBestSplit(sah, subgroupLocalID, scale, defaultSplit); - - struct Split split; - split.sah = as_float((uint)(bestSplit >> 32)); - split.dim = (uint)bestSplit & 3; - split.pos = (uint)bestSplit >> 2; - - return split; -} - -inline struct Split reduceBinsAndComputeBestSplit32(struct BinInfo2 *binInfo, const float4 scale, uint startID, uint endID) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - - struct AABB3f boundsX_low = binInfo->boundsX[subgroupLocalID]; - struct AABB3f boundsX_high = binInfo->boundsX[subgroupLocalID + subgroup_size]; - - const float2 lr_areaX = left_to_right_area32(&boundsX_low, &boundsX_high); - const float2 rl_areaX = right_to_left_area32(&boundsX_low, &boundsX_high); - - struct AABB3f boundsY_low = binInfo->boundsY[subgroupLocalID]; - struct AABB3f boundsY_high = binInfo->boundsY[subgroupLocalID + subgroup_size]; - - const float2 lr_areaY = left_to_right_area32(&boundsY_low, &boundsY_high); - const float2 rl_areaY = right_to_left_area32(&boundsY_low, &boundsY_high); - - struct AABB3f boundsZ_low = binInfo->boundsZ[subgroupLocalID]; - struct AABB3f boundsZ_high = binInfo->boundsZ[subgroupLocalID + subgroup_size]; - - const float2 lr_areaZ = left_to_right_area32(&boundsZ_low, &boundsZ_high); - const float2 rl_areaZ = right_to_left_area32(&boundsZ_low, &boundsZ_high); - - const uint3 counts_low = binInfo->counts[subgroupLocalID]; - const uint3 counts_high = binInfo->counts[subgroupLocalID + subgroup_size]; - - const uint2 lr_countsX = left_to_right_counts32(counts_low.x, counts_high.x); - const uint2 rl_countsX = right_to_left_counts32(counts_low.x, counts_high.x); - const uint2 lr_countsY = left_to_right_counts32(counts_low.y, counts_high.y); - const uint2 rl_countsY = right_to_left_counts32(counts_low.y, counts_high.y); - const uint2 lr_countsZ = left_to_right_counts32(counts_low.z, counts_high.z); - const uint2 rl_countsZ = right_to_left_counts32(counts_low.z, counts_high.z); - - const uint blocks_shift = SAH_LOG_BLOCK_SHIFT; - uint3 blocks_add = (uint3)((1 << blocks_shift) - 1); - - /* low part: bins 0..15 */ - const float3 lr_area_low = (float3)(lr_areaX.x, lr_areaY.x, lr_areaZ.x); - const float3 rl_area_low = (float3)(rl_areaX.x, rl_areaY.x, rl_areaZ.x); - -#if DIVIDE_BY_6 == 0 - const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x) + blocks_add) >> blocks_shift; - const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x) + blocks_add) >> blocks_shift; - -#else - //const uint3 lr_count_low = ((uint3)(lr_countsX.x,lr_countsY.x,lr_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6; - //const uint3 rl_count_low = ((uint3)(rl_countsX.x,rl_countsY.x,rl_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6; - - /* skip blocks for breadth-first phase */ - const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x)); - const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x)); - -#endif - - float3 sah_low = fma(lr_area_low, convert_float3(lr_count_low), rl_area_low * convert_float3(rl_count_low)); - - /* first bin is invalid */ - // sah_low.x = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.x; - // sah_low.y = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.y; - // sah_low.z = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.z; - - sah_low.x = select((float)(INFINITY), sah_low.x, subgroupLocalID != 0); - sah_low.y = select((float)(INFINITY), sah_low.y, subgroupLocalID != 0); - sah_low.z = select((float)(INFINITY), sah_low.z, subgroupLocalID != 0); - - /* high part: bins 16..31 */ - - const float3 lr_area_high = (float3)(lr_areaX.y, lr_areaY.y, lr_areaZ.y); - const float3 rl_area_high = (float3)(rl_areaX.y, rl_areaY.y, rl_areaZ.y); -#if DIVIDE_BY_6 == 0 - const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y) + blocks_add) >> blocks_shift; - const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y) + blocks_add) >> blocks_shift; -#else - //const uint3 lr_count_high = ((uint3)(lr_countsX.y,lr_countsY.y,lr_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6; - //const uint3 rl_count_high = ((uint3)(rl_countsX.y,rl_countsY.y,rl_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6; - - /* skip blocks for breadth-first phase */ - const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y)); - const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y)); - -#endif - const float3 sah_high = fma(lr_area_high, convert_float3(lr_count_high), rl_area_high * convert_float3(rl_count_high)); - - const uint mid = (startID + endID) / 2; - const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0; - - const ulong bestSplit_low = getBestSplit(sah_low, subgroupLocalID, scale, defaultSplit); - const ulong bestSplit_high = getBestSplit(sah_high, subgroupLocalID + subgroup_size, scale, defaultSplit); - const ulong bestSplit = min(bestSplit_low, bestSplit_high); - - struct Split split; - split.sah = as_float((uint)(bestSplit >> 32)); - split.dim = (uint)bestSplit & 3; - split.pos = (uint)bestSplit >> 2; - - return split; -} - -// ===================================================================== - -inline float leafSAH(float geometryArea, uint prims, uint block_shift) -{ - return geometryArea * convert_float((prims + (1 << block_shift) - 1) >> block_shift); -} - -inline bool is_left(struct BinMapping *binMapping, struct Split *split, struct AABB *primref) -{ - const uint dim = split->dim; - const float lower = primref->lower[dim]; - const float upper = primref->upper[dim]; - const float c = lower + upper; - const uint pos = convert_uint_rtz((c - binMapping->ofs[dim]) * binMapping->scale[dim]); - return pos < split->pos; -} - -inline void serial_find_split(global struct AABB *primref, - struct BinMapping *binMapping, - struct BuildRecord *buildRecord, - local struct Split *split, - local struct BinInfo *binInfo, - global uint *primref_index0, - global uint *primref_index1) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - - const uint startID = buildRecord->start; - const uint endID = buildRecord->end; - - subgroup_initBinInfo(binInfo); - - for (uint t = startID + subgroupLocalID; t < endID; t += subgroup_size) - { - const uint index = primref_index0[t]; - primref_index1[t] = index; - atomicUpdateLocalBinInfo_nocheck(binMapping, binInfo, &primref[index]); - } -} - -inline void serial_partition_index(global struct AABB *primref, - struct BinMapping *binMapping, - struct BuildRecord *buildRecord, - struct Split *inSplit, - struct BuildRecord *outLeft, - struct BuildRecord *outRight, - struct AABB *outGeometryBoundsLeft, - struct AABB *outGeometryBoundsRight, - global uint *primref_index0, - global uint *primref_index1) -{ - const uint localID = get_local_id(0); - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroupID = get_sub_group_id(); - const uint subgroup_size = get_sub_group_size(); - - const uint begin = buildRecord->start; - const uint end = buildRecord->end; - struct Split split = *inSplit; - - struct BuildRecord left; - struct BuildRecord right; - initBuildRecord(&left, begin, end); - initBuildRecord(&right, begin, end); - - struct AABB leftAABB; - struct AABB rightAABB; - AABB_init(&leftAABB); - AABB_init(&rightAABB); - - global uint *l = primref_index0 + begin; - global uint *r = primref_index0 + end; - - /* no valid split, just split in the middle */ - if (split.sah == (float)(INFINITY)) - { - for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size) - { - const uint index = primref_index1[i]; - const uint count = sub_group_reduce_add(1); - extendBuildRecord(&left, &primref[index]); - AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); - l[subgroupLocalID] = index; - l += count; - } - - for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size) - { - const uint index = primref_index1[i]; - const uint count = sub_group_reduce_add(1); - extendBuildRecord(&right, &primref[index]); - AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); - r -= count; - r[subgroupLocalID] = index; - } - } - else - { - for (uint i = begin + subgroupLocalID; i < end; i += subgroup_size) - { - const uint index = primref_index1[i]; - const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; - const uint isRight = 1 - isLeft; - const uint countLeft = sub_group_reduce_add(isLeft); - const uint countRight = sub_group_reduce_add(isRight); - const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); - const uint prefixRight = sub_group_scan_exclusive_add(isRight); - - r -= countRight; - - if (isLeft) - { - extendBuildRecord(&left, &primref[index]); - AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); - l[prefixLeft] = index; - } - else - { - extendBuildRecord(&right, &primref[index]); - AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); - r[prefixRight] = index; - } - l += countLeft; - } - } - - left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); - right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); - leftAABB = AABB_sub_group_reduce(&leftAABB); - rightAABB = AABB_sub_group_reduce(&rightAABB); - - if (subgroupLocalID == 0) - { - uint pos = l - primref_index0; // single first thread needs to compute "pos" - left.end = pos; - right.start = pos; - - leftAABB.lower.w = AABB_halfArea(&leftAABB); - rightAABB.lower.w = AABB_halfArea(&rightAABB); - - leftAABB.upper.w = as_float(getNumPrimsBuildRecord(&left)); - rightAABB.upper.w = as_float(getNumPrimsBuildRecord(&right)); - - *outLeft = left; - *outRight = right; - *outGeometryBoundsLeft = leftAABB; - *outGeometryBoundsRight = rightAABB; - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - -#if ENABLE_CHECKS == 1 - if (subgroupLocalID == 0) - { - if (AABB_verify(outLeft)) - { - printf("outLeft:\n"); - printBuildRecord(outLeft); - } - if (AABB_verify(outRight)) - { - printf("outRight:\n"); - printBuildRecord(outRight); - } - if (AABB_verify(outGeometryBoundsLeft)) - { - printf("outGeometryBoundsLeft:\n"); - AABB_print(outGeometryBoundsLeft); - } - if (AABB_verify(outGeometryBoundsRight)) - { - printf("outGeometryBoundsRight:\n"); - AABB_print(outGeometryBoundsRight); - } - - for (uint i = outLeft->start; i < outLeft->end; i++) - { - const uint index = primref_index0[i]; - if (split.sah != (float)(INFINITY) && !is_left(binMapping, inSplit, &primref[index])) - printf("check left %d \n", i); - if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index])) - printf("check prim ref bounds left %d \n", i); - } - for (uint i = outRight->start; i < outRight->end; i++) - { - const uint index = primref_index0[i]; - if (split.sah != (float)(INFINITY) && is_left(binMapping, inSplit, &primref[index])) - printf("check right %d \n", i); - if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index])) - printf("check prim ref bounds right %d \n", i); - } - } -#endif -} - -inline uint subgroup_createLeaf_index(global struct BlockAllocator *allocator, - const uint start, - const uint end, - global struct AABB *primref, - uint primID, - global char *bvh_mem, - unsigned leafSize) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - const uint items = end - start; - -#if ENABLE_CHECKS == 1 - if (items > BVH_LEAF_N_MAX) - printf("error items %d \n", items); -#endif - - // JDB TODO: Why was this code commented out?? - //uint offset = (subgroupLocalID == 0) ? alloc_leaf_mem(globals,sizeof(struct Quad)*items) : 0; - //offset = sub_group_broadcast(offset,0); - - //uint offset = globals->leaf_mem_allocator_start + start * leafSize; - uint offset = allocator->start + start * leafSize; - return offset; -} - -inline uint get_qnode_index_for_backptr(void *qnode_base, void *qnode) -{ - size_t offset = ((size_t)qnode - (size_t)qnode_base) / sizeof(struct QBVHNodeN); - uint offset_u = (uint)offset; -#if ENABLE_CHECKS - if ((size_t)((offset_u << 6) >> 6) != offset) - { - printf("get_qnode_index_for_backptr - index out of reach"); - } -#endif - return offset_u; -} - -struct SerialBuildRecurseTemplateConst -{ - unsigned leafSize; - unsigned leafType; - bool allocateBackpointers; -}; - -// ==================================================================================== -// ==================================================================================== -// ==================================================================================== -// ==================================================================================== -// ==================================================================================== - -inline void parallel_find_split(global struct AABB *primref, - local struct BuildRecord *buildRecord, - local struct Split *bestSplit, - local struct BinInfo *binInfo, - global uint *primref_index0, - global uint *primref_index1) -{ - const uint localID = get_local_id(0); - const uint local_size = get_local_size(0); - const uint subgroupID = get_sub_group_id(); - - const uint startID = buildRecord->start; - const uint endID = buildRecord->end; - - struct BinMapping binMapping; - initBinMapping(&binMapping, &buildRecord->centroidBounds, BINS); - - /* init bininfo */ - parallel_initBinInfo(binInfo); - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - for (uint t = startID + localID; t < endID; t += local_size) - { - const uint index = primref_index0[t]; - primref_index1[t] = index; - atomicUpdateLocalBinInfo(&binMapping, binInfo, &primref[index]); - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - /* find best dimension */ - - if (subgroupID == 0) - { - *bestSplit = reduceBinsAndComputeBestSplit16(binInfo, binMapping.scale, startID, endID); - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); -} - -inline void parallel_find_split32(local uint *local_sync, - global struct AABB *primref, - local struct BuildRecord *buildRecord, - local struct Split *bestSplit, - local struct BinInfo2 *binInfo2, - global uint *primref_index0, - global uint *primref_index1) -{ - - const uint localID = get_local_id(0); - const uint local_size = get_local_size(0); - const uint subgroupID = get_sub_group_id(); - const uint numSubGroups = get_num_sub_groups(); - const uint subgroupLocalID = get_sub_group_local_id(); - - const uint startID = buildRecord->start; - const uint endID = buildRecord->end; - - struct BinMapping binMapping; - initBinMapping(&binMapping, &buildRecord->centroidBounds, 2 * BINS); - - /* init bininfo */ - parallel_initBinInfo2(binInfo2, 2 * BINS); - - if (localID == 0) - *local_sync = 0; - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - for (uint t = startID + localID; t < endID; t += local_size) - { - const uint index = primref_index0[t]; - primref_index1[t] = index; - atomicUpdateLocalBinInfo2(&binMapping, binInfo2, &primref[index]); - } - - /* find best split position using the last subgroup */ - sub_group_barrier(CLK_LOCAL_MEM_FENCE); - uint syncID = subgroupLocalID == 0 ? generic_atomic_add(local_sync, 1) : 0; - syncID = sub_group_broadcast(syncID, 0); - - if (syncID + 1 == numSubGroups) - { - *bestSplit = reduceBinsAndComputeBestSplit32(binInfo2, binMapping.scale, startID, endID); - DBG(if (localID == 0) printSplit(bestSplit)); - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); -} - -inline void parallel_partition_index(local uint *local_sync, - global struct AABB *primref, - struct BinMapping *binMapping, - const uint begin, - const uint end, - struct Split *inSplit, - local struct BuildRecord *outLeft, - local struct BuildRecord *outRight, - local struct AABB *outGeometryBoundsLeft, - local struct AABB *outGeometryBoundsRight, - global uint *primref_index0, - global uint *primref_index1, - uint *atomicCountLeft, - uint *atomicCountRight) -{ - const uint localID = get_local_id(0); - const uint local_size = get_local_size(0); - const uint subgroupID = get_sub_group_id(); - const uint numSubGroups = get_num_sub_groups(); - const uint subgroup_size = get_sub_group_size(); - const uint subgroupLocalID = get_sub_group_local_id(); - - const uint size = end - begin; - struct Split split = *inSplit; - - /* init bin bounds */ - if (localID == 0) - { - initBuildRecord(outLeft, begin, end); - initBuildRecord(outRight, begin, end); - AABB_init(outGeometryBoundsLeft); - AABB_init(outGeometryBoundsRight); - *atomicCountLeft = 0; - *atomicCountRight = 0; - *local_sync = 0; - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); // remove ? - - struct BuildRecord left; - struct BuildRecord right; - initBuildRecord(&left, begin, end); - initBuildRecord(&right, begin, end); - - struct AABB leftAABB; - struct AABB rightAABB; - AABB_init(&leftAABB); - AABB_init(&rightAABB); - - if (split.sah == (float)(INFINITY)) - { - if (subgroupID == 0) - { - for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size) - { - const uint index = primref_index1[i]; - extendBuildRecord(&left, &primref[index]); - AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); - primref_index0[i] = index; - } - - for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size) - { - const uint index = primref_index1[i]; - extendBuildRecord(&right, &primref[index]); - AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); - primref_index0[i] = index; - } - - left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); - right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); - leftAABB = AABB_sub_group_reduce(&leftAABB); - rightAABB = AABB_sub_group_reduce(&rightAABB); - - if (localID == 0) - { - outLeft->centroidBounds = left.centroidBounds; - outRight->centroidBounds = right.centroidBounds; - - *outGeometryBoundsLeft = leftAABB; - *outGeometryBoundsRight = rightAABB; - - outLeft->end = split.pos; - outRight->start = split.pos; - - outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft); - outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight); - outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft)); - outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight)); - } - } - } - else - { - - const int startID = begin + ((subgroupID + 0) * size / numSubGroups); - const int endID = begin + ((subgroupID + 1) * size / numSubGroups); - - for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size) - { - const uint index = primref_index1[i]; - const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; - const uint isRight = 1 - isLeft; - const uint countLeft = sub_group_reduce_add(isLeft); - const uint countRight = sub_group_reduce_add(isRight); - const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); - const uint prefixRight = sub_group_scan_exclusive_add(isRight); - - uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0; - offsetLeft = sub_group_broadcast(offsetLeft, 0); - uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0; - offsetRight = sub_group_broadcast(offsetRight, 0); - - if (isLeft) - { - extendBuildRecord(&left, &primref[index]); - AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); - primref_index0[begin + offsetLeft + prefixLeft] = index; - } - else - { - extendBuildRecord(&right, &primref[index]); - AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); - primref_index0[end - (offsetRight + countRight) + prefixRight] = index; - } - } - left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds); - right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds); - leftAABB = AABB_sub_group_reduce(&leftAABB); - rightAABB = AABB_sub_group_reduce(&rightAABB); - - AABB_local_atomic_merge(&outLeft->centroidBounds, left.centroidBounds.lower, left.centroidBounds.upper); - AABB_local_atomic_merge(&outRight->centroidBounds, right.centroidBounds.lower, right.centroidBounds.upper); - - AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper); - AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper); - - sub_group_barrier(CLK_LOCAL_MEM_FENCE); - - if (subgroupLocalID == 0) - { - const uint sync = atomic_add(local_sync, 1); - if (sync + 1 == numSubGroups) - { - uint pos = begin + *atomicCountLeft; // single thread of last subgroup needs to compute "pos" - outLeft->end = pos; - outRight->start = pos; - - outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft); - outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight); - outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft)); - outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight)); - } - } - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - -#if ENABLE_CHECKS == 1 - if (localID == 0) - { - if (outLeft->end <= begin) - printf("pos begin error\n"); - if (outLeft->end > end) - printf("pos end error\n"); - - for (uint i = outLeft->start; i < outLeft->end; i++) - { - const uint index = primref_index0[i]; - //printf("left %d -> %d \n",i,index); - if (!is_left(binMapping, inSplit, &primref[index])) - printf("check left %d \n", i); - if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index])) - printf("check prim ref bounds left %d \n", i); - } - for (uint i = outRight->start; i < outRight->end; i++) - { - const uint index = primref_index0[i]; - //printf("right %d -> %d \n",i,index); - if (is_left(binMapping, inSplit, &primref[index])) - printf("check right %d \n", i); - if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index])) - printf("check prim ref bounds right %d \n", i); - } - } -#endif -} - - -#define ENABLE_LOOP_BREADTH_FIRST 0 -#if ENABLE_LOOP_BREADTH_FIRST -// TBD It might be that layout of this impact perf. -struct BreadthFirstLoopLocals -{ - struct BuildRecord local_current; -#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 - struct BinInfo binInfo; -#else - struct BinInfo2 binInfo; -#endif - struct Split split; - struct BuildRecord children[BVH_NODE_N + 1]; - struct AABB childrenAABB[BVH_NODE_N + 1]; - uint atomicCountLeft; - uint atomicCountRight; - uint local_sync; - uint recordID; - uint buildRecordIDs[BUILDRECORD_STACK_SIZE]; - uint numBuildRecordIDs; - bool exit; -}; - - -inline void parallel_build_breadth_first_loopT(global struct Globals *globals, - global struct AABB *primref, - global uint *primref_index, - global char *bvh_mem, - uint subtreeThreshold, - local struct BreadthFirstLoopLocals *L, - struct BreadthFirstTemplateConst T) -{ - const uint global_size = get_global_size(0); - const uint local_size = get_local_size(0); - const uint localID = get_local_id(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - - const uint subgroupID = get_sub_group_id(); - const uint subgroupLocalID = get_sub_group_local_id(); - - /* double buffered primref index array */ - global uint *primref_index0 = primref_index; - global uint *primref_index1 = primref_index + globals->numPrimitives; - - global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); - -#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 - const uint bins = BINS; -#else - const uint bins = 2 * BINS; -#endif - - if (localID == 0) - { - L->numBuildRecordIDs = 0; - L->exit = false; - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - while (1) - { - if (localID == 0) - { - if (L->numBuildRecordIDs == 0) - { - L->recordID = generic_atomic_add(&globals->counter, 1); - if (L->recordID >= globals->numBuildRecords) - L->exit = true; - } - else - { - L->numBuildRecordIDs--; - L->recordID = L->buildRecordIDs[L->numBuildRecordIDs]; - } - L->local_current = records[L->recordID]; - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); - - /* no more buildrecords available ? */ - - if (L->exit) - break; - - local struct BuildRecord *current = &L->local_current; - const uint items = getNumPrims(current); - const uint depth = getBuildRecursionDepth(current); - - global unsigned int *num_records_output = &globals->numBuildRecords_extended; - - struct QBVHNodeN *qnode = (struct QBVHNodeN *)current->current; - - /* ignore small buildrecords */ - if (items < max(subtreeThreshold, cfg_minLeafSize)) - { - // do nothing - } - else - { - /*! find best split */ -#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 - parallel_find_split(primref, current, &L->split, &L->binInfo, primref_index0, primref_index1); -#else - parallel_find_split32(&L->local_sync, primref, current, &L->split, &L->binInfo, primref_index0, primref_index1); -#endif - uint numChildren = 2; - - /*! find best split */ - struct BinMapping binMapping; - initBinMapping(&binMapping, ¤t->centroidBounds, bins); - - parallel_partition_index(&L->local_sync, primref, &binMapping, current->start, current->end, &L->split, &L->children[0], &L->children[1], &L->childrenAABB[0], &L->childrenAABB[1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight); - - while (numChildren < BVH_NODE_N6) - { - /*! find best child to split */ - const uint bestChild = subgroup_getMaxAreaChild(L->childrenAABB, numChildren); - if (bestChild == -1) - break; - - /* perform best found split */ - local struct BuildRecord *brecord = &L->children[bestChild]; - local struct BuildRecord *lrecord = &L->children[numChildren + 0]; - local struct BuildRecord *rrecord = &L->children[numChildren + 1]; - -#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0 - parallel_find_split(primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1); -#else - parallel_find_split32(&L->local_sync, primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1); -#endif - - initBinMapping(&binMapping, &brecord->centroidBounds, bins); - - parallel_partition_index(&L->local_sync, primref, &binMapping, brecord->start, brecord->end, &L->split, lrecord, rrecord, &L->childrenAABB[numChildren + 0], &L->childrenAABB[numChildren + 1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight); - - *brecord = *rrecord; - L->childrenAABB[bestChild] = L->childrenAABB[numChildren + 1]; - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - numChildren++; - } - - //sub_group_barrier(CLK_LOCAL_MEM_FENCE); - - if (localID <= 16 && subgroupID == 0) - { - global struct BVHBase *bvh_base = (global struct BVHBase *)bvh_mem; - global struct QBVHNodeN *nodes_start = BVHBase_nodeData(bvh_base); - global uint *back_pointers = BVHBase_backPointers(bvh_base); - uint qnode_index = 0; - if (T.allocateBackpointers) - { - /* index of internal node, the domain of backpointers map*/ - qnode_index = get_qnode_index_for_backptr(nodes_start, qnode); - // the backpointer is already set, but we need to add/encode the num of children - // todo don't like the need of data read (we should just add), maybe should pass grandpa pointer in record..., or use atomic... - back_pointers[qnode_index] += (numChildren << 3); - } - - /* sort children based on rnage size */ - const uint numPrimsIDs = select((uint)0, (as_uint(L->childrenAABB[subgroupLocalID].upper.w) << 3) | subgroupLocalID, subgroupLocalID < numChildren); - //const uint IDs = sortBVHChildrenIDs(numPrimsIDs) & (BVH_NODE_N-1); - const uint IDs = numPrimsIDs & 7; - const uint pushIDs = convertToPushIndices8(IDs); - - /* alloc #numChildren nodes at once */ - const uint node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren); - - /* update single relative node pointer and type */ - const int offset = encodeOffset(bvh_mem, (global void *)qnode, node_offset) >> 6; - const uint type = BVH_INTERNAL_NODE; - - /* set parent pointer in child build records */ - if (subgroupLocalID < numChildren) - { - setBuildRecursionDepth(&L->children[subgroupLocalID], depth + 1); - global uchar *child_data_ptr = (global uchar *)bvh_mem + node_offset + pushIDs * sizeof(struct QBVHNodeN); - L->children[subgroupLocalID].current = child_data_ptr; - if (T.allocateBackpointers) - { - uint child_index = get_qnode_index_for_backptr(nodes_start, child_data_ptr); - back_pointers[child_index] = qnode_index << 6; - } - } - - /* write out qbvh node */ - subgroup_setQBVHNodeN(offset, type, &L->childrenAABB[IDs], numChildren, qnode); - - /* write out child buildrecords to memory */ - - uint global_records_offset = (subgroupLocalID == 0) ? atomic_add(num_records_output, numChildren - 1) : 0; - global_records_offset = sub_group_broadcast(global_records_offset, 0); - - if (localID == 0) - { - records[L->recordID] = L->children[0]; - L->buildRecordIDs[L->numBuildRecordIDs++] = L->recordID; - for (uint i = 1; i < numChildren; i++) - { - const uint ID = globals->numBuildRecords + global_records_offset + i - 1; - records[ID] = L->children[i]; - L->buildRecordIDs[L->numBuildRecordIDs++] = ID; - } - } - } - } - work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); - } - - /* last active HW thread ? */ - if (localID == 0) - { - const uint sync = atomic_add(&globals->sync, 1); - if (sync + 1 == numTasks) - { - globals->sync = 0; - /* set final number of buildrecords */ - globals->numBuildRecords += globals->numBuildRecords_extended; - globals->numBuildRecords_extended = 0; - globals->counter = 0; - } - } -} - -__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -parallel_build_breadth_first_loop(global struct Globals *globals, - global struct AABB *primref, - global uint *primref_index, - global char *bvh_mem, - uint subtreeThreshold) -{ - local struct BreadthFirstLoopLocals L; - static const struct BreadthFirstTemplateConst T = { - false // bool allocateBackpointers; - }; - - parallel_build_breadth_first_loopT(globals, - primref, - primref_index, - bvh_mem, - subtreeThreshold, - &L, - T); -} - -__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -parallel_build_breadth_first_loop_backpointers(global struct Globals *globals, - global struct AABB *primref, - global uint *primref_index, - global char *bvh_mem, - uint subtreeThreshold) -{ - local struct BreadthFirstLoopLocals L; - static const struct BreadthFirstTemplateConst T = { - true // bool allocateBackpointers; - }; - - parallel_build_breadth_first_loopT(globals, - primref, - primref_index, - bvh_mem, - subtreeThreshold, - &L, - T); -} -// =================================================== -// =============== experimental code ================= -// =================================================== -#endif - -#define ENABLE_GLOBAL_SPLIT 0 -#if ENABLE_GLOBAL_SPLIT -inline void parallel_partition_segment_index(local uint *local_sync, - global struct AABB *primref, - struct BinMapping *binMapping, - const uint begin, - const uint end, - const uint global_begin, - const uint global_end, - struct Split *inSplit, - local struct AABB *outLeft, - local struct AABB *outRight, - local struct AABB *outGeometryBoundsLeft, - local struct AABB *outGeometryBoundsRight, - global uint *primref_index0, - global uint *primref_index1, - uint *atomicCountLeft, - uint *atomicCountRight) -{ - const uint localID = get_local_id(0); - const uint local_size = get_local_size(0); - const uint subgroupID = get_sub_group_id(); - const uint numSubGroups = get_num_sub_groups(); - const uint subgroup_size = get_sub_group_size(); - const uint subgroupLocalID = get_sub_group_local_id(); - - const uint size = end - begin; - struct Split split = *inSplit; - - /* init bin bounds */ - if (localID == 0) - { - AABB_init(outLeft); - AABB_init(outRight); - AABB_init(outGeometryBoundsLeft); - AABB_init(outGeometryBoundsRight); - *local_sync = 0; - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - struct AABB left; - struct AABB right; - AABB_init(&left); - AABB_init(&right); - - struct AABB leftAABB; - struct AABB rightAABB; - AABB_init(&leftAABB); - AABB_init(&rightAABB); - - const int startID = begin + ((subgroupID + 0) * size / numSubGroups); - const int endID = begin + ((subgroupID + 1) * size / numSubGroups); - - for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size) - { - const uint index = primref_index1[i]; - const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0; - const uint isRight = 1 - isLeft; - const uint countLeft = sub_group_reduce_add(isLeft); - const uint countRight = sub_group_reduce_add(isRight); - const uint prefixLeft = sub_group_scan_exclusive_add(isLeft); - const uint prefixRight = sub_group_scan_exclusive_add(isRight); - - uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0; - offsetLeft = sub_group_broadcast(offsetLeft, 0); - uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0; - offsetRight = sub_group_broadcast(offsetRight, 0); - - if (isLeft) - { - AABB_extend_point(&left, AABB_centroid2(&primref[index])); - AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper); - primref_index0[global_begin + offsetLeft + prefixLeft] = index; - } - else - { - AABB_extend_point(&right, AABB_centroid2(&primref[index])); - AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper); - primref_index0[global_end - (offsetRight + countRight) + prefixRight] = index; - } - } - left = AABB_sub_group_reduce(&left); - right = AABB_sub_group_reduce(&right); - leftAABB = AABB_sub_group_reduce(&leftAABB); - rightAABB = AABB_sub_group_reduce(&rightAABB); - - AABB_local_atomic_merge(outLeft, left.lower, left.upper); - AABB_local_atomic_merge(outRight, right.lower, right.upper); - - AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper); - AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper); - - work_group_barrier(CLK_LOCAL_MEM_FENCE); -} - -__attribute__((reqd_work_group_size(BINS * 2, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel global_init_split_iteration(global struct Globals *globals, - global struct GlobalBuildRecord *global_record, - global char *bvh_mem, - const uint subTreeThreshold) -{ - const uint localID = get_local_id(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - - global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); - - /* for each build record with size > subTreeThreshold initialize a global build record */ - - const uint startID = (taskID + 0) * globals->numBuildRecords / numTasks; - const uint endID = (taskID + 1) * globals->numBuildRecords / numTasks; - - for (uint i = startID; i < endID; i++) - { - global struct BuildRecord *buildRecord = &records[i]; - DBG(if (localID == 0) printf("i %d subTreeThreshold %d size %d \n", i, subTreeThreshold, buildRecord->end - buildRecord->start)); - - if ((buildRecord->end - buildRecord->start) > subTreeThreshold) - { - uint ID = localID == 0 ? generic_atomic_add(&globals->numGlobalBuildRecords, 1) : 0; - - ID = work_group_broadcast(ID, 0); - global struct BinInfo2 *binInfo = &global_record[ID].binInfo; - global struct BinMapping *binMapping = &global_record[ID].binMapping; - initBinMapping(binMapping, &buildRecord->centroidBounds, 2 * BINS); - parallel_initBinInfo2(binInfo, 2 * BINS); - if (localID == 0) - { - global_record[ID].range.start = buildRecord->start; - global_record[ID].range.end = buildRecord->end; - global_record[ID].atomicCountLeft = 0; - global_record[ID].atomicCountRight = 0; - global_record[ID].buildRecordID = i; - AABB_init(&global_record[ID].leftCentroid); - AABB_init(&global_record[ID].rightCentroid); - AABB_init(&global_record[ID].leftGeometry); - AABB_init(&global_record[ID].rightGeometry); - } - } - } - DBG( - work_group_barrier(CLK_LOCAL_MEM_FENCE); - if (localID == 0) - printf("globals->numGlobalBuildRecords %d \n", globals->numGlobalBuildRecords);); -} - -__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel global_bin_iteration(global struct Globals *globals, - global struct AABB *primref, - global uint *primref_index, - global char *bvh_mem, - global struct GlobalBuildRecord *global_record) -{ - const uint localID = get_local_id(0); - const uint blockSize = get_local_size(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - - const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; - - /* early out */ - if (numGlobalBuildRecords == 0) - return; - - /* double buffered primref index array */ - global uint *primref_index0 = primref_index; - global uint *primref_index1 = primref_index + globals->numPrimitives; - - uint numBlocks = 0; - - /* get total number of blocks, size of block == WG size */ - for (uint i = 0; i < numGlobalBuildRecords; i++) - numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize; - - const uint startBlockID = (taskID + 0) * numBlocks / numTasks; - const uint endBlockID = (taskID + 1) * numBlocks / numTasks; - uint numBlockIDs = endBlockID - startBlockID; - - uint splitRecordID = 0; - uint offset_start = 0; - uint offset_end = 0; - uint cur_blocks = 0; - - for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++) - { - const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; - const uint blocks = (sizeRecord + blockSize - 1) / blockSize; - if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks) - { - const uint preBlocks = startBlockID - blockCounter; - cur_blocks = min(numBlockIDs, blocks - preBlocks); - offset_start = preBlocks * blockSize; - offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord); - break; - } - blockCounter += blocks; - } - - if (localID == 0) - DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); - - local struct BinInfo2 local_binInfo; - parallel_initBinInfo2(&local_binInfo, 2 * BINS); - struct BinMapping binMapping = global_record[splitRecordID].binMapping; - - while (1) - { - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - const uint startID = global_record[splitRecordID].range.start + offset_start; - const uint endID = global_record[splitRecordID].range.start + offset_end; - - if (localID == 0) - DBG(printf("taskID %d startID %d endID %d \n", taskID, startID, endID)); - - for (uint i = startID + localID; i < endID; i += blockSize) - { - const uint index = primref_index0[i]; - primref_index1[i] = index; - atomicUpdateLocalBinInfo2(&binMapping, &local_binInfo, &primref[index]); - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); //FIXME: remove, do local sync - atomicUpdateGlobalFromLocalBinInfo2(&global_record[splitRecordID].binInfo, &local_binInfo, 2 * BINS); - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - numBlockIDs -= cur_blocks; - if (numBlockIDs == 0) - break; - - splitRecordID++; - parallel_initBinInfo2(&local_binInfo, 2 * BINS); - binMapping = global_record[splitRecordID].binMapping; - - const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; - const uint blocks = (sizeRecord + blockSize - 1) / blockSize; - cur_blocks = min(numBlockIDs, blocks); - offset_start = 0; - offset_end = min(cur_blocks * blockSize, sizeRecord); - - if (localID == 0) - DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); - } -} - -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -global_compute_best_split_iteration(global struct Globals *globals, - global char *bvh_mem, - global struct GlobalBuildRecord *global_record) -{ - const uint localID = get_local_id(0); - const uint blockSize = get_local_size(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - - const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; - - /* early out */ - if (numGlobalBuildRecords == 0) - return; - - const uint startRecordID = (taskID + 0) * numGlobalBuildRecords / numTasks; - const uint endRecordID = (taskID + 1) * numGlobalBuildRecords / numTasks; - for (uint i = startRecordID; i < endRecordID; i++) - { - struct Split split = reduceBinsAndComputeBestSplit32(&global_record[i].binInfo, - global_record[i].binMapping.scale, - global_record[i].range.start, - global_record[i].range.end); - if (localID == 0) - { - global_record[i].split = split; - global_record[i].atomicCountLeft = 0; - global_record[i].atomicCountRight = 0; - DBG(printSplit(&global_record[i].split)); - } - } -} - -__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -global_partition_iteration(global struct Globals *globals, - global struct AABB *primref, - global uint *primref_index, - global char *bvh_mem, - global struct GlobalBuildRecord *global_record) -{ - - const uint localID = get_local_id(0); - const uint blockSize = get_local_size(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - - const uint numGlobalBuildRecords = globals->numGlobalBuildRecords; - - /* early out */ - if (numGlobalBuildRecords == 0) - return; - - /* double buffered primref index array */ - global uint *primref_index0 = primref_index; - global uint *primref_index1 = primref_index + globals->numPrimitives; - - uint numBlocks = 0; - - /* get total number of blocks, size of block == WG size */ - for (uint i = 0; i < numGlobalBuildRecords; i++) - numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize; - - const uint startBlockID = (taskID + 0) * numBlocks / numTasks; - const uint endBlockID = (taskID + 1) * numBlocks / numTasks; - uint numBlockIDs = endBlockID - startBlockID; - - uint splitRecordID = 0; - uint offset_start = 0; - uint offset_end = 0; - uint cur_blocks = 0; - - for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++) - { - const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; - const uint blocks = (sizeRecord + blockSize - 1) / blockSize; - if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks) - { - const uint preBlocks = startBlockID - blockCounter; - cur_blocks = min(numBlockIDs, blocks - preBlocks); - offset_start = preBlocks * blockSize; - offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord); - break; - } - blockCounter += blocks; - } - - if (localID == 0) - DBG(printf("partition taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); - - local struct AABB centroidAABB[2]; - local struct AABB geometryAABB[2]; - local uint local_sync; - - while (1) - { - - const uint startID = global_record[splitRecordID].range.start + offset_start; - const uint endID = global_record[splitRecordID].range.start + offset_end; - - struct BinMapping binMapping = global_record[splitRecordID].binMapping; - struct Split split = global_record[splitRecordID].split; - - const uint global_start = global_record[splitRecordID].range.start; - const uint global_end = global_record[splitRecordID].range.end; - - if (localID == 0) - DBG(printf("partition taskID %d startID %d endID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, startID, endID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks)); - - parallel_partition_segment_index(&local_sync, primref, &binMapping, startID, endID, global_start, global_end, &split, ¢roidAABB[0], ¢roidAABB[1], &geometryAABB[0], &geometryAABB[1], primref_index0, primref_index1, &global_record[splitRecordID].atomicCountLeft, &global_record[splitRecordID].atomicCountRight); - - /* update global structures */ - if (localID == 0) - { - AABB_global_atomic_merge(&global_record[splitRecordID].leftCentroid, ¢roidAABB[0]); - AABB_global_atomic_merge(&global_record[splitRecordID].rightCentroid, ¢roidAABB[1]); - AABB_global_atomic_merge(&global_record[splitRecordID].leftGeometry, &geometryAABB[0]); - AABB_global_atomic_merge(&global_record[splitRecordID].rightGeometry, &geometryAABB[1]); - } - - numBlockIDs -= cur_blocks; - if (numBlockIDs == 0) - break; - - splitRecordID++; - - const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start; - const uint blocks = (sizeRecord + blockSize - 1) / blockSize; - cur_blocks = min(numBlockIDs, blocks); - offset_start = 0; - offset_end = min(cur_blocks * blockSize, sizeRecord); - } -} - -inline void printBinaryNode(struct AABB *aabb) -{ - printf("lower %f upper %f lower.w %d upper.w %d \n", aabb->lower, aabb->upper, as_uint(aabb->lower.w), as_uint(aabb->upper.w)); -} - -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel global_finalize_iteration(global struct Globals *globals, - global struct GlobalBuildRecord *global_record, - global char *bvh_mem, - global struct AABB *binary_nodes) -{ - const uint localID = get_local_id(0); - const uint localSize = get_local_size(0); - const uint groupID = get_group_id(0); - const uint numGroups = get_num_groups(0); - - global struct BuildRecord *records = getBuildRecords(bvh_mem, globals); - - for (uint i = localID; i < globals->numGlobalBuildRecords; i += localSize) - { - const uint buildRecordID = global_record[i].buildRecordID; - const uint binaryNodeID = as_uint(records[buildRecordID].centroidBounds.lower.w); - /* left child buildrecord */ - const uint leftID = buildRecordID; - records[leftID].start = global_record[i].range.start; - records[leftID].end = global_record[i].range.start + global_record[i].atomicCountLeft; - records[leftID].centroidBounds = global_record[i].leftCentroid; - /* right child buildrecord */ - const uint rightID = generic_atomic_add(&globals->numBuildRecords, 1); - records[rightID].start = global_record[i].range.start + global_record[i].atomicCountLeft; - records[rightID].end = global_record[i].range.end; - records[rightID].centroidBounds = global_record[i].rightCentroid; - /* two binary nodes */ - const uint binaryChildID = generic_atomic_add(&globals->numGlobalBinaryNodes, 2); - binary_nodes[binaryNodeID].lower.w = as_float(binaryChildID + 0); - binary_nodes[binaryNodeID].upper.w = as_float(binaryChildID + 1); - binary_nodes[binaryChildID + 0] = global_record[i].leftGeometry; - binary_nodes[binaryChildID + 1] = global_record[i].rightGeometry; - binary_nodes[binaryChildID + 0].lower.w = as_float(leftID); - binary_nodes[binaryChildID + 0].upper.w = as_float(-1); - binary_nodes[binaryChildID + 1].lower.w = as_float(rightID); - binary_nodes[binaryChildID + 1].upper.w = as_float(-1); - records[leftID].centroidBounds.lower.w = as_float(binaryChildID + 0); - records[rightID].centroidBounds.lower.w = as_float(binaryChildID + 1); - } - - sub_group_barrier(CLK_LOCAL_MEM_FENCE); - - if (localID == 0) - { - const uint sync = atomic_add(&globals->sync, 1); - if (sync + 1 == numGroups) - { - globals->sync = 0; - DBG(printf("globals->numBuildRecords %d \n", globals->numBuildRecords)); - DBG( - for (uint i = 0; i < globals->numBuildRecords; i++) { - printf("i %d \n", i); - printBuildRecord(&records[i]); - } printf("Binary Tree \n"); - for (uint i = 0; i < globals->numGlobalBinaryNodes; i++) { - printf("i %d \n", i); - printBinaryNode(&binary_nodes[i]); - } - - ); - globals->numGlobalBuildRecords = 0; - } - } -} - -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel global_build_top_level(global struct Globals *globals, - global struct GlobalBuildRecord *global_record, - global char *bvh_mem, - global struct AABB *binary_nodes) -{ -#define MAX_TOP_LEVEL_STACK_DEPTH 32 - struct AABB stack[MAX_TOP_LEVEL_STACK_DEPTH]; - global uchar *stackParentPtrs[MAX_TOP_LEVEL_STACK_DEPTH]; - struct AABB childrenAABB[BVH_NODE_N6]; - float childrenHalfArea[BVH_NODE_N6]; - - /* build records */ - global struct BuildRecord *record = getBuildRecords(bvh_mem, globals); - - struct BVHBase *base = (struct BVHBase *)bvh_mem; - struct QBVHNodeN *qnode_root = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset); - - uint stack_index = 1; - stack[0] = binary_nodes[0]; - stackParentPtrs[0] = (global uchar *)qnode_root; - - while (stack_index != 0) - { - stack_index--; - - childrenAABB[0] = stack[stack_index]; - struct QBVHNodeN *qnode = (struct QBVHNodeN *)stackParentPtrs[stack_index]; - childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]); - - /* buildrecord leaf => set parent pointer and continue*/ - DBG( - printf("stack_index %d \n", stack_index); - printf("as_uint(childrenAABB[0].upper.w) %d \n", as_uint(childrenAABB[0].upper.w));); - - if (as_uint(childrenAABB[0].upper.w) == -1) - { - const uint buildRecordID = as_uint(childrenAABB[0].lower.w); - DBG( - printf("leaf buildRecordID %d \n", buildRecordID); - printBuildRecord(&record[buildRecordID]);) - - record[buildRecordID].current = (global uchar *)qnode; - continue; - } - - childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]); - - uint numChildren = 1; - while (numChildren < BVH_NODE_N6) - { - // FIXME - - /*! find best child to split */ - float bestArea = -(float)INFINITY; - int bestChild = -1; - for (int i = 0; i < numChildren; i++) - { - /* ignore leaves as they cannot get split */ - if (as_uint(childrenAABB[i].upper.w) == -1) - continue; - - /* find child with largest surface area */ - if (childrenHalfArea[i] > bestArea) - { - bestChild = i; - bestArea = childrenAABB[i].lower.w; - } - } - if (bestChild == -1) - break; - const uint leftID = as_uint(childrenAABB[bestChild].lower.w); - const uint rightID = as_uint(childrenAABB[bestChild].upper.w); - childrenAABB[bestChild] = binary_nodes[leftID]; - childrenAABB[numChildren] = binary_nodes[rightID]; - childrenHalfArea[bestChild] = AABB_halfArea(&childrenAABB[bestChild]); - childrenHalfArea[numChildren] = AABB_halfArea(&childrenAABB[numChildren]); - numChildren++; - } - - const uint child_node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren); - - /* update single relative node pointer */ - const int offset = encodeOffset(bvh_mem, (global void *)qnode, child_node_offset) >> 6; - const uint type = BVH_INTERNAL_NODE; - - setQBVHNodeN(offset, type, childrenAABB, numChildren, qnode); - - DBG( - printQBVHNodeN(qnode); - printf("numChildren %d \n", numChildren); - for (uint i = 0; i < numChildren; i++) - AABB_print(&childrenAABB[i]);); - - /* update parent pointer of build records of all children */ - for (uint ID = 0; ID < numChildren; ID++) - { - stack[stack_index] = childrenAABB[ID]; - stackParentPtrs[stack_index] = (global uchar *)bvh_mem + child_node_offset + ID * sizeof(struct QBVHNodeN); - stack_index++; - } - } -} - -#endif diff --git a/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h deleted file mode 100644 index b8cf7288f6a..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h +++ /dev/null @@ -1,1507 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "bvh_build_refit.h" -#include "libs/lsc_intrinsics.h" - - -#define REFIT_DEBUG_CHECKS 0 -#define REFIT_VERBOSE_LOG 0 - -#define NUM_STARTPOINTS_IN_SLM (1024) - -GRL_INLINE void storeAABBToL1(struct AABB aabb, struct AABB* ptr) -{ - uint8 val = (uint8)( - as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w), - as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w)); - - store_uint8_L1WB_L3WB((__global uint8*) ptr, 0, val); -} - -GRL_INLINE void storeAABBToL3(struct AABB aabb, struct AABB* ptr) -{ - uint8 val = (uint8)( - as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w), - as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w)); - - store_uint8_L1UC_L3WB((__global uint8*) ptr, 0, val); -} - -typedef struct Treelet_by_single_group_locals -{ - uint startpoints[NUM_STARTPOINTS_IN_SLM]; -} Treelet_by_single_group_locals; - -typedef struct SquashedInputGroupDesc { - qword bvh; - qword scratch; - uint groupInTree; - uint totalNumGroups; //valid only for 0th element in array, otherwise its trash padding -} SquashedInputGroupDesc; - -// -// -// update primitives -// -// - -typedef struct SquashedInput { - global struct BVHBase* pBvh; - global void* pInput; - global struct AABB* bbox_scratch; -} SquashedInput; - - - -// updates one quad leaf and gets BBOX contatining it -GRL_INLINE void refit_bottom_child_quad( - global struct QuadLeaf* quad, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - struct AABB* childAABB) -{ - struct QuadLeaf Q; - get_updated_quad(quad, geomDesc, &Q); - quadCopyVertices(&Q, quad); - *childAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad -} - -// procedurals will have to go old path at first -#if 0 -// updates one procedural leaf and gets BBOX contatining it -GRL_INLINE void refit_bottom_child_procedural( - global struct ProceduralLeaf** pleaf, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - struct AABB* childAABB) -{ - global struct ProceduralLeaf* leaf = *pleaf; - /* extract geomID and primID from leaf */ - const uint startPrim = QBVHNodeN_startPrim(curNode, child_idx); - const uint geomID = ProceduralLeaf_geomIndex(leaf); - const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf! - - /* read bounds from geometry descriptor */ - struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID); - childAABB->lower.x = aabb.MinX; - childAABB->lower.y = aabb.MinY; - childAABB->lower.z = aabb.MinZ; - childAABB->upper.x = aabb.MaxX; - childAABB->upper.y = aabb.MaxY; - childAABB->upper.z = aabb.MaxZ; - - /* advance leaf pointer to next child */ - *pleaf = leaf + QBVHNodeN_blockIncr(curNode, child_idx); -} - - -GRL_INLINE void update_procedural_leafs( - global struct BVHBase* bvh, - global void* input, - global struct AABB* bbox_scratch, - uint id, - uint num_done_by_one_thread) -{ - uint numLeaves = BVHBase_GetNumQuads(bvh); - uint leafsIndexOffset = bvh->proceduralDataStart - BVH_ROOT_NODE_OFFSET / 64; - global ProceduralLeaf* leafs = (global QuadLeaf*)BVHBase_GetProceduralLeaves(bvh); - uint start_leaf = id * num_done_by_one_thread; - uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves); - - global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input; - - for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++) - { - struct AABB theAABB; - refit_bottom_child_procedural(leafs + leaf_id, geosArray, &theAABB); - theAABB.lower.w = as_float(0xABBADEFF); - theAABB.upper.w = 0x00; - storeAABBToL1(theAABB, &bbox[leafsIndexOffset + leaf_id]); - } -} -#endif - -GRL_INLINE void update_quads( - global struct BVHBase* bvh, - global void* input, - global struct AABB* bbox_scratch, - uint id, - uint num_done_by_one_thread) -{ - uint numLeaves = BVHBase_GetNumQuads(bvh); - uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64; - global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh); - uint start_leaf = id * num_done_by_one_thread; - uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves); - - global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input; - - for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++) - { - struct AABB theAABB; - refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB); - theAABB.lower.w = as_float(0xABBADEFF); - theAABB.upper.w = 0x00; - storeAABBToL1(theAABB, &bbox_scratch[leafsIndexOffset + leaf_id]); - } -} - -///////////////////////////////////////////////////////////////////////////////////////////////////// -// -// core bottom-up update functions -// -// - -GRL_INLINE void quantise_bounds( - struct AABB* input_aabb, float3 len, float3 mant, float3 org, int3 exp, - uchar3* lower_uchar, - uchar3* upper_uchar) -{ - const float up = 1.0f + ulp; - const float down = 1.0f - ulp; - - struct AABB child_aabb = conservativeAABB(input_aabb); // conservative ??? - - float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8)); - lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX)); - float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8)); - upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX)); - - *lower_uchar = convert_uchar3_rtn(lower); - *upper_uchar = convert_uchar3_rtp(upper); -} - -typedef struct Qbounds_as_DW { - uint32_t xLL; uint32_t xLU; uint32_t xUU; - uint32_t yLL; uint32_t yLU; uint32_t yUU; - uint32_t zLL; uint32_t zLU; uint32_t zUU; -} Qbounds_as_DW; - -GRL_INLINE void encodeQuantisedDataAsDW( - uchar3 lower_uchar, - uchar3 upper_uchar, - uint idx, - Qbounds_as_DW* qbounds) -{ - uint shift_init = idx * 8; - if (idx >= 4) { - uint shift = (shift_init - 32); - qbounds->xLU |= ((uint)lower_uchar.x) << shift; - qbounds->yLU |= ((uint)lower_uchar.y) << shift; - qbounds->zLU |= ((uint)lower_uchar.z) << shift; - } - else { - qbounds->xLL |= ((uint)lower_uchar.x) << shift_init; - qbounds->yLL |= ((uint)lower_uchar.y) << shift_init; - qbounds->zLL |= ((uint)lower_uchar.z) << shift_init; - } - - if (idx < 2) { - uint shift = (shift_init + 16); - qbounds->xLU |= ((uint)upper_uchar.x) << shift; - qbounds->yLU |= ((uint)upper_uchar.y) << shift; - qbounds->zLU |= ((uint)upper_uchar.z) << shift; - } - else { - uint shift = (shift_init - 16); - - qbounds->xUU |= ((uint)upper_uchar.x) << shift; - qbounds->yUU |= ((uint)upper_uchar.y) << shift; - qbounds->zUU |= ((uint)upper_uchar.z) << shift; - } -} - -GRL_INLINE void encodeChildBounds(uchar3 lower_uchar, uchar3 upper_uchar, uint ch, struct InternalNode* qnode) -{ - qnode->lower_x[ch] = lower_uchar.x; qnode->upper_x[ch] = upper_uchar.x; - qnode->lower_y[ch] = lower_uchar.y; qnode->upper_y[ch] = upper_uchar.y; - qnode->lower_z[ch] = lower_uchar.z; qnode->upper_z[ch] = upper_uchar.z; -} - - -GRL_INLINE GRL_OVERLOADABLE void InternalNode_setBounds_skip_prev(struct InternalNode* qbvh_node, uint prevChildIdx, struct AABB* prev_input_aabb, struct AABB* input_aabb, uint childrenIndex, const uint numChildren, struct AABB* aabb_reduced) -{ - - int3 exp; - const float up = 1.0f + ulp; - struct AABB conservative_aabb = conservativeAABB(aabb_reduced); - const float3 len = AABB_size(&conservative_aabb).xyz * up; - const float3 mant = frexp_vec3(len, &exp); - const float3 org = conservative_aabb.lower.xyz; - - exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); - - qbvh_node->lower[0] = org.x; qbvh_node->lower[1] = org.y; qbvh_node->lower[2] = org.z; - - qbvh_node->exp_x = exp.x; qbvh_node->exp_y = exp.y; qbvh_node->exp_z = exp.z; - - Qbounds_as_DW qbounds = { 0x0 }; - - - { - uchar3 lower_uchar, upper_uchar; - quantise_bounds(prev_input_aabb, len, mant, org, exp, &lower_uchar, &upper_uchar); - - //encode invalid children. its enough to set 0x80 as lower_x bytes - uint shift = numChildren * 8; - uint shift2 = min(shift, 31u); - qbounds.xLL = (0x80808080u << shift2); - uint shift3 = max(shift, 32u) - 32; - qbounds.xLU = (ushort)(((ushort)0x8080) << (ushort)shift3); - - encodeQuantisedDataAsDW(lower_uchar, upper_uchar, prevChildIdx, &qbounds); - //encodeChildBounds(lower_uchar, upper_uchar, prevChildIdx, qbvh_node); - } - - uint ch = prevChildIdx == 0; - while (ch < numChildren) { - uchar3 lower_uchar, upper_uchar; - quantise_bounds(input_aabb + ch, len, mant, org, exp, &lower_uchar, &upper_uchar); - encodeQuantisedDataAsDW(lower_uchar, upper_uchar, ch, &qbounds); - //encodeChildBounds(lower_uchar, upper_uchar, ch, qbvh_node); - ch += 1 + (prevChildIdx == (ch + 1)); - } - Qbounds_as_DW* qbounds_dst = (Qbounds_as_DW*)(&qbvh_node->lower_x[0]); - *qbounds_dst = qbounds; - return; -} - -GRL_INLINE struct AABB refitReduce2Boxes(struct AABB A, struct AABB B) -{ - AABB_extend(&A, &B); - // to make it work for TLAS node masks change to this: - // A.lower.w = as_float(as_uint(A.lower.w) | as_uint(B.lower.w)); - A.lower.w = as_float(0xABBADE00u); - return A; -} - -GRL_INLINE void refitReduceNodePrev( - uint prevIdx, - uint leadChildIdx, - uint numChildren, - struct AABB* globalBox, - struct AABB* reduceBox, - uint depth, - uint NodeIndex) -{ - uint8_t childIgnored = (prevIdx - leadChildIdx); - -# if REFIT_DEBUG_CHECKS - bool err = false; - if ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u) - { - printf("refitReduceNode6 (loc_id %d): prev (used as child %d) not updated! NodeIndex %d, child nodeIdx %d at depth %d\n", - get_local_id(0), - childIgnored, - NodeIndex, - prevIdx, - depth); - err = true; - } - - if ((as_uint(globalBox[NodeIndex].lower.w) & 0xFFFFFF00) == 0xABBADE00u) - { - printf("refitReduceNode6 (loc_id %d): dst node already updated. NodeIndex %d depth %d\n", - get_local_id(0), - NodeIndex, - depth); - } - - bool fail = false; - for (uint k = 0; (k < numChildren) && !err; ++k) { - if (k != childIgnored) { - if ((as_uint(globalBox[leadChildIdx + k].lower.w) & 0xFFFFFF00) != 0xABBADE00u) { - printf("refitReduceNode6 (loc_id %d): child %d not updated! use prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n", - get_local_id(0), - k, - prevIdx - leadChildIdx, - NodeIndex, - leadChildIdx + k, - depth); - fail = true; - } - } - } - err |= fail; -# endif - - // for each child 3 bits contains load index - const uint32_t indicesEncoded = - (1 << 0) + - (2 << 3) + - (3 << 6) + - (4 << 9) + - (5 << 12) + - (0 << 15) + - (1 << 18) + - (2 << 21) + - (3 << 24) + - (4 << 27); - // 1,2,3,4,5 - - - uint32_t indicesEncodedShifted = indicesEncoded >> (childIgnored * 3); - - struct AABB* childAABB = globalBox + leadChildIdx; - struct AABB temp = childAABB[indicesEncodedShifted & 7]; - indicesEncodedShifted >>= 3; - struct AABB* nextChild = childAABB + (indicesEncodedShifted & 7); - struct AABB backlog = temp; - - for (uint child = 2; child < numChildren; child++) - { - temp = *nextChild; - *reduceBox = refitReduce2Boxes(*reduceBox, backlog); - indicesEncodedShifted >>= 3; - nextChild = childAABB + (indicesEncodedShifted & 7); - backlog = temp; - } - - *reduceBox = refitReduce2Boxes(*reduceBox, backlog); - -#if REFIT_DEBUG_CHECKS - for (uint k = 0; (k < numChildren) && !err; ++k) { - if (k != childIgnored) { - if (!AABB_subset(&globalBox[leadChildIdx + k], reduceBox)) { - printf("refitReduceNode6 (loc_id %d): child AABB %d/%d reduction went wrong! skipped prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n", - get_local_id(0), - k, numChildren, - prevIdx - leadChildIdx, - NodeIndex, - leadChildIdx + k, - depth); - - err = true; - } - } - } - if (!err && ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)) { - printf("refitReduceNode6: havent set the 0xABBADEXXu marker in result node %d at depth %d!\n", - NodeIndex, - depth); - } -#endif -} - - -GRL_INLINE uint hash_local_id() -{ - return get_sub_group_local_id() * get_num_sub_groups() + get_sub_group_id(); -} - -//=============================================================== -// -// Core update function -// -//=============================================================== -GRL_INLINE bool refit_treelet_by_single_group( - global struct AABB* bbox, - local Treelet_by_single_group_locals* loc, - uniform global BVHBase* pBvh, - uniform RefitTreelet trltDsc, - bool encodeQnodes, - bool isTipTreelet) -{ - BackPointers* backpointers = BVHBase_GetBackPointers(pBvh); - InternalNode* internalNodes = BVHBase_GetInternalNodes(pBvh); - uint local_id = get_local_id(0); - StartPoint* startPoints = BVHBase_GetRefitStartPoints(pBvh) + trltDsc.startpoint_offset; - - // special case for single path treelets, TODO rewrite it as subgroups based - if (trltDsc.numStartpoints == 1) { - if (local_id == 0) { - RefitTreeletTrivial desc = *((RefitTreeletTrivial*)& trltDsc); - uint innerNodeIdx = desc.theOnlyNodeIndex; - uint numChildren = desc.numChildrenOfTheNode; - uint childIndex = desc.childrenOffsetOfTheNode; - uint maxDepth = desc.maxDepth; - - uint prevIdx = childIndex; - struct AABB myBox = bbox[childIndex]; - struct AABB prevAABB; - uint backpointer = maxDepth > 0 ? *InnerNode_GetBackPointer(backpointers, innerNodeIdx) : 0; - InternalNode* curNode = internalNodes + innerNodeIdx; - uint currDepth = 0; - - while (1) - { - prevAABB = myBox; - if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); } - - if (!encodeQnodes) { myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); } - - if (++currDepth > maxDepth) { break; } - - if (encodeQnodes) { - InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); - } -#if !REFIT_DEBUG_CHECKS - else -#endif - { storeAABBToL1(myBox, &bbox[innerNodeIdx]); } - - prevIdx = innerNodeIdx; - innerNodeIdx = BackPointer_GetParentIndex(backpointer); - backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); - numChildren = BackPointer_GetNumChildren(backpointer); - curNode = internalNodes + innerNodeIdx; - childIndex = innerNodeIdx + curNode->childOffset; - } - - if (isTipTreelet) { - AABB3f reduced3f = AABB3fFromAABB(myBox); - pBvh->Meta.bounds = reduced3f; - } - else { - storeAABBToL3(myBox, &bbox[innerNodeIdx]); - } - - if (encodeQnodes || isTipTreelet) { - InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); - } - -#if REFIT_VERBOSE_LOG - printf("single node treelet: storing node idx %d \n", innerNodeIdx); -#endif - } - - return local_id == 0; - } - - local uint* loc_startpoints = loc->startpoints; - - -#if REFIT_DEBUG_CHECKS - if ((trltDsc.numNonTrivialStartpoints > NUM_STARTPOINTS_IN_SLM)) { - if(local_id == 0) printf("out of SLM space, trltDsc.depthSub_NUM_STARTPOINTS_IN_SLM > 0\n"); - return local_id == 0; - } -#endif - - uint SLMedStartpointsOffset = trltDsc.numStartpoints - trltDsc.numNonTrivialStartpoints; - - /*===================================================================== - first phase where we update startpoints nodes only - ----------------------------------------------------------------------*/ - for (uint startpoint_i = local_id; startpoint_i < trltDsc.numStartpoints; startpoint_i += get_local_size(0)) { - uint startpoint = (uint)intel_sub_group_block_read_ui((global uint*)(startPoints + startpoint_i)); - uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint); - uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); - if (startpoint_i >= SLMedStartpointsOffset) { - uint idx = startpoint_i - SLMedStartpointsOffset; - loc_startpoints[idx] = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint); - } - - uint numChildren = BackPointer_GetNumChildren(backpointer); - InternalNode* curNode = internalNodes + innerNodeIdx; - uint childIndex = innerNodeIdx + curNode->childOffset; - - uint prevIdx = childIndex; - struct AABB myBox = bbox[childIndex]; - struct AABB prevAABB = myBox; - -# if REFIT_DEBUG_CHECKS - if (numChildren == 0) { - printf("this node has no chidren!\n", 0); - AABB_init(&myBox); - } -# endif - - if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); } - myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); - -#if REFIT_VERBOSE_LOG - printf("init phase: at depth 0 storing node idx %d \n", innerNodeIdx); -#endif - storeAABBToL1(myBox, &bbox[innerNodeIdx]); - - if (encodeQnodes) { - InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); - } - } - - uniform uint CurrPeeledDepth = 1; - uniform uint numStartpoints = trltDsc.numNonTrivialStartpoints; - uint nextFloorStartpoint = hash_local_id(); - - uint depthOnionEnd = trltDsc.depthLess64; - if (get_local_size(0) == 128) { depthOnionEnd = trltDsc.depthLess128; } - if (get_local_size(0) == 256) { depthOnionEnd = trltDsc.depthLess256; } - - /*===================================================================== - second phase, we update horizontally untill - we reach number of active path below grou size - ----------------------------------------------------------------------*/ - while (CurrPeeledDepth < depthOnionEnd) { - mem_fence_workgroup_default(); - - work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group); - uint start = nextFloorStartpoint; - nextFloorStartpoint = numStartpoints; - - for (uint startpoint_i = start; startpoint_i < numStartpoints; startpoint_i += get_local_size(0)) { - uint startpoint = loc_startpoints[startpoint_i]; - uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint); - uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); - - if (StartPoint_GetDepth(startpoint) > CurrPeeledDepth) { - StartPoint newSP = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint); - loc_startpoints[startpoint_i] = newSP; - nextFloorStartpoint = min(nextFloorStartpoint, startpoint_i); - } - - InternalNode* curNode = internalNodes + innerNodeIdx; - uint childIndex = innerNodeIdx + curNode->childOffset; - uint numChildren = BackPointer_GetNumChildren(backpointer); - - uint prevIdx = childIndex; - struct AABB myBox = bbox[childIndex]; - struct AABB prevAABB = myBox; - refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx); - - myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); - -#if REFIT_VERBOSE_LOG - printf("onion: startpoint %d at depth %d storing node idx %d \n", startpoint_i, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx); -#endif - storeAABBToL1(myBox, &bbox[innerNodeIdx]); - if (encodeQnodes) { - InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); - } - } - CurrPeeledDepth++; - } - - uint startpoint_idx = nextFloorStartpoint; - bool active = startpoint_idx < numStartpoints; - - work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group); - StartPoint startpoint = loc_startpoints[startpoint_idx]; - - struct AABB myBox; - uint prevIdx = 0; - uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint); - - /*===================================================================== - last phase, each thread just continues path to its end - - only thread that computes the longest path leaves prematurely - (thats why while condition isn't <=) the code for finalizing root of treelet - is special and hendled afterwards - - TODO: with proper assigning of paths to lanes we should reach only three - active lanes per physical thread quite soon for this subgroups could be used - ----------------------------------------------------------------------*/ - bool prevActive = active; - while (CurrPeeledDepth < trltDsc.maxDepth) { - uint backpointer; - uint childIndex; - InternalNode* curNode = internalNodes + innerNodeIdx; - if (active) { - childIndex = innerNodeIdx + curNode->childOffset; - backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); - } else if(prevActive){ - mem_fence_workgroup_default(); - } - - prevActive = active; - - work_group_barrier(0, memory_scope_work_group); - //printf("Start node %d at depth %d, innerNodeIdx %d dying! \n", StartPoint_GetNodeIdx(startpoint), CurrPeeledDepth, innerNodeIdx); - if (active) { - -#if REFIT_DEBUG_CHECKS - if (CurrPeeledDepth > StartPoint_GetDepth(startpoint)) - { - printf("uppath: startpoint %d at depth %d shouldn't be active!\n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth); - } -#endif - if (prevIdx == 0) { - myBox = bbox[childIndex]; - prevIdx = childIndex; - } - uint numChildren = BackPointer_GetNumChildren(backpointer); - - struct AABB prevAABB = myBox; - refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx); - myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); -#if REFIT_VERBOSE_LOG - printf("uppath: startpoint %d at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx); -#endif - active = CurrPeeledDepth < StartPoint_GetDepth(startpoint); - - if (encodeQnodes) { -#if !REFIT_DEBUG_CHECKS - if (!active) -#endif - { storeAABBToL1(myBox, &bbox[innerNodeIdx]); } - InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); - } else { - storeAABBToL1(myBox, &bbox[innerNodeIdx]); - } - - prevIdx = innerNodeIdx; - innerNodeIdx = BackPointer_GetParentIndex(backpointer); - } - - CurrPeeledDepth++; - } - - { - uint backpointer; - uint childIndex; - InternalNode* curNode = internalNodes + innerNodeIdx; - if (active) { - childIndex = innerNodeIdx + curNode->childOffset; - backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx); - } else if(prevActive) { - mem_fence_workgroup_default(); - } - - work_group_barrier(0, memory_scope_work_group); - - /*===================================================================== - final step, is special processing of root, - its different, since its box is transfered cross group (written to L3) - or is root of whole tree and hence fill global box in bvh MD - TODO: this should be done in SG as only one thread is active - ----------------------------------------------------------------------*/ - if (active) { - if (prevIdx == 0) { - myBox = bbox[childIndex]; - prevIdx = childIndex; - } - uint numChildren = BackPointer_GetNumChildren(backpointer); - struct AABB prevAABB = myBox; - refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx); - myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); - -#if REFIT_VERBOSE_LOG - printf("root: startpoint %d at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx/*,WeReInSIMD*/); -#endif - if (isTipTreelet) { - AABB3f reduced3f = AABB3fFromAABB(myBox); - pBvh->Meta.bounds = reduced3f; - InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); - } else { - storeAABBToL3(myBox, &bbox[innerNodeIdx]); - if (encodeQnodes) { - InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); - } - } - } - } - - return active; -} - - -////////////////////////////////////////////////////////////////////////////////////// -// -// Internal nodes enocding as a separate dispatch -// -// - -// encode qnodes as a separate pass -GRL_INLINE void post_refit_encode_qnode_tree_per_group( - global struct AABB* bbox_scratch, - global struct BVHBase* bvh) -{ - uint numInnerNodes = BVHBase_GetNumInternalNodes(bvh); - InternalNode* internalNodes = BVHBase_GetInternalNodes(bvh); - - for (uint nodeIdx = get_local_id(0) + 1 /*+1 because node 0 is already updated*/; nodeIdx < numInnerNodes; nodeIdx += get_local_size(0)) - { - struct AABB reduced = bbox_scratch[nodeIdx]; -# if REFIT_DEBUG_CHECKS - if ((as_uint(reduced.lower.w) & 0xFFFFFF00) != 0xABBADE00u) { - printf("qnode enc group: NodeIndex %d not updated! \n", nodeIdx); - return; - } - for (uint k = 0; k < (as_uint(reduced.upper.w) & 7); ++k) { - uint childIdx = (as_uint(reduced.upper.w) >> 4) + k; - if ((as_uint(bbox_scratch[childIdx].lower.w) & 0xFFFFFF00) != 0xABBADE00u) { - printf("qnode enc group: child not updated! NodeIndex %d, child nodeIdx %d \n", nodeIdx, childIdx); - return; - } - } -# endif - struct InternalNode* qbvh_node = internalNodes + nodeIdx; - uint childIndex = as_uint(reduced.upper.w) >> 4; - uint numChildren = as_uint(reduced.upper.w) & 7; - struct AABB* children = bbox_scratch + childIndex; - //InternalNode_setBounds(internalNodes + nodeIdx, bbox_scratch + (as_uint(reduced.upper.w) >> 4), as_uint(reduced.upper.w) & 7, &reduced); - InternalNode_setBounds_skip_prev(qbvh_node, 0, children, children, childIndex, numChildren, &reduced); - } -} - -////////////////////////////////////////////////////////////////////////////////////// -// -// Construction of treelets and paths -// -// - -// this is tiny bit tricky, when bottom-up thread haven't yet closed treelet this is number of startpoints that are under the node -// when thread closed treelets it the data is starts to be treelet ID -typedef uint TreeletNodeData; - -typedef struct TreeletsOpenNodeInfo { - // bool isTreeletRoot; // : 1 - short maxDepth; // : 14 - uint numStartpoints;// : 16 -} TreeletsOpenNodeInfo; - -typedef struct TreeletsClosedNodeInfo { - // bool isTreeletRoot; // : 1 - uint treeletId; // : 31 (when treelet is closed) -} TreeletsClosedNodeInfo; - -GRL_INLINE TreeletNodeData ClearTreeletRoot(TreeletNodeData D) -{ - return D & ((1u << 31u) - 1u); -} - -GRL_INLINE uint isTreeletRoot(TreeletNodeData E) -{ - return E >> 31; -} - -GRL_INLINE uint getNumStartpoints(TreeletNodeData E) -{ - return E & ((1 << 16) - 1); -} - -GRL_INLINE uint getMaxDepth(TreeletNodeData E) -{ - return (E >> 16) & ((1 << 14) - 1); -} - -// single startpoint treelet -GRL_INLINE uint isTrivialTreeletRoot(TreeletNodeData E) -{ - return (E >> 31) && (getMaxDepth(E) == 0); -} - -GRL_INLINE TreeletNodeData SetTipStartpoint(TreeletNodeData D) -{ - return ClearTreeletRoot(D) | (1 << 30); -} - -GRL_INLINE TreeletNodeData SetTreeletRoot(TreeletNodeData D) -{ - return D | (1 << 31); -} - -GRL_INLINE TreeletsOpenNodeInfo DecodeOpenInfo(TreeletNodeData E) -{ - TreeletsOpenNodeInfo I; - I.maxDepth = getMaxDepth(E); - I.numStartpoints = getNumStartpoints(E); - return I; -} - -GRL_INLINE TreeletNodeData EncodeOpenInfo(TreeletsOpenNodeInfo I, bool isRoot) -{ - TreeletNodeData D = isRoot ? (1 << 31) : 0; - D |= (I.maxDepth & ((1 << 14) - 1)) << 16; - D |= I.numStartpoints & ((1 << 16) - 1); - return D; -} - -GRL_INLINE TreeletsClosedNodeInfo DecodeClosedInfo(TreeletNodeData E) -{ - TreeletsClosedNodeInfo I; - I.treeletId = E & ((1u << 31u) - 1u); - return I; -} - -GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(TreeletsClosedNodeInfo I) -{ - TreeletNodeData D = (1u << 31u); // closed is always a root! - D |= I.treeletId & ((1u << 31u) - 1u); - return D; -} - -GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(uint treeletId) -{ - TreeletNodeData D = (1 << 31); // closed is always a root! - D |= treeletId & ((1u << 31u) - 1u); - return D; -} - -GRL_INLINE void chk_close_Treelet( - RefitTreelet* TreeletDescsArr, - TreeletNodeData* nodeTreeletDataArr, - uint* StartPointBuffer, - uint* currStartpoint, - TreeletNodeData nodeData, - TreeletsOpenNodeInfo* nodeOpenInfo, - uint nodeIdx, - uint* treeletDescIdx) -{ - if (isTreeletRoot(nodeData)) - { - TreeletNodeData encoded = 0; - if (nodeOpenInfo->numStartpoints == 1) - { - encoded = ClearTreeletRoot(SetTipStartpoint(nodeData)); - } - else - { - RefitTreelet RTdesc; - RTdesc.startpoint_offset = *currStartpoint; - *currStartpoint += nodeOpenInfo->numStartpoints; - RTdesc.numStartpoints = nodeOpenInfo->numStartpoints; - RTdesc.maxDepth = nodeOpenInfo->maxDepth; - TreeletDescsArr[*treeletDescIdx] = RTdesc; - encoded = EncodeClosedInfo(*treeletDescIdx); - *treeletDescIdx = *treeletDescIdx + 1; - TreeletsOpenNodeInfo infoDefault = { 0, 0 }; - *nodeOpenInfo = infoDefault; - } - - nodeTreeletDataArr[nodeIdx] = encoded; - } - // printf("close_Treelet %d, nodeOpenInfo.numStartpoints %d, RTdesc.maxDepth %d, RTdesc.startpoint_offset %d\n", treeletDescIdx, nodeOpenInfo.numStartpoints, RTdesc.maxDepth, RTdesc.startpoint_offset); -} - - -// TreeletNodeData* treelets holds per node property, after running this some of them are marked as treelet root -GRL_INLINE void treelet_bottom_up_mark_treelets( - global struct BVHBase* bvh, - global InternalNode* internalNodes, - global StartPoint* scratch_startpoints, - uint curNodeIndex, - BackPointers* backPointers, - global TreeletNodeData* treelets, - uint refitTreeletsDataStart, - uint* startpointAlloc) -{ - TreeletsOpenNodeInfo currInfo; - currInfo.maxDepth = 0; - currInfo.numStartpoints = 1; - - global RefitTreelet* treeletDescs = (global RefitTreelet*) (((global char*)bvh) + (refitTreeletsDataStart * 64)); - - treelets[curNodeIndex] = EncodeOpenInfo(currInfo, true); - - /* the start node got already processed, thus go to its parent node */ - uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex); - curNodeIndex = parentPointer >> 6; - - bool isInTip = false; - while (curNodeIndex != 0x03FFFFFF) - { - uint numChildrenTotal = 0; - // numChildrenTotal and parentPointer gets updated... - // atomic trickery, on backpointers, only the last one thread enters up - { - /* increment refit counter that counts refitted children of current node */ - global uint* pCurrentBackpointer = (global uint*)InnerNode_GetBackPointer(backPointers, curNodeIndex); - mem_fence_gpu_invalidate(); - parentPointer = 1 + atomic_inc_global(pCurrentBackpointer); - - /* if all children got refitted, then continue */ - const uint numChildrenRefitted = (parentPointer >> 0) & 0x7; - numChildrenTotal = (parentPointer >> 3) & 0x7; - - if (numChildrenRefitted != numChildrenTotal) - return; - - /* reset refit counter for next refit */ - *pCurrentBackpointer = (parentPointer & 0xfffffff8); - } - - /* get children treelets */ - global struct InternalNode* node = internalNodes + curNodeIndex; - uint childrenIndices = curNodeIndex + node->childOffset; - global TreeletNodeData* childrenTreelets = treelets + childrenIndices; - - // yeah, it is possible we are pulling trash here, but we wont use it. - // this is for the sake of one non control flow spoiled data pull - TreeletNodeData dataCh0 = childrenTreelets[0]; TreeletNodeData dataCh1 = childrenTreelets[1]; - TreeletNodeData dataCh2 = childrenTreelets[2]; TreeletNodeData dataCh3 = childrenTreelets[3]; - TreeletNodeData dataCh4 = childrenTreelets[4]; TreeletNodeData dataCh5 = childrenTreelets[5]; - - // zero out the potential trash - if (numChildrenTotal < 3) dataCh2 = 0; - if (numChildrenTotal < 4) dataCh3 = 0; - if (numChildrenTotal < 5) dataCh4 = 0; - if (numChildrenTotal < 6) dataCh5 = 0; - - TreeletsOpenNodeInfo infoCh0 = DecodeOpenInfo(dataCh0); - TreeletsOpenNodeInfo infoCh1 = DecodeOpenInfo(dataCh1); - TreeletsOpenNodeInfo infoCh2 = DecodeOpenInfo(dataCh2); - TreeletsOpenNodeInfo infoCh3 = DecodeOpenInfo(dataCh3); - TreeletsOpenNodeInfo infoCh4 = DecodeOpenInfo(dataCh4); - TreeletsOpenNodeInfo infoCh5 = DecodeOpenInfo(dataCh5); - - uint numChildrenBeingRoots = isTreeletRoot(dataCh0) + isTreeletRoot(dataCh1) + isTreeletRoot(dataCh2) + isTreeletRoot(dataCh3) + isTreeletRoot(dataCh4) + isTreeletRoot(dataCh5); - // see if we should merge the trees, if not then we should move to tip. - currInfo.numStartpoints = infoCh0.numStartpoints + infoCh1.numStartpoints + infoCh2.numStartpoints + infoCh3.numStartpoints + infoCh4.numStartpoints + infoCh5.numStartpoints; - - bool isTipStartpoint = false; - if (!isInTip) - { - // TODO: threshold could be a dynamic parameter based on the number of actual inner nodes - bool mergeTreelets = ((currInfo.numStartpoints > 0) && (currInfo.numStartpoints < TREELET_NUM_STARTPOINTS)); - bool allChildrenRootsCurrently = numChildrenTotal == numChildrenBeingRoots; - if (mergeTreelets && allChildrenRootsCurrently) - { - childrenTreelets[0] = ClearTreeletRoot(dataCh0); - childrenTreelets[1] = ClearTreeletRoot(dataCh1); // -1 will be recognised then as this is not a treelet root. - if (numChildrenTotal > 2) childrenTreelets[2] = ClearTreeletRoot(dataCh2); - if (numChildrenTotal > 3) childrenTreelets[3] = ClearTreeletRoot(dataCh3); - if (numChildrenTotal > 4) childrenTreelets[4] = ClearTreeletRoot(dataCh4); - if (numChildrenTotal > 5) childrenTreelets[5] = ClearTreeletRoot(dataCh5); - } - else - { - isInTip = true; - isTipStartpoint = allChildrenRootsCurrently; - } - } - - // close any roots underneath - if (isInTip && numChildrenBeingRoots) - { - uint trivialRoots = isTrivialTreeletRoot(dataCh0) + isTrivialTreeletRoot(dataCh1) + isTrivialTreeletRoot(dataCh2) + - isTrivialTreeletRoot(dataCh3) + isTrivialTreeletRoot(dataCh4) + isTrivialTreeletRoot(dataCh5); - - uint treeletId = 0; - uint bottomStartpointSpace = 0; - - uint startpointsFromTiptree = trivialRoots; - - if (trivialRoots) isTipStartpoint = false; - - if (numChildrenBeingRoots > trivialRoots) - { - startpointsFromTiptree += // startpoint ONLY from tiptree - (1 - isTreeletRoot(dataCh0)) * infoCh0.numStartpoints + - (1 - isTreeletRoot(dataCh1)) * infoCh1.numStartpoints + - (1 - isTreeletRoot(dataCh2)) * infoCh2.numStartpoints + - (1 - isTreeletRoot(dataCh3)) * infoCh3.numStartpoints + - (1 - isTreeletRoot(dataCh4)) * infoCh4.numStartpoints + - (1 - isTreeletRoot(dataCh5)) * infoCh5.numStartpoints; - - treeletId = atomic_add_global((global uint*)BVHBase_GetRefitTreeletCntPtr(bvh), numChildrenBeingRoots - trivialRoots); - bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints - startpointsFromTiptree); - } - - currInfo.numStartpoints = startpointsFromTiptree; - - chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh0, &infoCh0, childrenIndices + 0, &treeletId); - chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh1, &infoCh1, childrenIndices + 1, &treeletId); - chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh2, &infoCh2, childrenIndices + 2, &treeletId); - chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh3, &infoCh3, childrenIndices + 3, &treeletId); - chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh4, &infoCh4, childrenIndices + 4, &treeletId); - chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh5, &infoCh5, childrenIndices + 5, &treeletId); - } - - if (isTipStartpoint) - { - currInfo.maxDepth = 0; - currInfo.numStartpoints = 1; - } - else - { - // reduce max depth and number of startpoint underneath - currInfo.maxDepth = max(max(max(infoCh0.maxDepth, infoCh1.maxDepth), - max(infoCh2.maxDepth, infoCh3.maxDepth)), - max(infoCh4.maxDepth, infoCh5.maxDepth)) + 1; - } - - treelets[curNodeIndex] = EncodeOpenInfo( - currInfo, - !isInTip /*mark marged treelet as an new root iff we are in bottom we */); - - /* make parent node the current node */ - curNodeIndex = parentPointer >> 6; - } - - uint treeletId = *BVHBase_GetRefitTreeletCntPtr(bvh); - - uint bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints); - - treelets[0] = EncodeClosedInfo(treeletId); - RefitTreelet tipTreeletDesc; - tipTreeletDesc.startpoint_offset = bottomStartpointSpace; - tipTreeletDesc.numStartpoints = currInfo.numStartpoints; - tipTreeletDesc.maxDepth = currInfo.maxDepth; - - treeletDescs[treeletId] = tipTreeletDesc; - - uint realNumberOfTreelets = treeletId + 1; - // intentionally we set less by 1, because this number is used in num groups for dispatch which is number of bottom treelets - // so substract 1. Except single treelet tree which is should stay 1. - uint numStartingTreelets = (treeletId == 0) ? 1 : treeletId; - - *BVHBase_GetRefitTreeletCntPtr(bvh) = numStartingTreelets; - - uint treeletDescSpaceIn64B = (realNumberOfTreelets * sizeof(RefitTreelet) + 63) >> 6; - uint startpointSpaceIn64B = ((bottomStartpointSpace + currInfo.numStartpoints) * sizeof(StartPoint) + 63) >> 6; - bvh->refitStartPointDataStart = refitTreeletsDataStart + treeletDescSpaceIn64B; - bvh->BVHDataEnd = refitTreeletsDataStart +treeletDescSpaceIn64B + startpointSpaceIn64B; - *startpointAlloc = 0; -} - - -GRL_INLINE void find_refit_treelets( - global struct BVHBase* bvh, - global TreeletNodeData* treelets, - global uint* scratchStartpoints, - global uint* startpointAlloc) -{ - /* get pointer to inner nodes and back pointers */ - uniform global InternalNode* inner_nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh); - - /* construct range of nodes that each work group will process */ - uniform const uint numInnerNodes = BVHBase_numNodes(bvh); - - varying ushort lane = get_sub_group_local_id(); - varying uint global_id = get_local_id(0) + get_group_id(0) * get_local_size(0); - - uint numBackpointers = BVHBase_GetNumInternalNodes(bvh); - - // align to 64B and divide - uint treeletOffsetIn64B = ((numBackpointers * sizeof(uint)) + 63) >> 6; - - uint refitTreeletsDataStart = bvh->backPointerDataStart + treeletOffsetIn64B; - if (global_id == 0) - { - bvh->refitTreeletsDataStart = refitTreeletsDataStart; - } - - global struct InternalNode* curNode = &inner_nodes[global_id]; - - varying ushort has_startpoint = 0; - if (global_id < numInnerNodes) { - if ((curNode->nodeType != BVH_INTERNAL_NODE)) - { - has_startpoint = 1; - } - } - - if (has_startpoint == 0) - return; - - treelet_bottom_up_mark_treelets( - bvh, - inner_nodes, - scratchStartpoints, - global_id, - BVHBase_GetBackPointers(bvh), - treelets, - refitTreeletsDataStart, - startpointAlloc); -} - -GRL_INLINE void assign_refit_startpoints_to_treelets( - global struct BVHBase* bvh, - global TreeletNodeData* treelets, - global uint* scratchStartpoints) -{ - /* get pointer to inner nodes and back pointers */ - uniform global struct InternalNode* inner_nodes = (global struct InternalNode*) BVHBase_GetInternalNodes(bvh); - - /* construct range of nodes that each work group will process */ - uniform const uint numInnerNodes = BVHBase_numNodes(bvh); - - varying ushort lane = get_sub_group_local_id(); - varying uint starPointNode = get_local_id(0) + get_group_id(0) * get_local_size(0); - varying uint curNodeIndex = starPointNode; - global struct InternalNode* curNode = &inner_nodes[curNodeIndex]; - - varying ushort is_startpoint = 0; - - if (curNodeIndex < numInnerNodes) - { - if ((curNode->nodeType != BVH_INTERNAL_NODE)) - { - is_startpoint = 1; - } - } - - if (is_startpoint == 0) - { - return; - } - - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - - RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh); - uint numTreelets = *BVHBase_GetRefitTreeletCntPtr(bvh); - if (numTreelets > 1) numTreelets++; - - uint myDepthWhenDead = 0; - uint startpointsBeforeMe = 0; - bool dead = false; - - uint prevNodeIndex = 0x03FFFFFF; - - while (curNodeIndex != 0x03FFFFFF) - { - TreeletNodeData nodeData = treelets[curNodeIndex]; - - uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex); - uint numChildren = BackPointer_GetNumChildren(parentPointer); - - // this is counterpart of atomic based entrance decision. - // the alive path is the longest, if two are equal take the one that came through child with smaller index. - if (prevNodeIndex != 0x03FFFFFF) - { - uint leadChildOfCur = curNodeIndex + inner_nodes[curNodeIndex].childOffset; - uint childEnd = numChildren + leadChildOfCur; - - uint longestPath = 0; - uint longestPathChildIdx = leadChildOfCur; - - for (uint child = leadChildOfCur; child < childEnd; child++) - { - TreeletNodeData childData = treelets[child]; - if (!isTreeletRoot(childData)) - { - TreeletsOpenNodeInfo childinfo = DecodeOpenInfo(childData); - if (longestPath <= childinfo.maxDepth) { - longestPathChildIdx = child; - longestPath = childinfo.maxDepth + 1; - } - - if (child < prevNodeIndex) - { - // also count how many startpoints are there before me (used to place startpoint in proper slot) - startpointsBeforeMe += childinfo.numStartpoints; - } - } - } - - if (!dead && prevNodeIndex != longestPathChildIdx) - { - dead = true; - //printf("starPointNode %d dies in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead); - } - - if (!dead) // this "if" is not an "else" to abouve as we might be dead before and comming through the same child index - { - myDepthWhenDead = longestPath; - // it is a startpoint - //printf("starPointNode %d in node %d lives up, its myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead); - } - - if (starPointNode == (uint)-1) { - // we just entered upper treelet as treelet if we are alive, we can be a new startpoint in new treelet - if (dead) - { - //printf("starPointNode %d disappears in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead); - // and we are dead, so we are not a startpoint of tip, - // so we must disappear to not be added as a startpoint. - return; - } - else - { - // it is a startpoint - //printf("starPointNode %d in node %d becoming its new startpoint\n", starPointNode, curNodeIndex); - starPointNode = curNodeIndex; - } - } - } - - if (isTreeletRoot(nodeData)) - { - TreeletsClosedNodeInfo info = DecodeClosedInfo(nodeData); - RefitTreelet treeletDesc = treeletDescs[info.treeletId]; - uint startpointSlot = treeletDesc.startpoint_offset + startpointsBeforeMe; - scratchStartpoints[startpointSlot] = (starPointNode << 6) + (myDepthWhenDead & ((1 << 6) - 1)); - - //printf("Adding to treeletID %d at root %d startpoint %d StartNodeIdx %d, depth %d\n", info.treeletId, curNodeIndex, startpointSlot, starPointNode, myDepthWhenDead); - - if (dead) return; - myDepthWhenDead = 0; - startpointsBeforeMe = 0; - starPointNode = (uint)-1; - } - - /* make parent node the current node */ - prevNodeIndex = curNodeIndex; - curNodeIndex = BackPointer_GetParentIndex(parentPointer); - //if(!dead) - //printf("starPointNode %d move from node %d to %d\n", starPointNode, prevNodeIndex, curNodeIndex); - } -} - -const uint FINALIZE_TREELETS_SLM_DEPTHS_SPACE = 32; - -GRL_INLINE void finalize_treelets_in_groups( - global struct BVHBase* bvh, - global uint* scratchStartpoints, - local uint* depths) -{ - uint numTreeletsExecuted = *BVHBase_GetRefitTreeletCntPtr(bvh); - - uint local_id = get_local_id(0); - - uint numTreelets = (numTreeletsExecuted > 1) ? numTreeletsExecuted + 1 : numTreeletsExecuted; - - RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh); - - for (uint treeletId = get_group_id(0); treeletId < numTreelets; treeletId += numTreeletsExecuted) - { - if (treeletId == numTreeletsExecuted && treeletId != 0) { work_group_barrier(CLK_LOCAL_MEM_FENCE); } - - RefitTreelet treeletDesc = treeletDescs[treeletId]; - StartPoint* srcStartpoints = scratchStartpoints + treeletDesc.startpoint_offset; - if (treeletDesc.numStartpoints <= 1) - { - // for smaller latency we store 1 element treelets as RefitTreeletTrivial, - // this happens most of the time for tip treelet - if (local_id == 0) - { - RefitTreeletTrivial tr = { 0, treeletDesc.numStartpoints, 0, treeletDesc.maxDepth, 0 }; - if (treeletDesc.numStartpoints == 1) - { - StartPoint sp = srcStartpoints[0]; - - tr.theOnlyNodeIndex = StartPoint_GetNodeIdx(sp); - uint backpointer = *InnerNode_GetBackPointer(BVHBase_GetBackPointers(bvh), tr.theOnlyNodeIndex); - tr.numChildrenOfTheNode = BackPointer_GetNumChildren(backpointer); - tr.childrenOffsetOfTheNode = BVHBase_GetInternalNodes(bvh)[tr.theOnlyNodeIndex].childOffset + tr.theOnlyNodeIndex; - } - RefitTreeletTrivial* trivial = (RefitTreeletTrivial*)(treeletDescs + treeletId); - *trivial = tr; -#if REFIT_VERBOSE_LOG - printf("treelet trivial %d {\n theOnlyNodeIndex = %d;\n numStartpoints = %d;\n childrenOffsetOfTheNode = %d;\n maxDepth =%d;\n numChildrenOfTheNode = %d;\n}\n", - treeletId, - tr.theOnlyNodeIndex, - tr.numStartpoints, - tr.childrenOffsetOfTheNode, - tr.maxDepth, - tr.numChildrenOfTheNode); -#endif - } - } - else - { -#define SKIP_PATHS_SORTING 0 -#if SKIP_PATHS_SORTING - StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset; - for (uint startpointID = local_id; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0)) - { - dstStartpoints[startpointID] = srcStartpoints[startpointID]; - } -#else - //if (local_id == 0) { printf("treelet %d, numStartpoints = %d\n", treeletId, numStartpoints); } - - if (local_id <= treeletDesc.maxDepth) { - depths[local_id] = 0; - // printf("initializing slm treelet %d, depths[%d] = 0\n", treeletId, local_id); - } - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - uint loopSize = ((treeletDesc.numStartpoints + (get_sub_group_size() - 1)) / get_sub_group_size()) * get_sub_group_size(); - - // collect histogram of how many paths of given length we have - - // keep count of depth 0 - uint val = 0; - - // optimize: we will load Startpoint only once to - uint S_c[8]; - // optimize: keep accumulated numbers in registers to limit number of atomic ops - uint D_c[8] = { 0 }; - - uint cached_threshold = 8 * get_local_size(0); - cached_threshold = min(cached_threshold, treeletDesc.numStartpoints); - - uint loop_turn = 0; - uint sgid = get_sub_group_local_id(); - - for (uint startpointID = local_id+ cached_threshold; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0)) - { - uint dstSlot = StartPoint_GetDepth(srcStartpoints[startpointID]); - atomic_inc((volatile local uint*) (depths + dstSlot)); - } - - uint HistogramSG = 0; - if (treeletDesc.maxDepth < 8) - { - for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0)) - { - StartPoint S = srcStartpoints[startpointID]; - S_c[loop_turn++] = S; - uint dstSlot = StartPoint_GetDepth(S); - D_c[dstSlot]++; - } - - for (uint d = 0; d <= treeletDesc.maxDepth; d++) - { - val = sub_group_reduce_add(D_c[d]); - if (sgid == d) - { - HistogramSG = val; - } - } - if (sgid <= treeletDesc.maxDepth && HistogramSG != 0) - { - atomic_add((volatile local uint*) (depths + sgid), HistogramSG); - } - } - else - { - for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0)) - { - StartPoint S = srcStartpoints[startpointID]; - S_c[loop_turn++] = S; - uint dstSlot = StartPoint_GetDepth(S); - atomic_inc((volatile local uint*) (depths + dstSlot)); - } - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - -#if REFIT_VERBOSE_LOG - if (local_id == 0) - { - for (uint d = 0; d <= treeletDesc.maxDepth; d++) - { - printf("treelet %d depths[%d] = %d\n", treeletId, d, depths[d]); - } - } -#endif - - if (treeletDesc.maxDepth < get_sub_group_size()) - { - if (get_sub_group_id() == 0) - { - - uint cntOfDepth = 0; - if (sgid <= treeletDesc.maxDepth) { - cntOfDepth = depths[sgid]; - } - uint pref_sum = sub_group_scan_exclusive_add(cntOfDepth); - depths[sgid] = pref_sum; - - uint numLeft = treeletDesc.numStartpoints - (pref_sum); - uint depthLess64 = (numLeft < 64 ) ? (uint)sgid : (uint)treeletDesc.maxDepth; - uint depthLess128 = (numLeft < 128) ? (uint)sgid : (uint)treeletDesc.maxDepth; - uint depthLess256 = (numLeft < 256) ? (uint)sgid : (uint)treeletDesc.maxDepth; - - // filling data for thread 0 who will save this to mem - treeletDesc.depthLess64 = sub_group_reduce_min(depthLess64); - treeletDesc.depthLess128 = sub_group_reduce_min(depthLess128); - treeletDesc.depthLess256 = sub_group_reduce_min(depthLess256); - treeletDesc.numNonTrivialStartpoints = treeletDesc.numStartpoints - cntOfDepth; - - if (sgid == 0) { - treeletDescs[treeletId] = treeletDesc; -#if REFIT_VERBOSE_LOG - printf("treelet %d {\n startpoint_offset = %d;\n numStartpoints = %d;\n numNonTrivialStartpoints = %d; \n maxDepth = %d;\n depthLess64 = %d;\n depthLess128 = %d;\n depthLess256 = %d;\n}\n", - treeletId, - treeletDesc.startpoint_offset, - treeletDesc.numStartpoints, - treeletDesc.numNonTrivialStartpoints, - treeletDesc.maxDepth, - treeletDesc.depthLess64, - treeletDesc.depthLess128, - treeletDesc.depthLess256); -#endif - } - } - } - else if (local_id <= treeletDesc.maxDepth) { - uint thisdepthcount = depths[local_id]; - treeletDesc.depthLess64 = 0; - treeletDesc.depthLess128 = 0; - treeletDesc.depthLess256 = 0; - uint numLeft = treeletDesc.numStartpoints; - uint pref_sum = 0; - - for (uint d = 0; d < local_id; d++) - { - uint depthCnt = depths[d]; - if (numLeft > 64) { treeletDesc.depthLess64 = d + 1; } - if (numLeft > 128) { treeletDesc.depthLess128 = d + 1; } - if (numLeft > 256) { treeletDesc.depthLess256 = d + 1; } - pref_sum += depthCnt; - numLeft -= depthCnt; - if (d == 0) { treeletDesc.numNonTrivialStartpoints = numLeft; } - } - - if (local_id == treeletDesc.maxDepth) - { - treeletDescs[treeletId] = treeletDesc; -#if REFIT_VERBOSE_LOG - printf("treelet %d {\n startpoint_offset = %d;\n numStartpoints = %d;\n numNonTrivialStartpoints = %d; maxDepth = %d;\n depthLess64 = %d; depthLess128 = %d; depthLess256 = %d;\n}\n", - treeletId, - treeletDesc.startpoint_offset, - treeletDesc.numStartpoints, - treeletDesc.numNonTrivialStartpoints, - treeletDesc.maxDepth, - treeletDesc.depthLess64, - treeletDesc.depthLess128, - treeletDesc.depthLess256); -#endif - } - } - - StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset; - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - loop_turn = 0; - if (treeletDesc.maxDepth < 8) - { - uint prefixSG = 0; - - // make prefixSG keep interval for paths with sglid depth that is separated out for sg. - if (sgid <= treeletDesc.maxDepth && HistogramSG != 0) - { - prefixSG = atomic_add((volatile local uint*) (depths + sgid), HistogramSG); - } - - // from now on all sgs run independently - - // make D_c keep offset interval that is separated out for given lane - for (uint d = 0; d <= treeletDesc.maxDepth; d++) - { - uint thisDPrefixSg = sub_group_broadcast(prefixSG, d); - uint thisLaneCount = D_c[d]; - uint laneOffset = sub_group_scan_exclusive_add(thisLaneCount); - D_c[d] = laneOffset + thisDPrefixSg; - } - - for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0)) - { - StartPoint S = S_c[loop_turn++]; - uint d = StartPoint_GetDepth(S); - uint dstSlot = D_c[d]++; - dstStartpoints[dstSlot] = S; - } - } - else - { - for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0)) - { - StartPoint S = S_c[loop_turn++]; - uint d = StartPoint_GetDepth(S); - uint dstSlot = atomic_inc((volatile local uint*) (depths + d)); - dstStartpoints[dstSlot] = S; - } - } - - for (uint srcStartpointID = local_id+ cached_threshold; srcStartpointID < treeletDesc.numStartpoints; srcStartpointID += get_local_size(0)) - { - StartPoint S = srcStartpoints[srcStartpointID]; - uint d = StartPoint_GetDepth(srcStartpoints[srcStartpointID]); - uint dstSlot = atomic_inc((volatile local uint*) (depths+ d)); - dstStartpoints[dstSlot] = S; - } -#endif //skip sorting - } - } -} diff --git a/src/intel/vulkan/grl/gpu/bvh_copy.cl b/src/intel/vulkan/grl/gpu/bvh_copy.cl deleted file mode 100644 index 6e76f195095..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_copy.cl +++ /dev/null @@ -1,763 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "api_interface.h" -#include "d3d12.h" -#include "common.h" -#include "mem_utils.h" -#include "misc_shared.h" - -#define offsetof(TYPE, ELEMENT) ((size_t)&(((TYPE *)0)->ELEMENT)) - -GRL_INLINE -uint GroupCountForCopySize(uint size) -{ - return (size >> 8) + 4; -} - -GRL_INLINE -uint GroupCountForCopy(BVHBase* base) -{ - return GroupCountForCopySize(base->Meta.allocationSize); -} - -GRL_INLINE void copyInstanceDescs(InstanceDesc* instances, D3D12_RAYTRACING_INSTANCE_DESC* descs, uint64_t numInstances) -{ - for (uint64_t instanceIndex = get_local_id(0); instanceIndex < numInstances; instanceIndex += get_local_size(0)) - { - for (uint row = 0; row < 3; row++) - { - for (uint column = 0; column < 4; column++) - { - D3D12_set_transform(&descs[instanceIndex], row, column, InstanceDesc_get_transform(&instances[instanceIndex], row, column)); - } - } - D3D12_set_instanceID(&descs[instanceIndex], InstanceDesc_get_instanceID(&instances[instanceIndex])); - D3D12_set_InstanceMask(&descs[instanceIndex], InstanceDesc_get_InstanceMask(&instances[instanceIndex])); - D3D12_set_InstanceContributionToHitGroupIndex(&descs[instanceIndex], InstanceDesc_get_InstanceContributionToHitGroupIndex(&instances[instanceIndex])); - D3D12_set_InstanceFlags(&descs[instanceIndex], InstanceDesc_get_InstanceFlags(&instances[instanceIndex])); - D3D12_set_AccelerationStructure(&descs[instanceIndex], InstanceDesc_get_AccelerationStructure(&instances[instanceIndex])); - } -} - -GRL_INLINE void createGeoDescs(GeoMetaData* geoMetaData, D3D12_RAYTRACING_GEOMETRY_DESC* descs, uint64_t numGeos, const uint64_t dataBufferStart) -{ - if (get_local_id(0) == 0) - { - uint64_t previousGeoDataBufferEnd = dataBufferStart; - for (uint64_t geoIndex = 0; geoIndex < numGeos; geoIndex += 1) - { - D3D12_set_Type(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Type)); - D3D12_set_Flags(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Flags)); - if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES) - { - // Every triangle is stored separately - uint64_t vertexBufferSize = 9 * sizeof(float) * geoMetaData[geoIndex].PrimitiveCount; - D3D12_set_triangles_Transform(&descs[geoIndex], 0); - D3D12_set_triangles_IndexFormat(&descs[geoIndex], INDEX_FORMAT_NONE); - D3D12_set_triangles_VertexFormat(&descs[geoIndex], VERTEX_FORMAT_R32G32B32_FLOAT); - D3D12_set_triangles_IndexCount(&descs[geoIndex], 0); - D3D12_set_triangles_VertexCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount * 3); - D3D12_set_triangles_IndexBuffer(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); - D3D12_set_triangles_VertexBuffer_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); - D3D12_set_triangles_VertexBuffer_StrideInBytes(&descs[geoIndex], 3 * sizeof(float)); - previousGeoDataBufferEnd += vertexBufferSize; - } - else - { - D3D12_set_procedurals_AABBCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount); - D3D12_set_procedurals_AABBs_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd); - D3D12_set_procedurals_AABBs_StrideInBytes(&descs[geoIndex], sizeof(D3D12_RAYTRACING_AABB)); - previousGeoDataBufferEnd += sizeof(D3D12_RAYTRACING_AABB) * geoMetaData[geoIndex].PrimitiveCount; - } - } - } -} - -GRL_INLINE void copyIndiciesAndVerticies(D3D12_RAYTRACING_GEOMETRY_DESC* desc, QuadLeaf* quad) -{ - float* vertices = (float*)D3D12_get_triangles_VertexBuffer_StartAddress(desc); - uint64_t firstTriangleIndex = quad->primIndex0; - uint64_t numTriangles = QuadLeaf_IsSingleTriangle(quad) ? 1 : 2; - - vertices[firstTriangleIndex * 9] = quad->v[0][0]; - vertices[firstTriangleIndex * 9 + 1] = quad->v[0][1]; - vertices[firstTriangleIndex * 9 + 2] = quad->v[0][2]; - - vertices[firstTriangleIndex * 9 + 3] = quad->v[1][0]; - vertices[firstTriangleIndex * 9 + 4] = quad->v[1][1]; - vertices[firstTriangleIndex * 9 + 5] = quad->v[1][2]; - - vertices[firstTriangleIndex * 9 + 6] = quad->v[2][0]; - vertices[firstTriangleIndex * 9 + 7] = quad->v[2][1]; - vertices[firstTriangleIndex * 9 + 8] = quad->v[2][2]; - - if (numTriangles == 2) - { - uint64_t secondTriangleIndex = firstTriangleIndex + QuadLeaf_GetPrimIndexDelta(quad); - uint32_t packed_indices = QuadLeaf_GetSecondTriangleIndices(quad); - for( size_t i=0; i<3; i++ ) - { - uint32_t idx = packed_indices & 3 ; packed_indices >>= 2; - for( size_t j=0; j<3; j++ ) - vertices[secondTriangleIndex * 9 + i * 3 + j] = quad->v[idx][j]; - } - } -} - -GRL_INLINE -void storeProceduralDesc( - struct AABB procAABB, - uint32_t primId, - D3D12_RAYTRACING_GEOMETRY_DESC* geoDesc) -{ - D3D12_RAYTRACING_AABB* proceduralDescs = (D3D12_RAYTRACING_AABB*)D3D12_get_procedurals_AABBs_StartAddress(geoDesc); - D3D12_set_raytracing_aabb(&proceduralDescs[primId], &procAABB); -} - -GRL_INLINE -void copyDataFromLProcedurals( - BVHBase* base, - D3D12_RAYTRACING_GEOMETRY_DESC* descs) -{ - unsigned numProcedurals = BVHBase_GetNumProcedurals(base); - InternalNode* innerNodes = BVHBase_GetInternalNodes(base); - unsigned numInnerNodes = BVHBase_GetNumInternalNodes(base); - - if (BVHBase_GetNumProcedurals(base) > 0) //< there's no point entering here if there are no procedurals - { - - // iterate on all inner nodes to identify those with procedural children, we have to take aabbs from them - for (uint32_t nodeI = get_local_id(0); nodeI < numInnerNodes; nodeI += get_local_size(0)) - { - InternalNode* innerNode = innerNodes + nodeI; - - if (innerNode->nodeType == NODE_TYPE_PROCEDURAL) - { - float* origin = innerNode->lower; - - global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer((struct QBVHNodeN*)innerNode); - - for (uint k = 0; k < 6; k++) - { - if (InternalNode_IsChildValid(innerNode, k)) - { - struct AABB3f qbounds = { - (float)(innerNode->lower_x[k]), (float)(innerNode->lower_y[k]), (float)(innerNode->lower_z[k]), - (float)(innerNode->upper_x[k]), (float)(innerNode->upper_y[k]), (float)(innerNode->upper_z[k]) }; - - struct AABB dequantizedAABB; - - dequantizedAABB.lower[0] = origin[0] + bitShiftLdexp(qbounds.lower[0], innerNode->exp_x - 8); - dequantizedAABB.lower[1] = origin[1] + bitShiftLdexp(qbounds.lower[1], innerNode->exp_y - 8); - dequantizedAABB.lower[2] = origin[2] + bitShiftLdexp(qbounds.lower[2], innerNode->exp_z - 8); - dequantizedAABB.upper[0] = origin[0] + bitShiftLdexp(qbounds.upper[0], innerNode->exp_x - 8); - dequantizedAABB.upper[1] = origin[1] + bitShiftLdexp(qbounds.upper[1], innerNode->exp_y - 8); - dequantizedAABB.upper[2] = origin[2] + bitShiftLdexp(qbounds.upper[2], innerNode->exp_z - 8); - - dequantizedAABB = conservativeAABB(&dequantizedAABB); - /* extract geomID and primID from leaf */ - const uint startPrim = QBVHNodeN_startPrim((struct QBVHNodeN*) innerNode, k); - const uint geomID = ProceduralLeaf_geomIndex(leaf); - const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf! - - storeProceduralDesc(dequantizedAABB, primID, descs + geomID); - } - /* advance leaf pointer to next child */ - leaf += QBVHNodeN_blockIncr((struct QBVHNodeN*)innerNode, k); - } - - } - else if (innerNode->nodeType == NODE_TYPE_MIXED) { ERROR(); } - else {/* do nothing for other internal node types, they can't have procedural child (directly)*/; } - } - } -} - -GRL_INLINE -void copyDataFromQuadLeaves(BVHBase* base, - D3D12_RAYTRACING_GEOMETRY_DESC* descs) -{ - QuadLeaf* quads = BVHBase_GetQuadLeaves(base); - uint64_t numQuads = BVHBase_GetNumQuads(base); - for (uint64_t quadIdx = get_local_id(0); quadIdx < numQuads; quadIdx += get_local_size(0)) - { - uint64_t descIdx = PrimLeaf_GetGeoIndex(&quads[quadIdx].leafDesc); - copyIndiciesAndVerticies(&descs[descIdx], &quads[quadIdx]); - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel clone_indirect(global char* dest, - global char* src) -{ - BVHBase* base = (BVHBase*)src; - uint64_t bvhSize = base->Meta.allocationSize; - - uint numGroups = GroupCountForCopy(base); - CopyMemory(dest, src, bvhSize, numGroups); -} - -GRL_INLINE void compactT(global char* dest, global char* src, uint64_t compactedSize, uint skipCopy, uint groupCnt) -{ - global BVHBase* baseSrc = (global BVHBase*)src; - global BVHBase* baseDest = (global BVHBase*)dest; - - uint32_t offset = sizeof(BVHBase); - uint32_t numNodes = BVHBase_GetNumInternalNodes(baseSrc); - uint32_t nodeSize = numNodes * sizeof(InternalNode); - offset += nodeSize; - - int quadChildFix = baseSrc->quadLeafStart; - int procChildFix = baseSrc->proceduralDataStart; - int instChildFix = baseSrc->instanceLeafStart; - - // serialization already copies part of bvh base so skip this part - CopyMemory(dest + skipCopy, src + skipCopy, sizeof(BVHBase) - skipCopy, groupCnt); - baseDest->Meta.allocationSize = compactedSize; - - if (baseSrc->Meta.instanceCount) - { - const uint32_t instLeafsSize = BVHBase_GetNumHWInstanceLeaves(baseSrc) * sizeof(HwInstanceLeaf); - CopyMemory(dest + offset, (global char*)BVHBase_GetHWInstanceLeaves(baseSrc), instLeafsSize, groupCnt); - const uint instanceLeafStart = (uint)(offset / 64); - baseDest->instanceLeafStart = instanceLeafStart; - instChildFix -= instanceLeafStart; - offset += instLeafsSize; - baseDest->instanceLeafEnd = (uint)(offset / 64); - } - if (baseSrc->Meta.geoCount) - { - const uint quadLeafsSize = BVHBase_GetNumQuads(baseSrc) * sizeof(QuadLeaf); - if (quadLeafsSize) - { - CopyMemory(dest + offset, (global char*)BVHBase_GetQuadLeaves(baseSrc), quadLeafsSize, groupCnt); - const uint quadLeafStart = (uint)(offset / 64); - baseDest->quadLeafStart = quadLeafStart; - quadChildFix -= quadLeafStart; - offset += quadLeafsSize; - baseDest->quadLeafCur = (uint)(offset / 64); - } - - const uint procLeafsSize = BVHBase_GetNumProcedurals(baseSrc) * sizeof(ProceduralLeaf); - if (procLeafsSize) - { - CopyMemory(dest + offset, (global char*)BVHBase_GetProceduralLeaves(baseSrc), procLeafsSize, groupCnt); - const uint proceduralDataStart = (uint)(offset / 64); - baseDest->proceduralDataStart = proceduralDataStart; - procChildFix -= proceduralDataStart; - offset += procLeafsSize; - baseDest->proceduralDataCur = (uint)(offset / 64); - } - } - // copy nodes with fixed child offsets - global uint* nodeDest = (global uint*)(dest + sizeof(BVHBase)); - global InternalNode* nodeSrc = (global InternalNode*)BVHBase_GetInternalNodes(baseSrc); - // used in mixed case - char* instanceLeavesBegin = (char*)BVHBase_GetHWInstanceLeaves(baseSrc); - char* instanceLeavesEnd = (char*)BVHBase_GetHWInstanceLeaves_End(baseSrc); - uint localId = get_sub_group_local_id(); - for (uint i = get_group_id(0); i < numNodes; i += groupCnt) - { - uint nodePart = CacheLineSubgroupRead((const global char*)&nodeSrc[i]); - char nodeType = as_char4(sub_group_broadcast(nodePart, offsetof(InternalNode, nodeType) / 4))[0]; - if (localId * 4 == offsetof(InternalNode, childOffset)) - { - int childOffset = as_int(nodePart); - if (nodeType == NODE_TYPE_MIXED) - { - char* childPtr = (char*)&nodeSrc[i] + 64 * childOffset; - if (childPtr > instanceLeavesBegin && childPtr < instanceLeavesEnd) - nodePart = as_int(childOffset - instChildFix); - } - else if (nodeType == NODE_TYPE_INSTANCE) - nodePart = as_int(childOffset - instChildFix); - else if (nodeType == NODE_TYPE_QUAD) - nodePart = as_int(childOffset - quadChildFix); - else if (nodeType == NODE_TYPE_PROCEDURAL) - nodePart = as_int(childOffset - procChildFix); - } - nodeDest[i * 16 + localId] = nodePart; - } - - if (baseSrc->Meta.instanceCount) - { - const uint32_t instanceDescSize = baseSrc->Meta.instanceCount * sizeof(InstanceDesc); - CopyMemory(dest + offset, src + baseSrc->Meta.instanceDescsStart, instanceDescSize, groupCnt); - baseDest->Meta.instanceDescsStart = offset; - offset += instanceDescSize; - } - if (baseSrc->Meta.geoCount) - { - const uint32_t geoMetaSize = baseSrc->Meta.geoCount * sizeof(GeoMetaData); - CopyMemory(dest + offset, src + baseSrc->Meta.geoDescsStart, geoMetaSize, groupCnt); - baseDest->Meta.geoDescsStart = offset; - offset += (geoMetaSize + 63) & ~63; // align to 64 - } - - uint backPointerDataStart = offset / 64; - uint refitTreeletsDataStart = backPointerDataStart; - uint refitStartPointDataStart = backPointerDataStart; - uint dataEnd = backPointerDataStart; - uint fatLeafTableStart = dataEnd; - uint fatLeafCount = baseSrc->fatLeafCount; - uint innerTableStart = dataEnd; - uint innerCount = baseSrc->innerCount; - - uint quadLeftoversCountNewAtomicUpdate = baseSrc->quadLeftoversCountNewAtomicUpdate; - uint quadTableSizeNewAtomicUpdate = baseSrc->quadTableSizeNewAtomicUpdate; - uint quadIndicesDataStart = dataEnd; - - if (BVHBase_HasBackPointers(baseSrc)) - { -#if 0 // - const uint oldbackpontersDataStart = baseSrc->backPointerDataStart; - const uint shift = oldbackpontersDataStart - backPointerDataStart; - const uint refitStructsSize = ((BVHBase_GetRefitStructsDataSize(baseSrc)) + 63) & ~63; - - CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), refitStructsSize, groupCnt); - - refitTreeletsDataStart = baseSrc->refitTreeletsDataStart - shift; - refitStartPointDataStart = baseSrc->refitStartPointDataStart - shift; - dataEnd = baseSrc->BVHDataEnd - shift; -#else // compacting version - const uint backpointersSize = ((numNodes*sizeof(uint)) + 63) & ~63; - CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), backpointersSize, groupCnt); - offset += backpointersSize; - - refitTreeletsDataStart = offset / 64; - refitStartPointDataStart = offset / 64; - - // TODO: remove treelets from .... everywhere - const uint treeletExecutedCnt = *BVHBase_GetRefitTreeletCntPtr(baseSrc); - - if (treeletExecutedCnt) - { - const uint treeletCnt = treeletExecutedCnt > 1 ? treeletExecutedCnt + 1 : 1; - - refitTreeletsDataStart = offset / 64; - const uint treeletsSize = ((treeletCnt * sizeof(RefitTreelet)) + 63) & ~63; - RefitTreelet* destTreelets = (RefitTreelet*)(dest + offset); - RefitTreelet* srcTreelets = BVHBase_GetRefitTreeletDescs(baseSrc); - - uint numThreads = groupCnt * get_local_size(0); - uint globalID = (get_group_id(0) * get_local_size(0)) + get_local_id(0); - - for (uint i = globalID; i < treeletCnt; i += numThreads) - { - RefitTreelet dsc = srcTreelets[i]; - RefitTreeletTrivial* trivial_dsc = (RefitTreeletTrivial*)&dsc; - if (trivial_dsc->numStartpoints == 1 && trivial_dsc->childrenOffsetOfTheNode > numNodes) { - trivial_dsc->childrenOffsetOfTheNode -= quadChildFix; - } - destTreelets[i] = dsc; - } - - offset += treeletsSize; - - refitStartPointDataStart = offset / 64; - const uint startPointsSize = (BVHBase_GetRefitStartPointsSize(baseSrc) + 63) & ~63; - CopyMemory(dest + offset, (global char*)BVHBase_GetRefitStartPoints(baseSrc), startPointsSize, groupCnt); - offset += startPointsSize; - dataEnd = offset / 64; - } - - uint fatleafEntriesSize = ((fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63); - fatLeafTableStart = offset / 64; - if (fatleafEntriesSize) { - CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), fatleafEntriesSize, groupCnt); - } - offset += fatleafEntriesSize; - - // New atomic update - if(baseSrc->quadIndicesDataStart > baseSrc->backPointerDataStart) - { - uint numQuads = BVHBase_GetNumQuads(baseSrc); - uint quadTableMainBufferSize = (numQuads + 255) & ~255; - uint quadLeftoversSize = (quadLeftoversCountNewAtomicUpdate + 255) & ~255; - uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63); - if (quadTableEntriesSize) { - CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), quadTableEntriesSize, groupCnt); - } - offset += quadTableEntriesSize; - - uint quadIndicesDataSize = ((numQuads * sizeof(QuadDataIndices) + 63) & ~63); - quadIndicesDataStart = offset / 64; - if (quadIndicesDataSize) { - CopyMemory(dest + offset, (global char*)BVHBase_GetQuadDataIndicesTable(baseSrc), quadIndicesDataSize, groupCnt); - } - offset += quadIndicesDataSize; - } - - uint innerEntriesSize = ((innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63); - innerTableStart = offset / 64; - if (innerEntriesSize) { - CopyMemory(dest + offset, (global char*)BVHBase_GetInnerNodeTable(baseSrc), innerEntriesSize, groupCnt); - } - offset += innerEntriesSize; - - dataEnd = offset / 64; -#endif - } - - baseDest->backPointerDataStart = backPointerDataStart; - baseDest->refitTreeletsDataStart = refitTreeletsDataStart; - baseDest->refitStartPointDataStart = refitStartPointDataStart; - baseDest->fatLeafTableStart = fatLeafTableStart ; - baseDest->fatLeafCount = fatLeafCount; - baseDest->innerTableStart = innerTableStart; - baseDest->innerCount = innerCount; - - baseDest->quadLeftoversCountNewAtomicUpdate = quadLeftoversCountNewAtomicUpdate; - baseDest->quadTableSizeNewAtomicUpdate = quadTableSizeNewAtomicUpdate; - baseDest->quadIndicesDataStart = quadIndicesDataStart; - baseDest->BVHDataEnd = dataEnd; -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -void kernel compact(global char* dest, - global char* src, - uint groupCnt) -{ - uint64_t compactedSize = compute_compacted_size((BVHBase*)src); - compactT(dest, src, compactedSize, 0, groupCnt); -} - -// set serialization header along all lanes, each lane will get one dword of header plus 64bit reminding data -GRL_INLINE -unsigned prepare_header( - uint64_t headerSize, - uint64_t instancePtrSize, - uint64_t numInstances, - uint64_t bvhSize, - uint8_t* driverID, - uint64_t reminder) -{ - - unsigned loc_id = get_sub_group_local_id(); - - uint64_t SerializedSizeInBytesIncludingHeader = headerSize + instancePtrSize * numInstances + bvhSize; - uint64_t DeserializedSizeInBytes = bvhSize; - uint64_t InstanceHandleCount = numInstances; - - char bvh_magic_str[] = BVH_MAGIC_MACRO; - uint* bvh_magic_uint = (uint*)bvh_magic_str; - - unsigned headerTempLanePiece; - if (loc_id < 4) { headerTempLanePiece = *((unsigned*)&driverID[4*loc_id]); } - else if (loc_id == 4) { headerTempLanePiece = bvh_magic_uint[0]; } - else if (loc_id == 5) { headerTempLanePiece = bvh_magic_uint[1]; } - else if (loc_id == 6) { headerTempLanePiece = bvh_magic_uint[2]; } - else if (loc_id == 7) { headerTempLanePiece = bvh_magic_uint[3]; } - else if (loc_id == 8) { headerTempLanePiece = (uint)SerializedSizeInBytesIncludingHeader; } - else if (loc_id == 9) { headerTempLanePiece = (uint)(SerializedSizeInBytesIncludingHeader >> 32ul); } - else if (loc_id == 10) { headerTempLanePiece = (uint)DeserializedSizeInBytes; } - else if (loc_id == 11) { headerTempLanePiece = (uint)(DeserializedSizeInBytes >> 32ul); } - else if (loc_id == 12) { headerTempLanePiece = (uint)InstanceHandleCount; } - else if (loc_id == 13) { headerTempLanePiece = (uint)(InstanceHandleCount >> 32ul); } - else if (loc_id == 14) { headerTempLanePiece = (uint)reminder; } - else if (loc_id == 15) { headerTempLanePiece = (uint)(reminder >> 32ul); } - - return headerTempLanePiece; -} - - - - -GRL_INLINE -void serializeT( - global byte_align64B* dest, - global byte_align64B* src, - global uint8_t* driverID, - uint groups_count) -{ - SerializationHeader* header = (SerializationHeader*)dest; - BVHBase* base = (BVHBase*)src; - - const uint headerSize = sizeof(SerializationHeader); - const uint numInstances = base->Meta.instanceCount; - const uint instancePtrSize = sizeof(gpuva_t); - const uint compactedSize = compute_compacted_size(base); - uint local_id = get_sub_group_local_id(); - - // this is not 64byte aligned :( - const uint offsetToBvh = headerSize + instancePtrSize * numInstances; - - global InstanceDesc* src_instances = 0; - - if (numInstances) { - src_instances = (global InstanceDesc*)((uint64_t)base + base->Meta.instanceDescsStart); - } - - // effectively this part should end up as one 64B aligned 64B write - if (get_group_id(0) == groups_count - 1) - { - Block64B headerPlus; - - // we patch the missing piece with instance or bhv beginning (TRICK A and B) - // we assume header is 56B. - global uint64_t* srcPiece = (numInstances != 0) ? &src_instances[0].AccelerationStructureGPUVA : (global uint64_t*)src; - - unsigned headerTemp; - - headerTemp = prepare_header( - headerSize, - instancePtrSize, - numInstances, - compactedSize, - driverID, - *srcPiece); - - CacheLineSubgroupWrite((global byte_align64B*)dest, headerTemp); - } - - if (numInstances > 0) - { - uint instancesOffset = headerSize; - uint aligned_instance_ptrs_offset = ((instancesOffset + 63) >> 6) << 6; - uint unaligned_prefixing_instance_cnt = (aligned_instance_ptrs_offset - instancesOffset) >> 3; - unaligned_prefixing_instance_cnt = min(unaligned_prefixing_instance_cnt, numInstances); - - global uint64_t* dst_instances = (global uint64_t*)(dest + instancesOffset); - - // we've copied first instance onto a header, (see TRICK A) - // now we have only instances start at aligned memory - uint numAlignedInstances = numInstances - unaligned_prefixing_instance_cnt; - dst_instances += unaligned_prefixing_instance_cnt; - src_instances += unaligned_prefixing_instance_cnt; - - if (numAlignedInstances) - { - // each 8 instances form a cacheline - uint numCachelines = numAlignedInstances >> 3; //qwords -> 64Bs - // qwords besides multiple of 8; - uint startReminder = numAlignedInstances & ~((1 << 3) - 1); - uint numreminder = numAlignedInstances & ((1 << 3) - 1); - - uint task_id = get_group_id(0); - - while (task_id < numCachelines) - { - uint src_id = task_id * 8 + (local_id >> 1); - uint* src_uncorected = (uint*)& src_instances[src_id].AccelerationStructureGPUVA; - uint* src = ((local_id & 1) != 0) ? src_uncorected + 1 : src_uncorected; - uint data = *src; - - global char* dst = (global byte_align64B*)(dst_instances + (8 * task_id)); - CacheLineSubgroupWrite(dst, data); - task_id += groups_count; - } - - if (task_id == numCachelines && local_id < 8 && numreminder > 0) - { - // this should write full cacheline - - uint index = startReminder + local_id; - // data will be taken from instances for lanes (local_id < numreminder) - // copy srcbvh beginning as uint64_t for remaining lanes (TRICK B) - global uint64_t* srcData = (local_id < numreminder) ? - &src_instances[index].AccelerationStructureGPUVA : - ((global uint64_t*)src) + (local_id - numreminder); - dst_instances[index] = *srcData; - } - } - } - - // the parts above copied unaligned dst beginning of bvh (see TRICK B) - uint32_t unalignedPartCopiedElsewhere = (64u - (offsetToBvh & (64u - 1u)))&(64u - 1u); - - compactT(dest + offsetToBvh, src, compactedSize, unalignedPartCopiedElsewhere, groups_count); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -void kernel serialize_indirect( - global char* dest, - global char* src, - global uint8_t* driverID) -{ - BVHBase* base = (BVHBase*)src; - uint groups_count = GroupCountForCopy(base); - serializeT(dest, src, driverID, groups_count); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -void kernel serialize_for_input_dump_indirect( - global struct OutputBatchPtrs* batchPtrs, - global dword* dstOffset, - global char* src, - global uint8_t* driverID) -{ - BVHBase* base = (BVHBase*)src; - uint groups_count = GroupCountForCopy(base); - global char* dest = (global char*)(batchPtrs->dataStart + *dstOffset); - dest += (sizeof(OutputData) + 127) & ~127; - serializeT(dest, src, driverID, groups_count); -} - -GRL_INLINE -void deserializeT( - global char* dest, - global char* src, - unsigned groupCnt) -{ - SerializationHeader* header = (SerializationHeader*)src; - - const uint64_t headerSize = sizeof(struct SerializationHeader); - const uint64_t instancePtrSize = sizeof(gpuva_t); - const uint64_t numInstances = header->InstanceHandleCount; - const uint64_t offsetToBvh = headerSize + instancePtrSize * numInstances; - const uint64_t bvhSize = header->DeserializedSizeInBytes; - - if (numInstances) - { - const bool instances_mixed_with_inner_nodes = false; - if (instances_mixed_with_inner_nodes) - { - // not implemented ! - // copy each node with 64byte granularity if node is instance, patch it mid-copy - } - else - { - BVHBase* srcBvhBase = (BVHBase*)(src + offsetToBvh); - - // numHWInstances can be bigger (because of rebraiding) or smaller (because of inactive instances) than - // numInstances (count of pointers and descriptors). - uint offsetToHwInstances = srcBvhBase->instanceLeafStart << 6; - uint numHwInstances = (srcBvhBase->instanceLeafEnd - srcBvhBase->instanceLeafStart) >> 1; - - // - // instances are in separate memory intervals - // copy all the other data simple way - // - uint nodesEnd = srcBvhBase->Meta.instanceDescsStart; - // copy before instance leafs - CopyMemory(dest, (global char*)(src + offsetToBvh), offsetToHwInstances, groupCnt); - - uint offsetPostInstances = srcBvhBase->instanceLeafEnd << 6; - uint instanceDescStart = srcBvhBase->Meta.instanceDescsStart; - uint sizePostInstances = instanceDescStart - offsetPostInstances; - // copy after instance leafs before instance desc - CopyMemory(dest + offsetPostInstances, (global char*)(src + offsetToBvh + offsetPostInstances), sizePostInstances, groupCnt); - - uint instanceDescEnd = instanceDescStart + numInstances * sizeof(InstanceDesc); - uint sizePostInstanceDescs = bvhSize - instanceDescEnd; - // copy after instance desc - CopyMemory(dest + instanceDescEnd, (global char*)(src + offsetToBvh + instanceDescEnd), sizePostInstanceDescs, groupCnt); - - global gpuva_t* newInstancePtrs = (global gpuva_t*)(src + headerSize); - global InstanceDesc* dstDesc = (global InstanceDesc*)(dest + instanceDescStart); - global InstanceDesc* srcDesc = (global InstanceDesc*)(src + offsetToBvh + instanceDescStart); - - // copy and patch instance descriptors - for (uint64_t instanceIndex = get_group_id(0); instanceIndex < numInstances; instanceIndex += groupCnt) - { - InstanceDesc desc = srcDesc[instanceIndex]; - uint64_t newInstancePtr = newInstancePtrs[instanceIndex]; - desc.AccelerationStructureGPUVA = newInstancePtr; // patch it with new ptr; - - dstDesc[instanceIndex] = desc; - } - - // copy and patch hw instance leafs - global HwInstanceLeaf* dstInstleafs = (global HwInstanceLeaf*)(dest + offsetToHwInstances); - global HwInstanceLeaf* srcInstleafs = (global HwInstanceLeaf*)(src + offsetToBvh + offsetToHwInstances); - - for (uint hwLeafIndex = get_group_id(0); hwLeafIndex < numHwInstances; hwLeafIndex += groupCnt) - { - // pull the instance from srcBVH - HwInstanceLeaf tmpInstleaf = srcInstleafs[hwLeafIndex]; - - uint swInstanceIndex = HwInstanceLeaf_GetInstanceIndex(&tmpInstleaf); - uint64_t childBvhPtr = (uint64_t)newInstancePtrs[swInstanceIndex]; - uint64_t originalBvhPtr = (uint64_t)HwInstanceLeaf_GetBVH(&tmpInstleaf); - - HwInstanceLeaf_SetBVH(&tmpInstleaf, childBvhPtr); - uint64_t startNode = HwInstanceLeaf_GetStartNode(&tmpInstleaf); - - if (startNode != 0) { - uint64_t rootNodeOffset = startNode - originalBvhPtr; - HwInstanceLeaf_SetStartNode(&tmpInstleaf, childBvhPtr + rootNodeOffset); - } - - dstInstleafs[hwLeafIndex] = tmpInstleaf; - } - } - } - else - { - CopyMemory(dest, (global char*)(src + offsetToBvh), bvhSize, groupCnt); - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel deserialize_indirect( - global char* dest, - global char* src) -{ - SerializationHeader* header = (SerializationHeader*)src; - const uint64_t bvhSize = header->DeserializedSizeInBytes; - unsigned groupCnt = GroupCountForCopySize(bvhSize); - deserializeT(dest, src, groupCnt); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel dxr_decode(global char* dest, - global char* src) -{ - - DecodeHeader* header = (DecodeHeader*)dest; - BVHBase* base = (BVHBase*)src; - - uint32_t numGeos = base->Meta.geoCount; - uint32_t numInstances = base->Meta.instanceCount; - - if (numInstances > 0) - { - header->Type = TOP_LEVEL; - header->NumDesc = numInstances; - - D3D12_RAYTRACING_INSTANCE_DESC* instanceDesc = (D3D12_RAYTRACING_INSTANCE_DESC*)(dest + sizeof(DecodeHeader)); - copyInstanceDescs((InstanceDesc*)((uint64_t)base + (uint64_t)base->Meta.instanceDescsStart), - instanceDesc, - numInstances); - } - else if (numGeos > 0) - { - header->Type = BOTTOM_LEVEL; - header->NumDesc = numGeos; - - D3D12_RAYTRACING_GEOMETRY_DESC* geomDescs = (D3D12_RAYTRACING_GEOMETRY_DESC*)(dest + sizeof(DecodeHeader)); - uint64_t data = (uint64_t)geomDescs + sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) * numGeos; - createGeoDescs((GeoMetaData*)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart), - geomDescs, - numGeos, - data); - - work_group_barrier(CLK_GLOBAL_MEM_FENCE); - - copyDataFromQuadLeaves(base, - geomDescs); - - copyDataFromLProcedurals(base, - geomDescs); - } - else - { - header->Type = BOTTOM_LEVEL; - header->NumDesc = 0; - } -} diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.cl b/src/intel/vulkan/grl/gpu/bvh_debug.cl deleted file mode 100644 index bce75fec3ff..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_debug.cl +++ /dev/null @@ -1,208 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// @file bvh_debug.cl -// -// @brief routines to do basic integrity checks -// -// Notes: -// - -#include "GRLGen12.h" -#include "intrinsics.h" -#include "libs/lsc_intrinsics.h" -#include "GRLGen12IntegrityChecks.h" -#include "api_interface.h" - -#define ERROR_PRINTF 0 -GRL_INLINE bool commit_err( - global uint* some_null, - global BVHBase* bvh, - global ERROR_INFO* err_info_slot, - ERROR_INFO err) -{ - if (err.type != error_t_no_error) { - uint expected = error_t_no_error; - atomic_compare_exchange_global(&err_info_slot->type, &expected, err.type); - if (expected == error_t_no_error) - { - err_info_slot->offset_in_BVH = err.offset_in_BVH; - err_info_slot->when = err.when; - err_info_slot->reserved = 0xAAACCAAA; - mem_fence_evict_to_memory(); -#if ERROR_PRINTF - printf("bvh = 0x%llX, err.type = %X, err.offset_in_BVH = %d\n", bvh, err.type, err.offset_in_BVH); -#else - // This is to trigger PF. Note we have to write directly to memory. - // If write would stay in L3 it won't give a PF untill this will get evicted to mem. - store_uint_L1UC_L3UC(some_null, 0, 0x0EEE0000 + err.type); -#endif - return true; - } - } - return false; -} - -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel check_tree_topology( - global uint* some_null, - global BVHBase* bvh, - global ERROR_INFO* err, - uint phase) -{ - uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); - - if (err->type != error_t_no_error) return; - - uint dummy1, dummy2, dummy3; - ERROR_INFO reterr = check_tree_topology_helper(bvh, globalID, &dummy1, &dummy2, &dummy3, false); - if (reterr.type == error_t_no_error) - { - reterr = check_backpointers(bvh, globalID); - } - if (reterr.type == error_t_no_error) - { - reterr = validate_atomic_update_structs(bvh, globalID); - } - reterr.when = phase; - commit_err(some_null, bvh, err, reterr); -} - -GRL_INLINE bool IsValid48bPtr(qword ptr) -{ - qword CANONIZED_BITS = 0xFFFFul << 48ul; - qword canonized_part = ptr & CANONIZED_BITS; - bool isIt = ptr != 0 && ( - canonized_part == 0 || canonized_part == CANONIZED_BITS); - return isIt; -} - -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel check_geos_before_quad_update( - global BVHBase* bvh, //dest bvh - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global uint* some_null, - global ERROR_INFO* err, - uint phase, - uint numGeos, - uint numThreads) -{ - uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); - - if (err->type != error_t_no_error) return; - - // first check sanity of geos - ERROR_INFO geo_insanity_error = { error_t_input_geo_insane, 0 }; - - for (uint ID = globalID; ID < numGeos; ID += numThreads * get_sub_group_size()) - { - bool IsSane = IsValid48bPtr((qword)(qword)geomDesc); - - if (IsSane) { - GRL_RAYTRACING_GEOMETRY_DESC geo = geomDesc[globalID]; - IsSane = geo.Type < NUM_GEOMETRY_TYPES; - if (IsSane) { - if (geo.Type == GEOMETRY_TYPE_TRIANGLES) { - if (geo.Desc.Triangles.IndexFormat >= INDEX_FORMAT_END) { - IsSane = false; - } - else - { - if (geo.Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE && geo.Desc.Triangles.IndexCount > 2) - { - IsSane = (geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END) && - IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) && - IsValid48bPtr((qword)geo.Desc.Triangles.pIndexBuffer); - } - else if (geo.Desc.Triangles.VertexCount > 2) - { - IsSane = - geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END&& - IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) != 0; - } - } - } - } - } - - geo_insanity_error.offset_in_BVH = ID; - geo_insanity_error.when = phase; - if (!IsSane) { - commit_err(some_null, bvh, err, geo_insanity_error); - } - return; - } -} - -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel check_geos_vs_quads( - global BVHBase* bvh, - global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, - global uint* some_null, - global ERROR_INFO* err, - uint phase, - uint numGeos, - uint numThreads) -{ - uint numQuads = BVHBase_GetNumQuads(bvh); - - QuadLeaf* quads = BVHBase_GetQuadLeaves(bvh); - - uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); - uint qoffset = bvh->quadLeafStart; - - if (err->type != error_t_no_error) return; - - ERROR_INFO theErr = { error_t_no_error, 0 }; - - for (uint ID = globalID; ID < numQuads; ID += numThreads * get_sub_group_size()) - { - ERROR_INFO quadErr = { error_t_quad_leaf_broken, qoffset + ID, phase }; - - QuadLeaf quad = quads[ID]; - - uint geoIdx = PrimLeaf_GetGeoIndex(&quad.leafDesc); - - if (geoIdx > numGeos) { commit_err(some_null, bvh, err, quadErr); return; } - - uint numPrimsInGeo = geomDesc[geoIdx].Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE ? - geomDesc[geoIdx].Desc.Triangles.IndexCount / 3 : - geomDesc[geoIdx].Desc.Triangles.VertexCount / 3; - - if(quad.primIndex0 >= numPrimsInGeo) { - commit_err(some_null, bvh, err, quadErr); - return; - } - - if(!QuadLeaf_IsSingleTriangle(&quad) && - (quad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&quad) >= numPrimsInGeo)) - { - commit_err(some_null, bvh, err, quadErr); - return; - } - } -} - -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel check_instances_linked_bvhs( - global uint* some_null, - global BVHBase* bvh, - global ERROR_INFO* err, - uint phase) -{ - if (err->type != error_t_no_error) return; - - uint instanceLeafStart = bvh->instanceLeafStart; - uint instanceLeafEnd = bvh->instanceLeafEnd; - uint numInstances = (instanceLeafEnd - instanceLeafStart) / 2; - - uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); - - ERROR_INFO reterr = check_instances_linked_bvhs_helper(bvh, globalID, /*touchBlas*/true); - reterr.when = phase; - commit_err(some_null, bvh, err, reterr); -} diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.grl b/src/intel/vulkan/grl/gpu/bvh_debug.grl deleted file mode 100644 index 28008ab09ce..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_debug.grl +++ /dev/null @@ -1,107 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module bvh_on_gpu_checks; - -kernel_module debug_kernels ("bvh_debug.cl") -{ - links lsc_intrinsics; - kernel opencl_check_tree_topology < kernelFunction="check_tree_topology">; - kernel opencl_check_instances_linked_bvhs < kernelFunction="check_instances_linked_bvhs">; - kernel opencl_check_geos_before_quad_update < kernelFunction="check_geos_before_quad_update">; - kernel opencl_check_geos_vs_quads < kernelFunction="check_geos_vs_quads">; -} - - -metakernel debug_checks_prepare_const_regs() -{ - define cRoundingSIMD REG4; - define cInit0 REG5; - define cShiftForSIMD REG3; - cRoundingSIMD = (16-1); - cShiftForSIMD = 4; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; -} - -metakernel debug_checks_bvh_topology( - qword some_null_ptr, - qword bvh, - qword bvh_inner_nodes_end, - qword error_struct, - dword when, - dword bvh_inner_nodes_start_value ) -{ - define cRoundingSIMD REG4; - define cShiftForSIMD REG3; - REG1.lo = load_dword(bvh_inner_nodes_end); - REG0 = bvh_inner_nodes_start_value; - REG1.hi = 0; - REG2 = REG1 - REG0; - REG2 = REG2 + cRoundingSIMD; - REG2 = REG2 >> cShiftForSIMD; - - DISPATCHDIM_X = REG2.lo; - - dispatch_indirect opencl_check_tree_topology args( - some_null_ptr, - bvh, - error_struct, - when); -} - -metakernel debug_check_instances_linked_bvhs( - qword some_null_ptr, - qword bvh, - qword error_struct, - dword numHWThreads, - dword when) -{ - dispatch opencl_check_instances_linked_bvhs(numHWThreads,1,1) args( - some_null_ptr, - bvh, - error_struct, - when); -} - -metakernel debug_check_geos_before_quad_update( - qword bvh, - qword geos, - qword some_null_ptr, - qword error_struct, - dword when, - dword numGeos, - dword numHWThreads ) -{ - dispatch opencl_check_geos_before_quad_update(numHWThreads,1,1) args( - bvh, - geos, - some_null_ptr, - error_struct, - when, - numGeos, - numHWThreads ); -} - -metakernel debug_check_geos_vs_quads( - qword bvh, - qword geos, - qword some_null_ptr, - qword error_struct, - dword when, - dword numGeos, - dword numHWThreads ) -{ - dispatch opencl_check_geos_vs_quads(numHWThreads,1,1) args( - bvh, - geos, - some_null_ptr, - error_struct, - when, - numGeos, - numHWThreads ); -} diff --git a/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl deleted file mode 100644 index 4fa222b53eb..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl +++ /dev/null @@ -1,97 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "api_interface.h" -#include "d3d12.h" -#include "common.h" - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel compacted_size(global char *bvh_mem, - global char *postbuild_info) -{ - BVHBase *base = (BVHBase *)bvh_mem; - PostbuildInfoCompactedSize *postbuildInfoCompacted = (PostbuildInfoCompactedSize *)postbuild_info; - - postbuildInfoCompacted->CompactedSizeInBytes = compute_compacted_size(base); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel current_size(global char *bvh_mem, - global char *postbuild_info) -{ - - BVHBase *base = (BVHBase *)bvh_mem; - PostbuildInfoCurrentSize *postbuildInfoCurrent = (PostbuildInfoCurrentSize *)postbuild_info; - - postbuildInfoCurrent->CurrentSizeInBytes = base->Meta.allocationSize; -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel serialized_size(global char *bvh_mem, - global char *postbuild_info) -{ - - BVHBase *base = (BVHBase *)bvh_mem; - PostbuildInfoSerializationDesc *postbuildInfoSerialization = (PostbuildInfoSerializationDesc *)postbuild_info; - - uint64_t headerSize = sizeof(SerializationHeader); - uint64_t numInstances = base->Meta.instanceCount; - - postbuildInfoSerialization->SerializedSizeInBytes = sizeof(SerializationHeader) + - numInstances * sizeof(gpuva_t) + - compute_compacted_size(base); - //base->Meta.allocationSize; - postbuildInfoSerialization->NumBottomLevelAccelerationStructurePointers = numInstances; -} - -void countTrianglesAndProcedurals(GeoMetaData *geoMetaData, - uint64_t numGeos, - uint64_t *numTriangles, - uint64_t *numProcedurals) -{ - uint64_t numTrianglesLoc = 0; - uint64_t numProceduralsLoc = 0; - - for (uint64_t geoIndex = get_local_id(0); geoIndex < numGeos; geoIndex += get_local_size(0)) - { - if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES) - { - *numTriangles += geoMetaData[geoIndex].PrimitiveCount; - } - else - { - *numProcedurals += geoMetaData[geoIndex].PrimitiveCount; - } - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel decoded_size(global char *bvh_mem, - global char *postbuild_info) -{ - BVHBase *base = (BVHBase *)bvh_mem; - PostbuildInfoToolsVisualizationDesc *postbuildInfoDecoded = (PostbuildInfoToolsVisualizationDesc *)postbuild_info; - - uint64_t numTriangles = 0; - uint64_t numProcedurals = 0; - countTrianglesAndProcedurals((GeoMetaData *)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart), - base->Meta.geoCount, - &numTriangles, - &numProcedurals); - uint64_t numInstances = base->Meta.instanceCount; - uint64_t numDescs = base->Meta.geoCount; - uint64_t headerSize = sizeof(DecodeHeader); - uint64_t descsSize = numDescs * sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) + - numInstances * sizeof(D3D12_RAYTRACING_INSTANCE_DESC); - - // Each triangle is stored separately - 3 vertices (9 floats) per triangle - uint64_t triangleDataSize = 9 * sizeof(float); - uint64_t proceduralDataSize = sizeof(D3D12_RAYTRACING_AABB); - uint64_t geoDataSize = numTriangles * triangleDataSize + numProcedurals * proceduralDataSize; - - postbuildInfoDecoded->DecodedSizeInBytes = headerSize + descsSize + geoDataSize; -} diff --git a/src/intel/vulkan/grl/gpu/bvh_rebraid.cl b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl deleted file mode 100644 index ab0f891acee..00000000000 --- a/src/intel/vulkan/grl/gpu/bvh_rebraid.cl +++ /dev/null @@ -1,1683 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "AABB.h" -#include "GRLGen12.h" -#include "api_interface.h" -#include "common.h" -#include "qbvh6.h" - -#define MAX_SPLITS_PER_INSTANCE 64 -#define NUM_REBRAID_BINS 32 - -#define NUM_CHILDREN 6 -#define MAX_NODE_OFFSET 65535 // can't open nodes whose offsets exceed this - -// OCL/DPC++ *SHOULD* have a uniform keyword... but they dont... so I'm making my own -#define uniform -#define varying - -#define SGPRINT_UNIFORM(fmt,val) {sub_group_barrier(CLK_LOCAL_MEM_FENCE); if( get_sub_group_local_id() == 0 ) { printf(fmt,val); }} - -#define SGPRINT_6x(prefix,fmt,type,val) {\ - type v0 = sub_group_broadcast( val, 0 );\ - type v1 = sub_group_broadcast( val, 1 );\ - type v2 = sub_group_broadcast( val, 2 );\ - type v3 = sub_group_broadcast( val, 3 );\ - type v4 = sub_group_broadcast( val, 4 );\ - type v5 = sub_group_broadcast( val, 5 );\ - sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ - if( get_sub_group_local_id() == 0 ) { \ - printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \ - v0,v1,v2,v3,v4,v5);}} - - -#define SGPRINT_16x(prefix,fmt,type,val) {\ - type v0 = sub_group_broadcast( val, 0 );\ - type v1 = sub_group_broadcast( val, 1 );\ - type v2 = sub_group_broadcast( val, 2 );\ - type v3 = sub_group_broadcast( val, 3 );\ - type v4 = sub_group_broadcast( val, 4 );\ - type v5 = sub_group_broadcast( val, 5 );\ - type v6 = sub_group_broadcast( val, 6 );\ - type v7 = sub_group_broadcast( val, 7 );\ - type v8 = sub_group_broadcast( val, 8 );\ - type v9 = sub_group_broadcast( val, 9 );\ - type v10 = sub_group_broadcast( val, 10 );\ - type v11 = sub_group_broadcast( val, 11 );\ - type v12 = sub_group_broadcast( val, 12 );\ - type v13 = sub_group_broadcast( val, 13 );\ - type v14 = sub_group_broadcast( val, 14 );\ - type v15 = sub_group_broadcast( val, 15 );\ - sub_group_barrier(CLK_LOCAL_MEM_FENCE); \ - if( get_sub_group_local_id() == 0 ) { \ - printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \ - fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \ - v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}} - -#if 1 -#define GRL_ATOMIC_INC(addr) atomic_add(addr, 1); -#else -#define GRL_ATOMIC_INC(addr) atomic_inc(addr); -#endif - -#if 0 -#define LOOP_TRIPWIRE_INIT uint _loop_trip=0; - -#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name) \ - _loop_trip++;\ - if ( _loop_trip > max_iterations )\ - {\ - printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!!\n" );\ - printf( name"\n");\ - break;\ - } -#else - -#define LOOP_TRIPWIRE_INIT -#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name) - -#endif - - - -typedef struct SGHeap -{ - uint32_t key_value; - bool lane_mask; -} SGHeap; - -GRL_INLINE void SGHeap_init(uniform SGHeap *h) -{ - h->lane_mask = false; - h->key_value = 0xbaadf00d; -} - -GRL_INLINE bool SGHeap_full(uniform SGHeap *h) -{ - return sub_group_all(h->lane_mask); -} -GRL_INLINE bool SGHeap_empty(uniform SGHeap *h) -{ - return sub_group_all(!h->lane_mask); -} - -GRL_INLINE bool SGHeap_get_lane_mask(uniform SGHeap *h) -{ - return h->lane_mask; -} -GRL_INLINE uint16_t SGHeap_get_lane_values(uniform SGHeap *h) -{ - return (h->key_value & 0xffff); -} - -GRL_INLINE ushort isolate_lowest_bit( ushort m ) -{ - return m & ~(m - 1); -} - - -// lane i receives the index of the ith set bit in mask. -GRL_INLINE ushort subgroup_bit_rank( uniform ushort mask ) -{ - varying ushort lane = get_sub_group_local_id(); - ushort idx = 16; - for ( uint i = 0; i < NUM_CHILDREN; i++ ) - { - ushort lo = isolate_lowest_bit( mask ); - mask = mask ^ lo; - idx = (lane == i) ? lo : idx; - } - - return ctz( idx ); -} - -// push a set of elements spread across a subgroup. Return mask of elements that were not pushed -GRL_INLINE uint16_t SGHeap_vectorized_push(uniform SGHeap *h, varying uint16_t key, varying uint16_t value, uniform ushort push_mask) -{ - -#if 0 // an attempt to make this algorithm branchless - varying uint key_value = (((uint)key) << 16) | ((uint)value); - uniform ushort free_mask = intel_sub_group_ballot( !h->lane_mask ); - - varying ushort free_slot_idx = subgroup_bit_prefix_exclusive( free_mask ); // for each heap slot, what is its position in a compacted list of free slots (prefix sum) - varying ushort push_idx = subgroup_bit_prefix_exclusive( push_mask ); // for each lane, what is its position in a compacted list of pushing lanes (prefix sum) - - uniform ushort num_pushes = min( popcount( free_mask ), popcount( push_mask ) ); - - varying ushort push_index = subgroup_bit_rank( push_mask ); // lane i gets the index of the i'th set bit in push_mask - - varying uint shuffled = intel_sub_group_shuffle( key_value, intel_sub_group_shuffle( push_index, free_slot_idx ) ); - varying bool pushed = false; - if ( !h->lane_mask && free_slot_idx < num_pushes ) - { - h->lane_mask = true; - h->key_value = shuffled; - pushed = true; - } - - return push_mask & intel_sub_group_ballot( push_idx >= num_pushes ); -#else - - varying uint lane = get_sub_group_local_id(); - - varying uint key_value = (((uint)key) << 16) | ((uint)value); - uniform ushort free_mask = intel_sub_group_ballot(!h->lane_mask); - - // TODO_OPT: Look for some clever way to remove this loop - while (free_mask && push_mask) - { - // insert first active child into first available lane - uniform uint child_id = ctz(push_mask); - uniform uint victim_lane = ctz(free_mask); - uniform uint kv = sub_group_broadcast( key_value, child_id ); - if (victim_lane == lane) - { - h->lane_mask = true; - h->key_value = kv; - } - push_mask ^= (1 << child_id); - free_mask ^= (1 << victim_lane); - } - - return push_mask; - -#endif -} - -// push an item onto a heap that is full except for one slot -GRL_INLINE void SGHeap_push_and_fill(uniform SGHeap *h, uniform uint16_t key, uniform uint16_t value) -{ - uniform uint32_t key_value = (((uint)key) << 16) | value; - if (!h->lane_mask) - { - h->lane_mask = true; - h->key_value = key_value; // only one lane will be active at this point - } -} - -// pop the min item from a full heap -GRL_INLINE void SGHeap_full_pop_min(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out) -{ - varying uint lane = get_sub_group_local_id(); - uniform uint kv = sub_group_reduce_min(h->key_value); - if (h->key_value == kv) - h->lane_mask = false; - - *key_out = (kv >> 16); - *value_out = (kv & 0xffff); -} - -// pop the max item from a heap -GRL_INLINE void SGHeap_pop_max(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out) -{ - uniform uint lane = get_sub_group_local_id(); - uniform uint kv = sub_group_reduce_max(h->lane_mask ? h->key_value : 0); - if (h->key_value == kv) - h->lane_mask = false; - - *key_out = (kv >> 16); - *value_out = (kv & 0xffff); -} - -GRL_INLINE void SGHeap_printf( SGHeap* heap ) -{ - uint key = heap->key_value >> 16; - uint value = heap->key_value & 0xffff; - - if ( get_sub_group_local_id() == 0) - printf( "HEAP: \n" ); - SGPRINT_16x( " mask: ", "%6u ", bool, heap->lane_mask ); - SGPRINT_16x( " key : ", "0x%04x ", uint, key ); - SGPRINT_16x( " val : ", "0x%04x ", uint, value ); - -} - -GRL_INLINE float transformed_aabb_halfArea(float3 lower, float3 upper, const float *Transform) -{ - // Compute transformed extent per 'transform_aabb'. Various terms cancel - float3 Extent = upper - lower; - float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]); - float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]); - float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]); - - return (ex * ey) + (ey * ez) + (ex * ez); -} - -GRL_INLINE uint16_t quantize_area(float relative_area) -{ - // clamp relative area at 0.25 (1/4 of root area) - // and apply a non-linear distribution because most things in real scenes are small - relative_area = pow(min(1.0f, relative_area * 4.0f), 0.125f); - return convert_ushort_rtn( relative_area * 65535.0f ); -} - -GRL_INLINE varying uint16_t SUBGROUP_get_child_areas(uniform InternalNode *n, - uniform const float *Transform, - uniform float relative_area_scale) -{ - varying uint16_t area; - varying uint16_t lane = get_sub_group_local_id(); - varying int exp_x = n->exp_x; - varying int exp_y = n->exp_y; - varying int exp_z = n->exp_z; - - { - // decode the AABB positions. Lower in the bottom 6 lanes, upper in the top - uniform uint8_t *px = &n->lower_x[0]; - uniform uint8_t *py = &n->lower_y[0]; - uniform uint8_t *pz = &n->lower_z[0]; - - varying float fx = convert_float(px[lane]); - varying float fy = convert_float(py[lane]); - varying float fz = convert_float(pz[lane]); - fx = n->lower[0] + bitShiftLdexp(fx, exp_x - 8); - fy = n->lower[1] + bitShiftLdexp(fy, exp_y - 8); - fz = n->lower[2] + bitShiftLdexp(fz, exp_z - 8); - - // transform the AABBs to world space - varying float3 lower = (float3)(fx, fy, fz); - varying float3 upper = intel_sub_group_shuffle(lower, lane + 6); - - { - - // TODO_OPT: This is only utilizing 6 lanes. - // We might be able to do better by vectorizing the calculation differently - float a1 = transformed_aabb_halfArea( lower, upper, Transform ); - float a2 = a1 * relative_area_scale; - area = quantize_area( a2 ); - } - } - - return area; -} - - - -GRL_INLINE ushort get_child_area( - InternalNode* n, - ushort child, - const float* Transform, - float relative_area_scale ) -{ - uint16_t area; - uint16_t lane = get_sub_group_local_id(); - int exp_x = n->exp_x; - int exp_y = n->exp_y; - int exp_z = n->exp_z; - - // decode the AABB positions. Lower in the bottom 6 lanes, upper in the top - uint8_t* px = &n->lower_x[0]; - uint8_t* py = &n->lower_y[0]; - uint8_t* pz = &n->lower_z[0]; - - float3 lower, upper; - lower.x = convert_float( n->lower_x[child] ); - lower.y = convert_float( n->lower_y[child] ); - lower.z = convert_float( n->lower_z[child] ); - upper.x = convert_float( n->upper_x[child] ); - upper.y = convert_float( n->upper_y[child] ); - upper.z = convert_float( n->upper_z[child] ); - - lower.x = bitShiftLdexp( lower.x, exp_x - 8 ); // NOTE: the node's 'lower' field cancels out, so don't add it - lower.y = bitShiftLdexp( lower.y, exp_y - 8 ); // see transform_aabb_halfArea - lower.z = bitShiftLdexp( lower.z, exp_z - 8 ); - upper.x = bitShiftLdexp( upper.x, exp_x - 8 ); - upper.y = bitShiftLdexp( upper.y, exp_y - 8 ); - upper.z = bitShiftLdexp( upper.z, exp_z - 8 ); - - float a1 = transformed_aabb_halfArea( lower, upper, Transform ); - float a2 = a1 * relative_area_scale; - area = quantize_area( a2 ); - - return area; -} - - -GRL_INLINE varying int SUBGROUP_get_child_offsets(uniform InternalNode *n) -{ - varying uint lane = get_sub_group_local_id(); - varying uint child = (lane < NUM_CHILDREN) ? lane : 0; - - varying uint block_incr = InternalNode_GetChildBlockIncr( n, child ); - - //varying uint prefix = sub_group_scan_exclusive_add( block_incr ); - varying uint prefix; - if ( NUM_CHILDREN == 6 ) - { - prefix = block_incr + intel_sub_group_shuffle_up( 0u, block_incr, 1u ); - prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 2 ); - prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 4 ); - prefix = prefix - block_incr; - } - - return n->childOffset + prefix; -} - - -// compute the maximum number of leaf nodes that will be produced given 'num_splits' node openings -GRL_INLINE uint get_num_nodes(uint num_splits, uint max_children) -{ - // each split consumes one node and replaces it with N nodes - // there is initially one node - // number of nodes is thus: N*s + 1 - s ==> (N-1)*s + 1 - return (max_children - 1) * num_splits + 1; -} - -// compute the number of node openings that can be performed given a fixed extra node budget -GRL_INLINE uint get_num_splits(uint num_nodes, uint max_children) -{ - // inverse of get_num_nodes: x = (n-1)s + 1 - // s = (x-1)/(n-1) - if (num_nodes == 0) - return 0; - - return (num_nodes - 1) / (max_children - 1); -} - -GRL_INLINE uint get_rebraid_bin_index(uint16_t quantized_area, uint NUM_BINS) -{ - // arrange bins in descending order by size - float relative_area = quantized_area * (1.0f/65535.0f); - relative_area = 1.0f - relative_area; // arrange bins largest to smallest - size_t bin = round(relative_area * (NUM_BINS - 1)); - return bin; -} - -GRL_INLINE global InternalNode *get_node(global BVHBase *base, int incr) -{ - global char *ptr = (((global char *)base) + BVH_ROOT_NODE_OFFSET); // NOTE: Assuming this will be hoisted out of inner loops - - return (global InternalNode *)(ptr + incr * 64); -} - -GRL_INLINE bool is_aabb_valid(float3 lower, float3 upper) -{ - return all(isfinite(lower)) && - all(isfinite(upper)) && - all(lower <= upper); -} - -GRL_INLINE bool is_node_openable(InternalNode *n) -{ - // TODO_OPT: Optimize me by fetching dwords instead of looping over bytes - // TODO: OPT: Pre-compute openability and pack into the pad byte next to the nodeType field?? - bool openable = n->nodeType == NODE_TYPE_INTERNAL; - if ( openable ) - { - for ( uint i = 0; i < NUM_CHILDREN; i++ ) - { - bool valid = InternalNode_IsChildValid( n, i ); - uint childType = InternalNode_GetChildType( n, i ); - openable = openable & (!valid || (childType == NODE_TYPE_INTERNAL)); - } - } - - return openable; -} - - -GRL_INLINE bool SUBGROUP_can_open_root( - uniform global BVHBase *bvh_base, - uniform const struct GRL_RAYTRACING_INSTANCE_DESC* instance - ) -{ - if (bvh_base == 0 || GRL_get_InstanceMask(instance) == 0) - return false; - - // TODO_OPT: SG-vectorize this AABB test - uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds); - uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds); - if (!is_aabb_valid(root_lower, root_upper)) - return false; - - uniform global InternalNode *node = get_node(bvh_base, 0); - if ( node->nodeType != NODE_TYPE_INTERNAL ) - return false; - - varying bool openable = true; - varying uint lane = get_sub_group_local_id(); - if (lane < NUM_CHILDREN) - { - varying uint childType = InternalNode_GetChildType(node, lane); - varying bool valid = InternalNode_IsChildValid(node, lane); - openable = childType == NODE_TYPE_INTERNAL || !valid; - } - - return sub_group_all(openable); -} - - - -GRL_INLINE -varying uint2 -SUBGROUP_count_instance_splits(uniform global struct AABB3f *geometry_bounds, - uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance) -{ - uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure; - if (!SUBGROUP_can_open_root(bvh_base, instance)) - return (uint2)(0, 0); - - uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds); - uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds); - uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds); - - uniform uint16_t quantized_area = quantize_area(transformed_aabb_halfArea(root_lower, root_upper, instance->Transform) * relative_area_scale); - uniform uint16_t node_offs = 0; - - uniform SGHeap heap; - uniform uint num_splits = 0; - - SGHeap_init(&heap); - varying uint sg_split_counts_hi = 0; // cross-subgroup bin counters - varying uint sg_split_counts_lo = 0; - - uniform global InternalNode* node_array = get_node( bvh_base, 0 ); - - LOOP_TRIPWIRE_INIT; - - while (1) - { - uniform global InternalNode* node = node_array + node_offs; - - // count this split - uniform uint bin = get_rebraid_bin_index(quantized_area, NUM_REBRAID_BINS); - varying uint lane = get_sub_group_local_id(); - - sg_split_counts_hi += ((lane + 16) == bin) ? 1 : 0; - sg_split_counts_lo += (lane == bin) ? 1 : 0; - - // open this node and push all of its openable children to heap - varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node); - varying bool sg_openable = 0; - if (lane < NUM_CHILDREN & sg_offs <= MAX_NODE_OFFSET ) - if (InternalNode_IsChildValid(node, lane)) - sg_openable = is_node_openable( node_array + sg_offs); - - uniform uint openable_children = intel_sub_group_ballot(sg_openable); - - if ( openable_children ) - { - varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale ); - - if ( !SGHeap_full( &heap ) ) - { - openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children ); - } - - while ( openable_children ) - { - // pop min element - uniform uint16_t min_area; - uniform uint16_t min_offs; - SGHeap_full_pop_min( &heap, &min_area, &min_offs ); - - // eliminate all children smaller than heap minimum - openable_children &= intel_sub_group_ballot( sg_area > min_area ); - - if ( openable_children ) - { - // if any children survived, - // kick out heap minimum and replace with first child.. otherwise we will re-push the minimum - uniform uint child_id = ctz( openable_children ); - openable_children ^= (1 << child_id); - min_area = sub_group_broadcast( sg_area, child_id ); - min_offs = sub_group_broadcast( sg_offs, child_id ); - } - - // re-insert onto heap - SGHeap_push_and_fill( &heap, min_area, min_offs ); - - // repeat until all children are accounted for. It is possible - // for multiple children to fit in the heap, because heap minimum is now changed and we need to recompute it - } - } - - num_splits++; - if (num_splits == MAX_SPLITS_PER_INSTANCE) - break; - - if (SGHeap_empty(&heap)) - break; - - // get next node from heap - SGHeap_pop_max(&heap, &quantized_area, &node_offs); - - LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_count_splits" ); - - } - - return (uint2)(sg_split_counts_lo, sg_split_counts_hi); -} - -typedef struct RebraidBuffers -{ - global uint *bin_split_counts; // [num_bins] - global uint *bin_instance_counts; // [num_bins] - global uint *instance_bin_counts; // num_intances * num_bins -} RebraidBuffers; - -GRL_INLINE RebraidBuffers cast_rebraid_buffers(global uint *scratch, uint instanceID) -{ - RebraidBuffers b; - b.bin_split_counts = scratch; - b.bin_instance_counts = scratch + NUM_REBRAID_BINS; - b.instance_bin_counts = scratch + (2 + instanceID) * NUM_REBRAID_BINS; - return b; -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// Compute AABB -// Dispatch one work item per instance -/////////////////////////////////////////////////////////////////////////////////////////// - -GRL_INLINE void rebraid_compute_AABB( - global struct BVHBase* bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance) -{ - // don't open null rtas - global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure; - - struct AABB new_primref; - if (bvh_base != 0) - { - float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds); - float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds); - const float *Transform = instance->Transform; - - if (is_aabb_valid(root_lower, root_upper)) - { - new_primref = AABBfromAABB3f(transform_aabb(root_lower, root_upper, Transform)); - } - else - { - // degenerate instance which might be updated to be non-degenerate - // use AABB position to guide BVH construction - // - new_primref.lower.x = Transform[3]; - new_primref.lower.y = Transform[7]; - new_primref.lower.z = Transform[11]; - new_primref.upper = new_primref.lower; - } - } - else - { - AABB_init(&new_primref); - } - - struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref); - - if (get_sub_group_local_id() == 0) - { - AABB3f_atomic_merge_global_lu(&bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz ); - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -rebraid_computeAABB_DXR_instances( - global struct BVHBase* bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances) -{ - const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0); - rebraid_compute_AABB(bvh, instances + instanceID); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -rebraid_computeAABB_DXR_instances_indirect( - global struct BVHBase* bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances, - global struct IndirectBuildRangeInfo const * const indirect_data) -{ - const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0); - instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) - (((global char*)instances) + indirect_data->primitiveOffset); - rebraid_compute_AABB(bvh, instances + instanceID); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -rebraid_computeAABB_DXR_instances_pointers( - global struct BVHBase* bvh, - global void *instances_in) -{ - global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = - (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; - - const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0); - rebraid_compute_AABB(bvh, instances[instanceID]); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -rebraid_computeAABB_DXR_instances_pointers_indirect( - global struct BVHBase* bvh, - global void *instances_in, - global struct IndirectBuildRangeInfo const * const indirect_data) -{ - instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset; - global const struct GRL_RAYTRACING_INSTANCE_DESC **instances = - (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in; - - const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0); - rebraid_compute_AABB(bvh, instances[instanceID]); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// Init scratch: Dispatch one work group -/////////////////////////////////////////////////////////////////////////////////////////// - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(64, 1, 1))) void kernel rebraid_init_scratch(global uint *scratch) -{ - scratch[get_local_id(0) + get_group_id(0)*get_local_size(0)] = 0; -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel rebraid_chase_instance_pointers(global struct GRL_RAYTRACING_INSTANCE_DESC *instances_out, - global void *instance_buff) -{ - global const struct GRL_RAYTRACING_INSTANCE_DESC **instances_in = - (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instance_buff; - - instances_out[get_local_id(0)] = *instances_in[get_local_id(0)]; -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) -void kernel rebraid_chase_instance_pointers_indirect( - global struct GRL_RAYTRACING_INSTANCE_DESC* instances_out, - global void* instance_buff, - global struct IndirectBuildRangeInfo const* const indirect_data) -{ - instance_buff = ((global char*)instance_buff) + indirect_data->primitiveOffset; - global const struct GRL_RAYTRACING_INSTANCE_DESC** - instances_in = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instance_buff; - - instances_out[get_local_id(0)] = *instances_in[get_local_id(0)]; -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// Count splits -/////////////////////////////////////////////////////////////////////////////////////////// - -GRL_INLINE void DEBUG_SUBGROUP_print_split_counts( uniform uint instanceID, varying uint split_counts_lo, varying uint split_counts_hi ) -{ - uniform uint vals[32] = { - sub_group_broadcast( split_counts_lo, 0 ), sub_group_broadcast( split_counts_lo, 1 ), - sub_group_broadcast( split_counts_lo, 2 ), sub_group_broadcast( split_counts_lo, 3 ), - sub_group_broadcast( split_counts_lo, 4 ), sub_group_broadcast( split_counts_lo, 5 ), - sub_group_broadcast( split_counts_lo, 6 ), sub_group_broadcast( split_counts_lo, 7 ), - sub_group_broadcast( split_counts_lo, 8 ), sub_group_broadcast( split_counts_lo, 9 ), - sub_group_broadcast( split_counts_lo, 10 ), sub_group_broadcast( split_counts_lo, 11 ), - sub_group_broadcast( split_counts_lo, 12 ), sub_group_broadcast( split_counts_lo, 13 ), - sub_group_broadcast( split_counts_lo, 14 ), sub_group_broadcast( split_counts_lo, 15 ), - - sub_group_broadcast( split_counts_hi, 0 ), sub_group_broadcast( split_counts_hi, 1 ), - sub_group_broadcast( split_counts_hi, 2 ), sub_group_broadcast( split_counts_hi, 3 ), - sub_group_broadcast( split_counts_hi, 4 ), sub_group_broadcast( split_counts_hi, 5 ), - sub_group_broadcast( split_counts_hi, 6 ), sub_group_broadcast( split_counts_hi, 7 ), - sub_group_broadcast( split_counts_hi, 8 ), sub_group_broadcast( split_counts_hi, 9 ), - sub_group_broadcast( split_counts_hi, 10 ), sub_group_broadcast( split_counts_hi, 11 ), - sub_group_broadcast( split_counts_hi, 12 ), sub_group_broadcast( split_counts_hi, 13 ), - sub_group_broadcast( split_counts_hi, 14 ), sub_group_broadcast( split_counts_hi, 15 ) - }; - - if ( get_sub_group_local_id() == 0 ) - { - printf( - "Instance: %4u " - "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u " - "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u \n" - , - instanceID, - vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7], - vals[8], vals[9], vals[10], vals[11], vals[12], vals[13], vals[14], vals[15], - vals[16], vals[17], vals[18], vals[19], vals[20], vals[21], vals[22], vals[23], - vals[24], vals[25], vals[26], vals[27], vals[28], vals[29], vals[30], vals[31] - ); - } -} - -GRL_INLINE void do_rebraid_count_splits_SG( - uniform global struct BVHBase* bvh, - uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances, - uniform global uint *rebraid_scratch) -{ - uniform const uint instanceID = get_sub_group_global_id(); - uniform RebraidBuffers buffers = cast_rebraid_buffers(rebraid_scratch,instanceID); - - varying uint lane = get_sub_group_local_id(); - varying uint2 splits = SUBGROUP_count_instance_splits(&bvh->Meta.bounds, instances + instanceID); - varying uint split_counts_lo = splits.x; - varying uint split_counts_hi = splits.y; - - // write this instance's per-bin counts - global uint* counts = buffers.instance_bin_counts; - intel_sub_group_block_write2( counts, splits ); - - // update the per-bin split and instance counters - if (split_counts_lo > 0) - { - atomic_add(&buffers.bin_split_counts[lane], split_counts_lo); - GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane]); - } - if (split_counts_hi > 0) - { - atomic_add(&buffers.bin_split_counts[lane + 16], split_counts_hi); - GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane + 16]); - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -rebraid_count_splits_SG( - uniform global struct BVHBase* bvh, - uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances, - uniform global uint *rebraid_scratch) -{ - do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -rebraid_count_splits_SG_indirect( - uniform global struct BVHBase* bvh, - uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances, - uniform global uint *rebraid_scratch, - global struct IndirectBuildRangeInfo const * const indirect_data) -{ - instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) - (((global char*)instances) + indirect_data->primitiveOffset); - do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch); -} - - -#define HEAP_SIZE 16 -#define COUNT_SPLITS_WG_SIZE 16 - -struct SLMHeapNode -{ - short offs; - ushort area; -}; - -struct SLMHeap -{ - struct SLMHeapNode nodes[HEAP_SIZE]; - ushort size; - ushort min_key; -}; - -GRL_INLINE bool SLMHeapNode_Greater( struct SLMHeapNode a, struct SLMHeapNode b ) -{ - return a.area > b.area; -} - -GRL_INLINE ushort SLMHeapNode_UnpackKey( struct SLMHeapNode a ) -{ - return a.area; -} - -GRL_INLINE void SLMHeapNode_Unpack( struct SLMHeapNode a, ushort* area_out, short* offs_out ) -{ - *area_out = a.area; - *offs_out = a.offs; -} - -GRL_INLINE struct SLMHeapNode SLMHeapNode_Pack( ushort area, short offs ) -{ - struct SLMHeapNode n; - n.offs = offs; - n.area = area; - return n; -} - - -GRL_INLINE void SLMHeap_Init( struct SLMHeap* heap ) -{ - heap->size = 0; - heap->min_key = 0xffff; -} - -GRL_INLINE bool SLMHeap_empty( struct SLMHeap* heap ) -{ - return heap->size == 0; -} - -GRL_INLINE bool SLMHeap_full( struct SLMHeap* heap ) -{ - return heap->size == HEAP_SIZE; -} - - -GRL_INLINE void SLMHeap_push( struct SLMHeap* heap, ushort area, short offs ) -{ - ushort insert_pos; - if ( SLMHeap_full( heap ) ) - { - ushort current_min_key = heap->min_key; - if ( area <= current_min_key ) - return; // don't push stuff that's smaller than the current minimum - - // search for the minimum element - // The heap is laid out in level order, so it is sufficient to search only the last half - ushort last_leaf = HEAP_SIZE - 1; - ushort first_leaf = (last_leaf / 2) + 1; - - // as we search, keep track of what the new min-key will be so we can cull future pushes - ushort new_min_key = area; - ushort min_pos = 0; - - do - { - ushort idx = first_leaf++; - - ushort current_key = SLMHeapNode_UnpackKey( heap->nodes[idx] ); - bool found_min_pos = (min_pos == 0) && (current_key == current_min_key); - - if ( found_min_pos ) - min_pos = idx; - else - new_min_key = min( current_key, new_min_key ); - - } while ( first_leaf != last_leaf ); - - heap->min_key = new_min_key; - insert_pos = min_pos; - } - else - { - insert_pos = heap->size++; - heap->min_key = min( area, heap->min_key ); - } - - heap->nodes[insert_pos] = SLMHeapNode_Pack( area, offs ); - - // heap-up - while ( insert_pos ) - { - ushort parent = insert_pos / 2; - - struct SLMHeapNode parent_node = heap->nodes[parent]; - struct SLMHeapNode current_node = heap->nodes[insert_pos]; - if ( SLMHeapNode_Greater( parent_node, current_node ) ) - break; - - heap->nodes[insert_pos] = parent_node; - heap->nodes[parent] = current_node; - insert_pos = parent; - } - -} - -bool SLMHeap_pop_max( struct SLMHeap* heap, ushort* area_out, short* offs_out ) -{ - if ( SLMHeap_empty( heap ) ) - return false; - - SLMHeapNode_Unpack( heap->nodes[0], area_out, offs_out ); - - // heap down - ushort size = heap->size; - ushort idx = 0; - do - { - ushort left = 2 * idx + 1; - ushort right = 2 * idx + 2; - if ( left >= size ) - break; - - if ( right >= size ) - { - heap->nodes[idx] = heap->nodes[left]; - break; - } - - struct SLMHeapNode left_node = heap->nodes[left]; - struct SLMHeapNode right_node = heap->nodes[right]; - bool go_left = SLMHeapNode_Greater( left_node, right_node ); - heap->nodes[idx] = go_left ? left_node : right_node; - idx = go_left ? left : right; - - } while ( 1 ); - - heap->size = size - 1; - return true; -} - -void SLMHeap_Print( struct SLMHeap* heap ) -{ - printf( " size=%u min=%u {", heap->size, heap->min_key ); - for ( uint i = 0; i < heap->size; i++ ) - printf( "%04x:%04x", heap->nodes[i].area, heap->nodes[i].offs ); -} - - -GRL_INLINE bool can_open_root( - global struct BVHBase* bvh_base, - const struct GRL_RAYTRACING_INSTANCE_DESC* instance - ) -{ - float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds ); - float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds ); - if ( !is_aabb_valid( root_lower, root_upper ) || GRL_get_InstanceMask(instance) == 0 ) - return false; - - global InternalNode* node = get_node( bvh_base, 0 ); - if ( node->nodeType != NODE_TYPE_INTERNAL ) - return false; - - return is_node_openable( node ); -} - - -GRL_INLINE void count_instance_splits( - global struct AABB3f* geometry_bounds, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance, - local ushort* bin_split_counts, - local struct SLMHeap* heap -) -{ - global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure; - - SLMHeap_Init( heap ); - - float relative_area_scale = 1.0f / AABB3f_halfArea( geometry_bounds ); - float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds ); - float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds ); - - ushort quantized_area = quantize_area( transformed_aabb_halfArea( root_lower, root_upper, instance->Transform ) * relative_area_scale ); - short node_offs = 0; - ushort num_splits = 0; - - global InternalNode* node_array = get_node( bvh_base, 0 ); - - while ( 1 ) - { - global InternalNode* node = node_array + node_offs; - - // count this split - uint bin = get_rebraid_bin_index( quantized_area, NUM_REBRAID_BINS ); - bin_split_counts[bin]++; - - // open this node and push children to heap - - // TODO_OPT: Restructure this control flow to prevent differnet lanes from skipping different loop iterations and diverging - // TODO_OPT: Precompute openability masks in BLAS nodes at build time... one bit for self and N bits for each child - int offs = node->childOffset; - for ( ushort i = 0; i < NUM_CHILDREN; i++ ) - { - if ( InternalNode_IsChildValid( node, i ) ) - { - if ( offs >= SHRT_MIN && offs <= SHRT_MAX ) - { - if ( is_node_openable( node_array + offs ) ) - { - ushort area = get_child_area( node, i, instance->Transform, relative_area_scale ); - SLMHeap_push( heap, area, (short)offs ); - } - } - } - offs += InternalNode_GetChildBlockIncr( node, i ); - } - - num_splits++; - if ( num_splits == MAX_SPLITS_PER_INSTANCE ) - break; - - if ( !SLMHeap_pop_max( heap, &quantized_area, &node_offs ) ) - break; - } - -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( COUNT_SPLITS_WG_SIZE, 1, 1 )) ) -void kernel -rebraid_count_splits( - global struct BVHBase* bvh_base, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, - global uint* rebraid_scratch, - uint num_instances - ) -{ - local struct SLMHeap heap[COUNT_SPLITS_WG_SIZE]; - local ushort split_counts[COUNT_SPLITS_WG_SIZE][NUM_REBRAID_BINS]; - - // initialize stuff - // TODO_OPT: transpose this and subgroup-vectorize it so that - // block-writes can be used - for ( uint i = 0; i < NUM_REBRAID_BINS; i++ ) - split_counts[get_local_id( 0 )][i] = 0; - - - // count splits for this thread's instance - uniform uint base_instance = get_group_id( 0 ) * get_local_size( 0 ); - uint instanceID = base_instance + get_local_id( 0 ); - - if ( instanceID < num_instances ) - { - global BVHBase* bvh_base = (global BVHBase*)instances[instanceID].AccelerationStructure; - if ( can_open_root( bvh_base, &instances[instanceID] ) ) - { - count_instance_splits( &bvh_base->Meta.bounds, - &instances[instanceID], - &split_counts[get_local_id( 0 )][0], - &heap[get_local_id(0)] ); - } - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID ); - - - // reduce bins - for ( uint bin = get_local_id( 0 ); bin < NUM_REBRAID_BINS; bin += get_local_size( 0 ) ) - { - // TODO_OPT: There's probably a better way to arrange this computation - uint bin_split_count = 0; - uint bin_instance_count = 0; - for ( uint i = 0; i < COUNT_SPLITS_WG_SIZE; i++ ) - { - uint s = split_counts[i][bin]; - bin_split_count += s; - bin_instance_count += (s > 0) ? 1 : 0; - } - - if ( bin_split_count > 0 ) - { - atomic_add( &buffers.bin_split_counts[bin], bin_split_count ); - atomic_add( &buffers.bin_instance_counts[bin], bin_instance_count ); - } - } - - // write out bin counts for each instance - for ( uniform uint i = get_sub_group_id(); i < COUNT_SPLITS_WG_SIZE; i += get_num_sub_groups() ) - { - uniform uint iid = base_instance + i; - if ( iid > num_instances ) - break; - - global uint* instance_bin_counts = cast_rebraid_buffers( rebraid_scratch, iid ).instance_bin_counts; - - for ( uniform ushort j = 0; j < NUM_REBRAID_BINS; j += get_sub_group_size() ) - { - uint count = split_counts[i][j + get_sub_group_local_id() ]; - intel_sub_group_block_write( instance_bin_counts + j, count ); - } - } - -} - - - - -/////////////////////////////////////////////////////////////////////////////////////////// -// Build PrimRefs -/////////////////////////////////////////////////////////////////////////////////////////// - -GRL_INLINE uint get_instance_split_count(RebraidBuffers buffers, uint instanceID, uint available_splits) -{ - global uint* instance_desired_split_count = buffers.instance_bin_counts; - global uint *bin_split_counts = buffers.bin_split_counts; - global uint *bin_instance_counts = buffers.bin_instance_counts; - - uint total_splits = 0; - uint remaining_available_splits = available_splits; - uint max_bin = 0; - uint desired_splits_this_bin = 0; - uint instance_splits = 0; - - do - { - // stop when we reach a level where we can't satisfy the demand - desired_splits_this_bin = instance_desired_split_count[max_bin]; - uint total_bin_splits = bin_split_counts[max_bin]; - - if (total_bin_splits > remaining_available_splits) - break; - - // we have enough budget to give all instances everything they want at this level, so do it - remaining_available_splits -= total_bin_splits; - instance_splits += desired_splits_this_bin; - desired_splits_this_bin = 0; - max_bin++; - - } while (max_bin < NUM_REBRAID_BINS); - - if (max_bin < NUM_REBRAID_BINS) - { - // we have more split demand than we have splits available. The current bin is the last one that gets any splits - // distribute the leftovers as evenly as possible to instances that want them - if (desired_splits_this_bin > 0) - { - // this instance wants splits. how many does it want? - uint desired_total = instance_splits + desired_splits_this_bin; - - // distribute to all instances as many as possible - uint count = bin_instance_counts[max_bin]; - uint whole = remaining_available_splits / count; - remaining_available_splits -= whole * count; - - // distribute remainder to lower numbered instances - size_t partial = (instanceID < remaining_available_splits) ? 1 : 0; - - // give the instance its share. - instance_splits += whole + partial; - instance_splits = min(instance_splits, desired_total); // don't give it more than it needs - } - } - - return instance_splits; -} - -GRL_INLINE void build_unopened_primref( - struct AABB3f* centroid_bounds, - global __const BVHBase *bvh_base, - global volatile uint *primref_counter, - global struct AABB *primref_buffer, - global __const float *Transform, - uint instanceID, - float matOverhead, - ushort instanceMask) -{ - float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds); - float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds); - - struct AABB primRef; - AABB_init( &primRef ); - - uint bvhoffset = (uint)BVH_ROOT_NODE_OFFSET; - if (is_aabb_valid(root_lower, root_upper) && instanceMask != 0) - { - primRef = AABBfromAABB3f(compute_xfm_bbox(Transform, BVHBase_GetRootNode(bvh_base), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &bvh_base->Meta.bounds, matOverhead)); - } - else - { - primRef.lower.x = Transform[3]; - primRef.lower.y = Transform[7]; - primRef.lower.z = Transform[11]; - primRef.upper.xyz = primRef.lower.xyz; - - instanceMask = 0; - bvhoffset = NO_NODE_OFFSET; - } - - primRef.lower.w = as_float(instanceID | (instanceMask << 24)); - primRef.upper.w = as_float(bvhoffset); - - float3 centroid = primRef.lower.xyz + primRef.upper.xyz; - centroid_bounds->lower[0] = centroid.x; - centroid_bounds->upper[0] = centroid.x; - centroid_bounds->lower[1] = centroid.y; - centroid_bounds->upper[1] = centroid.y; - centroid_bounds->lower[2] = centroid.z; - centroid_bounds->upper[2] = centroid.z; - - uint place = GRL_ATOMIC_INC(primref_counter); - primref_buffer[place] = primRef; -} - -GRL_INLINE void build_opened_primrefs( - varying bool lane_mask, - varying uint offset, - varying InternalNode* node, - varying struct AABB3f* centroid_bounds, - uniform global BVHBase *bvh_base, - uniform volatile global uint *primref_counter, - uniform global struct AABB *primref_buffer, - uniform uint instanceID, - uniform const float *Transform, - uniform float matOverhead, - varying ushort instanceMask) -{ - // TODO_OPT: This function is often called with <= 6 active lanes - // If lanes are sparse, consider jumping to a sub-group vectorized variant... - - if (lane_mask) - { - varying uint place = GRL_ATOMIC_INC(primref_counter); - - struct AABB box = AABBfromAABB3f(compute_xfm_bbox(Transform, node, XFM_BOX_NOT_REFINED_CLIPPED, &bvh_base->Meta.bounds, matOverhead)); - - box.lower.w = as_float(instanceID | (instanceMask << 24)); - box.upper.w = as_float(offset * 64 + (uint)BVH_ROOT_NODE_OFFSET); - primref_buffer[place] = box; - - AABB3f_extend_point( centroid_bounds, box.lower.xyz + box.upper.xyz ); - } -} - - -GRL_INLINE void SUBGROUP_open_nodes( - uniform global struct AABB3f *geometry_bounds, - uniform uint split_limit, - uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance, - uniform uint instanceID, - uniform volatile global uint *primref_counter, - uniform global struct AABB *primref_buffer, - varying struct AABB3f* centroid_bounds, - float transformOverhead) -{ - uniform SGHeap heap; - SGHeap_init(&heap); - - uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds); - uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure; - - uniform uint16_t node_offs = 0; - varying uint lane = get_sub_group_local_id(); - - uniform InternalNode* node_array = get_node( bvh_base, 0 ); - - LOOP_TRIPWIRE_INIT; - - while ( 1 ) - { - uniform InternalNode *node = node_array + node_offs; - - varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node); - varying bool sg_valid = false; - varying bool sg_openable = false; - if (lane < NUM_CHILDREN) - { - sg_valid = InternalNode_IsChildValid(node, lane); - if (sg_valid && (sg_offs <= MAX_NODE_OFFSET)) - { - sg_openable = is_node_openable( node_array + sg_offs); - } - } - - uniform uint16_t valid_children = intel_sub_group_ballot(sg_valid); - uniform uint16_t openable_children = intel_sub_group_ballot(sg_openable); - uniform uint16_t unopenable_children = valid_children & (~openable_children); - - if ( openable_children ) - { - varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale ); - - // try to push all openable children to the heap - if ( !SGHeap_full( &heap ) ) - { - openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children ); - } - - // we have more openable children than will fit in the heap - // process these one by one. - // TODO: Try re-writing with sub_group_any() and see if compiler does a better job - while ( openable_children ) - { - // pop min element - uniform uint16_t min_area; - uniform uint16_t min_offs; - SGHeap_full_pop_min( &heap, &min_area, &min_offs ); - - // eliminate all children smaller than heap minimum. - // mark eliminated children as unopenable - varying uint culled_children = openable_children & intel_sub_group_ballot( sg_area <= min_area ); - unopenable_children ^= culled_children; - openable_children &= ~culled_children; - - if ( openable_children ) - { - // if any children survived the purge - // find the first such child and swap its offset for the one from the heap - // - uniform uint child_id = ctz( openable_children ); - uniform uint16_t old_min_offs = min_offs; - min_area = sub_group_broadcast( sg_area, child_id ); - min_offs = sub_group_broadcast( sg_offs, child_id ); - - if ( lane == child_id ) - sg_offs = old_min_offs; - - openable_children ^= (1 << child_id); - unopenable_children ^= (1 << child_id); - } - - SGHeap_push_and_fill( &heap, min_area, min_offs ); - - } - } - - if (unopenable_children) - { - varying bool sg_create_primref = ((1 << lane) & unopenable_children); - build_opened_primrefs(sg_create_primref, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance)); - } - - --split_limit; - if (split_limit == 0) - { - // split limit exceeded - // create primrefs for all remaining openable nodes in heap - varying bool sg_mask = SGHeap_get_lane_mask(&heap); - sg_offs = SGHeap_get_lane_values(&heap); - build_opened_primrefs(sg_mask, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance)); - - break; - } - - - // NOTE: the heap should never be empty. If it is, the instance was given too many splits. - - // get next node from heap - uint16_t quantized_area; - SGHeap_pop_max(&heap, &quantized_area, &node_offs); - - LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_build_primrefs" ); - - } -} - - -#define OPEN_QUEUE_SIZE 256 -#define OPEN_QUEUE_NUM_SGS 16 - -typedef struct OpenQueueEntry -{ - uint instanceID; - ushort num_splits; -} OpenQueueEntry; - -typedef struct OpenQueue -{ - uint num_produced; - uint num_consumed; - OpenQueueEntry Q[OPEN_QUEUE_SIZE]; -} OpenQueue; - -uniform uint SUBGROUP_GetNextQueueEntry( local OpenQueue* queue ) -{ - uint next = 0; - if ( get_sub_group_local_id() == 0 ) - next = GRL_ATOMIC_INC( &queue->num_consumed ); - return sub_group_broadcast( next, 0 ); -} - - -GRL_INLINE void do_rebraid_build_primrefs( - local struct AABB3f* SLM_CentroidBounds, - local OpenQueue* SLM_Q, - global struct Globals* globals, - global struct BVHBase* base, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer, - global uint* rebraid_scratch, - global struct AABB* primref_buffer, - uint extra_primref_count, - uint num_instances) -{ - varying uint instanceID = get_sub_group_size() * get_sub_group_global_id() + get_sub_group_local_id(); - - uniform volatile global uint* primref_counter = &globals->numPrimitives; - uniform RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID ); - uniform uint available_splits = get_num_splits( extra_primref_count, NUM_CHILDREN ); - - - - varying struct AABB3f centroidBounds; - AABB3f_init( ¢roidBounds ); - - if ( get_local_id( 0 ) == 0 ) - { - SLM_Q->num_produced = 0; - SLM_Q->num_consumed = 0; - AABB3f_init( SLM_CentroidBounds ); - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - // assign splits to unopened instances. Build primrefs for unsplit instances in vectorized form - varying uint num_splits = 0; - if ( instanceID < num_instances ) - { - num_splits = get_instance_split_count( buffers, instanceID, available_splits ); - if ( num_splits == 0 ) - { - varying global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID; - varying global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure; - if ( bvh_base != 0 ) - { - build_unopened_primref( ¢roidBounds, bvh_base, primref_counter, primref_buffer, instance->Transform, instanceID, 0.0f, GRL_get_InstanceMask(instance)); - } - } - else - { - // defer opened instances - uint place = GRL_ATOMIC_INC( &SLM_Q->num_produced ); - SLM_Q->Q[place].instanceID = instanceID; - SLM_Q->Q[place].num_splits = (ushort)num_splits; - } - } - - barrier( CLK_LOCAL_MEM_FENCE ); - - // if there were opened instances, process them, one per subgroup - uniform uint num_produced = SLM_Q->num_produced; - uniform uint next = SUBGROUP_GetNextQueueEntry( SLM_Q ); - - while ( next < num_produced ) - { - uniform uint instanceID = SLM_Q->Q[next].instanceID; - uniform uint num_splits = SLM_Q->Q[next].num_splits; - - uniform global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID; - - float transformOverhead = -#if FINE_TRANSFORM_NODE_BOX - transformation_bbox_surf_overhead(instance->Transform); -#else - 0.0f; -#endif - - SUBGROUP_open_nodes( - &base->Meta.bounds, - num_splits, - instance, - instanceID, - primref_counter, - primref_buffer, - ¢roidBounds, - transformOverhead); - - next = SUBGROUP_GetNextQueueEntry( SLM_Q ); - } - - // reduce the centroid bounds AABB - struct AABB3f reduced = AABB3f_sub_group_reduce( ¢roidBounds ); - if ( get_sub_group_local_id() == 0 ) - AABB3f_atomic_merge_localBB_nocheck( SLM_CentroidBounds, &reduced ); - - barrier( CLK_LOCAL_MEM_FENCE ); - - if( get_local_id(0) == 0 ) - { - atomic_min( (global float*) (&globals->centroidBounds.lower) + 0, SLM_CentroidBounds->lower[0] ); - atomic_min( (global float*) (&globals->centroidBounds.lower) + 1, SLM_CentroidBounds->lower[1] ); - atomic_min( (global float*) (&globals->centroidBounds.lower) + 2, SLM_CentroidBounds->lower[2] ); - atomic_max( (global float*) (&globals->centroidBounds.upper) + 0, SLM_CentroidBounds->upper[0] ); - atomic_max( (global float*) (&globals->centroidBounds.upper) + 1, SLM_CentroidBounds->upper[1] ); - atomic_max( (global float*) (&globals->centroidBounds.upper) + 2, SLM_CentroidBounds->upper[2] ); - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -void kernel rebraid_build_primrefs( - global struct Globals* globals, - global struct BVHBase* base, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer, - global uint* rebraid_scratch, - global struct AABB* primref_buffer, - uint extra_primref_count, - uint num_instances) -{ - local struct AABB3f SLM_CentroidBounds; - local OpenQueue SLM_Q; - do_rebraid_build_primrefs( - &SLM_CentroidBounds, - &SLM_Q, - globals, - base, - instance_buffer, - rebraid_scratch, - primref_buffer, - extra_primref_count, - num_instances); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) -void kernel rebraid_build_primrefs_indirect( - global struct Globals* globals, - global struct BVHBase* base, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer, - global uint* rebraid_scratch, - global struct AABB* primref_buffer, - global struct IndirectBuildRangeInfo const * const indirect_data, - uint extra_primref_count ) -{ - local struct AABB3f SLM_CentroidBounds; - local OpenQueue SLM_Q; - - instance_buffer = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) - (((global char*)instance_buffer) + indirect_data->primitiveOffset); - - do_rebraid_build_primrefs( - &SLM_CentroidBounds, - &SLM_Q, - globals, - base, - instance_buffer, - rebraid_scratch, - primref_buffer, - extra_primref_count, - indirect_data->primitiveCount); -} - - -/////////////////////////////////////////////////////////////////////////////////////////// -// Misc -/////////////////////////////////////////////////////////////////////////////////////////// - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -ISA_TEST(global InternalNode *n, global uint *out, global float *xform, float scale) -{ - - out[get_sub_group_local_id()] = InternalNode_IsChildValid(n, get_sub_group_local_id()); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( (reqd_work_group_size( 1, 1, 1 )) ) void kernel -DEBUG_PRINT( - global struct Globals* globals, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer, - global uint* rebraid_scratch, - global struct AABB* primref_buffer, - dword num_extra, - dword input_instances ) -{ -#if 0 - // validate primrefs - if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 ) - { - uint refs = globals->numPrimitives; - for ( uint i = 0; i < refs; i++ ) - { - if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) || - any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) || - any( isnan(primref_buffer[i].lower.xyz) ) || - any( isnan(primref_buffer[i].upper.xyz) ) ) - { - struct AABB box = primref_buffer[i]; - printf( "BAD BOX: %u {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ), - box.lower.x, box.lower.y, box.lower.z, - box.upper.x, box.upper.y, box.upper.z, - as_uint( box.lower.w ) ); - } - - const uint instIndex = PRIMREF_instanceID(&primref_buffer[i]); // TODO: Refactor me. We should not be using struct AABB for primRefs - const uint rootByteOffset = as_uint( primref_buffer[i].upper.w ); // It should be struct PrimRef - if ( instIndex >= input_instances ) - printf( "BAD INSTANCE INDEX: %u", i ); - else - { - global struct BVHBase* blas = (global struct BVHBase*)instance_buffer[instIndex].AccelerationStructure; - if ( blas ) - { - struct InternalNode* start = BVHBase_GetInternalNodes( blas ); - struct InternalNode* end = BVHBase_GetInternalNodesEnd( blas ); - - InternalNode* entryPoint = (struct InternalNode*)((char*)instance_buffer[instIndex].AccelerationStructure + rootByteOffset); - if ( entryPoint < start || entryPoint >= end ) - printf( "BAD ENTRYPOINT: %u\n", i ); - if ( (rootByteOffset & 63) != 0 ) - printf( "MISALIGNED ENTRYPOInt: %u\n", i ); - - } - } - } - } -#endif -#if 0 - if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 ) - printf( "REBRAIDED: %u\n", globals->numPrimitives ); - - // print instance bin information - if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 ) - { - printf( "REBRAIDED: %u\n", globals->numPrimitives ); - for( uint i=0; i<231; i++ ) - { - RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch,i ); - printf( " ID:%4u ", i ); - for ( uint j = 0; j < NUM_REBRAID_BINS; j++ ) - { - global uint* count = buffers.instance_bin_counts; - printf( " %2u ", count[j] ); - } - printf( "\n" ); - } - } -#endif -#if 0 - if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 ) - { - printf( "Instances: %u\n", globals->numPrimitives ); - - for ( uint i = 0; i < globals->numPrimitives; i++ ) - { - if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) || - any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) ) - { - struct AABB box = primref_buffer[i]; - printf( " %u {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ), - box.lower.x, box.lower.y, box.lower.z, - box.upper.x, box.upper.y, box.upper.z, - as_uint( box.lower.w ) ); - } - - } - } -#endif -} - diff --git a/src/intel/vulkan/grl/gpu/common.h b/src/intel/vulkan/grl/gpu/common.h deleted file mode 100644 index 5fa0e117ae4..00000000000 --- a/src/intel/vulkan/grl/gpu/common.h +++ /dev/null @@ -1,429 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "shared.h" -#include "intrinsics.h" -#include "AABB.h" -#include "AABB3f.h" -#include "qbvh6.h" - -/* ====== BVH_BUILDER config ====== */ - -__constant const float cfg_intCost = 4.0f; -__constant const float cfg_travCost = 1.0f; -__constant const uint cfg_minLeafSize = BVH_LEAF_N_MIN; -__constant const uint cfg_maxLeafSize = BVH_LEAF_N_MAX; -__constant const uint cfg_maxDepth = BUILDRECORD_STACK_SIZE; - -#define ENABLE_CONVERSION_CHECKS 0 - -#ifdef ENABLE_BIG_REG_ANNOTATION -#define GRL_ANNOTATE_BIG_REG_REQ __attribute__((annotate("num-thread-per-eu 4"))) -#else -#define GRL_ANNOTATE_BIG_REG_REQ -#endif - -#ifdef ENABLE_IGC_DO_NOT_SPILL -#define GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((annotate("igc-do-not-spill"))) -#else -#define GRL_ANNOTATE_IGC_DO_NOT_SPILL -#endif - -#define ERROR() - -/* =================================================================================================================================================== */ -/* =================================================================================================================================================== */ -/* =================================================================================================================================================== */ -/* =================================================================================================================================================== */ - -GRL_INLINE unsigned int getNumLeafPrims(unsigned int offset) -{ - return (offset & 0x7) - 3; -} - -GRL_INLINE unsigned int getLeafOffset(unsigned int offset) -{ - return offset & (~0x7); -} - -GRL_INLINE float4 triangleNormal(const float4 v0, const float4 v1, const float4 v2) -{ - const float4 a = v1 - v0; - const float4 b = v2 - v0; - return cross(a, b); -} - -GRL_INLINE float areaTriangle(const float4 v0, const float4 v1, const float4 v2) -{ - const float4 normal = triangleNormal(v0, v1, v2); - return length((float3)(normal.x, normal.y, normal.z)) * 0.5f; -} - -GRL_INLINE float det2(const float2 a, const float2 b) -{ - return a.x * b.y - a.y * b.x; -} - -GRL_INLINE float areaProjectedTriangle(const float4 v0, const float4 v1, const float4 v2) -{ - const float xy = 0.5f * fabs(det2(v1.xy - v0.xy, v2.xy - v0.xy)); - const float yz = 0.5f * fabs(det2(v1.yz - v0.yz, v2.yz - v0.yz)); - const float zx = 0.5f * fabs(det2(v1.zx - v0.zx, v2.zx - v0.zx)); - return xy + yz + zx; -} - -typedef struct Block64B { - char data[64]; -} Block64B __attribute__((aligned(64))); - -typedef char byte_align64B __attribute__((aligned(64))); - -/* ====================================================================== */ -/* ============================== GLOBALS =============================== */ -/* ====================================================================== */ - -GRL_INLINE bool Globals_OnFinish(global struct Globals *globals) -{ - /* last active HW thread ? */ - if (get_local_id(0) == 0) - { - const uint sync = atomic_add(&globals->sync, 1); - if (sync + 1 == get_num_groups(0)) - { - globals->sync = 0; - return true; - } - } - return false; -} - -GRL_INLINE uint BlockAllocator_BytesUsed(struct BlockAllocator *p) -{ - return p->cur - p->start; -}; - -GRL_INLINE uint BlockAllocator_Alloc(__global struct BlockAllocator *p, const uint size) -{ - return atomic_add(&p->cur, size); -} - -GRL_INLINE uint BlockAllocator_Alloc_Single(__global struct BlockAllocator *p, const uint size) -{ - uint offset = 0; - if (get_sub_group_local_id() == 0) - offset = atomic_add(&p->cur, size); - return sub_group_broadcast(offset, 0); -} - -// node allocation returns an offset from beginning of BVH to allocated node -// in multiples of 64B -GRL_INLINE uint allocate_inner_nodes(global struct BVHBase* base, uint num_nodes ) -{ - return atomic_add_global( &base->nodeDataCur, num_nodes ); -} -GRL_INLINE uint allocate_procedural_leaves(global struct BVHBase* base, uint num_nodes) -{ - return atomic_add_global(&base->proceduralDataCur, num_nodes); -} - -GRL_INLINE uint allocate_quad_leaves(global struct BVHBase* base, uint num_nodes) -{ - return atomic_add_global(&base->quadLeafCur, num_nodes); -} - -#if 0 -GRL_INLINE uint alloc_node_mem(global struct Globals *globals, const uint size) -{ - const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */ - return BlockAllocator_Alloc(&globals->node_mem_allocator, aligned_size); -} - -GRL_INLINE uint alloc_single_node_mem(global struct Globals *globals, const uint size) -{ - const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */ - return BlockAllocator_Alloc_Single(&globals->node_mem_allocator, aligned_size); -} - -GRL_INLINE uint alloc_quad_leaf_mem(global struct Globals *globals, const uint size) -{ - const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */ - return BlockAllocator_Alloc(&globals->quad_mem_allocator, aligned_size); -} - -GRL_INLINE uint alloc_procedural_leaf_mem(global struct Globals *globals, const uint size) -{ - const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */ - return BlockAllocator_Alloc(&globals->procedural_mem_allocator, aligned_size); -} -#endif - -GRL_INLINE global struct BuildRecord *getBuildRecords(char *bvh_mem, struct Globals *globals) -{ - return (global struct BuildRecord *)(bvh_mem + globals->build_record_start); -} - -/* ======================================================================= */ -/* ============================== TRIANGLE =============================== */ -/* ======================================================================= */ - -/*GRL_INLINE void printTriangle(struct Triangle *t) -{ - printf("vtx[0] %d vtx[1] %d vtx[2] %d primID %d geomID %d \n",t->vtx[0],t->vtx[1],t->vtx[2],t->primID,t->geomID); - }*/ - -/* ==================================================================== */ -/* ============================== SPLIT =============================== */ -/* ==================================================================== */ - -GRL_INLINE void printSplit(struct Split *split) -{ - printf("split sah %f dim %d pos %d \n", split->sah, split->dim, split->pos); -} - -/* ========================================================================== */ -/* ============================== BUILDRECORD =============================== */ -/* ========================================================================== */ - -GRL_INLINE void initBuildRecord(struct BuildRecord *buildRecord, uint start, uint end) -{ - AABB_init(&buildRecord->centroidBounds); - buildRecord->start = start; - buildRecord->end = end; -} - -GRL_INLINE void extendBuildRecord(struct BuildRecord *buildRecord, struct AABB *primref) -{ - AABB_extend_point(&buildRecord->centroidBounds, AABB_centroid2(primref)); -} - -GRL_INLINE uint getBuildRecursionDepth(struct BuildRecord *buildRecord) -{ - return as_uint(buildRecord->centroidBounds.upper.w); -} - -GRL_INLINE void setBuildRecursionDepth(struct BuildRecord *buildRecord, uint depth) -{ - buildRecord->centroidBounds.upper.w = as_float(depth); -} - -GRL_INLINE uint getNumPrimsBuildRecord(struct BuildRecord *buildRecord) -{ - return buildRecord->end - buildRecord->start; -} - -/* ========================================================================== */ -/* =================== BinaryMortonCodeHierarchy ============================= */ -/* ========================================================================== */ - -GRL_INLINE void BinaryMortonCodeHierarchy_init(struct BinaryMortonCodeHierarchy *record, uint start, uint end) -{ - record->range.start = start; - record->range.end = end; - record->leftChild = -1; - record->rightChild = -1; -// record->flag = 0; -} - -GRL_INLINE uint BinaryMortonCodeHierarchy_getNumPrimitives(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID) -{ - /* leaf case */ - if (nodeID & (uint)(1 << 31)) - return 1; - - /* inner node case*/ - else - return nodes[nodeID].range.end - nodes[nodeID].range.start + 1; -} - -GRL_INLINE struct BinaryMortonCodeHierarchy BinaryMortonCodeHierarchy_getEntry(global struct BinaryMortonCodeHierarchy* nodes, uint nodeID) -{ - struct BinaryMortonCodeHierarchy entry; - - if (nodeID & (uint)(1 << 31)) { - /* leaf case */ - uint rangeStart = nodeID ^ (uint)(1 << 31); - BinaryMortonCodeHierarchy_init(&entry, rangeStart, rangeStart); - } - else { - /* inner node case*/ - entry = nodes[nodeID]; - } - - return entry; -} - -GRL_INLINE uint BinaryMortonCodeHierarchy_getRangeStart(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID) -{ - /* leaf case */ - if (nodeID & (uint)(1 << 31)) - return nodeID ^ (uint)(1 << 31); - - /* inner node case*/ - else - return nodes[nodeID].range.start; -} - -/* ==================================================================== */ -/* ============================== RANGE =============================== */ -/* ==================================================================== */ - -GRL_INLINE void printRange(struct Range *range) -{ - printf("start %d end %d \n", range->start, range->end); -} - -GRL_INLINE bool equalRange(struct Range *range0, struct Range *range1) -{ - if (range0->start == range1->start && - range0->end == range1->end) - return true; - return false; -} - -GRL_INLINE uint getSizeRange(struct Range *range) -{ - return range->end - range->start; -} - -/* ==================================================================== */ -/* ========================= ProceduralLeaf =========================== */ -/* ==================================================================== */ - -#if 0 -struct ProceduralLeaf -{ - uint shaderIndex_geomMask; - uint geomIndex_flags; - uint N_last; - uint primIndex[13]; -}; -#endif - -GRL_INLINE uint ProceduralLeaf_geomIndex(global struct ProceduralLeaf *This) -{ - return This->leafDesc.geomIndex_flags & 0x1FFFFFFF; -} - -GRL_INLINE uint ProceduralLeaf_primIndex(global struct ProceduralLeaf *This, uint i) -{ - //assert(i < N); - return This->_primIndex[i]; -} - -/* ==================================================================== */ -/* =========================== TrianglePair =========================== */ -/* ==================================================================== */ - -struct TrianglePair -{ - uint4 a; // indices of the 4 verts to store in the quad - uint3 lb; // index of the second triangle's verts in 'a' -}; - -GRL_INLINE struct TrianglePair TrianglePair_Constructor(uint3 tri0, uint primID0, uint3 tri1, uint primID1) -{ - struct TrianglePair q; - q.a.x = tri0.x; - q.a.y = tri0.y; - q.a.z = tri0.z; - q.a.w = tri0.z; - - uint3 b; - b.x = tri1.x; - b.y = tri1.y; - b.z = tri1.z; - - q.lb = (uint3)(3); - - q.lb.x = (b.x == q.a.x) ? 0 : q.lb.x; - q.lb.y = (b.y == q.a.x) ? 0 : q.lb.y; - q.lb.z = (b.z == q.a.x) ? 0 : q.lb.z; - - q.lb.x = (b.x == q.a.y) ? 1 : q.lb.x; - q.lb.y = (b.y == q.a.y) ? 1 : q.lb.y; - q.lb.z = (b.z == q.a.y) ? 1 : q.lb.z; - - q.lb.x = (b.x == q.a.z) ? 2 : q.lb.x; - q.lb.y = (b.y == q.a.z) ? 2 : q.lb.y; - q.lb.z = (b.z == q.a.z) ? 2 : q.lb.z; - - q.lb.x = (primID0 != primID1) ? q.lb.x : 0; - q.lb.y = (primID0 != primID1) ? q.lb.y : 0; - q.lb.z = (primID0 != primID1) ? q.lb.z : 0; - - q.a.w = (q.lb.x == 3) ? b.x : q.a.w; - q.a.w = (q.lb.y == 3) ? b.y : q.a.w; - q.a.w = (q.lb.z == 3) ? b.z : q.a.w; - - return q; -} - -GRL_INLINE float InstanceDesc_get_transform(const InstanceDesc *d, const uint32_t row, const uint32_t column) -{ - return d->Transform[row][column]; -} - -GRL_INLINE uint32_t InstanceDesc_get_instanceID(const InstanceDesc *d) -{ - return d->InstanceIDAndMask & (0x00FFFFFF); -} - -GRL_INLINE uint32_t InstanceDesc_get_InstanceMask(const InstanceDesc *d) -{ - return d->InstanceIDAndMask >> 24; -} - -GRL_INLINE uint32_t InstanceDesc_get_InstanceContributionToHitGroupIndex(const InstanceDesc *d) -{ - return d->InstanceContributionToHitGroupIndexAndFlags & ((1 << 24) - 1); -} - -GRL_INLINE uint32_t InstanceDesc_get_InstanceFlags(const InstanceDesc *d) -{ - return d->InstanceContributionToHitGroupIndexAndFlags >> 24; -} - -GRL_INLINE gpuva_t InstanceDesc_get_AccelerationStructure(const InstanceDesc *d) -{ - return d->AccelerationStructureGPUVA; -} - -GRL_INLINE void InstanceDesc_set_transform(InstanceDesc *d, const uint32_t row, const uint32_t column, float value) -{ - d->Transform[row][column] = value; -} - -GRL_INLINE void InstanceDesc_set_instanceID(InstanceDesc *d, const uint32_t id) -{ - d->InstanceIDAndMask &= 255 << 24; - d->InstanceIDAndMask |= id & ((1 << 24) - 1); -} - -GRL_INLINE void InstanceDesc_set_InstanceMask(InstanceDesc *d, const uint32_t mask) -{ - d->InstanceIDAndMask &= ((1 << 24) - 1); - d->InstanceIDAndMask |= mask << 24; -} - -GRL_INLINE void InstanceDesc_set_InstanceContributionToHitGroupIndex(InstanceDesc *d, const uint32_t contribution) -{ - d->InstanceContributionToHitGroupIndexAndFlags &= 255 << 24; - d->InstanceContributionToHitGroupIndexAndFlags |= contribution & ((1 << 24) - 1); -} - -GRL_INLINE void InstanceDesc_set_InstanceFlags(InstanceDesc *d, const uint32_t flags) -{ - d->InstanceContributionToHitGroupIndexAndFlags &= ((1 << 24) - 1); - d->InstanceContributionToHitGroupIndexAndFlags |= flags << 24; -} - -GRL_INLINE void InstanceDesc_set_AccelerationStructure(InstanceDesc *d, gpuva_t address) -{ - d->AccelerationStructureGPUVA = address; -} diff --git a/src/intel/vulkan/grl/gpu/copy.grl b/src/intel/vulkan/grl/gpu/copy.grl deleted file mode 100644 index 1bb500a4ea0..00000000000 --- a/src/intel/vulkan/grl/gpu/copy.grl +++ /dev/null @@ -1,129 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module copy; // In copy we assume output data structure to be DXR compatible - -kernel clone_indirect < source="bvh_copy.cl", kernelFunction="clone_indirect" > -kernel compact < source="bvh_copy.cl", kernelFunction="compact" > -kernel serialize_indirect < source="bvh_copy.cl", kernelFunction="serialize_indirect" > -kernel serialize_for_input_dump_indirect < source="bvh_copy.cl", kernelFunction="serialize_for_input_dump_indirect" > -kernel deserialize_indirect < source="bvh_copy.cl", kernelFunction="deserialize_indirect" > -kernel dxr_decode < source="bvh_copy.cl", kernelFunction="dxr_decode" > - -metakernel clone_indirect( - qword dest, - qword src, - qword srcBVHsizedwordAddr) -{ -// this has to be compatible with in kernel GroupCountForCopy(...) - define byteSize REG0; - define numGroupsRqd REG1; - define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255; - define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8; - define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4; - byteSize = load_dword(srcBVHsizedwordAddr); - numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT; - numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS; - - DISPATCHDIM_X = numGroupsRqd.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect clone_indirect args( - dest, - src); -} - -metakernel compact( - qword dest, - qword src) -{ - dispatch compact(32,1,1) args( - dest, - src, - 32); -} - -metakernel serialize_indirect( - qword dest, - qword src, - qword driverID, - qword srcBVHsizedwordAddr) -{ - define byteSize REG0; - define numGroupsRqd REG1; - define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255; - define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8; - define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4; - byteSize = load_dword(srcBVHsizedwordAddr); - numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT; - numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS; - DISPATCHDIM_X = numGroupsRqd.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect serialize_indirect args( - dest, - src, - driverID); -} - -metakernel serialize_for_input_dump_indirect( - qword batchPtrs, - qword dstOffset, - qword src, - qword driverID, - qword srcBVHsizedwordAddr) -{ - define byteSize REG0; - define numGroupsRqd REG1; - define BYTE_PER_GROUP_CHUNK_SHIFT REG2; BYTE_PER_GROUP_CHUNK_SHIFT = 8; - define REMINDER_NUM_GROUPS REG3; REMINDER_NUM_GROUPS = 4; - byteSize = load_dword(srcBVHsizedwordAddr); - numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT; - numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS; - DISPATCHDIM_X = numGroupsRqd.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect serialize_for_input_dump_indirect args( - batchPtrs, - dstOffset, - src, - driverID); -} - -metakernel deserialize_indirect( - qword dest, - qword src, - qword srcBVHsizedwordAddr) -{ - define byteSize REG0; - define numGroupsRqd REG1; - define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255; - define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8; - define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4; - byteSize = load_dword(srcBVHsizedwordAddr); - numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT; - numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS; - DISPATCHDIM_X = numGroupsRqd.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect deserialize_indirect args( - dest, - src); -} - -metakernel dxr_decode( - qword dest, - qword src) -{ - dispatch dxr_decode(1,1,1) args( - dest, - src); -} diff --git a/src/intel/vulkan/grl/gpu/d3d12.h b/src/intel/vulkan/grl/gpu/d3d12.h deleted file mode 100644 index 32a7654eac5..00000000000 --- a/src/intel/vulkan/grl/gpu/d3d12.h +++ /dev/null @@ -1,525 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once -#include "GRLStructs.h" -#include "shared.h" - -typedef global void *D3D12_GPU_VIRTUAL_ADDRESS; -typedef void *ID3D12StateObjectPrototype; - -enum DXGI_FORMAT -{ - DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_R32G32B32A32_TYPELESS, - DXGI_FORMAT_R32G32B32A32_FLOAT, - DXGI_FORMAT_R32G32B32A32_UINT, - DXGI_FORMAT_R32G32B32A32_SINT, - DXGI_FORMAT_R32G32B32_TYPELESS, - DXGI_FORMAT_R32G32B32_FLOAT, - DXGI_FORMAT_R32G32B32_UINT, - DXGI_FORMAT_R32G32B32_SINT, - DXGI_FORMAT_R16G16B16A16_TYPELESS, - DXGI_FORMAT_R16G16B16A16_FLOAT, - DXGI_FORMAT_R16G16B16A16_UNORM, - DXGI_FORMAT_R16G16B16A16_UINT, - DXGI_FORMAT_R16G16B16A16_SNORM, - DXGI_FORMAT_R16G16B16A16_SINT, - DXGI_FORMAT_R32G32_TYPELESS, - DXGI_FORMAT_R32G32_FLOAT, - DXGI_FORMAT_R32G32_UINT, - DXGI_FORMAT_R32G32_SINT, - DXGI_FORMAT_R32G8X24_TYPELESS, - DXGI_FORMAT_D32_FLOAT_S8X24_UINT, - DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS, - DXGI_FORMAT_X32_TYPELESS_G8X24_UINT, - DXGI_FORMAT_R10G10B10A2_TYPELESS, - DXGI_FORMAT_R10G10B10A2_UNORM, - DXGI_FORMAT_R10G10B10A2_UINT, - DXGI_FORMAT_R11G11B10_FLOAT, - DXGI_FORMAT_R8G8B8A8_TYPELESS, - DXGI_FORMAT_R8G8B8A8_UNORM, - DXGI_FORMAT_R8G8B8A8_UNORM_SRGB, - DXGI_FORMAT_R8G8B8A8_UINT, - DXGI_FORMAT_R8G8B8A8_SNORM, - DXGI_FORMAT_R8G8B8A8_SINT, - DXGI_FORMAT_R16G16_TYPELESS, - DXGI_FORMAT_R16G16_FLOAT, - DXGI_FORMAT_R16G16_UNORM, - DXGI_FORMAT_R16G16_UINT, - DXGI_FORMAT_R16G16_SNORM, - DXGI_FORMAT_R16G16_SINT, - DXGI_FORMAT_R32_TYPELESS, - DXGI_FORMAT_D32_FLOAT, - DXGI_FORMAT_R32_FLOAT, - DXGI_FORMAT_R32_UINT, - DXGI_FORMAT_R32_SINT, - DXGI_FORMAT_R24G8_TYPELESS, - DXGI_FORMAT_D24_UNORM_S8_UINT, - DXGI_FORMAT_R24_UNORM_X8_TYPELESS, - DXGI_FORMAT_X24_TYPELESS_G8_UINT, - DXGI_FORMAT_R8G8_TYPELESS, - DXGI_FORMAT_R8G8_UNORM, - DXGI_FORMAT_R8G8_UINT, - DXGI_FORMAT_R8G8_SNORM, - DXGI_FORMAT_R8G8_SINT, - DXGI_FORMAT_R16_TYPELESS, - DXGI_FORMAT_R16_FLOAT, - DXGI_FORMAT_D16_UNORM, - DXGI_FORMAT_R16_UNORM, - DXGI_FORMAT_R16_UINT, - DXGI_FORMAT_R16_SNORM, - DXGI_FORMAT_R16_SINT, - DXGI_FORMAT_R8_TYPELESS, - DXGI_FORMAT_R8_UNORM, - DXGI_FORMAT_R8_UINT, - DXGI_FORMAT_R8_SNORM, - DXGI_FORMAT_R8_SINT, - DXGI_FORMAT_A8_UNORM, - DXGI_FORMAT_R1_UNORM, - DXGI_FORMAT_R9G9B9E5_SHAREDEXP, - DXGI_FORMAT_R8G8_B8G8_UNORM, - DXGI_FORMAT_G8R8_G8B8_UNORM, - DXGI_FORMAT_BC1_TYPELESS, - DXGI_FORMAT_BC1_UNORM, - DXGI_FORMAT_BC1_UNORM_SRGB, - DXGI_FORMAT_BC2_TYPELESS, - DXGI_FORMAT_BC2_UNORM, - DXGI_FORMAT_BC2_UNORM_SRGB, - DXGI_FORMAT_BC3_TYPELESS, - DXGI_FORMAT_BC3_UNORM, - DXGI_FORMAT_BC3_UNORM_SRGB, - DXGI_FORMAT_BC4_TYPELESS, - DXGI_FORMAT_BC4_UNORM, - DXGI_FORMAT_BC4_SNORM, - DXGI_FORMAT_BC5_TYPELESS, - DXGI_FORMAT_BC5_UNORM, - DXGI_FORMAT_BC5_SNORM, - DXGI_FORMAT_B5G6R5_UNORM, - DXGI_FORMAT_B5G5R5A1_UNORM, - DXGI_FORMAT_B8G8R8A8_UNORM, - DXGI_FORMAT_B8G8R8X8_UNORM, - DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM, - DXGI_FORMAT_B8G8R8A8_TYPELESS, - DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, - DXGI_FORMAT_B8G8R8X8_TYPELESS, - DXGI_FORMAT_B8G8R8X8_UNORM_SRGB, - DXGI_FORMAT_BC6H_TYPELESS, - DXGI_FORMAT_BC6H_UF16, - DXGI_FORMAT_BC6H_SF16, - DXGI_FORMAT_BC7_TYPELESS, - DXGI_FORMAT_BC7_UNORM, - DXGI_FORMAT_BC7_UNORM_SRGB, - DXGI_FORMAT_AYUV, - DXGI_FORMAT_Y410, - DXGI_FORMAT_Y416, - DXGI_FORMAT_NV12, - DXGI_FORMAT_P010, - DXGI_FORMAT_P016, - DXGI_FORMAT_420_OPAQUE, - DXGI_FORMAT_YUY2, - DXGI_FORMAT_Y210, - DXGI_FORMAT_Y216, - DXGI_FORMAT_NV11, - DXGI_FORMAT_AI44, - DXGI_FORMAT_IA44, - DXGI_FORMAT_P8, - DXGI_FORMAT_A8P8, - DXGI_FORMAT_B4G4R4A4_UNORM, - DXGI_FORMAT_P208, - DXGI_FORMAT_V208, - DXGI_FORMAT_V408, - DXGI_FORMAT_FORCE_UINT -}; - -typedef enum D3D12_RAYTRACING_GEOMETRY_FLAGS -{ - D3D12_RAYTRACING_GEOMETRY_FLAG_NONE = 0, - D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE = 0x1, - D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2 -} D3D12_RAYTRACING_GEOMETRY_FLAGS; - -typedef enum D3D12_RAYTRACING_GEOMETRY_TYPE -{ - D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES = 0, - D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS = (D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES + 1) -} D3D12_RAYTRACING_GEOMETRY_TYPE; - -typedef enum D3D12_RAYTRACING_INSTANCE_FLAGS -{ - D3D12_RAYTRACING_INSTANCE_FLAG_NONE = 0, - D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1, - D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2, - D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_OPAQUE = 0x4, - D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8 -} D3D12_RAYTRACING_INSTANCE_FLAGS; - -typedef struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE -{ - D3D12_GPU_VIRTUAL_ADDRESS StartAddress; - unsigned long StrideInBytes; -} D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE; - -typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE -{ - D3D12_GPU_VIRTUAL_ADDRESS StartAddress; - unsigned long SizeInBytes; -} D3D12_GPU_VIRTUAL_ADDRESSRANGE; - -typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE -{ - D3D12_GPU_VIRTUAL_ADDRESS StartAddress; - unsigned long SizeInBytes; - unsigned long StrideInBytes; -} D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE; - -typedef struct D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC -{ - D3D12_GPU_VIRTUAL_ADDRESS Transform; - enum DXGI_FORMAT IndexFormat; - enum DXGI_FORMAT VertexFormat; - unsigned int IndexCount; - unsigned int VertexCount; - D3D12_GPU_VIRTUAL_ADDRESS IndexBuffer; - struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE VertexBuffer; -} D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC; - -typedef struct D3D12_RAYTRACING_AABB -{ - float MinX; - float MinY; - float MinZ; - float MaxX; - float MaxY; - float MaxZ; -} D3D12_RAYTRACING_AABB; - -GRL_INLINE void D3D12_set_raytracing_aabb(D3D12_RAYTRACING_AABB* dest, struct AABB* source) -{ - dest->MinX = source->lower.x; - dest->MinY = source->lower.y; - dest->MinZ = source->lower.z; - dest->MaxX = source->upper.x; - dest->MaxY = source->upper.y; - dest->MaxZ = source->upper.z; -} - -typedef struct D3D12_RAYTRACING_GEOMETRY_AABBS_DESC -{ - unsigned long AABBCount; - D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE AABBs; -} D3D12_RAYTRACING_GEOMETRY_AABBS_DESC; - -typedef struct D3D12_RAYTRACING_GEOMETRY_DESC -{ - D3D12_RAYTRACING_GEOMETRY_TYPE Type; - D3D12_RAYTRACING_GEOMETRY_FLAGS Flags; - //unsigned int ShaderIndex : 24; // extension - //unsigned int Mask : 8; // extension - //unsigned int ShaderIndex_Mask; // extension - union { - D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC Triangles; - D3D12_RAYTRACING_GEOMETRY_AABBS_DESC AABBs; - }; -} D3D12_RAYTRACING_GEOMETRY_DESC; - -GRL_INLINE void D3D12_set_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_TYPE type) -{ - geomDesc->Type = type; -} - -GRL_INLINE D3D12_RAYTRACING_GEOMETRY_TYPE D3D12_get_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->Type; -} - -GRL_INLINE void D3D12_set_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_FLAGS flags) -{ - geomDesc->Flags = flags; -} - -GRL_INLINE D3D12_RAYTRACING_GEOMETRY_FLAGS D3D12_get_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->Flags; -} - -GRL_INLINE void D3D12_set_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS transform) -{ - geomDesc->Triangles.Transform = transform; -} - -GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->Triangles.Transform; -} - -GRL_INLINE void D3D12_set_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, IndexFormat format) -{ - switch (format) - { - case INDEX_FORMAT_NONE: - geomDesc->Triangles.IndexFormat = DXGI_FORMAT_UNKNOWN; - break; - case INDEX_FORMAT_R16_UINT: - geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R16_UINT; - break; - case INDEX_FORMAT_R32_UINT: - geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R32_UINT; - break; - } -} - -GRL_INLINE IndexFormat D3D12_get_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - switch (geomDesc->Triangles.IndexFormat) - { - case DXGI_FORMAT_R16_UINT: - return INDEX_FORMAT_R16_UINT; - case DXGI_FORMAT_R32_UINT: - return INDEX_FORMAT_R32_UINT; - case DXGI_FORMAT_UNKNOWN: - default: - return INDEX_FORMAT_NONE; - } -} - -GRL_INLINE void D3D12_set_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, VertexFormat format) -{ - switch (format) - { - case VERTEX_FORMAT_R32G32_FLOAT: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32_FLOAT; - break; - case VERTEX_FORMAT_R32G32B32_FLOAT: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32B32_FLOAT; - break; - case VERTEX_FORMAT_R16G16_FLOAT: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_FLOAT; - break; - case VERTEX_FORMAT_R16G16B16A16_FLOAT: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - case VERTEX_FORMAT_R16G16_SNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_SNORM; - break; - case VERTEX_FORMAT_R16G16B16A16_SNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_SNORM; - break; - case VERTEX_FORMAT_R16G16B16A16_UNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_UNORM; - break; - case VERTEX_FORMAT_R16G16_UNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_UNORM; - break; - case VERTEX_FORMAT_R10G10B10A2_UNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R10G10B10A2_UNORM; - break; - case VERTEX_FORMAT_R8G8B8A8_UNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_UNORM; - break; - case VERTEX_FORMAT_R8G8_UNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_UNORM; - break; - case VERTEX_FORMAT_R8G8B8A8_SNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_SNORM; - break; - case VERTEX_FORMAT_R8G8_SNORM: - geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_SNORM; - break; - } -} - -GRL_INLINE VertexFormat D3D12_get_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - switch(geomDesc->Triangles.VertexFormat) - { - case DXGI_FORMAT_R32G32_FLOAT: - return VERTEX_FORMAT_R32G32_FLOAT; - case DXGI_FORMAT_R32G32B32_FLOAT: - return VERTEX_FORMAT_R32G32B32_FLOAT; - case DXGI_FORMAT_R16G16_FLOAT: - return VERTEX_FORMAT_R16G16_FLOAT; - case DXGI_FORMAT_R16G16B16A16_FLOAT: - return VERTEX_FORMAT_R16G16B16A16_FLOAT; - case DXGI_FORMAT_R16G16_SNORM: - return VERTEX_FORMAT_R16G16_SNORM; - case DXGI_FORMAT_R16G16B16A16_SNORM: - return VERTEX_FORMAT_R16G16B16A16_SNORM; - case DXGI_FORMAT_R16G16B16A16_UNORM: - return VERTEX_FORMAT_R16G16B16A16_UNORM; - case DXGI_FORMAT_R16G16_UNORM: - return VERTEX_FORMAT_R16G16_UNORM; - case DXGI_FORMAT_R10G10B10A2_UNORM: - return VERTEX_FORMAT_R10G10B10A2_UNORM; - case DXGI_FORMAT_R8G8B8A8_UNORM: - return VERTEX_FORMAT_R8G8B8A8_UNORM; - case DXGI_FORMAT_R8G8_UNORM: - return VERTEX_FORMAT_R8G8_UNORM; - case DXGI_FORMAT_R8G8B8A8_SNORM: - return VERTEX_FORMAT_R8G8B8A8_SNORM; - case DXGI_FORMAT_R8G8_SNORM: - return VERTEX_FORMAT_R8G8_SNORM; - default: - return VERTEX_FORMAT_R32G32_FLOAT; - } -} - -GRL_INLINE void D3D12_set_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count) -{ - geomDesc->Triangles.IndexCount = count; -} - -GRL_INLINE unsigned int D3D12_get_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->Triangles.IndexCount; -} - -GRL_INLINE void D3D12_set_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count) -{ - geomDesc->Triangles.VertexCount = count; -} - -GRL_INLINE unsigned int D3D12_get_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->Triangles.VertexCount; -} - -GRL_INLINE void D3D12_set_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS buffer) -{ - geomDesc->Triangles.IndexBuffer = buffer; -} - -GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->Triangles.IndexBuffer; -} - -GRL_INLINE void D3D12_set_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address) -{ - geomDesc->Triangles.VertexBuffer.StartAddress = address; -} - -GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->Triangles.VertexBuffer.StartAddress; -} - -GRL_INLINE void D3D12_set_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride) -{ - geomDesc->Triangles.VertexBuffer.StrideInBytes = stride; -} - -GRL_INLINE unsigned long D3D12_get_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->Triangles.VertexBuffer.StrideInBytes; -} - -GRL_INLINE void D3D12_set_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long count) -{ - geomDesc->AABBs.AABBCount = count; -} - -GRL_INLINE unsigned long D3D12_get_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->AABBs.AABBCount; -} - -GRL_INLINE void D3D12_set_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address) -{ - geomDesc->AABBs.AABBs.StartAddress = address; -} - -GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->AABBs.AABBs.StartAddress; -} - -GRL_INLINE void D3D12_set_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride) -{ - geomDesc->AABBs.AABBs.StrideInBytes = stride; -} - -GRL_INLINE unsigned long D3D12_get_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc) -{ - return geomDesc->AABBs.AABBs.StrideInBytes; -} - -typedef struct D3D12_RAYTRACING_INSTANCE_DESC -{ - float Transform[12]; - // unsigned int InstanceID : 24; - // unsigned int InstanceMask : 8; - uint32_t DW0; - // unsigned int InstanceContributionToHitGroupIndex : 24; - // unsigned int Flags : 8; - uint32_t DW1; - global char *AccelerationStructure; -} D3D12_RAYTRACING_INSTANCE_DESC; - -GRL_INLINE float D3D12_get_transform(const D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column) -{ - return d->Transform[row * 4 + column]; -} - -GRL_INLINE uint32_t D3D12_get_instanceID(const D3D12_RAYTRACING_INSTANCE_DESC *d) -{ - return d->DW0 & ((1 << 24) - 1); -} - -GRL_INLINE uint32_t D3D12_get_InstanceMask(const D3D12_RAYTRACING_INSTANCE_DESC *d) -{ - return d->DW0 >> 24; -} - -GRL_INLINE uint32_t D3D12_get_InstanceContributionToHitGroupIndex(const D3D12_RAYTRACING_INSTANCE_DESC *d) -{ - return d->DW1 & ((1 << 24) - 1); -} - -GRL_INLINE uint32_t D3D12_get_InstanceFlags(const D3D12_RAYTRACING_INSTANCE_DESC *d) -{ - return d->DW1 >> 24; -} - -GRL_INLINE gpuva_t D3D12_get_AccelerationStructure(const D3D12_RAYTRACING_INSTANCE_DESC *d) -{ - return (gpuva_t)d->AccelerationStructure; -} - -GRL_INLINE void D3D12_set_transform(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column, float value) -{ - d->Transform[row * 4 + column] = value; -} - -GRL_INLINE void D3D12_set_instanceID(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t id) -{ - d->DW0 &= 255 << 24; - d->DW0 |= id & ((1 << 24) - 1); -} - -GRL_INLINE void D3D12_set_InstanceMask(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t mask) -{ - d->DW0 &= ((1 << 24) - 1); - d->DW0 |= mask << 24; -} - -GRL_INLINE void D3D12_set_InstanceContributionToHitGroupIndex(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t contribution) -{ - d->DW1 &= 255 << 24; - d->DW1 |= contribution & ((1 << 24) - 1); -} - -GRL_INLINE void D3D12_set_InstanceFlags(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t flags) -{ - d->DW1 &= ((1 << 24) - 1); - d->DW1 |= flags << 24; -} - -GRL_INLINE void D3D12_set_AccelerationStructure(D3D12_RAYTRACING_INSTANCE_DESC *d, gpuva_t address) -{ - d->AccelerationStructure = (global char*)address; -} diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl deleted file mode 100644 index d37adbbbb2b..00000000000 --- a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl +++ /dev/null @@ -1,59 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "api_interface.h" - -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel CopyGeom( - global struct Geo *src, - global struct Geo *dst, - global float4 *vec, - global ushort *indices, - dword step) -{ - src = src + get_group_id(0); - dst = dst + get_group_id(0); - dst->Flags = src->Flags; - dst->Type = src->Type; - if (src->Type == GEOMETRY_TYPE_PROCEDURAL) - { - dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride; - dst->Desc.Procedural.AABBCount = src->Desc.Procedural.AABBCount; - dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride; - } - else - { - dst->Desc.Triangles.pTransformBuffer = src->Desc.Triangles.pTransformBuffer; - if (step == 0) - return; - dst->Desc.Triangles.IndexCount = src->Desc.Triangles.IndexCount; - if (step == 1) - return; - dst->Desc.Triangles.VertexCount = src->Desc.Triangles.VertexCount; - if (step == 2) - return; - dst->Desc.Triangles.IndexFormat = src->Desc.Triangles.IndexFormat; - if (step == 3) - return; - dst->Desc.Triangles.pIndexBuffer = src->Desc.Triangles.pIndexBuffer; - if (step == 4) - return; - dst->Desc.Triangles.pVertexBuffer = src->Desc.Triangles.pVertexBuffer; - if (step == 5) - return; - dst->Desc.Triangles.VertexBufferByteStride = src->Desc.Triangles.VertexBufferByteStride; - - dst->Desc.Triangles.VertexFormat = src->Desc.Triangles.VertexFormat; - - for (uint t = 0; t * 3 < dst->Desc.Triangles.IndexCount; t++) - { - uint3 tri = GRL_load_triangle(src, t); - vec[t * 3] = GRL_load_vertex(src, tri[0]); - vec[t * 3 + 1] = GRL_load_vertex(src, tri[1]); - vec[t * 3 + 2] = GRL_load_vertex(src, tri[2]); - } - } -} diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl deleted file mode 100644 index 3779439c54b..00000000000 --- a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl +++ /dev/null @@ -1,27 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module api_interface_verify; - -kernel copy_geom < source="grl_api_interface_verify.cl", kernelFunction="CopyGeom" > - -metakernel ifc0_copy( - qword src, - qword dst, - qword vec, - qword srcIndices, - dword numGroups, - dword step) -{ - dispatch copy_geom(numGroups,1,1) args( - src, - dst, - vec, - srcIndices, - step - ); -} diff --git a/src/intel/vulkan/grl/gpu/input_dump.cl b/src/intel/vulkan/grl/gpu/input_dump.cl deleted file mode 100644 index f668f053f1f..00000000000 --- a/src/intel/vulkan/grl/gpu/input_dump.cl +++ /dev/null @@ -1,723 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "api_interface.h" -#include "common.h" -#include "d3d12.h" -#include "mem_utils.h" -#include "misc_shared.h" - -/// Align value to 128 -/// -/// @param value vale to align -/// @return aligned value -GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; } - -GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) { - return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch))); -} - -/// Finds max used byte in vertex buffer -/// -/// @param indexBuffPtr pointer to index buffer -/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers -/// @param IndexCount number of indices in index buffer -/// @param IndexFormat index format -/// @param VertexCount number of vertices in vertex buffer -/// @param VertexBufferByteStride vertex buffer byte stride -__attribute__((reqd_work_group_size(256, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel find_max_used_byte_in_buff( - global void* indexBuffPtr, - global uint* vertexBufferUsedByteEnd, - dword IndexCount, - dword IndexFormat, - dword VertexCount, - qword VertexBufferByteStride) -{ - local uint sgMax[16]; - uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0); - - if (IndexFormat != INDEX_FORMAT_NONE) - { - uint endByte = 0; - if (glob_id < IndexCount) - { - if (IndexFormat == INDEX_FORMAT_R16_UINT) - { - global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr; - endByte = indexBuffPtrShort[glob_id]; - } - else - { - global uint* indexBuffPtrUint = (global uint*) indexBuffPtr; - endByte = indexBuffPtrUint[glob_id]; - } - } - - endByte = sub_group_reduce_max(endByte); - - if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_sub_group_id() == 0) - { - endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]); - if (get_sub_group_local_id() == 0) - { - endByte = min(endByte, VertexCount); - if (endByte < VertexCount && IndexCount != 0) - ++endByte; - endByte *= (dword)VertexBufferByteStride; - atomic_max(vertexBufferUsedByteEnd, endByte); - } - } - } - else if (glob_id == 0) - { - uint endByte = VertexCount * VertexBufferByteStride; - atomic_max(vertexBufferUsedByteEnd, endByte); - } -} - -/// Allocates buffer for vertices -/// -/// @param batchPtrs batch pointers struct -/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers -/// @param vertexBufferOffset pointer to offsets to vertex buffers -/// @param numVertexBuffers number of vertex buffers -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel allocate_linear_offsets_for_vertex_buffers( - global InputBatchPtrs* batchPtrs, - global uint* vertexBufferUsedByteEnd, - global uint* vertexBufferOffset, - dword numVertexBuffers) -{ - uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id(); - - if (glob_id < numVertexBuffers) - { - uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]); - uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes); - vertexBufferOffset[glob_id] = position; - } -} - -/// Sets the dst data space for input dump of this batch -/// -/// @param inputDumpMainBuffer pointer to main dump buffer -/// @param batchPtrs batch pointers struct -/// @param nonVertexSize size of non vertex data -/// @param batchIdPtr pointer to batch id -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel allocate_data_space_for_inputs( - global DebugBufferHeader* inputDumpMainBuffer, - global InputBatchPtrs* batchPtrs, - uint nonVertexSize, - global qword* batchIdPtr) -{ - if (get_sub_group_local_id() == 0) - { - uint vertexBufferSize = batchPtrs->vertexBuffersSize; - uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize; - - if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2)) - { - inputDumpMainBuffer->overflow = 1; - batchPtrs->dumpDst = 0; - batchPtrs->globalDumpBuffer = 0; - batchPtrs->nonVertexDataStart = 0; - batchPtrs->totalSize = 0; - return; - } - - dword prevHead = inputDumpMainBuffer->gpuHead; - dword newHead; - bool circled; - - do - { - circled = false; - newHead = prevHead + sizeOfThisBatch; - dword bufferBegin = prevHead; - if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize) - { - circled = true; - newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch; - bufferBegin = inputDumpMainBuffer->headStart; - } - dword bufferEnd = newHead + sizeof(InputBatch); - - uint tail; - uint tail2 = 7; - bool wait; - do - { - wait = true; - tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0); - - // dead code, workaround so IGC won't move tail load out of loop - if (tail > inputDumpMainBuffer->totalSize) - { - store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2); - tail2 = tail; - } - - if( prevHead >= tail ) - { - //colision example: - // ----------T=======H------------ - // -------B=====E----------------- - // - if((bufferEnd < tail) || (bufferBegin >= prevHead)) - { - wait = false; - } - } - else - { - //colision example: - // ==========H-------T============ - // B==============E--------------- - // caution: we will never have H circled completely so that H == T - if((bufferEnd < tail) && (bufferBegin >= prevHead)) - { - wait = false; - } - } - } while (wait); - } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead)); - - if (circled) - { - global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead); - endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER; - prevHead = inputDumpMainBuffer->headStart; - } - - global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead; - batchPtrs->dumpDst = (qword)thisBatchDump; - batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer; - batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize); - batchPtrs->totalSize = sizeOfThisBatch; - - global InputBatch* batchOp = (global InputBatch*) thisBatchDump; - batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH; - batchOp->header.opHeader.endOfData = sizeOfThisBatch; - batchOp->vertexBufferDataSize = vertexBufferSize; - batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize; - batchOp->batchId = *batchIdPtr; - } -} - -/// Sets the dst data space for output dump of this batch -/// -/// @param outputDumpMainBuffer pointer to main dump buffer -/// @param batchPtrs batch pointers struct -/// @param batchIdPtr pointer to batch id -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel allocate_data_space_for_outputs( - global DebugBufferHeader* outputDumpMainBuffer, - global OutputBatchPtrs* batchPtrs, - global qword* batchIdPtr) -{ - if (get_sub_group_local_id() == 0) - { - uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize; - - if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2)) - { - outputDumpMainBuffer->overflow = 1; - batchPtrs->dumpDst = 0; - batchPtrs->dataStart = 0; - batchPtrs->totalSize = 0; - return; - } - - dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead)); - dword newHead; - bool circled; - - do - { - //mem_fence_gpu_invalidate(); - //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead)); - circled = false; - newHead = prevHead + sizeOfThisBatch; - dword bufferBegin = prevHead; - if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize) - { - circled = true; - newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch; - bufferBegin = outputDumpMainBuffer->headStart; - } - dword bufferEnd = newHead + sizeof(OutputBatch); - - uint tail; - uint tail2 = 7; - bool wait; - do - { - wait = true; - tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0); - - // dead code, workaround so IGC won't move tail load out of loop - if (tail > outputDumpMainBuffer->totalSize) - { - store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2); - tail2 = tail; - } - - if( prevHead >= tail ) - { - //colision example: - // ----------T=======H------------ - // -------B=====E----------------- - // - if((bufferEnd < tail) || (bufferBegin >= prevHead)) - { - wait = false; - } - } - else - { - //colision example: - // ==========H-------T============ - // B==============E--------------- - // caution: we will never have H circled completely so that H == T - if((bufferEnd < tail) && (bufferBegin >= prevHead)) - { - wait = false; - } - } - } while (wait); - } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead)); - - if (circled) - { - global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead); - endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER; - prevHead = outputDumpMainBuffer->headStart; - } - - global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead; - batchPtrs->dumpDst = (qword)thisBatchDump; - batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch))); - batchPtrs->totalSize = sizeOfThisBatch; - - global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump; - batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH; - batchOp->header.opHeader.endOfData = sizeOfThisBatch; - batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch)); - batchOp->batchId = *batchIdPtr; - } -} - -/// Calculates sum of output sizes -/// -/// @param pbi pointer to post build infos -/// @param destOffset offset in dest buffer -/// @param numOutputs number of outputs -/// @param batchPtrs batch pointers struct -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel calc_outputs_data_size( - global PostbuildInfoSerializationDesc* pbi, - global dword* destOffsets, - qword numOutputs, - global OutputBatchPtrs* batchPtrs) -{ - uint offset = 0; - for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH) - { - uint size = 0; - if (i < numOutputs) - { - size = AlignTo128(pbi[i].SerializedSizeInBytes); - size += AlignTo128(sizeof(OutputData)); - destOffsets[i] = offset + sub_group_scan_exclusive_add(size); - } - offset += sub_group_reduce_add(size); - } - if (get_sub_group_local_id() == 0) - batchPtrs->dataSize = offset; -} - -/// Adds output data operation to batch -/// -/// @param batchPtrs batch pointers struct -/// @param destOffset offset in dest buffer -/// @param src pointer to source bvh -/// @param pbi pointer to post build info -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel write_output_data_op( - global OutputBatchPtrs* batchPtrs, - global dword* destOffset, - qword src, - global PostbuildInfoSerializationDesc* pbi) -{ - if (batchPtrs->dataStart == 0) - return; - - global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset); - out->header.operationType = OUTPUT_DUMP_OP_DATA; - out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes); - out->srcBvhPtr = src; -} - -/// Writes indices and transform or procedurals data -/// -/// @param batchPtrs batch pointers struct -/// @param srcDesc description of source geometry -/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer -/// @param dstDescOffset offset to dest geo desc -/// @param dstDataOffset offset to dest geo data -/// @param numThreads number of threads -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel write_geo_data( - global InputBatchPtrs* batchPtrs, - global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc, - global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers, - global uint* pVertexBufferSize, - qword dstDescOffset, - qword dstDataOffset, - dword numThreads) -{ - if (batchPtrs->dumpDst == 0) return; - - uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); - - GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc; - - global char* dstDataPtr = (global char*)( - batchPtrs->nonVertexDataStart + dstDataOffset); - - global char* srcDataPtr; - global char* dstTransform; - uint bytesToCopy = 0; - - if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES) - { - uint sizeOfMatrix = 0; - - if (geoDescToStore.Desc.Triangles.pTransformBuffer) - { - sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float)); - if (glob_id < 12) - { - global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer; - global float* matrixDst = (global float*)dstDataPtr; - matrixDst[glob_id] = matrixSrc[glob_id]; - if (glob_id == 0) - { - geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer; - } - } - } - - dstDataPtr += sizeOfMatrix; - srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer; - - bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount); - - if (bytesToCopy && (glob_id == 0)) - { - qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers); - // for this we remember offset relative to global debug buffer - geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer; - geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer; - geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride; - } - else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0) - { - if (geoDescToStore.Desc.Triangles.pVertexBuffer) - { - qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers); - // for this we remember offset relative to global debug buffer - geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer; - } - } - else if (glob_id == 0) - { - geoDescToStore.Desc.Triangles.IndexCount = 0; - geoDescToStore.Desc.Triangles.VertexCount = 0; - geoDescToStore.Desc.Triangles.pVertexBuffer = 0; - geoDescToStore.Desc.Triangles.pIndexBuffer = 0; - } - } - else - { - srcDataPtr = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA; - bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount); - if (glob_id == 0) - { - geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer; - } - } - - if (bytesToCopy) - { - CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads); - } - - if (glob_id == 0) - { - global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)( - batchPtrs->nonVertexDataStart + dstDescOffset); - *dstDescPtr = geoDescToStore; - } -} - -/// Adds build operation to batch -/// -/// @param batchPtrs batch pointers struct -/// @param buildOpOffset offset in dst buffer -/// @param srcBvh address of src bvh (in case of update) -/// @param dstBvhAddr address of dest bvh buffer -/// @param offsetToEnd offset to end of this operation -/// @param flags build flags -/// @param numGeometries number of geometries in build -/// @param numInstances number of instances in build -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel write_input_build_op( - global InputBatchPtrs* batchPtrs, - qword buildOpOffset, - qword srcBvh, - qword dstBvhAddr, - dword offsetToEnd, - dword flags, - dword numGeometries, - dword numInstances, - dword instArrayOfPtrs) -{ - uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); - if (batchPtrs->dumpDst == 0 || glob_id != 0) return; - - global InputBuild* buildOp = (global InputBuild*)( - batchPtrs->nonVertexDataStart + buildOpOffset); - buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD; - buildOp->header.endOfData = offsetToEnd; - buildOp->dstBvhPtr = dstBvhAddr; - buildOp->srcBvhPtr = srcBvh; - buildOp->flags = flags; - buildOp->numGeos = numGeometries; - buildOp->numInstances = numInstances; - buildOp->instArrayOfPtrs = instArrayOfPtrs; -} - -/// Copies instance description -/// -/// @param batchPtrs batch pointers struct -/// @param instanceDescArr inst desc source -/// @param offset ptr to offset in dst buffer -/// @param numInstances number of instances to copy -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel -copy_instance_descriptors_array( - global InputBatchPtrs* batchPtrs, - global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr, - qword offset, - dword numInstances) -{ - uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); - if (batchPtrs->dumpDst == 0) return; - - global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )( - batchPtrs->nonVertexDataStart + offset); - - if (glob_id < numInstances) - { - dst[glob_id] = instanceDescArr[glob_id]; - } -} - -/// Copies instance description, array of pointers version -/// -/// @param batchPtrs batch pointers struct -/// @param pInstanceDescPtrsArr inst desc source -/// @param offset ptr to offset in dst buffer -/// @param numInstances number of instances to copy -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel -copy_instance_descriptors_array_of_ptrs( - global InputBatchPtrs* batchPtrs, - global qword* pInstanceDescPtrsArr, - qword offset, - dword numInstances) -{ - uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); - if (batchPtrs->dumpDst == 0) return; - - // save gpuva of instance descs for debug - global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset); - - global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)( - batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset); - global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr; - - if (glob_id < numInstances) - { - gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id]; - dst[glob_id] = *(instanceDescPtrsArr[glob_id]); - } -} - -/// Adds copy operation to batch -/// -/// @param batchPtrs batch pointers struct -/// @param offset ptr to offset in dst buffer -/// @param src copy source pointer -/// @param dst copy destination pointer -/// @param copyOpType copy type -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel insert_copy_op( - global InputBatchPtrs* batchPtrs, - qword offset, - global void* src, - global void* dst, - uint copyOpType) -{ - uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); - if (batchPtrs->dumpDst == 0 || glob_id != 0) return; - - global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset); - - copyOp->header.operationType = copyOpType; - copyOp->header.endOfData = AlignTo128(sizeof(InputCopy)); - copyOp->srcBvhPtr = (qword)src; - copyOp->dstBvhPtr = (qword)dst; -} - -/// Copies vertex buffer -/// -/// @param batchPtrs batch pointers struct -/// @param src input buffer -/// @param offset ptr to offset in dst buffer -/// @param size ptr to number of bytes to copy -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_vertex_data( - global InputBatchPtrs* batchPtrs, - global const char* src, - global const uint* offset, - global const uint* size) -{ - if (batchPtrs->dumpDst == 0) return; - - global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset); - uint numGroups = (*size >> 6) + 1; - CopyMemory(dst, src, *size, numGroups); -} - -/// Generate unique batch id -/// -/// @param batchIds array of unique batch ids -/// @param index index of batch id to generate -__attribute__((reqd_work_group_size(1, 1, 1))) -void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) { - global unsigned int *counterPtrs = (global unsigned int *)batchIds; - atomic_add(&counterPtrs[index * 2 + 1], 1); - batchIds[index] |= (unsigned long)index; -} - -/// Sets batch as ready to read and moves cpuHead forward, inputs case -/// -/// @param batchPtrs batch pointers struct -/// @param dumpMainBuffer pointer to main dump buffer -__attribute__((reqd_work_group_size(1, 1, 1))) -void kernel finish_batch_dump_inputs( - global InputBatchPtrs* batchPtrs, - global DebugBufferHeader* dumpMainBuffer) -{ - if (batchPtrs->dumpDst == 0) - return; - - global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst; - - dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer); - - dword seven = 7; - while (true) - { - dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0); - if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop - { - store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven); - currentHead = seven; - } - - if (currentHead == myDstOffset) - { - mem_fence_evict_to_memory(); - dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData; - break; - } - else if (myDstOffset == dumpMainBuffer->headStart) - { - global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead); - if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER) - { - mem_fence_evict_to_memory(); - dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData; - break; - } - } - } -} - -/// Sets batch as ready to read and moves cpuHead forward, outputs case -/// -/// @param batchPtrs batch pointers struct -/// @param dumpMainBuffer pointer to main dump buffer -__attribute__((reqd_work_group_size(1, 1, 1))) -void kernel finish_batch_dump_outputs( - global OutputBatchPtrs* batchPtrs, - global DebugBufferHeader* dumpMainBuffer) -{ - if (batchPtrs->dumpDst == 0) - return; - - global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst; - - dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer); - - dword seven = 7; - while (true) - { - dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0); - if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop - { - store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven); - currentHead = seven; - } - - if (currentHead == myDstOffset) - { - mem_fence_evict_to_memory(); - dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData; - break; - } - else if (myDstOffset == dumpMainBuffer->headStart) - { - global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead); - if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER) - { - mem_fence_evict_to_memory(); - dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData; - break; - } - } - } -} diff --git a/src/intel/vulkan/grl/gpu/input_dump.grl b/src/intel/vulkan/grl/gpu/input_dump.grl deleted file mode 100644 index 7cc6e60a95d..00000000000 --- a/src/intel/vulkan/grl/gpu/input_dump.grl +++ /dev/null @@ -1,252 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module input_dump; - -kernel_module input_dumper("input_dump.cl") -{ - links lsc_intrinsics; - - kernel opencl_kernel_find_max_used_byte_in_buff < kernelFunction="find_max_used_byte_in_buff" >; - kernel opencl_kernel_allocate_linear_offsets_for_vertex_buffers < kernelFunction="allocate_linear_offsets_for_vertex_buffers" >; - kernel opencl_kernel_allocate_data_space_for_inputs < kernelFunction="allocate_data_space_for_inputs" >; - kernel opencl_kernel_allocate_data_space_for_outputs < kernelFunction="allocate_data_space_for_outputs" >; - kernel opencl_kernel_calc_outputs_data_size < kernelFunction="calc_outputs_data_size" >; - kernel opencl_kernel_write_output_data_op < kernelFunction="write_output_data_op" >; - kernel opencl_kernel_write_geo_data < kernelFunction="write_geo_data" >; - kernel opencl_kernel_write_input_build_op < kernelFunction="write_input_build_op" >; - kernel opencl_kernel_copy_instance_descriptors_array < kernelFunction="copy_instance_descriptors_array" >; - kernel opencl_kernel_copy_instance_descriptors_array_of_ptrs < kernelFunction="copy_instance_descriptors_array_of_ptrs" >; - kernel opencl_kernel_insert_copy_op < kernelFunction="insert_copy_op" >; - kernel opencl_kernel_copy_vertex_data < kernelFunction="copy_vertex_data" >; - kernel opencl_kernel_generate_unique_batch_id < kernelFunction="generate_unique_batch_id" >; - kernel opencl_kernel_finish_batch_dump_inputs < kernelFunction="finish_batch_dump_inputs" >; - kernel opencl_kernel_finish_batch_dump_outputs < kernelFunction="finish_batch_dump_outputs" >; -} - - -metakernel find_max_used_byte_in_buff( - qword indexBuffPtr, - qword vertexBufferUsedByteEnd, - dword IndexCount, - dword IndexFormat, - dword VertexCount, - qword VertexBufferByteStride, - dword numPhysThreads) -{ - dispatch opencl_kernel_find_max_used_byte_in_buff(numPhysThreads, 1, 1) args( - indexBuffPtr, - vertexBufferUsedByteEnd, - IndexCount, - IndexFormat, - VertexCount, - VertexBufferByteStride); -} - -metakernel allocate_linear_offsets_for_vertex_buffers( - qword batchPtrs, - qword m_VertexBufferUsedByteEnd, - qword m_VertexBufferOffset, - dword numVertexBuffers, - dword numPhysThreads) -{ - dispatch opencl_kernel_allocate_linear_offsets_for_vertex_buffers(numPhysThreads, 1, 1) args( - batchPtrs, - m_VertexBufferUsedByteEnd, - m_VertexBufferOffset, - numVertexBuffers); -} - -metakernel allocate_data_space_for_inputs( - qword inputDumpMainBuffer, - qword batchPtrs, - dword nonVertexSize, - qword batchIdPtr) -{ - dispatch opencl_kernel_allocate_data_space_for_inputs(1, 1, 1) args( - inputDumpMainBuffer, - batchPtrs, - nonVertexSize, - batchIdPtr); -} - -metakernel allocate_data_space_for_outputs( - qword inputDumpMainBuffer, - qword batchPtrs, - qword batchIdPtr) -{ - dispatch opencl_kernel_allocate_data_space_for_outputs(1, 1, 1) args( - inputDumpMainBuffer, - batchPtrs, - batchIdPtr); -} - -metakernel calc_outputs_data_size( - qword pbi, - qword destOffsets, - qword numOutputs, - qword batchPtrs) -{ - dispatch opencl_kernel_calc_outputs_data_size(1, 1, 1) args( - pbi, - destOffsets, - numOutputs, - batchPtrs); -} - -metakernel write_output_data_op( - qword batchPtrs, - qword destOffset, - qword src, - qword pbi) -{ - dispatch opencl_kernel_write_output_data_op(1, 1, 1) args( - batchPtrs, - destOffset, - src, - pbi); -} - -metakernel write_geo_data( - qword batchPtrs, - qword srcDesc, - qword pVertexBufferOffsetInLinearisedUniqueVertexBuffers, - qword pVertexBufferSize, - qword dstDescOffset, - qword dstDataOffset, - dword numThreads) -{ - dispatch opencl_kernel_write_geo_data(numThreads, 1, 1) args( - batchPtrs, - srcDesc, - pVertexBufferOffsetInLinearisedUniqueVertexBuffers, - pVertexBufferSize, - dstDescOffset, - dstDataOffset, - numThreads); -} - -metakernel write_input_build_op( - qword batchPtrs, - qword buildOpOffset, - qword srcBvh, - qword dstBvhAddr, - dword offsetToEnd, - dword flags, - dword numGeometries, - dword numInstances, - dword instArrayOfPtrs) - -{ - dispatch opencl_kernel_write_input_build_op(1, 1, 1) args( - batchPtrs, - buildOpOffset, - srcBvh, - dstBvhAddr, - offsetToEnd, - flags, - numGeometries, - numInstances, - instArrayOfPtrs); -} - -metakernel copy_instance_descriptors_array( - qword batchPtrs, - qword instanceDescArr, - qword offset, - dword numInstances, - dword numPhysThreads) -{ - dispatch opencl_kernel_copy_instance_descriptors_array(numPhysThreads, 1, 1) args( - batchPtrs, - instanceDescArr, - offset, - numInstances); -} - -metakernel copy_instance_descriptors_array_of_ptrs( - qword batchPtrs, - qword instanceDescArrPtrs, - qword offset, - dword numInstances, - dword numPhysThreads) -{ - dispatch opencl_kernel_copy_instance_descriptors_array_of_ptrs(numPhysThreads, 1, 1) args( - batchPtrs, - instanceDescArrPtrs, - offset, - numInstances); -} - -metakernel insert_copy_op( - qword batchPtrs, - qword offset, - qword src, - qword dst, - dword type) -{ - dispatch opencl_kernel_insert_copy_op(1, 1, 1) args( - batchPtrs, - offset, - src, - dst, - type); -} - -metakernel copy_vertex_data( - qword desc, - qword src, - qword offset, - qword size) -{ - define byteSize REG0; - define numGroupsRqd REG1; - define shift REG2; - define minimum REG3; - - shift = 6; - minimum = 1; - byteSize = load_dword(size); - numGroupsRqd = byteSize >> shift; - numGroupsRqd = numGroupsRqd + minimum; - DISPATCHDIM_X = numGroupsRqd.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_copy_vertex_data args( - desc, - src, - offset, - size); -} - -metakernel generate_unique_batch_id( - qword batchIds, - dword batchIndex) -{ - dispatch opencl_kernel_generate_unique_batch_id(1, 1, 1) args( - batchIds, - batchIndex); -} - -metakernel finish_batch_dump_inputs( - qword batchPtrs, - qword dumpMainBuffer) -{ - dispatch opencl_kernel_finish_batch_dump_inputs(1, 1, 1) args( - batchPtrs, - dumpMainBuffer); -} - -metakernel finish_batch_dump_outputs( - qword batchPtrs, - qword dumpMainBuffer) -{ - dispatch opencl_kernel_finish_batch_dump_outputs(1, 1, 1) args( - batchPtrs, - dumpMainBuffer); -} diff --git a/src/intel/vulkan/grl/gpu/instance.h b/src/intel/vulkan/grl/gpu/instance.h deleted file mode 100644 index e463a01dc90..00000000000 --- a/src/intel/vulkan/grl/gpu/instance.h +++ /dev/null @@ -1,183 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "shared.h" -#include "affinespace.h" -#include "api_interface.h" -#include "qbvh6.h" -#include "libs/lsc_intrinsics.h" - -GRL_INLINE uint32_t HwInstanceLeafPart1_getInstanceIndex(struct HwInstanceLeaf *I) -{ - return I->part1.instanceIndex; -} - -GRL_INLINE void encodeDW0_HwInstanceLeafPart0( - uint32_t shaderIndex, - uint32_t geomMask, - uint4 *dst) -{ - (*dst).x = (shaderIndex & ((1 << 24) - 1)) | - (geomMask << 24); -} - -GRL_INLINE void encodeDW1_HwInstanceLeafPart0( - uint32_t instanceContributionToHitGroupIndex, - uint32_t notProcedural, - uint32_t geomFlags, - uint4* dst) -{ - (*dst).y = (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) | - ((notProcedural & 1) << (24 + 5)) | - ((geomFlags & 3) << (24 + 5 + 1)); -} - -GRL_INLINE void encodeDW2DW3_HwInstanceLeafPart0( - uint64_t rootNodePtr, - uint32_t instFlags, - uint4* dst) -{ - uint64_t flags = instFlags; - uint DW2 = (uint)rootNodePtr; - uint DW3 = ((uint)(rootNodePtr >> 32ul) & 0xffff); - DW3 |= flags << 16ull; - (*dst).z = DW2; - (*dst).w = DW3; -} - -GRL_INLINE void HwInstanceLeafPart0_setDW0(struct HwInstanceLeaf *I, - uint32_t shaderIndex, - uint32_t geomMask) -{ - I->part0.DW0 = - (shaderIndex & ((1 << 24) - 1)) | - (geomMask << 24); -} - -GRL_INLINE void HwInstanceLeafPart0_setDW1(struct HwInstanceLeaf *I, - uint32_t instanceContributionToHitGroupIndex, - uint32_t notProcedural, - uint32_t geomFlags) -{ - I->part0.DW1 = - (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) | - ((notProcedural & 1) << (24 + 5)) | - ((geomFlags & 3) << (24 + 5 + 1)); -} - -GRL_INLINE void HwInstanceLeafPart1_setDW0DW1(struct HwInstanceLeaf *I, - global char *pBvhPtr) -{ - I->part1.DW0_DW1 = ((uint64_t)pBvhPtr) & (((uint64_t)1 << 48) - 1); -} - -GRL_INLINE void HwInstanceLeafPart0_setDW2DW3(struct HwInstanceLeaf *I, - uint64_t rootNodePtr, - uint32_t instFlags) -{ - uint64_t flags = instFlags; - flags = flags << 48ull; - uint64_t ptr = rootNodePtr & 0x0000ffffffffffff; - I->part0.DW2_DW3 = ptr + flags; -} - -GRL_INLINE void HwInstanceLeaf_Constructor(global struct HwInstanceLeaf* leaf, - global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc, - uint instanceIndex, - uint rootNodeByteOffset, - uint instanceMask) -{ - global uint4* InstanceLeaf_4DWparts = (global uint4*) (leaf); - - struct AffineSpace3f obj2world = AffineSpace3f_load_row_major(instDesc->Transform); - - qword accStructPtr = (qword)instDesc->AccelerationStructure; - uint4 p1_DW0_3 = (uint4)( - (uint)accStructPtr, - (uint)(accStructPtr >> (uint64_t)32), - GRL_get_instanceID(instDesc), - instanceIndex); - - struct AffineSpace3f world2obj = AffineSpace3f_invert(obj2world); - - store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 0 /*part1 + 0DW*/, p1_DW0_3); - - uint4 p1_DW4_7 = (uint4)( - as_uint(obj2world.l.vx.x), - as_uint(obj2world.l.vx.y), - as_uint(obj2world.l.vx.z), - as_uint(obj2world.l.vy.x)); - - store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 1 /*part1 + 4DW*/, p1_DW4_7); - - uint4 p1_DW8_11 = (uint4)( - as_uint(obj2world.l.vy.y), - as_uint(obj2world.l.vy.z), - as_uint(obj2world.l.vz.x), - as_uint(obj2world.l.vz.y)); - - store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 2 /*part1 + 8DW*/, p1_DW8_11); - - - uint4 p1_DW12_15 = (uint4)( - as_uint(obj2world.l.vz.z), - as_uint(world2obj.p.x), - as_uint(world2obj.p.y), - as_uint(world2obj.p.z)); - - store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 3 /*part1 + 12DW*/, p1_DW12_15); - - - uint hit_group_index = GRL_get_InstanceContributionToHitGroupIndex(instDesc); - global struct BVHBase* bvh = (global struct BVHBase*)instDesc->AccelerationStructure; - - uint4 p0_DW0_3; - - encodeDW0_HwInstanceLeafPart0( - hit_group_index, - instanceMask, - &p0_DW0_3); - - encodeDW1_HwInstanceLeafPart0( - hit_group_index, // for HW instance leaf, this field is used to offset the hit-group index - 1, // disable opaque culling.. Necessary for SW instancing.. don't-care for HW instancing - 0, - &p0_DW0_3); - - encodeDW2DW3_HwInstanceLeafPart0( - rootNodeByteOffset == NO_NODE_OFFSET ? 0 : ((uint64_t)bvh) + rootNodeByteOffset, // offset NO_NODE_OFFSET is for degenerated instance, put null as root pointer - GRL_get_InstanceFlags(instDesc), - &p0_DW0_3); - - store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 0 /*part0 + 0DW*/, p0_DW0_3); - - uint4 p0_DW4_7 = (uint4)( - as_uint(world2obj.l.vx.x), - as_uint(world2obj.l.vx.y), - as_uint(world2obj.l.vx.z), - as_uint(world2obj.l.vy.x)); - - store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 1 /*part0 + 4DW*/, p0_DW4_7); - - uint4 p0_DW8_11 = (uint4)( - as_uint(world2obj.l.vy.y), - as_uint(world2obj.l.vy.z), - as_uint(world2obj.l.vz.x), - as_uint(world2obj.l.vz.y)); - - store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 2 /*part0 + 8DW*/, p0_DW8_11); - - uint4 p0_DW12_15 = (uint4)( - as_uint(world2obj.l.vz.z), - as_uint(obj2world.p.x), - as_uint(obj2world.p.y), - as_uint(obj2world.p.z)); - - store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 3 /*part0 + 12DW*/, p0_DW12_15); -} diff --git a/src/intel/vulkan/grl/gpu/intrinsics.h b/src/intel/vulkan/grl/gpu/intrinsics.h deleted file mode 100644 index 0dff3147d8a..00000000000 --- a/src/intel/vulkan/grl/gpu/intrinsics.h +++ /dev/null @@ -1,581 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -// TODO: AABB_work_group_reduce is super slow, remove !!! - -#pragma cl_intel_subgroups : enable -#pragma cl_khr_fp16 : enable -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - - -uint intel_sub_group_ballot(bool valid); - -// atom_min -float __attribute__((overloadable)) atom_min(volatile __global float *p, float val); -float __attribute__((overloadable)) atom_min(volatile __local float *p, float val); -float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val); -float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val); -// atom_max -float __attribute__((overloadable)) atom_max(volatile __global float *p, float val); -float __attribute__((overloadable)) atom_max(volatile __local float *p, float val); -float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val); -float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val); -// atom_cmpxchg -float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val); -float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val); -float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val); -float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val); - - - -inline uint subgroup_single_atomic_add(global uint *p, uint val) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0; - return sub_group_broadcast(v, 0); -} - -inline float halfarea(const float3 d) -{ - return fma(d.x, (d.y + d.z), d.y * d.z); -} - -inline float area(const float3 d) -{ - return halfarea(d) * 2.0f; -} - -inline uint maxDim(const float3 a) -{ - const float3 b = fabs(a); - const bool b_x_y = b.x > b.y; - const float cur_max = b_x_y ? b.x : b.y; - const uint cur_idx = b_x_y ? 0 : 1; - const bool b_x_y_z = b.z > cur_max; - return b_x_y_z ? 2 : cur_idx; -} - -inline uint3 sortByMaxDim(const float3 a) -{ - const uint kz = maxDim(a); - const uint _kx = (kz + 1) % 3; - const uint _ky = (_kx + 1) % 3; - const bool kz_pos = a[kz] >= 0.0f; - const uint kx = kz_pos ? _ky : _kx; - const uint ky = kz_pos ? _kx : _ky; - return (uint3)(kx, ky, kz); -} - -inline uint4 sort4_ascending(const uint4 dist) -{ - const uint a0 = dist.s0; - const uint a1 = dist.s1; - const uint a2 = dist.s2; - const uint a3 = dist.s3; - const uint b0 = min(a0, a2); - const uint b1 = min(a1, a3); - const uint b2 = max(a0, a2); - const uint b3 = max(a1, a3); - const uint c0 = min(b0, b1); - const uint c1 = max(b0, b1); - const uint c2 = min(b2, b3); - const uint c3 = max(b2, b3); - const uint d0 = c0; - const uint d1 = min(c1, c2); - const uint d2 = max(c1, c2); - const uint d3 = c3; - return (uint4)(d0, d1, d2, d3); -} - -__constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6}; -__constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4}; -__constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6}; -__constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0}; -__constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5}; -__constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6}; -__constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6}; - -__constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1}; -__constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1}; -__constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1}; - -__constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1}; - -inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask) -{ - const uint a1 = intel_sub_group_shuffle(a0, shuffleMask); - const uint a_min = min(a0, a1); - const uint a_max = max(a0, a1); - return select(a_max, a_min, selectMask); -} - -inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask) -{ - const uint a1 = intel_sub_group_shuffle(a0, shuffleMask); - const uint a_min = min(a0, a1); - const uint a_max = max(a0, a1); - return select(a_min, a_max, selectMask); -} - -inline uint sort8_descending(const uint aa) -{ - const unsigned int slotID = get_sub_group_local_id() % 8; - const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]); - const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]); - const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]); - const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]); - const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]); - const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]); - return gg; -} - -inline uint sort8_ascending(const uint aa) -{ - const unsigned int slotID = get_sub_group_local_id() % 8; - const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]); - const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]); - const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]); - const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]); - const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]); - const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]); - return gg; -} - -inline uint sort4_descending(const uint aa) -{ - const unsigned int slotID = get_sub_group_local_id() % 8; - const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]); - const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]); - const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]); - return dd; -} - -inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask) -{ - const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask); - const ulong a_min = min(a0, a1); - const ulong a_max = max(a0, a1); - return select(a_max, a_min, (ulong)selectMask); -} - -inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask) -{ - const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask); - const ulong a_min = min(a0, a1); - const ulong a_max = max(a0, a1); - return select(a_min, a_max, (ulong)selectMask); -} - -inline ulong sort8_ascending_ulong(const ulong aa) -{ - const unsigned int slotID = get_sub_group_local_id() % 8; - const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]); - const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]); - const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]); - const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]); - const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]); - const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]); - return gg; -} - -inline uint bitInterleave3D(const uint4 in) -{ - uint x = in.x, y = in.y, z = in.z; - x = (x | (x << 16)) & 0x030000FF; - x = (x | (x << 8)) & 0x0300F00F; - x = (x | (x << 4)) & 0x030C30C3; - x = (x | (x << 2)) & 0x09249249; - - y = (y | (y << 16)) & 0x030000FF; - y = (y | (y << 8)) & 0x0300F00F; - y = (y | (y << 4)) & 0x030C30C3; - y = (y | (y << 2)) & 0x09249249; - - z = (z | (z << 16)) & 0x030000FF; - z = (z | (z << 8)) & 0x0300F00F; - z = (z | (z << 4)) & 0x030C30C3; - z = (z | (z << 2)) & 0x09249249; - - return x | (y << 1) | (z << 2); -} - -inline uint bitInterleave4D(const uint4 in) -{ - uint x = in.x, y = in.y, z = in.z, w = in.w; - - x = x & 0x000000ff; - x = (x ^ (x << 16)) & 0x00c0003f; - x = (x ^ (x << 8)) & 0x00c03807; - x = (x ^ (x << 4)) & 0x08530853; - x = (x ^ (x << 2)) & 0x09090909; - x = (x ^ (x << 1)) & 0x11111111; - - y = y & 0x000000ff; - y = (y ^ (y << 16)) & 0x00c0003f; - y = (y ^ (y << 8)) & 0x00c03807; - y = (y ^ (y << 4)) & 0x08530853; - y = (y ^ (y << 2)) & 0x09090909; - y = (y ^ (y << 1)) & 0x11111111; - - z = z & 0x000000ff; - z = (z ^ (z << 16)) & 0x00c0003f; - z = (z ^ (z << 8)) & 0x00c03807; - z = (z ^ (z << 4)) & 0x08530853; - z = (z ^ (z << 2)) & 0x09090909; - z = (z ^ (z << 1)) & 0x11111111; - - w = w & 0x000000ff; - w = (w ^ (w << 16)) & 0x00c0003f; - w = (w ^ (w << 8)) & 0x00c03807; - w = (w ^ (w << 4)) & 0x08530853; - w = (w ^ (w << 2)) & 0x09090909; - w = (w ^ (w << 1)) & 0x11111111; - - return (x | (y << 1) | (z << 2) | (w << 3)); -} - -inline ulong ulong_bitInterleave4D(const uint4 in) -{ - ulong x = in.x, y = in.y, z = in.z, w = in.w; - - x = x & 0x0000ffff; - x = (x ^ (x << 32)) & 0x0000f800000007ff; - x = (x ^ (x << 16)) & 0x0000f80007c0003f; - x = (x ^ (x << 8)) & 0x00c0380700c03807; - x = (x ^ (x << 4)) & 0x0843084308430843; - x = (x ^ (x << 2)) & 0x0909090909090909; - x = (x ^ (x << 1)) & 0x1111111111111111; - - y = y & 0x0000ffff; - y = (y ^ (y << 32)) & 0x0000f800000007ff; - y = (y ^ (y << 16)) & 0x0000f80007c0003f; - y = (y ^ (y << 8)) & 0x00c0380700c03807; - y = (y ^ (y << 4)) & 0x0843084308430843; - y = (y ^ (y << 2)) & 0x0909090909090909; - y = (y ^ (y << 1)) & 0x1111111111111111; - - z = z & 0x0000ffff; - z = (z ^ (z << 32)) & 0x0000f800000007ff; - z = (z ^ (z << 16)) & 0x0000f80007c0003f; - z = (z ^ (z << 8)) & 0x00c0380700c03807; - z = (z ^ (z << 4)) & 0x0843084308430843; - z = (z ^ (z << 2)) & 0x0909090909090909; - z = (z ^ (z << 1)) & 0x1111111111111111; - - w = w & 0x0000ffff; - w = (w ^ (w << 32)) & 0x0000f800000007ff; - w = (w ^ (w << 16)) & 0x0000f80007c0003f; - w = (w ^ (w << 8)) & 0x00c0380700c03807; - w = (w ^ (w << 4)) & 0x0843084308430843; - w = (w ^ (w << 2)) & 0x0909090909090909; - w = (w ^ (w << 1)) & 0x1111111111111111; - - return (x | (y << 1) | (z << 2) | (w << 3)); -} - -inline uint bitCompact(uint x) -{ - x &= 0x09249249; - x = (x ^ (x >> 2)) & 0x030c30c3; - x = (x ^ (x >> 4)) & 0x0300f00f; - x = (x ^ (x >> 8)) & 0xff0000ff; - x = (x ^ (x >> 16)) & 0x000003ff; - return x; -} - -inline uint3 bitCompact3D(const uint in) -{ - const uint x = bitCompact(x >> 0); - const uint y = bitCompact(y >> 1); - const uint z = bitCompact(z >> 2); - return (uint3)(x, y, z); -} - -inline uint convertToPushIndices8(uint ID) -{ - const unsigned int slotID = get_sub_group_local_id(); - uint index = 0; - for (uint i = 0; i < 8; i++) - { - const uint mask = intel_sub_group_ballot(ID == i); - const uint new_index = ctz(mask); - index = i == slotID ? new_index : index; - } - return index; -} - -inline uint convertToPushIndices16(uint ID) -{ - const unsigned int slotID = get_sub_group_local_id(); - uint index = 0; - for (uint i = 0; i < 16; i++) - { - const uint mask = intel_sub_group_ballot(ID == i); - const uint new_index = ctz(mask); - index = i == slotID ? new_index : index; - } - return index; -} - -#define FLOAT_EXPONENT_MASK (0x7F800000) // used to be EXPONENT_MASK -#define FLOAT_MANTISSA_MASK (0x007FFFFF) // used to be MANTISSA_MASK -#define FLOAT_NEG_ONE_EXP_MASK (0x3F000000) -#define FLOAT_BIAS (127) -#define FLOAT_MANTISSA_BITS (23) - -inline float3 frexp_vec3(float3 len, int3* exp) -{ - float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK)); - mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f))); - mant = copysign(mant, len); - *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1)); - return mant; -} - - -#ifndef uniform -#define uniform -#endif - -#ifndef varying -#define varying -#endif - -uint get_sub_group_global_id() -{ - return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 ); -} - -// each lane contains the number of 1 bits below the corresponding position in 'mask' -uint subgroup_bit_prefix_exclusive(uniform uint mask) -{ - varying ushort lane = get_sub_group_local_id(); - varying uint lane_mask = (1 << lane) - 1; - varying uint m = mask & lane_mask; - return popcount(m); -} - -uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx ) -{ - varying uint lane_mask = (1 << lane_idx) - 1; - varying uint m = mask & lane_mask; - return popcount(m); -} - - -uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx) -{ - return (uint3)(sub_group_broadcast(v.x,idx), - sub_group_broadcast(v.y,idx), - sub_group_broadcast(v.z,idx)); -} - -float3 sub_group_broadcast_float3(float3 v, uniform ushort idx) -{ - return (float3)(sub_group_broadcast(v.x, idx), - sub_group_broadcast(v.y, idx), - sub_group_broadcast(v.z, idx)); -} - -float3 sub_group_reduce_min_float3(float3 v) -{ - return (float3)(sub_group_reduce_min(v.x), - sub_group_reduce_min(v.y), - sub_group_reduce_min(v.z) ); -} -float3 sub_group_reduce_max_float3(float3 v) -{ - return (float3)(sub_group_reduce_max(v.x), - sub_group_reduce_max(v.y), - sub_group_reduce_max(v.z)); -} - -float3 sub_group_shuffle_float3(float3 v, uniform ushort idx) -{ - return (float3)(intel_sub_group_shuffle(v.x, idx), - intel_sub_group_shuffle(v.y, idx), - intel_sub_group_shuffle(v.z, idx)); -} -uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx) -{ - return (uint3)( intel_sub_group_shuffle(v.x, idx), - intel_sub_group_shuffle(v.y, idx), - intel_sub_group_shuffle(v.z, idx)); -} - - -inline uchar sub_group_reduce_or_N6(uchar val) -{ - val = val | intel_sub_group_shuffle_down(val, val, 4); - val = val | intel_sub_group_shuffle_down(val, val, 2); - val = val | intel_sub_group_shuffle_down(val, val, 1); - return sub_group_broadcast(val, 0); -} - -inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val) -{ - uint SIMD8_id = get_sub_group_local_id() / 8; - val = val | intel_sub_group_shuffle_down(val, val, 4); - val = val | intel_sub_group_shuffle_down(val, val, 2); - val = val | intel_sub_group_shuffle_down(val, val, 1); - - return intel_sub_group_shuffle(val, SIMD8_id * 8); -} - - -inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p ) -{ - return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group ); -} - -inline __attribute__((overloadable)) int atomic_inc_local(local int* p) -{ - return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group); -} - -inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p) -{ - return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group); -} - -inline __attribute__((overloadable)) int atomic_dec_local(local int* p) -{ - return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group); -} - -inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n) -{ - return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); -} - -inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n ) -{ - return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group); -} - -inline uint atomic_add_local( local uint* p, uint n ) -{ - return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); -} - -inline uint atomic_xor_local(local uint* p, uint n) -{ - return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); -} - -inline uint atomic_or_local(local uint* p, uint n) -{ - return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); -} - -inline uint atomic_min_local(local uint* p, uint n) -{ - return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); -} - -inline uint atomic_max_local(local uint* p, uint n) -{ - return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); -} - - - - -inline uint atomic_inc_global( global uint* p ) -{ - return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device); -} - -inline uint atomic_dec_global(global uint* p) -{ - return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device); -} - -inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired) -{ - return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device); -} - -inline uint atomic_add_global( global uint* p, uint n ) -{ - return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); -} - -inline uint atomic_sub_global(global uint* p, uint n) -{ - return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); -} - -inline uint atomic_or_global(global uint* p, uint n) -{ - return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); -} - - -inline uint atomic_inc_global_acquire(global uint* p) -{ - return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device); -} - - -inline uint atomic_inc_global_release(global uint* p) -{ - return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device); -} -inline uint atomic_dec_global_release(global uint* p) -{ - return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device); -} - -inline uint generic_atomic_add(uint* p, uint val) -{ - if (to_global(p) != NULL) - return atomic_add_global(to_global(p), val); - if (to_local(p) != NULL) - return atomic_add_local(to_local(p), val); - return 0; -} - -inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n ) -{ - n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) ); - n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) ); - n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) ); - return sub_group_broadcast( n, 0 ); -} - -inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n ) -{ - n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) ); - n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) ); - n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) ); - return sub_group_broadcast( n, 0 ); -} - -inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n) -{ - n = max(n, intel_sub_group_shuffle_down(n, n, 4)); - n = max(n, intel_sub_group_shuffle_down(n, n, 2)); - n = max(n, intel_sub_group_shuffle_down(n, n, 1)); - return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0); -} - -inline uint generic_atomic_inc(uint* p) -{ - if (to_global(p) != NULL) - return atomic_inc_global(to_global(p)); - if (to_local(p) != NULL) - return atomic_inc(to_local(p)); - return 0; -} - - -// Built-in GRL function which, if called in a kernel body, will force the kernel -// to be compiled to the minimum SIMD width supported by the platform -void GRL_UseMinimumSIMDWidth(); \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/libs/libraries.grl b/src/intel/vulkan/grl/gpu/libs/libraries.grl deleted file mode 100644 index 1d6c0d2c6c5..00000000000 --- a/src/intel/vulkan/grl/gpu/libs/libraries.grl +++ /dev/null @@ -1,13 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -library lsc_intrinsics -{ - default "lsc_intrinsics.cl" ; - fallback "lsc_intrinsics_fallback.cl"; -} - diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl deleted file mode 100644 index 03a76ba36f1..00000000000 --- a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl +++ /dev/null @@ -1,1033 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// LSC Cache options -// Load message caching control -enum LSC_LDCC { - LSC_LDCC_DEFAULT, - LSC_LDCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached - LSC_LDCC_L1UC_L3C, // Override to L1 uncached and L3 cached - LSC_LDCC_L1C_L3UC, // Override to L1 cached and L3 uncached - LSC_LDCC_L1C_L3C, // Override to L1 cached and L3 cached - LSC_LDCC_L1S_L3UC, // Override to L1 streaming load and L3 uncached - LSC_LDCC_L1S_L3C, // Override to L1 streaming load and L3 cached - LSC_LDCC_L1IAR_L3C, // Override to L1 invalidate-after-read, and L3 cached -}; - -// Store message caching control (also used for atomics) -enum LSC_STCC { - LSC_STCC_DEFAULT, - LSC_STCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached - LSC_STCC_L1UC_L3WB, // Override to L1 uncached and L3 written back - LSC_STCC_L1WT_L3UC, // Override to L1 written through and L3 uncached - LSC_STCC_L1WT_L3WB, // Override to L1 written through and L3 written back - LSC_STCC_L1S_L3UC, // Override to L1 streaming and L3 uncached - LSC_STCC_L1S_L3WB, // Override to L1 streaming and L3 written back - LSC_STCC_L1WB_L3WB, // Override to L1 written through and L3 written back -}; - -// LSC Loads - -// Global address space -uint __builtin_IB_lsc_load_global_uchar_to_uint (const __global uchar *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32 -uint __builtin_IB_lsc_load_global_ushort_to_uint(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32 -uint __builtin_IB_lsc_load_global_uint (const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1 -uint2 __builtin_IB_lsc_load_global_uint2 (const __global uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2 -uint3 __builtin_IB_lsc_load_global_uint3 (const __global uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3 -uint4 __builtin_IB_lsc_load_global_uint4 (const __global uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4 -uint8 __builtin_IB_lsc_load_global_uint8 (const __global uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8 -ulong __builtin_IB_lsc_load_global_ulong (const __global ulong *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1 -ulong2 __builtin_IB_lsc_load_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2 -ulong3 __builtin_IB_lsc_load_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3 -ulong4 __builtin_IB_lsc_load_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4 -ulong8 __builtin_IB_lsc_load_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8 - -// Local address space -uint __builtin_IB_lsc_load_local_uchar_to_uint( const __local uchar *base, int immElemOff); //D8U32 -uint __builtin_IB_lsc_load_local_ushort_to_uint(const __local ushort *base, int immElemOff); //D16U32 -uint __builtin_IB_lsc_load_local_uint (const __local uint *base, int immElemOff); //D32V1 -uint2 __builtin_IB_lsc_load_local_uint2 (const __local uint2 *base, int immElemOff); //D32V2 -uint3 __builtin_IB_lsc_load_local_uint3 (const __local uint3 *base, int immElemOff); //D32V3 -uint4 __builtin_IB_lsc_load_local_uint4 (const __local uint4 *base, int immElemOff); //D32V4 -uint8 __builtin_IB_lsc_load_local_uint8 (const __local uint8 *base, int immElemOff); //D32V8 -ulong __builtin_IB_lsc_load_local_ulong (const __local ulong *base, int immElemOff); //D64V1 -ulong2 __builtin_IB_lsc_load_local_ulong2(const __local ulong2 *base, int immElemOff); //D64V2 -ulong3 __builtin_IB_lsc_load_local_ulong3(const __local ulong3 *base, int immElemOff); //D64V3 -ulong4 __builtin_IB_lsc_load_local_ulong4(const __local ulong4 *base, int immElemOff); //D64V4 -ulong8 __builtin_IB_lsc_load_local_ulong8(const __local ulong8 *base, int immElemOff); //D64V8 - -// LSC Stores - -// Global address space -void __builtin_IB_lsc_store_global_uchar_from_uint (__global uchar *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D8U32 -void __builtin_IB_lsc_store_global_ushort_from_uint(__global ushort *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D16U32 -void __builtin_IB_lsc_store_global_uint (__global uint *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D32V1 -void __builtin_IB_lsc_store_global_uint2 (__global uint2 *base, int immElemOff, uint2 val, enum LSC_STCC cacheOpt); //D32V2 -void __builtin_IB_lsc_store_global_uint3 (__global uint3 *base, int immElemOff, uint3 val, enum LSC_STCC cacheOpt); //D32V3 -void __builtin_IB_lsc_store_global_uint4 (__global uint4 *base, int immElemOff, uint4 val, enum LSC_STCC cacheOpt); //D32V4 -void __builtin_IB_lsc_store_global_uint8 (__global uint8 *base, int immElemOff, uint8 val, enum LSC_STCC cacheOpt); //D32V8 -void __builtin_IB_lsc_store_global_ulong (__global ulong *base, int immElemOff, ulong val, enum LSC_STCC cacheOpt); //D64V1 -void __builtin_IB_lsc_store_global_ulong2(__global ulong2 *base, int immElemOff, ulong2 val, enum LSC_STCC cacheOpt); //D64V2 -void __builtin_IB_lsc_store_global_ulong3(__global ulong3 *base, int immElemOff, ulong3 val, enum LSC_STCC cacheOpt); //D64V3 -void __builtin_IB_lsc_store_global_ulong4(__global ulong4 *base, int immElemOff, ulong4 val, enum LSC_STCC cacheOpt); //D64V4 -void __builtin_IB_lsc_store_global_ulong8(__global ulong8 *base, int immElemOff, ulong8 val, enum LSC_STCC cacheOpt); //D64V8 - -// Local address space -void __builtin_IB_lsc_store_local_uchar_from_uint (__local uchar *base, int immElemOff, uint val); //D8U32 -void __builtin_IB_lsc_store_local_ushort_from_uint(__local ushort *base, int immElemOff, uint val); //D16U32 -void __builtin_IB_lsc_store_local_uint (__local uint *base, int immElemOff, uint val); //D32V1 -void __builtin_IB_lsc_store_local_uint2 (__local uint2 *base, int immElemOff, uint2 val); //D32V2 -void __builtin_IB_lsc_store_local_uint3 (__local uint3 *base, int immElemOff, uint3 val); //D32V3 -void __builtin_IB_lsc_store_local_uint4 (__local uint4 *base, int immElemOff, uint4 val); //D32V4 -void __builtin_IB_lsc_store_local_uint8 (__local uint8 *base, int immElemOff, uint8 val); //D32V8 -void __builtin_IB_lsc_store_local_ulong (__local ulong *base, int immElemOff, ulong val); //D64V1 -void __builtin_IB_lsc_store_local_ulong2(__local ulong2 *base, int immElemOff, ulong2 val); //D64V2 -void __builtin_IB_lsc_store_local_ulong3(__local ulong3 *base, int immElemOff, ulong3 val); //D64V3 -void __builtin_IB_lsc_store_local_ulong4(__local ulong4 *base, int immElemOff, ulong4 val); //D64V4 -void __builtin_IB_lsc_store_local_ulong8(__local ulong8 *base, int immElemOff, ulong8 val); //D64V8 - -// LSC prefetching - -// LSC Pre-Fetch Load functions with CacheControls -// Global address space -void __builtin_IB_lsc_prefetch_global_uchar (const __global uchar *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32 -void __builtin_IB_lsc_prefetch_global_ushort(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32 -void __builtin_IB_lsc_prefetch_global_uint (const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1 -void __builtin_IB_lsc_prefetch_global_uint2 (const __global uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2 -void __builtin_IB_lsc_prefetch_global_uint3 (const __global uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3 -void __builtin_IB_lsc_prefetch_global_uint4 (const __global uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4 -void __builtin_IB_lsc_prefetch_global_uint8 (const __global uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8 -void __builtin_IB_lsc_prefetch_global_ulong (const __global ulong *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1 -void __builtin_IB_lsc_prefetch_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2 -void __builtin_IB_lsc_prefetch_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3 -void __builtin_IB_lsc_prefetch_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4 -void __builtin_IB_lsc_prefetch_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8 - -// LSC Fence support - -// FS - Fence Scope -enum LSC_FS { - LSC_FS_THREAD_GROUP, - LSC_FS_LOCAL, - LSC_FS_TILE, - LSC_FS_GPU, - LSC_FS_GPUs, - LSC_FS_SYSTEM_RELEASE, - LSC_FS_SYSTEM_ACQUIRE -}; - -// FT - Fence Type -enum LSC_FT { - LSC_FT_DEFAULT, - LSC_FT_EVICT, - LSC_FT_INVALIDATE, - LSC_FT_DISCARD, - LSC_FT_CLEAN, - LSC_FT_L3 -}; - -// LSC Fence functions -void __builtin_IB_lsc_fence_global_untyped(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - UGM -void __builtin_IB_lsc_fence_global_untyped_cross_tile(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - UGML -void __builtin_IB_lsc_fence_global_typed(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - TGM -void __builtin_IB_lsc_fence_local(); // Mem Port - SLM - -// Exported functions - -// LSC Loads -// uchar -uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset) -{ - return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3UC); -} - -uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset) -{ - return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3C); -} - -uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset) -{ - return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3UC); -} - -uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset) -{ - return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3C); -} - -uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset) -{ - return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3UC); -} - -uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset) -{ - return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3C); -} - -uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset) -{ - return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// ushort -uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset) -{ - return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3UC); -} - -uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset) -{ - return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3C); -} - -uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset) -{ - return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3UC); -} - -uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset) -{ - return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3C); -} - -uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset) -{ - return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3UC); -} - -uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset) -{ - return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3C); -} - -uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset) -{ - return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// uint -uint load_uint_L1UC_L3UC(global uint* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3UC); -} - -uint load_uint_L1UC_L3C(global uint* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3C); -} - -uint load_uint_L1C_L3UC(global uint* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3UC); -} - -uint load_uint_L1C_L3C(global uint* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3C); -} - -uint load_uint_L1S_L3UC(global uint* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3UC); -} - -uint load_uint_L1S_L3C(global uint* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3C); -} - -uint load_uint_L1IAR_L3C(global uint* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// uint2 -uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3UC); -} - -uint2 load_uint2_L1UC_L3C(global uint2* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3C); -} - -uint2 load_uint2_L1C_L3UC(global uint2* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3UC); -} - -uint2 load_uint2_L1C_L3C(global uint2* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3C); -} - -uint2 load_uint2_L1S_L3UC(global uint2* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3UC); -} - -uint2 load_uint2_L1S_L3C(global uint2* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3C); -} - -uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// uint3 -uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3UC); -} - -uint3 load_uint3_L1UC_L3C(global uint3* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3C); -} - -uint3 load_uint3_L1C_L3UC(global uint3* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3UC); -} - -uint3 load_uint3_L1C_L3C(global uint3* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3C); -} - -uint3 load_uint3_L1S_L3UC(global uint3* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3UC); -} - -uint3 load_uint3_L1S_L3C(global uint3* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3C); -} - -uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// uint4 -uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3UC); -} - -uint4 load_uint4_L1UC_L3C(global uint4* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3C); -} - -uint4 load_uint4_L1C_L3UC(global uint4* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3UC); -} - -uint4 load_uint4_L1C_L3C(global uint4* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3C); -} - -uint4 load_uint4_L1S_L3UC(global uint4* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3UC); -} - -uint4 load_uint4_L1S_L3C(global uint4* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3C); -} - -uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// uint8 -uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3UC); -} - -uint8 load_uint8_L1UC_L3C(global uint8* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3C); -} - -uint8 load_uint8_L1C_L3UC(global uint8* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3UC); -} - -uint8 load_uint8_L1C_L3C(global uint8* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3C); -} - -uint8 load_uint8_L1S_L3UC(global uint8* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3UC); -} - -uint8 load_uint8_L1S_L3C(global uint8* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3C); -} - -uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset) -{ - return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// ulong -ulong load_ulong_L1UC_L3UC(global ulong* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3UC); -} - -ulong load_ulong_L1UC_L3C(global ulong* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3C); -} - -ulong load_ulong_L1C_L3UC(global ulong* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3UC); -} - -ulong load_ulong_L1C_L3C(global ulong* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3C); -} - -ulong load_ulong_L1S_L3UC(global ulong* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3UC); -} - -ulong load_ulong_L1S_L3C(global ulong* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3C); -} - -ulong load_ulong_L1IAR_L3C(global ulong* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// ulong2 -ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3UC); -} - -ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3C); -} - -ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3UC); -} - -ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3C); -} - -ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3UC); -} - -ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3C); -} - -ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// ulong3 -ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3UC); -} - -ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3C); -} - -ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3UC); -} - -ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3C); -} - -ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3UC); -} - -ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3C); -} - -ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// ulong4 -ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3UC); -} - -ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3C); -} - -ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3UC); -} - -ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3C); -} - -ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3UC); -} - -ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3C); -} - -ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// ulong8 -ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3UC); -} - -ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3C); -} - -ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3UC); -} - -ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3C); -} - -ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3UC); -} - -ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3C); -} - -ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset) -{ - return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1IAR_L3C); -} - -// LSC Stores -// uchar -void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// ushort -void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// uint -void store_uint_L1UC_L3UC(global uint* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_uint_L1UC_L3WB(global uint* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_uint_L1WT_L3UC(global uint* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_uint_L1WT_L3WB(global uint* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_uint_L1S_L3UC(global uint* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_uint_L1S_L3WB(global uint* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_uint_L1WB_L3WB(global uint* it, int offset, uint value) -{ - __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// uint2 -void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value) -{ - __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value) -{ - __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value) -{ - __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value) -{ - __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value) -{ - __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value) -{ - __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value) -{ - __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// uint3 -void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value) -{ - __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value) -{ - __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value) -{ - __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value) -{ - __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value) -{ - __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value) -{ - __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value) -{ - __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// uint4 -void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value) -{ - __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value) -{ - __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value) -{ - __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value) -{ - __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value) -{ - __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value) -{ - __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value) -{ - __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// uint8 -void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value) -{ - __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value) -{ - __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value) -{ - __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value) -{ - __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value) -{ - __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value) -{ - __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value) -{ - __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// ulong -void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value) -{ - __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value) -{ - __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value) -{ - __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value) -{ - __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value) -{ - __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value) -{ - __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value) -{ - __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// ulong2 -void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value) -{ - __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value) -{ - __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value) -{ - __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value) -{ - __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value) -{ - __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value) -{ - __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value) -{ - __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// ulong3 -void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value) -{ - __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value) -{ - __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value) -{ - __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value) -{ - __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value) -{ - __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value) -{ - __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value) -{ - __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// ulong4 -void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value) -{ - __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value) -{ - __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value) -{ - __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value) -{ - __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value) -{ - __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value) -{ - __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value) -{ - __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// ulong8 -void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value) -{ - __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3UC); -} - -void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value) -{ - __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3WB); -} - -void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value) -{ - __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3UC); -} - -void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value) -{ - __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3WB); -} - -void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value) -{ - __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3UC); -} - -void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value) -{ - __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3WB); -} - -void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value) -{ - __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WB_L3WB); -} - -// LSC Fence support -void mem_fence_gpu_default() -{ - __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_DEFAULT); -} - -void mem_fence_workgroup_default() -{ - __builtin_IB_lsc_fence_global_untyped(LSC_FS_THREAD_GROUP, LSC_FT_DEFAULT); -} - -void mem_fence_gpu_invalidate() -{ - // NOTE: 'FS_TILE' is used here to avoid DG2 HW bug where L3 is needlessly flushed on a 'GPU' scope fence - __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_INVALIDATE); -} - -void mem_fence_gpu_evict() -{ - __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_EVICT); -} - -void mem_fence_evict_to_memory() -{ - __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_EVICT); - __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_L3); -} diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h deleted file mode 100644 index a12dac00e77..00000000000 --- a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h +++ /dev/null @@ -1,207 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// LSC Loads -uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset); -uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset); -uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset); -uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset); -uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset); -uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset); -uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset); - -uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset); -uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset); -uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset); -uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset); -uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset); -uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset); -uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset); - -uint load_uint_L1UC_L3UC(global uint* it, int offset); -uint load_uint_L1UC_L3C(global uint* it, int offset); -uint load_uint_L1C_L3UC(global uint* it, int offset); -uint load_uint_L1C_L3C(global uint* it, int offset); -uint load_uint_L1S_L3UC(global uint* it, int offset); -uint load_uint_L1S_L3C(global uint* it, int offset); -uint load_uint_L1IAR_L3C(global uint* it, int offset); - -uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset); -uint2 load_uint2_L1UC_L3C(global uint2* it, int offset); -uint2 load_uint2_L1C_L3UC(global uint2* it, int offset); -uint2 load_uint2_L1C_L3C(global uint2* it, int offset); -uint2 load_uint2_L1S_L3UC(global uint2* it, int offset); -uint2 load_uint2_L1S_L3C(global uint2* it, int offset); -uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset); - -uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset); -uint3 load_uint3_L1UC_L3C(global uint3* it, int offset); -uint3 load_uint3_L1C_L3UC(global uint3* it, int offset); -uint3 load_uint3_L1C_L3C(global uint3* it, int offset); -uint3 load_uint3_L1S_L3UC(global uint3* it, int offset); -uint3 load_uint3_L1S_L3C(global uint3* it, int offset); -uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset); - -uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset); -uint4 load_uint4_L1UC_L3C(global uint4* it, int offset); -uint4 load_uint4_L1C_L3UC(global uint4* it, int offset); -uint4 load_uint4_L1C_L3C(global uint4* it, int offset); -uint4 load_uint4_L1S_L3UC(global uint4* it, int offset); -uint4 load_uint4_L1S_L3C(global uint4* it, int offset); -uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset); - -uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset); -uint8 load_uint8_L1UC_L3C(global uint8* it, int offset); -uint8 load_uint8_L1C_L3UC(global uint8* it, int offset); -uint8 load_uint8_L1C_L3C(global uint8* it, int offset); -uint8 load_uint8_L1S_L3UC(global uint8* it, int offset); -uint8 load_uint8_L1S_L3C(global uint8* it, int offset); -uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset); - -ulong load_ulong_L1UC_L3UC(global ulong* it, int offset); -ulong load_ulong_L1UC_L3C(global ulong* it, int offset); -ulong load_ulong_L1C_L3UC(global ulong* it, int offset); -ulong load_ulong_L1C_L3C(global ulong* it, int offset); -ulong load_ulong_L1S_L3UC(global ulong* it, int offset); -ulong load_ulong_L1S_L3C(global ulong* it, int offset); -ulong load_ulong_L1IAR_L3C(global ulong* it, int offset); - -ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset); -ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset); -ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset); -ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset); -ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset); -ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset); -ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset); - -ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset); -ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset); -ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset); -ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset); -ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset); -ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset); -ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset); - -ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset); -ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset); -ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset); -ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset); -ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset); -ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset); -ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset); - -ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset); -ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset); -ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset); -ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset); -ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset); -ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset); -ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset); - -// LSC Stores -void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value); -void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value); -void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value); -void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value); -void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value); -void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value); -void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value); - -void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value); -void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value); -void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value); -void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value); -void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value); -void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value); -void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value); - -void store_uint_L1UC_L3UC(global uint* it, int offset, uint value); -void store_uint_L1UC_L3WB(global uint* it, int offset, uint value); -void store_uint_L1WT_L3UC(global uint* it, int offset, uint value); -void store_uint_L1WT_L3WB(global uint* it, int offset, uint value); -void store_uint_L1S_L3UC(global uint* it, int offset, uint value); -void store_uint_L1S_L3WB(global uint* it, int offset, uint value); -void store_uint_L1WB_L3WB(global uint* it, int offset, uint value); - -void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value); -void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value); -void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value); -void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value); -void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value); -void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value); -void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value); - -void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value); -void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value); -void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value); -void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value); -void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value); -void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value); -void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value); - -void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value); -void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value); -void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value); -void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value); -void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value); -void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value); -void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value); - -void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value); -void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value); -void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value); -void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value); -void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value); -void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value); -void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value); - -void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value); -void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value); -void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value); -void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value); -void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value); -void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value); -void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value); - -void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value); -void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value); -void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value); -void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value); -void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value); -void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value); -void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value); - -void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value); -void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value); -void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value); -void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value); -void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value); -void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value); -void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value); - -void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value); -void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value); -void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value); -void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value); -void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value); -void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value); -void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value); - -void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value); -void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value); -void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value); -void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value); -void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value); -void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value); -void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value); - -// LSC Fence support -void mem_fence_gpu_default(); -void mem_fence_workgroup_default(); -void mem_fence_gpu_invalidate(); -void mem_fence_gpu_evict(); -void mem_fence_evict_to_memory(); diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl deleted file mode 100644 index 2217618c7c5..00000000000 --- a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl +++ /dev/null @@ -1,898 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// LSC Loads -// uchar -uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset) -{ - return (uint)(it[offset]); -} - -// ushort -uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset) -{ - return (uint)(it[offset]); -} - -uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset) -{ - return (uint)(it[offset]); -} - -// uint -uint load_uint_L1UC_L3UC(global uint* it, int offset) -{ - return it[offset]; -} - -uint load_uint_L1UC_L3C(global uint* it, int offset) -{ - return it[offset]; -} - -uint load_uint_L1C_L3UC(global uint* it, int offset) -{ - return it[offset]; -} - -uint load_uint_L1C_L3C(global uint* it, int offset) -{ - return it[offset]; -} - -uint load_uint_L1S_L3UC(global uint* it, int offset) -{ - return it[offset]; -} - -uint load_uint_L1S_L3C(global uint* it, int offset) -{ - return it[offset]; -} - -uint load_uint_L1IAR_L3C(global uint* it, int offset) -{ - return it[offset]; -} - -// uint2 -uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset) -{ - return it[offset]; -} - -uint2 load_uint2_L1UC_L3C(global uint2* it, int offset) -{ - return it[offset]; -} - -uint2 load_uint2_L1C_L3UC(global uint2* it, int offset) -{ - return it[offset]; -} - -uint2 load_uint2_L1C_L3C(global uint2* it, int offset) -{ - return it[offset]; -} - -uint2 load_uint2_L1S_L3UC(global uint2* it, int offset) -{ - return it[offset]; -} - -uint2 load_uint2_L1S_L3C(global uint2* it, int offset) -{ - return it[offset]; -} - -uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset) -{ - return it[offset]; -} - -// uint3 -uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset) -{ - return it[offset]; -} - -uint3 load_uint3_L1UC_L3C(global uint3* it, int offset) -{ - return it[offset]; -} - -uint3 load_uint3_L1C_L3UC(global uint3* it, int offset) -{ - return it[offset]; -} - -uint3 load_uint3_L1C_L3C(global uint3* it, int offset) -{ - return it[offset]; -} - -uint3 load_uint3_L1S_L3UC(global uint3* it, int offset) -{ - return it[offset]; -} - -uint3 load_uint3_L1S_L3C(global uint3* it, int offset) -{ - return it[offset]; -} - -uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset) -{ - return it[offset]; -} - -// uint4 -uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset) -{ - return it[offset]; -} - -uint4 load_uint4_L1UC_L3C(global uint4* it, int offset) -{ - return it[offset]; -} - -uint4 load_uint4_L1C_L3UC(global uint4* it, int offset) -{ - return it[offset]; -} - -uint4 load_uint4_L1C_L3C(global uint4* it, int offset) -{ - return it[offset]; -} - -uint4 load_uint4_L1S_L3UC(global uint4* it, int offset) -{ - return it[offset]; -} - -uint4 load_uint4_L1S_L3C(global uint4* it, int offset) -{ - return it[offset]; -} - -uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset) -{ - return it[offset]; -} - -// uint8 -uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset) -{ - return it[offset]; -} - -uint8 load_uint8_L1UC_L3C(global uint8* it, int offset) -{ - return it[offset]; -} - -uint8 load_uint8_L1C_L3UC(global uint8* it, int offset) -{ - return it[offset]; -} - -uint8 load_uint8_L1C_L3C(global uint8* it, int offset) -{ - return it[offset]; -} - -uint8 load_uint8_L1S_L3UC(global uint8* it, int offset) -{ - return it[offset]; -} - -uint8 load_uint8_L1S_L3C(global uint8* it, int offset) -{ - return it[offset]; -} - -uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset) -{ - return it[offset]; -} - -// ulong -ulong load_ulong_L1UC_L3UC(global ulong* it, int offset) -{ - return it[offset]; -} - -ulong load_ulong_L1UC_L3C(global ulong* it, int offset) -{ - return it[offset]; -} - -ulong load_ulong_L1C_L3UC(global ulong* it, int offset) -{ - return it[offset]; -} - -ulong load_ulong_L1C_L3C(global ulong* it, int offset) -{ - return it[offset]; -} - -ulong load_ulong_L1S_L3UC(global ulong* it, int offset) -{ - return it[offset]; -} - -ulong load_ulong_L1S_L3C(global ulong* it, int offset) -{ - return it[offset]; -} - -ulong load_ulong_L1IAR_L3C(global ulong* it, int offset) -{ - return it[offset]; -} - -// ulong2 -ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset) -{ - return it[offset]; -} - -ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset) -{ - return it[offset]; -} - -ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset) -{ - return it[offset]; -} - -ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset) -{ - return it[offset]; -} - -ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset) -{ - return it[offset]; -} - -ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset) -{ - return it[offset]; -} - -ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset) -{ - return it[offset]; -} - -// ulong3 -ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset) -{ - return it[offset]; -} - -ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset) -{ - return it[offset]; -} - -ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset) -{ - return it[offset]; -} - -ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset) -{ - return it[offset]; -} - -ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset) -{ - return it[offset]; -} - -ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset) -{ - return it[offset]; -} - -ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset) -{ - return it[offset]; -} - -// ulong4 -ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset) -{ - return it[offset]; -} - -ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset) -{ - return it[offset]; -} - -ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset) -{ - return it[offset]; -} - -ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset) -{ - return it[offset]; -} - -ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset) -{ - return it[offset]; -} - -ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset) -{ - return it[offset]; -} - -ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset) -{ - return it[offset]; -} - -// ulong8 -ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset) -{ - return it[offset]; -} - -ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset) -{ - return it[offset]; -} - -ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset) -{ - return it[offset]; -} - -ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset) -{ - return it[offset]; -} - -ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset) -{ - return it[offset]; -} - -ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset) -{ - return it[offset]; -} - -ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset) -{ - return it[offset]; -} - -// LSC Stores -// uchar -void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value) -{ - it[offset] = (uchar)(value); -} - -void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value) -{ - it[offset] = (uchar)(value); -} - -void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value) -{ - it[offset] = (uchar)(value); -} - -void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value) -{ - it[offset] = (uchar)(value); -} - -void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value) -{ - it[offset] = (uchar)(value); -} - -void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value) -{ - it[offset] = (uchar)(value); -} - -void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value) -{ - it[offset] = (uchar)(value); -} - -// ushort -void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value) -{ - it[offset] = (ushort)(value); -} - -void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value) -{ - it[offset] = (ushort)(value); -} - -void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value) -{ - it[offset] = (ushort)(value); -} - -void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value) -{ - it[offset] = (ushort)(value); -} - -void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value) -{ - it[offset] = (ushort)(value); -} - -void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value) -{ - it[offset] = (ushort)(value); -} - -void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value) -{ - it[offset] = (ushort)(value); -} - -// uint -void store_uint_L1UC_L3UC(global uint* it, int offset, uint value) -{ - it[offset] = value; -} - -void store_uint_L1UC_L3WB(global uint* it, int offset, uint value) -{ - it[offset] = value; -} - -void store_uint_L1WT_L3UC(global uint* it, int offset, uint value) -{ - it[offset] = value; -} - -void store_uint_L1WT_L3WB(global uint* it, int offset, uint value) -{ - it[offset] = value; -} - -void store_uint_L1S_L3UC(global uint* it, int offset, uint value) -{ - it[offset] = value; -} - -void store_uint_L1S_L3WB(global uint* it, int offset, uint value) -{ - it[offset] = value; -} - -void store_uint_L1WB_L3WB(global uint* it, int offset, uint value) -{ - it[offset] = value; -} - -// uint2 -void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value) -{ - it[offset] = value; -} - -void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value) -{ - it[offset] = value; -} - -void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value) -{ - it[offset] = value; -} - -void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value) -{ - it[offset] = value; -} - -void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value) -{ - it[offset] = value; -} - -void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value) -{ - it[offset] = value; -} - -void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value) -{ - it[offset] = value; -} - -// uint3 -void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value) -{ - it[offset] = value; -} - -void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value) -{ - it[offset] = value; -} - -void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value) -{ - it[offset] = value; -} - -void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value) -{ - it[offset] = value; -} - -void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value) -{ - it[offset] = value; -} - -void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value) -{ - it[offset] = value; -} - -void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value) -{ - it[offset] = value; -} - -// uint4 -void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value) -{ - it[offset] = value; -} - -void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value) -{ - it[offset] = value; -} - -void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value) -{ - it[offset] = value; -} - -void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value) -{ - it[offset] = value; -} - -void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value) -{ - it[offset] = value; -} - -void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value) -{ - it[offset] = value; -} - -void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value) -{ - it[offset] = value; -} - -// uint8 -void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value) -{ - it[offset] = value; -} - -void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value) -{ - it[offset] = value; -} - -void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value) -{ - it[offset] = value; -} - -void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value) -{ - it[offset] = value; -} - -void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value) -{ - it[offset] = value; -} - -void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value) -{ - it[offset] = value; -} - -void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value) -{ - it[offset] = value; -} - -// ulong -void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value) -{ - it[offset] = value; -} - -void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value) -{ - it[offset] = value; -} - -void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value) -{ - it[offset] = value; -} - -void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value) -{ - it[offset] = value; -} - -void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value) -{ - it[offset] = value; -} - -void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value) -{ - it[offset] = value; -} - -void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value) -{ - it[offset] = value; -} - -// ulong2 -void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value) -{ - it[offset] = value; -} - -void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value) -{ - it[offset] = value; -} - -void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value) -{ - it[offset] = value; -} - -void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value) -{ - it[offset] = value; -} - -void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value) -{ - it[offset] = value; -} - -void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value) -{ - it[offset] = value; -} - -void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value) -{ - it[offset] = value; -} - -// ulong3 -void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value) -{ - it[offset] = value; -} - -void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value) -{ - it[offset] = value; -} - -void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value) -{ - it[offset] = value; -} - -void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value) -{ - it[offset] = value; -} - -void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value) -{ - it[offset] = value; -} - -void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value) -{ - it[offset] = value; -} - -void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value) -{ - it[offset] = value; -} - -// ulong4 -void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value) -{ - it[offset] = value; -} - -void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value) -{ - it[offset] = value; -} - -void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value) -{ - it[offset] = value; -} - -void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value) -{ - it[offset] = value; -} - -void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value) -{ - it[offset] = value; -} - -void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value) -{ - it[offset] = value; -} - -void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value) -{ - it[offset] = value; -} - -// ulong8 -void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value) -{ - it[offset] = value; -} - -void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value) -{ - it[offset] = value; -} - -void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value) -{ - it[offset] = value; -} - -void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value) -{ - it[offset] = value; -} - -void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value) -{ - it[offset] = value; -} - -void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value) -{ - it[offset] = value; -} - -void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value) -{ - it[offset] = value; -} - -// LSC Fence support -void mem_fence_gpu_default() -{ - write_mem_fence(CLK_GLOBAL_MEM_FENCE); -} - -void mem_fence_workgroup_default() -{ - write_mem_fence( CLK_GLOBAL_MEM_FENCE ); -} - -void mem_fence_gpu_invalidate() -{ - read_mem_fence(CLK_GLOBAL_MEM_FENCE); -} - -void mem_fence_gpu_evict() -{ - read_mem_fence(CLK_GLOBAL_MEM_FENCE); -} - -void mem_fence_evict_to_memory() -{ - mem_fence(CLK_GLOBAL_MEM_FENCE); -} diff --git a/src/intel/vulkan/grl/gpu/mem_utils.h b/src/intel/vulkan/grl/gpu/mem_utils.h deleted file mode 100644 index b57a25279fd..00000000000 --- a/src/intel/vulkan/grl/gpu/mem_utils.h +++ /dev/null @@ -1,161 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "shared.h" - -/// Write cache line to global memory -/// Assumes subgroup_size is 16 -/// -/// @param dst 64 bytes aligned output pointer -/// @param val value to write -GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val) -{ - global uint* addrAligned = (global uint*)(global uint16*)dst; - intel_sub_group_block_write(addrAligned, val); -} - -/// Read cache line from global memory -/// Assumes subgroup_size is 16 -/// -/// @param src 64 bytes aligned input pointer -/// @return uint read from memory -GRL_INLINE uint CacheLineSubgroupRead(const global char* src) -{ - const global uint* addrAligned = (const global uint*)(global uint16*)src; - return intel_sub_group_block_read(addrAligned); -} - -/// Copy cache line -/// Assumes subgroup_size is 16 -/// -/// @param dst 64 bytes aligned output pointer -/// @param src input pointer -GRL_INLINE void CopyCacheLine(global char* dst, const global char* src) -{ - global const uint* usrc = (global const uint*) (src); - - uint data = intel_sub_group_block_read(usrc); - CacheLineSubgroupWrite(dst, data); -} - -/// Fast memory copy -/// -/// @param dst output pointer -/// @param src input pointer -/// @param size number of bytes to copy -/// @param numGroups number of groups that execute this function -GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups) -{ - const uint CACHELINE_SIZE = 64; - - uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0); - - // this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline. - // it copies laso reminder - { - uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1); - alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1); - - if (size > alignAdd) - { - uint alignedBytesCount = size - alignAdd; - uint alignedDWsCount = alignedBytesCount >> 2; - global uint* dstAlignedPart = (global uint*)(dst + alignAdd); - global uint* srcAlignedPart = (global uint*)(src + alignAdd); - - for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups) - { - dstAlignedPart[id] = srcAlignedPart[id]; - } - - if (globalID < alignedBytesCount - (alignedDWsCount << 2)) - { - global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount); - global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount); - dstByteRem[globalID] = srcByteRem[globalID]; - } - } - } - - // copy to dst below aligned up to chacheline - { - uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3; - if (misalignmentBytesSize) - { - if (globalID < misalignmentBytesSize) - { - dst[globalID] = src[globalID]; - } - dst += misalignmentBytesSize; - src += misalignmentBytesSize; - } - - uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1); - if (misalignmentDWSize) - { - if (globalID < (misalignmentDWSize >> 2)) - { - ((global uint*)dst)[globalID] = ((global uint*)src)[globalID]; - } - } - } -} - -#define CACHELINE_SIZE 64 -#define CACHELINE_PER_BLOCK 4 -#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK; - -GRL_INLINE -global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset) -{ - if (array != NULL) - { - return array + byteOffset; - } - else - { - return (global char *)arrayOfPtrs[byteOffset >> 6]; - } -} - -// assummed: -// dst is always 64 bytes alligned -// size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes) -GRL_INLINE -void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups) -{ - uint taskId = get_group_id(0); - - uint blockedSize = (size) & (~(BLOCK_SIZE - 1)); - - uint cachelinedTailOffset = blockedSize; - uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1)); - - uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE - uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1))); - if (reversedTaskId < tailCacheLines) - { - uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE); - global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset); - CopyCacheLine(dst + byteOffset, src); - } - - uint numBlocks = blockedSize >> 8; - while (taskId < numBlocks) - { - uint byteOffset = (taskId * BLOCK_SIZE); - - for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++) - { - global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset); - CopyCacheLine(dst + byteOffset, src); - byteOffset += CACHELINE_SIZE; - } - - taskId += numGroups; - } -} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/misc.cl b/src/intel/vulkan/grl/gpu/misc.cl deleted file mode 100644 index d32c8267b73..00000000000 --- a/src/intel/vulkan/grl/gpu/misc.cl +++ /dev/null @@ -1,367 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "api_interface.h" -#include "common.h" -#include "instance.h" -#include "misc_shared.h" -#include "mem_utils.h" - -#define DBG(x) -#define ENABLE_CHECKS 0 - -#define CACHELINE_SIZE 64 -#define CACHELINE_PER_BLOCK 4 -#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK; - -GRL_INLINE -uint32_t getGeomDescPrimitiveCountAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index) -{ - return (uint32_t)GRL_get_primitive_count(&geomDesc[index]); -} - -GRL_INLINE -uint32_t getGeomDescTypeAndFlagsAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index) -{ - return (uint32_t)GRL_get_Type(&geomDesc[index]) | - (((uint32_t)GRL_get_Flags(&geomDesc[index])) << 16); -} - -GRL_INLINE -uint64_t getGeomDescAsUint64t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index) -{ - return (uint64_t)getGeomDescPrimitiveCountAsUint32t(geomDesc, index) | - (((uint64_t)getGeomDescTypeAndFlagsAsUint32t(geomDesc, index)) << 32); -} - -// assummed: -// dst is always 64 bytes alligned -GRL_INLINE -void copyGeoMetaData(global char* dst, global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t size, uint numGroups) -{ - uint taskId = get_group_id(0); - uint localId = get_sub_group_local_id(); - - uint cachelinedSize = (size) & (~(CACHELINE_SIZE-1)); - - uint reminderOffset = cachelinedSize; - uint reminderQWSize = (size - reminderOffset) >> 3; - - uint tailCacheLines = cachelinedSize >> 6; // divide by CACHELINE_SIZE - uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups-1))); - if (reversedTaskId == tailCacheLines && localId < reminderQWSize) - { - uint reminderOffsetQW = reminderOffset >> 3; - global uint64_t* dstQW = (global uint64_t*)(dst); - dstQW[localId + reminderOffsetQW] = getGeomDescAsUint64t(geomDesc, localId + reminderOffsetQW); - } - - uint numCacheLines = cachelinedSize >> 6; - while (taskId < numCacheLines) - { - uint byteOffset = taskId * CACHELINE_SIZE; - uint geoIdFromOffset = (byteOffset >> 3) + (localId >> 1); - - uint32_t data = 0; - if (localId & 1) - { - data = getGeomDescTypeAndFlagsAsUint32t(geomDesc, geoIdFromOffset); - } - else - { - data = getGeomDescPrimitiveCountAsUint32t(geomDesc, geoIdFromOffset); - } - CacheLineSubgroupWrite(dst + byteOffset, data); - - taskId += numGroups; - } -} - -GRL_INLINE -uint groupCountForInstancesCopySize(uint size) -{ - return (size >> 8) + 3; -} - -GRL_INLINE -uint groupCountForGeoMetaDataCopySize(uint size) -{ - return (size >> 6) + 1; -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_instances(global char* dest, global char* instancesArray, uint64_t size) -{ - // global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart); - copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_instances_indirect(global char* dest, global char* instancesArray, global const struct IndirectBuildRangeInfo* const indirect_data) -{ - uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc); - instancesArray += indirect_data->primitiveOffset; - uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; - if (tid == 0) - { - struct BVHBase* bvh = (struct BVHBase*)dest; - bvh->Meta.instanceCount = indirect_data->primitiveCount; - } - copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_instance_ptrs(global char* dest, global uint64_t* arrayOfPtrs, uint64_t size) -{ - //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart); - copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_instance_ptrs_indirect(global char* dest, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data) -{ - uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc); - arrayOfPtrs += indirect_data->primitiveOffset; - uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; - if (tid == 0) - { - struct BVHBase* bvh = (struct BVHBase*)dest; - bvh->Meta.instanceCount = indirect_data->primitiveCount; - } - copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_instances_base_ptr(global BVHBase* bvh, global char* instancesArray, uint64_t size) -{ - global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart); - copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_instances_base_ptr_indirect(global BVHBase* bvh, global char* instancesArray, global struct IndirectBuildRangeInfo const * const indirect_data) -{ - global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart); - uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc); - instancesArray += indirect_data->primitiveOffset; - copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_instance_ptrs_base_ptr(global BVHBase* bvh, global uint64_t* arrayOfPtrs, uint64_t size) -{ - global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart); - copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_instance_ptrs_base_ptr_indirect(global BVHBase* bvh, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data) -{ - global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart); - uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc); - arrayOfPtrs += indirect_data->primitiveOffset; - copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel copy_geo_meta_data(global char* dest, global char* src, uint64_t size) -{ - //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.geoDescsStart); - global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc = (global GRL_RAYTRACING_GEOMETRY_DESC *)((unsigned long)src); - copyGeoMetaData(dest, geomDesc, size, groupCountForGeoMetaDataCopySize(size)); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__( ( reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 ) ) ) -__attribute__( ( intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH ) ) ) -void kernel copy_geo_descs_indirect_build(global char* dest, global char* src, global struct IndirectBuildRangeInfo const * const indirect_data, uint numGeometries) -{ - uint32_t gid = get_local_id(0) + get_group_id(0) * get_local_size(0); - if (gid < numGeometries) { - global GRL_RAYTRACING_GEOMETRY_DESC* dstDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(dest); - global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(src); - - GRL_RAYTRACING_GEOMETRY_DESC geo = srcDesc[gid]; - - uint primitiveCount = indirect_data[gid].primitiveCount; - uint primitiveOffset = indirect_data[gid].primitiveOffset; - uint firstVertex = indirect_data[gid].firstVertex; - uint transformOffset = indirect_data[gid].transformOffset; - - if (srcDesc[gid].Type == GEOMETRY_TYPE_TRIANGLES) - { - if (geo.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE) - { - geo.Desc.Triangles.VertexCount = primitiveCount * 3; - geo.Desc.Triangles.pVertexBuffer += primitiveOffset - + firstVertex * geo.Desc.Triangles.VertexBufferByteStride; - } - else - { - geo.Desc.Triangles.IndexCount = primitiveCount * 3; - geo.Desc.Triangles.pIndexBuffer += primitiveOffset; - geo.Desc.Triangles.pVertexBuffer += firstVertex * geo.Desc.Triangles.VertexBufferByteStride; - } - if (geo.Desc.Triangles.pTransformBuffer) { - geo.Desc.Triangles.pTransformBuffer += transformOffset; - } - } else { - // GEOMETRY_TYPE_PROCEDURAL - geo.Desc.Procedural.AABBCount = primitiveCount; - geo.Desc.Procedural.pAABBs_GPUVA += primitiveOffset; - } - - dstDesc[gid] = geo; - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel batched_init_globals(global struct BatchedInitGlobalsData *data) -{ - uint groupID = get_group_id(0); - - struct BatchedInitGlobalsData entry = data[groupID]; - - global struct Globals* globals = (global struct Globals*)entry.p_build_globals; - global char *bvh_mem = (global char*)entry.p_bvh_buffer; - uint numPrimitives = entry.numPrimitives; - uint numGeometries = entry.numGeometries; - uint numInstances = entry.numInstances; - uint instance_descs_start = entry.instance_descs_start; - uint geo_meta_data_start = entry.geo_meta_data_start; - uint node_data_start = entry.node_data_start; - uint quad_data_start = entry.leaf_data_start; - uint instance_data_start = entry.leaf_data_start; - uint procedural_data_start = entry.procedural_data_start; - uint back_pointer_start = entry.back_pointer_start; - uint build_record_start = entry.leaf_data_start; - uint totalBytes = entry.sizeTotal; - uint leafPrimType = entry.leafType; - uint leafSize = entry.leafSize; - - uint root_node_offset = node_data_start; - struct BVHBase *base = (struct BVHBase *)bvh_mem; - - base->Meta.instanceCount = numInstances; - base->Meta.geoCount = numGeometries; - base->Meta.instanceDescsStart = instance_descs_start; - base->Meta.geoDescsStart = geo_meta_data_start; - base->Meta.allocationSize = totalBytes; - // This doesnt work correctly - //ERROR_INFO initErr = { 0, 0, 0, 0xAAABBAAA }; - //base->Meta.errors = initErr; - base->Meta.errors.type = 0; - base->Meta.errors.offset_in_BVH = 0; //in 64B units - base->Meta.errors.when = 0; - base->Meta.errors.reserved = 0xAAABBAAA; - - base->nodeDataCur = node_data_start / 64; - base->quadLeafStart = quad_data_start / 64; - base->quadLeafCur = quad_data_start / 64; - base->instanceLeafStart = instance_data_start / 64; - base->instanceLeafEnd = instance_data_start / 64; - base->proceduralDataStart = procedural_data_start / 64; - base->proceduralDataCur = procedural_data_start / 64; - base->backPointerDataStart = back_pointer_start / 64; - base->refitTreeletsDataStart = totalBytes / 64; - base->refitStartPointDataStart = totalBytes / 64; - base->BVHDataEnd = totalBytes / 64; - base->refitTreeletCnt = 0; - base->refitTreeletCnt2 = 0; - base->rootNodeOffset = root_node_offset; - - base->fatLeafCount = 0; - base->fatLeafTableStart = entry.fatleaf_table_start / 64; - base->innerCount = 0; - base->innerTableStart = entry.innernode_table_start / 64; - base->quadLeftoversCountNewAtomicUpdate = 0; - base->quadTableSizeNewAtomicUpdate = 0; - base->quadIndicesDataStart = entry.quad_indices_data_start / 64; - - if (back_pointer_start != totalBytes) - { - BackPointers* back_pointers = BVHBase_GetBackPointers(base); - uint root_node_idx = root_node_offset - node_data_start; - global uint *root_node_backpointer = (global uint *)InnerNode_GetBackPointer(back_pointers,root_node_idx); - *root_node_backpointer = ((uint)-1) << 6; - } - - AABB3f_init(&base->Meta.bounds); - AABB_init(&globals->centroidBounds); - - globals->build_record_start = build_record_start; - - globals->numBuildRecords = 0; - globals->numBuildRecords_extended = 0; - globals->numPrimitives = numPrimitives; - globals->numSplittedPrimitives = 0; - globals->sync = 0; - globals->probThreshold = 0.0f; - globals->leafPrimType = leafPrimType; - globals->leafSize = leafSize; -} - - - -// This is temporary WA for mock in DXR -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel copy_mock(global char *dest, - global char *src, - uint32_t size) -{ - uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0); - uint32_t globalSize = get_num_groups(0) * get_local_size(0); - for (uint32_t i = globalId; i < size; i += globalSize) - { - dest[i] = src[i]; - } -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(32, 1, 1))) -void kernel mem_set(global char *dest, - dword byte, - dword size) -{ - uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0); - if (globalId < size) - { - dest[globalId] = (char)byte; - } -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(32, 1, 1))) -void kernel mem_set_size_ptr(global char *dest, - dword byte, - global qword* sizePtr) -{ - uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0); - if (globalId < *sizePtr) - { - dest[globalId] = (char)byte; - } -} diff --git a/src/intel/vulkan/grl/gpu/misc.grl b/src/intel/vulkan/grl/gpu/misc.grl deleted file mode 100644 index cb98534afb4..00000000000 --- a/src/intel/vulkan/grl/gpu/misc.grl +++ /dev/null @@ -1,278 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module misc; - -kernel_module misc("misc.cl") -{ - kernel opencl_kernel_batched_init_globals < kernelFunction="batched_init_globals" >; - kernel opencl_kernel_copy_instances < kernelFunction="copy_instances" >; - kernel opencl_kernel_copy_instances_indirect < kernelFunction="copy_instances_indirect" >; - kernel opencl_kernel_copy_instance_ptrs < kernelFunction="copy_instance_ptrs" >; - kernel opencl_kernel_copy_instance_ptrs_indirect < kernelFunction="copy_instance_ptrs_indirect" >; - kernel opencl_kernel_copy_instances_base_ptr < kernelFunction="copy_instances_base_ptr" >; - kernel opencl_kernel_copy_instances_base_ptr_indirect < kernelFunction="copy_instances_base_ptr_indirect" >; - kernel opencl_kernel_copy_instance_ptrs_base_ptr < kernelFunction="copy_instance_ptrs_base_ptr" >; - kernel opencl_kernel_copy_instance_ptrs_base_ptr_indirect < kernelFunction="copy_instance_ptrs_base_ptr_indirect" >; - kernel opencl_kernel_copy_geo_meta_data < kernelFunction="copy_geo_meta_data" >; - kernel opencl_kernel_copy_geo_descs_indirect_build < source="misc.cl", kernelFunction="copy_geo_descs_indirect_build" >; - kernel opencl_kernel_copy_mock < kernelFunction="copy_mock" >; - kernel opencl_kernel_memset < kernelFunction="mem_set" >; - kernel opencl_kernel_memset_size_ptr < kernelFunction="mem_set_size_ptr" >; -} - -import struct MKBuilderState "structs.grl"; -import struct MKSizeEstimate "structs.grl"; - - -metakernel batched_init_globals( - qword p_data, - dword numWgs) -{ - dispatch opencl_kernel_batched_init_globals(numWgs,1,1) args(p_data); -} - -metakernel copy_instances( - qword bvh_buffer, - qword instanceDescsBuffer, - qword totalSizeToCopy, - dword numThreads) -{ - dispatch opencl_kernel_copy_instances (numThreads, 1, 1) args( - bvh_buffer, - instanceDescsBuffer, - totalSizeToCopy); -} - -metakernel -copy_instances_indirect( qword bvh_buffer, qword instanceDescsBuffer, qword indirectBuildRangeInfo ) -{ - - define num_groups REG0; - define C_2 REG2; - define C_3 REG3; - - C_2 = 2; - C_3 = 3; - - // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions - // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3 - num_groups = load_dword( indirectBuildRangeInfo ); - num_groups = num_groups >> C_2; - num_groups = num_groups + C_3; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_copy_instances_indirect args( - bvh_buffer, - instanceDescsBuffer, - indirectBuildRangeInfo); -} - -metakernel copy_instance_ptrs( - qword bvh_buffer, - qword instanceDescPtrsBuffer, - qword totalSizeToCopy, - dword numThreads) -{ - dispatch opencl_kernel_copy_instance_ptrs (numThreads, 1, 1) args( - bvh_buffer, - instanceDescPtrsBuffer, - totalSizeToCopy); -} - -metakernel copy_instance_ptrs_indirect( - qword bvh_buffer, - qword instanceDescPtrsBuffer, - qword indirectBuildRangeInfo) -{ - define num_groups REG0; - define C_2 REG2; - define C_3 REG3; - - C_2 = 2; - C_3 = 3; - - // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions - // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3 - num_groups = load_dword( indirectBuildRangeInfo ); - num_groups = num_groups >> C_2; - num_groups = num_groups + C_3; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_copy_instance_ptrs_indirect args( - bvh_buffer, - instanceDescPtrsBuffer, - indirectBuildRangeInfo); -} - -metakernel copy_instances_base_ptr( - qword bvh_buffer, - qword instanceDescsBuffer, - qword totalSizeToCopy, - dword numThreads) -{ - dispatch opencl_kernel_copy_instances_base_ptr (numThreads, 1, 1) args( - bvh_buffer, - instanceDescsBuffer, - totalSizeToCopy); -} - -metakernel copy_instances_base_ptr_indirect( - qword bvh_buffer, - qword instanceDescsBuffer, - qword indirectBuildRangeInfo) -{ - define num_groups REG0; - define C_2 REG2; - define C_3 REG3; - - C_2 = 2; - C_3 = 3; - - // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions - // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3 - num_groups = load_dword( indirectBuildRangeInfo ); - num_groups = num_groups >> C_2; - num_groups = num_groups + C_3; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_copy_instances_base_ptr_indirect args( - bvh_buffer, - instanceDescsBuffer, - indirectBuildRangeInfo); -} - -metakernel copy_instance_ptrs_base_ptr( - qword bvh_buffer, - qword instanceDescPtrsBuffer, - qword totalSizeToCopy, - dword numThreads) -{ - dispatch opencl_kernel_copy_instance_ptrs_base_ptr (numThreads, 1, 1) args( - bvh_buffer, - instanceDescPtrsBuffer, - totalSizeToCopy); -} - -metakernel copy_instance_ptrs_base_ptr_indirect( - qword bvh_buffer, - qword instanceDescPtrsBuffer, - qword indirectBuildRangeInfo) -{ - define num_groups REG0; - define C_2 REG2; - define C_3 REG3; - - C_2 = 2; - C_3 = 3; - - // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions - // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3 - num_groups = load_dword( indirectBuildRangeInfo ); - num_groups = num_groups >> C_2; - num_groups = num_groups + C_3; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_copy_instance_ptrs_base_ptr_indirect args( - bvh_buffer, - instanceDescPtrsBuffer, - indirectBuildRangeInfo); -} - -metakernel copy_geo_descs( - qword private_dest, - qword transient_src, - qword indirectBuildRangeInfo, - dword numGeometries) -{ - - define num_groups (numGeometries + 16 - 1) / 16; - dispatch opencl_kernel_copy_geo_descs_indirect_build(num_groups, 1, 1) args( - private_dest, - transient_src, - indirectBuildRangeInfo, - numGeometries); -} - -metakernel copy_geo_meta_data( - qword bvh_buffer, - qword geomdesc_buffer, - qword totalSizeToCopy, - dword numThreads) -{ - dispatch opencl_kernel_copy_geo_meta_data (numThreads, 1, 1) args( - bvh_buffer, - geomdesc_buffer, - totalSizeToCopy); -} - - -const COPY_MOCK_GROUP_SIZE = 16; - -metakernel copy_mock( - qword dest, - qword src, - dword size) -{ - define num_groups (size + COPY_MOCK_GROUP_SIZE - 1) / COPY_MOCK_GROUP_SIZE; - dispatch opencl_kernel_copy_mock(num_groups, 1, 1) args( - dest, - src, - size); -} - -metakernel memset( - qword dest, - dword byte, - dword size) -{ - define num_groups (size + 32 - 1) / 32; - dispatch opencl_kernel_memset(num_groups, 1, 1) args( - dest, - byte, - size); -} - -metakernel memset_size_ptr( - qword dest, - dword byte, - qword sizePtr) -{ - define byteSize REG0; - define C_32 REG1; C_32 = 32; - define C_1 REG2; C_1 = 1; - define C_4 REG3; C_4 = 4; - define numGroupsRqd REG4; - - byteSize = load_dword(sizePtr); - - numGroupsRqd = byteSize + C_32; - numGroupsRqd = numGroupsRqd - C_1; - numGroupsRqd = numGroupsRqd >> C_4; - numGroupsRqd = numGroupsRqd >> C_1; - - DISPATCHDIM_X = numGroupsRqd.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_kernel_memset_size_ptr args( - dest, - byte, - sizePtr); -} diff --git a/src/intel/vulkan/grl/gpu/misc_legacy.cl b/src/intel/vulkan/grl/gpu/misc_legacy.cl deleted file mode 100644 index a464e89537c..00000000000 --- a/src/intel/vulkan/grl/gpu/misc_legacy.cl +++ /dev/null @@ -1,386 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "input_client_structs.h" -#include "common.h" -#include "instance.h" - -#define DBG(x) -#define ENABLE_CHECKS 0 - -/* - - This kernel implements a exclusive scan addition operation. The - implementation currently only uses one DSS. - - */ -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -parallel_scan_exclusive_add(global uint *input, - global uint *output, - const uint N) -{ - const uint j = get_local_id(0); - const uint J = get_local_size(0); - const uint BLOCKSIZE = (N + J - 1) / J; - const uint start = min((j + 0) * BLOCKSIZE, N); - const uint end = min((j + 1) * BLOCKSIZE, N); - - uint base = 0; - for (uint i = start; i < end; i++) - base += input[i]; - - base = work_group_scan_exclusive_add(base); - - uint accu = 0; - for (uint i = start; i < end; i++) - { - output[i] = base + accu; - accu += input[i]; - } -} - -/* - - This kernel implements a exclusive scan addition operation that can use the entire GPU. - - */ -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -parallel_scan_exclusive_add_phase0(global uint *input, - global uint *output, - global uint *prefix_sums, - const uint N) -{ - const uint local_size = get_local_size(0); - const uint numTasks = get_num_groups(0); - const uint groupID = get_group_id(0); - const uint localID = get_local_id(0); - const uint global_startID = (groupID + 0) * N / numTasks; - const uint global_endID = (groupID + 1) * N / numTasks; - - uint base = 0; - for (uint i = global_startID + localID; i < global_endID; i += local_size) - base += input[i]; - - base = work_group_reduce_add(base); - - if (localID == 0) - { - prefix_sums[groupID] = base; - printf("%d -> %d \n", groupID, base); - } -} - -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -parallel_scan_exclusive_add_phase1(global uint *input, - global uint *output, - global uint *prefix_sums, - const uint N) -{ - const uint local_size = get_local_size(0); - const uint numTasks = get_num_groups(0); - const uint groupID = get_group_id(0); - const uint localID = get_local_id(0); - const uint global_startID = (groupID + 0) * N / numTasks; - const uint global_endID = (groupID + 1) * N / numTasks; - const uint local_range = global_endID - global_startID; - - uint global_base = 0; - for (uint i = 0; i < groupID; i++) - global_base += prefix_sums[i]; - - const uint j = get_local_id(0); - const uint J = get_local_size(0); - const uint BLOCKSIZE = (local_range + J - 1) / J; - const uint startID = (j + 0) * local_range / J + global_startID; - const uint endID = (j + 1) * local_range / J + global_startID; - - uint base = 0; - for (uint i = startID; i < endID; i++) - base += input[i]; - - base = work_group_scan_exclusive_add(base); - - uint accu = 0; - for (uint i = startID; i < endID; i++) - { - output[i] = global_base + base + accu; - accu += input[i]; - } -} - -/* ========================================================================= */ -/* ============================== STATISTICS =============================== */ -/* ========================================================================= */ - -/* ====== STATS config ====== */ - -#define ENABLE_STAT_CHECKS 1 -#define DBG_STATS(x) - -__attribute__((reqd_work_group_size(256, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -printBVHStatistics(global struct Globals *globals, - global char *bvh_mem, - global struct StatStackEntry *global_stack0, - global struct StatStackEntry *global_stack1, - const uint presplit) -{ - const uint globalID = get_global_id(0); - const uint localID = get_local_id(0); - const uint local_size = get_local_size(0); - - struct BVHBase *base = (struct BVHBase *)bvh_mem; - const uint root = base->rootNodeOffset; - - local uint stack_items[2]; - local uint iterations; - - struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root)); - root_aabb = conservativeAABB(&root_aabb); - const float root_area = AABB_halfArea(&root_aabb); - - global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset); - - if (root_node->type != BVH_INTERNAL_NODE) - { - const uint numChildren = getNumChildren_QBVHNodeN(root_node); - const uint current = root; - for (uint i = 0; i < numChildren; i++) - { - struct AABB aabb = extractAABB_QBVHNodeN(root_node, i); - const float area = AABB_halfArea(&aabb); - - global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad); - global_stack0[i].type = root_node->type; - global_stack0[i].area = area; - global_stack0[i].aabb = aabb; - global_stack0[i].depth = 0; - } - stack_items[0] = numChildren; - stack_items[1] = 0; - } - else - { - global_stack0[0].node = root; - global_stack0[0].type = root_node->type; - global_stack0[0].area = root_area; - global_stack0[0].aabb = root_aabb; - global_stack0[0].depth = 1; - stack_items[0] = 1; - stack_items[1] = 0; - } - - const uint maxInnerNodeOffset = globals->node_mem_allocator.cur; - const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur; - - DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64)); - - iterations = 0; - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - float sah_nodes = 0.0f; - float sah_leaves = 0.0f; - uint leaves = 0; - uint inner_nodes = 0; - uint max_depth = 0; - uint leaf_items = 0; - uint inner_nodes_valid_children = 0; - - while (1) - { - work_group_barrier(CLK_GLOBAL_MEM_FENCE); - const uint buffer_index = (iterations % 2) == 0 ? 0 : 1; - global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1; - global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0; - - const uint local_stack_items = stack_items[buffer_index]; - stack_items[1 - buffer_index] = 0; - - DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items)); - - if (local_stack_items == 0) - break; - //if (iterations == 5) break; - - work_group_barrier(CLK_GLOBAL_MEM_FENCE); - - if (globalID == 0) - iterations++; - - for (uint sindex = localID; sindex < local_stack_items; sindex += local_size) - { - - uint current = input_global_stack[sindex].node; - uint type = input_global_stack[sindex].type; - float current_area = input_global_stack[sindex].area; - struct AABB current_aabb = input_global_stack[sindex].aabb; - uint current_depth = input_global_stack[sindex].depth; - - //printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items); - - max_depth = max(max_depth, current_depth); - - if (type == BVH_QUAD_NODE) - { - unsigned int prims = 1; //getNumLeafPrims(current); - if (prims > BVH_LEAF_N_MAX) - printf("too many items in leaf %d \n", prims); - unsigned int prims_offset = current; //getLeafOffset(current); - //printf("prims_offset %d \n",prims_offset); - - leaf_items += prims; - sah_leaves += current_area; - leaves++; -#if ENABLE_STAT_CHECKS == 1 - struct AABB leafAABB; - AABB_init(&leafAABB); - - global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset); - //printf("prims_offset %d \n",prims_offset); - - for (uint i = 0; i < prims; i++) - { - struct AABB quadAABB = getAABB_Quad(&quads[i]); - AABB_extend(&leafAABB, &quadAABB); - } - - if (!presplit && !AABB_subset(&leafAABB, ¤t_aabb)) - { - printf("leaf error: current %d depth %d \n", current, current_depth); - AABB_print(¤t_aabb); - printf("leaf bounds: \n"); - AABB_print(&leafAABB); - } -#endif - } - else if (type == BVH_INTERNAL_NODE) - { - inner_nodes++; - sah_nodes += current_area; - global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current); - - uint children = 0; - for (uint i = 0; i < BVH_NODE_N6; i++) - { - if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i]) - break; - children++; - } - //printf("children %d \n",children); - -#if ENABLE_STAT_CHECKS == 1 - if (children > BVH_NODE_N6 || children == 0) - { - printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID); - printQBVHNodeN(nodeN); - } - - if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0) - { - printf("offset error %d \n", nodeN->offset); - } -#endif - - uint children_offset = atomic_add(&stack_items[1 - buffer_index], children); - - for (uint i = 0; i < children; i++) - { - inner_nodes_valid_children++; - - struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i); - const float area = AABB_halfArea(&aabb); - - aabb = conservativeAABB(&aabb); - -#if 0 // ENABLE_STAT_CHECKS == 1 // FIXME: not clear whether parent child property still holds !!!! - - // if (aabb.lower.x == (float)(INFINITY)) - // { - // printf("aabb inf error %d current %d nodeN %d \n",i, current, children); - // break; - // } - - - if (!presplit && !AABB_subset(&aabb,¤t_aabb)) - { - printf("Parent: current %d depth %d children %d \n",current, current_depth, children); - AABB_print(¤t_aabb); - printf("Child %d: \n",i); - AABB_print(&aabb); - } -#endif - - uint dest_index = children_offset + i; - if (nodeN->type == BVH_QUAD_NODE) - { - output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad); - if (output_global_stack[dest_index].node >= maxLeafNodeOffset) - { - printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64); - } - } - else if (nodeN->type == BVH_INTERNAL_NODE) - { - output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN)); - if (output_global_stack[dest_index].node >= maxInnerNodeOffset) - { - printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset); - } - } - - output_global_stack[dest_index].type = nodeN->type; - output_global_stack[dest_index].area = area; - output_global_stack[dest_index].aabb = aabb; - output_global_stack[dest_index].depth = current_depth + 1; - //printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type); - } - } - } - } - - sah_nodes = work_group_reduce_add(sah_nodes); - sah_leaves = work_group_reduce_add(sah_leaves); - leaves = work_group_reduce_add(leaves); - inner_nodes = work_group_reduce_add(inner_nodes); - max_depth = work_group_reduce_max(max_depth); - leaf_items = work_group_reduce_add(leaf_items); - inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children); - - if (globalID == 0) - { - /* - sah_nodes *= 1.0f / root_area; - sah_leaves *= 1.0f / root_area; - float sah = sah_nodes + sah_leaves; - - const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start; - const uint totalAllocatedMem = globals->totalAllocatedMem; - - printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX); - float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6); - float leaf_util = 100.0f * (float)leaf_items / (leaves); - printf("allocators: node %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start); - printf("inner nodes %d leaves %d sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves); - uint node_mem = globals->node_mem_allocator_cur; - uint max_node_mem = globalLeafMemAllocatorOffset; - float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem; - - uint leaf_mem = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset; - uint max_leaf_mem = totalAllocatedMem - globalLeafMemAllocatorOffset; - float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem; - - uint total_mem = node_mem + leaf_mem; - float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem; - - printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem); - */ - } -} diff --git a/src/intel/vulkan/grl/gpu/misc_shared.h b/src/intel/vulkan/grl/gpu/misc_shared.h deleted file mode 100644 index 218f2fa4291..00000000000 --- a/src/intel/vulkan/grl/gpu/misc_shared.h +++ /dev/null @@ -1,196 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// -// This file contains structure definitions shared by GRL OCL kernels and host code -// - -#pragma once - -#include "GRLGen12.h" - -GRL_NAMESPACE_BEGIN(GRL) -GRL_NAMESPACE_BEGIN(RTAS) -GRL_NAMESPACE_BEGIN(MISC) - -struct BatchedInitGlobalsData -{ - qword p_build_globals; - qword p_bvh_buffer; - dword numPrimitives; - dword numGeometries; - dword numInstances; - dword instance_descs_start; - dword geo_meta_data_start; - dword node_data_start; - dword leaf_data_start; - dword procedural_data_start; - dword back_pointer_start; - dword sizeTotal; - dword leafType; - dword leafSize; - dword fatleaf_table_start; - dword innernode_table_start; - dword quad_indices_data_start; -}; - -/// Header of debug buffer -/// -/// Header is placed at the begining of debug buffer. -/// After header there is circullar buffer space -typedef struct DebugBufferHeader -{ - /// Offset to begin of buffer (after header) - dword headStart; - /// Offset to free memory in buffer (used by gpu) - dword gpuHead; - /// Offset to end of data in buffer that is ready to read (read on cpu, set on gpu, might be behind gpuHeader) - dword cpuHead; - /// Flag for buffer overflow - dword overflow; - /// Total size of buffer - dword totalSize; - /// Padding needed because otherwise GPU overrides tail with cacheline flush - dword pad[11]; - /// Offset to begin of data in buffer - dword tail; -} DebugBufferHeader; - -enum InputDumpOperationType -{ - INPUT_DUMP_OP_NOP, - INPUT_DUMP_OP_BATCH, - INPUT_DUMP_OP_BUILD, - INPUT_DUMP_OP_UPDATE, - INPUT_DUMP_OP_CLONE, - INPUT_DUMP_OP_COMPACT, - INPUT_DUMP_OP_SERIALIZE, - INPUT_DUMP_OP_DESERIALIZE, - INPUT_DUMP_OP_END_BUFFER -}; - -// each operation starts with the same header structure and looks like this - -// some defined struct { <-----------------start -// OpHeader -// .... struct type specific data -// } -// ... auxilary data of variable len -// <-------------------------------------- end - indicated by endOfData -typedef struct OpHeader -{ - dword operationType; - dword endOfData; // offset to end of this primitive -} OpHeader; - -// header for batch operations -typedef struct BatchOpHeader -{ - OpHeader opHeader; -} BatchOpHeader; - -// interpretation for operationType INPUT_DUMP_OP_BATCH -typedef struct InputBatch -{ - BatchOpHeader header; - qword batchId; - dword vertexBufferDataSize; - dword firstContainedOpOffset; - - // layout of batch is as below, each line is 128B aligned: - - // - // InputBatch <-------------------------------- start - // optional: batchVertexData - // InputBuildDesc/InputCopy <------------------ start + firstContainedOpOffset - // optional: extra data of above token - // InputBuildDesc/InputCopy - // optional: extra data of above token - // ... - // InputBuildDesc/InputCopy - // optional: extra data of above token - // <-------------------------------------------- end = start + endOfData -} InputBatch; - -// for operationType: -// INPUT_DUMP_OP_BUILD, -// INPUT_DUMP_OP_UPDATE, -// followed by auxilary data of variable len -typedef struct InputBuild -{ - OpHeader header; - qword srcBvhPtr; - qword dstBvhPtr; - dword flags; - dword numGeos; - dword numInstances; - dword instArrayOfPtrs; -} InputBuild; - -// for operationType: -// INPUT_DUMP_OP_CLONE, -// INPUT_DUMP_OP_COMPACT, -// INPUT_DUMP_OP_SERIALIZE, -// -// Not for INPUT_DUMP_OP_DESERIALIZE! -typedef struct InputCopy -{ - OpHeader header; - qword srcBvhPtr; - qword dstBvhPtr; -} InputCopy; - -// for INPUT_DUMP_OP_DESERIALIZE -// decode for debug tools follows this format -typedef struct InputDeserialize -{ - OpHeader header; - qword dstBvhPtr; -} InputDeserialize; - -typedef struct InputBatchPtrs -{ - qword dumpDst; - qword globalDumpBuffer; - qword nonVertexDataStart; - dword vertexBuffersSize; - dword totalSize; -} InputBatchPtrs; - -enum OutputDumpOperationType -{ - OUTPUT_DUMP_OP_NOP, - OUTPUT_DUMP_OP_BATCH, - OUTPUT_DUMP_OP_DATA, - OUTPUT_DUMP_OP_END_BUFFER -}; - -// interpretation for operationType OUTPUT_DUMP_OP_BATCH -typedef struct OutputBatch { - BatchOpHeader header; - qword batchId; - dword firstContainedOpOffset; -} OutputBatch; - -// interpretation for operationType OUTPUT_DUMP_OP_DATA -typedef struct OutputData -{ - OpHeader header; - qword srcBvhPtr; -} OutputData; - -typedef struct OutputBatchPtrs -{ - qword dumpDst; - qword dataStart; - dword dataSize; - dword totalSize; -} OutputBatchPtrs; - -GRL_NAMESPACE_END(MISC) -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/gpu/morton/morton_common.h b/src/intel/vulkan/grl/gpu/morton/morton_common.h deleted file mode 100644 index 2beb7a1aff3..00000000000 --- a/src/intel/vulkan/grl/gpu/morton/morton_common.h +++ /dev/null @@ -1,245 +0,0 @@ -// -// Copyright (C) 2009-2022 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "common.h" - -#define MORTON_DEBUG_CHECKS 0 -#define MORTON_VERBOSE_LOG 0 - -GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift ) -{ -#if 0 // turn off, because current hierarchy build requires full sort - // Difference between max iterations needed for LSB sorting and - // number of iterations needed for LSB sorting without primIDs - // This indicates how many of first iterations would be skipped in LSB - return 8 - (8 - (shift >> 3)); -#else - return 0; -#endif -} - -typedef struct BuildRecordLocalMortonFlattener -{ - unsigned int leftChild; // global - unsigned int rightChild; // global - unsigned int rangeStart; // global - unsigned int local_parent_index__numItems; -} BuildRecordLocalMortonFlattener; - -// TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced -typedef union UPerNodeData { - float4 four_DWs; - BuildRecordLocalMortonFlattener buildRecord; - MortonFlattenedBoxlessNode boxlessNode; - struct AABB box; -} UPerNodeData; - -GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn) -{ - return bn.childOffset_type >> 6; -} - -GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn) -{ - return bn.childOffset_type & ((1<<6) -1); -} - -GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane) -{ - short lane_used = index % get_sub_group_size(); - short shift = (index / get_sub_group_size()) * get_sub_group_size(); - if (lane_used == lane) { - *arr |= (val << shift); - } -} - -GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane) -{ - short r = 0; - short lane_used = index % get_sub_group_size(); - short shift = (index / get_sub_group_size()) * get_sub_group_size(); - r = arr >> shift; - r = sub_group_broadcast(r, lane_used); - return r; -} - -GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst) -{ - if (lane < count) - { - dst[lane]=(ushort)(arr & 0xFFFF); - short hi_idx = lane + get_sub_group_size(); - if (hi_idx < count) { - dst[hi_idx] = (ushort)(arr >> 16); - } - } -} - - -GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane) -{ - if (lane < count) - { - *arr = src[lane]; - short hi_idx = lane + get_sub_group_size(); - if (hi_idx < count) { - *arr |= ((uint)(src[hi_idx])) << 16u; - } - } -} - -GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane) -{ - short lane_used = index % get_sub_group_size(); - short shift = (index / get_sub_group_size()) * get_sub_group_size(); - if (lane_used == lane) { - uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint - *arr = (val << shift) | rem_val; - } -} - -GRL_INLINE void SUBGROUP_refit_bottom_up_local( - uniform struct QBVHNodeN* globalNodeData, - uniform struct BackPointers* backPointers, - uniform uint treeletRootGlobalIndex, - uniform uint globalBaseForInternalNodes, - varying ushort lane, - uniform local union UPerNodeData* local_nodes, - varying uint sg_bu_startpoints, - uniform uint sg_bu_startpoints_cnt) -{ - if(sg_bu_startpoints_cnt == 0) - return; - - const uint head_lane = 0; - uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane); - - uniform uint prev_loc_index = 0; - uniform struct AABB child_aabb; // this carries reduced aabb between loop turns - - uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer; - - while (curNodeIndex != 0) - { - uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode); - uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode); - varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane; - - uint numChildren = BackPointer_GetNumChildren(backpointer); - if (child_loc_idx != prev_loc_index && - lane < numChildren) - { - child_aabb = local_nodes[child_loc_idx].box; - } - else if (lane >= numChildren) { - AABB_init(&child_aabb); - child_aabb.lower.w = as_float(0u); - } - - // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM - struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb); - reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 ); - - uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w)); - reduced_bounds.lower.w = as_float((uint)instMask); - uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0); - local uint* pbox = (local uint*)(local_nodes+ curNodeIndex); - if (lane < 8) - { - pbox[lane] = reduce_bounds_lane; - } - - uint global_node_idx = globalBaseForInternalNodes + curNodeIndex; - /* get bounds of all children from child nodes directly */ - struct QBVHNodeN* qnode = globalNodeData + global_node_idx; - subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false); - child_aabb = reduced_bounds; - uint parentIndex = BackPointer_GetParentIndex(backpointer); - - write_mem_fence(CLK_LOCAL_MEM_FENCE); - - if (lane == 0) - { - backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer)); - uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex; - uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3); - - /* set global back pointer */ - *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer; - -#if MORTON_VERBOSE_LOG - printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n", - global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx); -#endif - } - - backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane); - prev_loc_index = curNodeIndex; - curNodeIndex = parentIndex; - - /* if all children got refitted, then continue */ - uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7; - uniform uint numChildrenTotal = (backpointer >> 3) & 0x7; - if (numChildrenRefitted != numChildrenTotal) - { - if(sg_bu_startpoints_cnt) - { - curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane); - backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer; - } - else - return; - } - } - - // process root of the treelet - { - -#if MORTON_DEBUG_CHECKS - if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n"); -#endif - - uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode); - varying uint child_loc_idx = lead_child_loc_offset + 0 + lane; - uint numChildren = BackPointer_GetNumChildren(backpointer); - - if (child_loc_idx != prev_loc_index && - lane < numChildren) - { - child_aabb = local_nodes[child_loc_idx].box; - } - else if (lane >= numChildren) { - AABB_init(&child_aabb); - child_aabb.lower.w = as_float(0u); - } - - // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM - uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w)); - uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode); - uint global_node_idx = treeletRootGlobalIndex; - uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset; - - /* get bounds of all children from child nodes directly */ - struct QBVHNodeN* qnode = globalNodeData + global_node_idx; - - subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false); - - /* reset refit counter for next refit */ - if (lane == 0) - { - /* set global back pointer */ - *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u); - - // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes - -#if MORTON_VERBOSE_LOG - printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n", - curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt); -#endif - } - } -} diff --git a/src/intel/vulkan/grl/gpu/morton/phase0.cl b/src/intel/vulkan/grl/gpu/morton/phase0.cl deleted file mode 100644 index 2fa91c214e1..00000000000 --- a/src/intel/vulkan/grl/gpu/morton/phase0.cl +++ /dev/null @@ -1,400 +0,0 @@ -// -// Copyright (C) 2009-2022 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "libs/lsc_intrinsics.h" -#include "morton/morton_common.h" - -GRL_INLINE void SUBGROUP_create_node_phase0( - uniform global struct Globals* globals, - uniform global struct BinaryMortonCodeHierarchy* bnodes, - uniform global char* bvh_mem, - uniform global uint *global_refit_startpoints, - uniform uint rID, - uniform local uint* local_numRecords, - uniform local uint* local_QNodeOffset, - uniform global struct BuildRecordMorton* records, - uniform struct BuildRecordMorton current, - uniform local uint* local_startpoints_num) -{ - uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; - uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); - uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); - - varying ushort lane = get_sub_group_local_id(); - - /* initialize child array */ - uniform uint numChildren = 2; - varying struct BuildRecordMorton sg_children; - sg_children.items = 0; - sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild; - - if ( lane < numChildren ) - sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID ); - - /* fill QBVH6 node with up to 6 children */ - while ( numChildren < BVH_NODE_N6 ) - { - varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize; - if ( sub_group_all( sg_is_leaf ) ) - break; - - uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items ); - uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) ); - uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild ); - - varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild; - - if ( lane == numChildren || lane == bestChild ) - { - sg_children.nodeID = nodeID; - sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID ); - } - - numChildren++; - } - - const uint current_index = current.current_index; - struct QBVHNodeN* qnode = nodeData + current_index; - SUBGROUP_QBVHNodeN_setChildIncr1( qnode ); - - uniform uint global_offset; - uniform uint child_node_offset; - - // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later - // used in global refit after phase1 - varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0; - uniform uchar children_roots_num = sub_group_reduce_add(is_children_root); - - if ( lane == 0 ) - { - child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren); - - /* create node, but to not set bounds yet as these get calculated during refit */ - QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE ); - QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) ); - /* set back pointers */ - uint backpointer = (current.parent_index << 6) | (numChildren << 3); - - global_offset = atomic_add_local( local_numRecords, numChildren - 1 ); - -#if MORTON_VERBOSE_LOG - printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n", - rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren); -#endif - - if(children_roots_num == numChildren) - { - uint startpoints_offset = atomic_inc_local( local_startpoints_num ); - global_refit_startpoints[startpoints_offset] = current_index; - } - else - { - backpointer += children_roots_num; - } - - *InnerNode_GetBackPointer(backPointers, current_index) = backpointer; - } - - child_node_offset = sub_group_broadcast( child_node_offset, 0 ); - global_offset = sub_group_broadcast( global_offset, 0 ); - - uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset); - - sg_children.current_index = childNodes - nodeData + lane; - sg_children.parent_index = current_index; - - if ( lane < numChildren ) - { - uint write_position = (lane == 0) ? rID : global_offset + lane - 1; - records[write_position] = sg_children; - } -} - - -GRL_INLINE void SUBGROUP_create_node_phase0_local_sync( - uniform global struct Globals* globals, - uniform global struct BinaryMortonCodeHierarchy* bnodes, - uniform global char* bvh_mem, - uniform uint rID, - uniform local uint* local_numRecords, - uniform local uint* local_QNodeOffset, - uniform global struct BuildRecordMorton* records, - uniform struct BuildRecordMorton current, - uniform local uint* local_p0_total, - uniform global struct MortonFlattenedBoxlessNode *boxless_nodes, - uniform uint nodeDataStart) -{ - uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - uniform const uint rootNodeOffset = bvh->rootNodeOffset; - uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); - uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); - - varying ushort lane = get_sub_group_local_id(); - - /* initialize child array */ - uniform uint numChildren = 2; - varying struct BuildRecordMorton sg_children; - sg_children.items = 0; - sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild; - - if ( lane < numChildren ) - sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID ); - - /* fill QBVH6 node with up to 6 children */ - while ( numChildren < BVH_NODE_N6 ) - { - varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize; - if ( sub_group_all( sg_is_leaf ) ) - break; - - uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items ); - uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) ); - uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild ); - - varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild; - - if ( lane == numChildren || lane == bestChild ) - { - sg_children.nodeID = nodeID; - sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID ); - } - - numChildren++; - } - - const uint current_index = current.current_index; - uniform uint global_offset; - uniform uint child_node_offset; - - // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later - // used in global refit after phase1 - varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0; - uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane); - uniform uchar children_roots_num = sub_group_reduce_add(is_children_root); - - if ( lane == 0 ) - { - child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren); - - /* Do not create qnodes here */ - uint backpointer = (current.parent_index << 6) | (numChildren << 3); - - global_offset = atomic_add_local( local_numRecords, numChildren - 1 ); - -#if MORTON_VERBOSE_LOG - printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n", - rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart); -#endif - - MortonFlattenedBoxlessNode flattened_node; - - if(children_roots_num != numChildren) - backpointer += children_roots_num; - - flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask; - - uint loc_id = atomic_inc_local( local_p0_total ); - - flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE; - flattened_node.backPointer = backpointer; - - //TODO: change this writes to L1WB or streaming - boxless_nodes[loc_id] = flattened_node; - - *InnerNode_GetBackPointer(backPointers, current_index) = backpointer; - } - - child_node_offset = sub_group_broadcast( child_node_offset, 0 ); - global_offset = sub_group_broadcast( global_offset, 0 ); - - uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset); - - sg_children.current_index = childNodes - nodeData + lane; - sg_children.parent_index = current_index; - - if ( lane < numChildren ) - { - uint write_position = (lane == 0) ? rID : global_offset + lane - 1; - records[write_position] = sg_children; - } -} - -/* - - In this phase a single large work group performs the construction of - the top of the BVH and creates a build record array. - - Two varians of this kernel: - 1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit - in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase - that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate - that is not effective. - 2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with - number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit. - In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1. - Refit is performed only with local synchronization. - -*/ - -__attribute__((reqd_work_group_size(512, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel -parallel_build_phase0(global struct Globals *globals, - global struct BinaryMortonCodeHierarchy *bnodes, - global char *bvh_mem, - global uint *global_refit_startpoints) -{ - global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; - global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); - - /* a queue of build records in global memory */ - global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); - local uint local_numRecords; - local uint local_QNodeOffset; - local uint local_startpoints_num; - - /* initialize first build record */ - if (get_local_id(0) == 0) - { - /* allocate root node */ - uint root_node_offset = 64*bvh->nodeDataCur; - global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset); - - //assert(root_node_offset == 0); - records[0].nodeID = globals->binary_hierarchy_root; - records[0].items = globals->numPrimitives; - records[0].current_index = rootNode - nodeData; - records[0].parent_index = -1; - - local_numRecords = 1; - local_QNodeOffset = root_node_offset + 64; - local_startpoints_num = 0; - - mem_fence_workgroup_default(); - } - - uint num_records = 1; - - /* terminate when all subtrees are under size threshold */ - while(true) - { - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - /* all work items in the work group pick a subtree to build */ - for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() ) - { - /* small subtrees will get built in next phase */ - if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives - continue; - - /* create QBVH node */ - SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset, - records, records[ID], &local_startpoints_num); - } - - work_group_barrier( CLK_LOCAL_MEM_FENCE ); - mem_fence_workgroup_default(); - uint old_num_records = num_records; - num_records = local_numRecords; - if( old_num_records == num_records ) - break; - - } - - /* remember number of build records for next phase */ - if (get_local_id( 0 ) == 0) - { - globals->numBuildRecords = local_numRecords; - globals->p0_created_num = local_startpoints_num; - bvh->nodeDataCur = local_QNodeOffset / 64; - -#if MORTON_VERBOSE_LOG - printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num); -#endif - } -} - -__attribute__((reqd_work_group_size(512, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel -parallel_build_phase0_local_sync(global struct Globals *globals, - global struct BinaryMortonCodeHierarchy *bnodes, - global char *bvh_mem, - global struct MortonFlattenedBoxlessNode *boxless_nodes) -{ - global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; - global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); - uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; - - /* a queue of build records in global memory */ - global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); - local uint local_numRecords; - local uint local_QNodeOffset; - local uint local_p0_total; - - /* initialize first build record */ - if (get_local_id(0) == 0) - { - /* allocate root node */ - uint root_node_offset = 64*bvh->nodeDataCur; - global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset); - - //assert(root_node_offset == 0); - records[0].nodeID = globals->binary_hierarchy_root; - records[0].items = globals->numPrimitives; - records[0].current_index = rootNode - nodeData; - records[0].parent_index = -1; - - local_numRecords = 1; - local_QNodeOffset = root_node_offset + 64; - local_p0_total = 0; - - mem_fence_workgroup_default(); - } - - uint num_records = 1; - - /* terminate when all subtrees are under size threshold */ - while(true) - { - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - /* all work items in the work group pick a subtree to build */ - for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() ) - { - /* small subtrees will get built in next phase */ - if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives - continue; - - /* create QBVH node */ - SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records, - records[ID], &local_p0_total, boxless_nodes, nodeDataStart); - } - - mem_fence_workgroup_default(); - work_group_barrier( CLK_LOCAL_MEM_FENCE ); - - uint old_num_records = num_records; - num_records = local_numRecords; - if( old_num_records == num_records ) - break; - - } - - /* remember number of build records for next phase */ - if (get_local_id( 0 ) == 0) - { - globals->numBuildRecords = local_numRecords; - bvh->nodeDataCur = local_QNodeOffset / 64; - - globals->p0_allocated_num = BVHBase_numNodes(bvh); - globals->p0_created_num = local_p0_total; - -#if MORTON_VERBOSE_LOG - printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints); -#endif - } -} diff --git a/src/intel/vulkan/grl/gpu/morton/phase1.cl b/src/intel/vulkan/grl/gpu/morton/phase1.cl deleted file mode 100644 index 6a1dd2aa44b..00000000000 --- a/src/intel/vulkan/grl/gpu/morton/phase1.cl +++ /dev/null @@ -1,785 +0,0 @@ -// -// Copyright (C) 2009-2022 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "libs/lsc_intrinsics.h" -#include "morton/morton_common.h" - -// caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards; -BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec) -{ - BuildRecordLocalMortonFlattener rec; - rec.leftChild = srcRec.leftChild; - rec.rightChild = srcRec.rightChild; - rec.rangeStart = srcRec.range.start; - rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1; - return rec; -} - -GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless) -{ - BuildRecordLocalMortonFlattener rec; - rec.leftChild = boxless.binary_hierarchy_index; - rec.rightChild = boxless.childOffset_type; - rec.rangeStart = boxless.backPointer; - rec.local_parent_index__numItems = 0; - return rec; -} - -GRL_INLINE void SUBGROUP_create_boxless_node_phase1( - uniform global struct Globals* globals, - uniform global struct BinaryMortonCodeHierarchy* bnodes, - uniform global char* bvh_mem, - uniform BuildRecordLocalMortonFlattener currentRecord, - uniform uint currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record - uniform local uint* local_numRecords, - uniform uint tictoc, - uniform uint* sg_bu_startpoint_arr, - uniform uint* sg_bu_startpoint_cnt, - uniform uint parentOfRoot, - uniform bool processRoot, - uniform UPerNodeData* nodeData) -{ - varying ushort lane = get_sub_group_local_id(); - - /* initialize child array */ - uniform uint numChildren = 2; - varying struct BuildRecordLocalMortonFlattener sg_children; - sg_children.local_parent_index__numItems = 0; - - uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild; - if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31; - - sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx)); - - /* fill QBVH6 node with up to 6 children */ - while (numChildren < BVH_NODE_N6) - { - // we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point - uint childNumItems = sg_children.local_parent_index__numItems; - varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize; - if (sub_group_all(sg_is_leaf)) { break; } - - uniform uint bestItems = sub_group_reduce_max_N6(childNumItems); - uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems)); - varying uint leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes - uniform uint rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild); - - varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest; - - if (lane == numChildren || lane == bestChild) - { - sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID)); - } - - numChildren++; - } - - uniform uint global_offset; - uniform uint child_node_index; - - bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren); - uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild)); - - if (lane <= numChildren) { - uint writeIDX = 0; - - if (lane == numChildren) - { - /* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */ - MortonFlattenedBoxlessNode flattened_node; - uint parentIDX; - - if (processRoot) - { - *local_numRecords = numChildren + 1; - child_node_index = 1; - writeIDX = 0; - flattened_node.binary_hierarchy_index = 0xFFFFFFFF; - flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE; - parentIDX = parentOfRoot; - } - else - { - uint shift = (16 * tictoc); - uint mask = 0xFFFF; - uint atomicAddVal = numChildren << shift; - child_node_index = atomic_add_local(local_numRecords, atomicAddVal); - sub_group_barrier(0); - writeIDX = currQnodeLocalId; - parentIDX = currentRecord.local_parent_index__numItems >> 16; - flattened_node.binary_hierarchy_index = 0xFFFFFFFF; - sub_group_barrier(0); - child_node_index = (child_node_index >> 16) + (child_node_index & mask); - flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE; - } - -#if MORTON_VERBOSE_LOG - printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren); -#endif - flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren; - sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node); - } - - child_node_index = sub_group_broadcast(child_node_index, numChildren); - - if (lane != numChildren) - { - writeIDX = child_node_index + lane; - sg_children.local_parent_index__numItems |= currQnodeLocalId << 16; - } - - nodeData[writeIDX].buildRecord = sg_children; - } - - if (numFatleafChildren == numChildren) { - uint arridx = *sg_bu_startpoint_cnt; - // GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane) - set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane); - *sg_bu_startpoint_cnt = arridx + 1; - } -} - -// TODO_OPT: Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants -// of this kernel with different WG sizes. There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is -// probably often wasted -GRL_INLINE void phase1_process_fatleaf( - uint globalBaseForInternalNodes, // for root node this is indexOfRoot - uint globalParent , // for root this should be parentOfRoot - bool isInstancePrimLeafType, // - uint leafPrimType, // - uint leafStride, // - global struct QBVHNodeN* nodeData, // per group - uint nodeDataStart, // - struct AABB* primref, // - BackPointers* backPointers, // - global struct MortonCodePrimitive* mc,// - uint nodesToLeafsGap, // - local union UPerNodeData* perNodeData,// - bool processRoot, // - short localNodeId, // - BuildRecordLocalMortonFlattener fatleafRecord, // per node - uint primID ) // -{ - uint lane = get_sub_group_local_id(); - uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); - uniform uint mcID = fatleafRecord.rangeStart; - uint pseudolane = lane < numChildren ? lane : 0; - varying struct AABB sg_bounds = primref[primID]; - - uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16); - uint globalNodeId = globalBaseForInternalNodes + localNodeId; - uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId; - - uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId; - - { - /* For all primitives in a fat leaf we store a back - * pointer. This way we can modify the fat leaf node at leaf construction time. */ - uint back_pointer = globalNodeId + nodeDataStart; - /* Store back pointer and primID inside morton code array to - * be later used by leaf creation. */ - mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID; - } - - struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds); - reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 ); - - uint8_t instMask; - if (isInstancePrimLeafType) - { - instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0; - subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask); - instMask = sub_group_reduce_or_N6(instMask); - } - else - { - instMask = 0xFF; - subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds); - } - - reduce_bounds.lower.w = as_float((uint)instMask); - uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0); - local uint* boxUint = (local uint*)(perNodeData + localNodeId); - if (get_sub_group_size() == 8 || lane < 8) - { - boxUint[lane] = reduce_bounds_lane; - uint globalParentIdx; - if (processRoot) { - // for root, treeletRootGlobalIndex is index of rootsParent in global space - globalParentIdx = globalParent; - } - else { - // for non root, raw_parent_idx is in local space - globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent; - } - if (lane == 0) { - *InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3); - } - } -} - -GRL_INLINE void perform_phase1(global struct Globals* globals, - global struct MortonCodePrimitive* mc, - global struct AABB* primref, - global struct BinaryMortonCodeHierarchy* bnodes, - global char* bvh_mem, - local union UPerNodeData* perNodeData, - local uint* local_records_head, - local uint* local_globalOffsetForNodes, - BuildRecordLocalMortonFlattener rootRecord, - uint treeletRootGlobalIndex, - uint parentOfRootIndex, - const uint leafPrimType, - bool isInstancePrimLeafType) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - varying ushort lane = get_sub_group_local_id(); - - // array that will keep 2x8 shorts indices - varying uint sg_fatleaf_array = 0x0; - uniform uint8_t sg_fatleaf_cnt = 0; - /* terminate when all subtrees are leaves */ - - uint subgroupId = get_sub_group_id(); - uint ID = subgroupId; - - uint sg_bu_startpoints = 0; - uniform uint sg_bu_startpoints_cnt = 0; - const uint shift_mask = globals->shift_mask; - - const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh); - - uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart; - uint leafStart = *pLeafStart; - uint leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode)); - uint nodesToLeafsGap = leafStart - nodeDataStart; - - if (ID == 0) - { - BuildRecordLocalMortonFlattener current = rootRecord; - - if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6) - { - *local_records_head = 1; -#if MORTON_DEBUG_CHECKS - if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n"); -#endif - BuildRecordLocalMortonFlattener fatleafRecord = current; - uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); - uint pseudolane = lane < numChildren ? lane : 0; - uniform const uint mcID = fatleafRecord.rangeStart; - varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask); - - phase1_process_fatleaf( - treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride, - nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, - true, 0, fatleafRecord, primID); - } - else - { -#if MORTON_VERBOSE_LOG - if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); } -#endif - //printf("local_records_head = %d\n", *local_records_head); - SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData); - *local_globalOffsetForNodes = treeletRootGlobalIndex; - } - - ID += get_num_sub_groups(); - } - - uniform uint priv_records_tail = 1; - - /* wait for all work items to have updated local_records array */ - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - uniform uint priv_records_head = *local_records_head & 0xFFFF; - treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1 - uniform uint priv_records_tail_prev = priv_records_tail; - uniform uint other_records_head = priv_records_head; - - uint ticToc = 1; - - if (priv_records_head == priv_records_tail) - { - return; - } - else - { - do - { - for (; ID < priv_records_head; ID += get_num_sub_groups()) - { - BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord); - - if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6) - { - set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane); -#if MORTON_VERBOSE_LOG - if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID); -#endif -#if MORTON_DEBUG_CHECKS - if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n"); -#endif - } - else - { - SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData); - } - } - - priv_records_tail = priv_records_head; - /* wait for all work items to have updated local_records array */ - work_group_barrier(CLK_LOCAL_MEM_FENCE); - { - uint records_as_in_mem = *local_records_head; - priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF; - uint other_records_head_temp = priv_records_head; - priv_records_head += other_records_head; - other_records_head = other_records_head_temp; - ticToc = ticToc ^ 1; -#if MORTON_VERBOSE_LOG - if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem); -#endif - } - } while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head - } - - bool atomicNodeAllocation = treeletRootGlobalIndex > 0; - bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation; - uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0; - - uniform uint globalBaseForInternalNodes = 0; - - // we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex - // if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so - // there's no need to synchronize multiple treelets nodes allocations with atomics. - if (atomicNodeAllocationProduce) - { - *local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1); - } - - // because, root is allocated elsewhere, and first node placed in global mem is node with local index 1 - // mapping local to global: - // local space global space - // [0] - treelet root [treeletRootGlobalIndex] - // ... possibly very long distance ... - // [1] - first non root [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above - // [2] - first [globalBaseForInternalNodes + 2] - // ... - // [numToAllocate] - last node [globalBaseForInternalNodes + 3] - if (atomicNodeAllocation) - { - work_group_barrier(CLK_LOCAL_MEM_FENCE); - globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1); - } - -#if MORTON_VERBOSE_LOG - if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); } -#endif - - if (sg_fatleaf_cnt) - { - short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane); - //if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue; - //if(local_startpoints_cnt > 1) return; - BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord; - - varying uint primID; - { - uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF); - uint pseudolane = lane < numChildren ? lane : 0; - uniform const uint mcID = fatleafRecord.rangeStart; - primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask); - } - - // process fatleafs, and store their boxes to SLM - // also put startpoints for bottom up - //uint fatleaf_cnt = *local_startpoints_cnt; - while (sg_fatleaf_cnt-- > 1) - { - short nextLocalNodeId = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane); - BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord; - varying uint nextPrimId; - - { - uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF); - uint pseudolane = lane < numChildren ? lane : 0; - uniform const uint mcID = nextfatleafRecord.rangeStart; - nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask); - } - - phase1_process_fatleaf( - globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride, - nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, - false, localNodeId, fatleafRecord, primID); - - fatleafRecord = nextfatleafRecord; - localNodeId = nextLocalNodeId; - primID = nextPrimId; - } - - phase1_process_fatleaf( - globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride, - nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData, - false, localNodeId, fatleafRecord, primID); - } - -#if 0 - // put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups. - { - ushort myStartpointWriteSite = 0; - - if (lane == 0) - { - myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt); - } - myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0); - - unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite); - } -#endif - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - // distribute bottom-up startpoints -#if 0 - { - short sp_count_to_divide = (*local_startpoints_cnt); - - //calculate the chunk for each sg. - sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups(); - uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups(); - - uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt; - if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) { - //from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx - // and all sgs before it also have one extra - myReadSite += get_sub_group_id(); - sg_bu_startpoints_cnt++; - } - else - { - // all reminder elements are consummed by previous sgs - myReadSite += sg_bu_startpoints_cnt_reminder; - } - - pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane); - } -#endif - - SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt); - - if (singleTreeletBumpBVHnodeCnt) - { - bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt; - } -} - -GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType) -{ - if (get_sub_group_id() == 0 ) - { - global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh); - BackPointers* backPointers = BVHBase_GetBackPointers(bvh); - - //set required fields to mark that blas is empty - uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0; - qnode->type = leafPrimType; - qnode->instMask = 0; - qnode->qbounds.lower_x[k] = 0x80; - qnode->qbounds.upper_x[k] = 0; - - *InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6); - } -} - -/* - - POSTSORT PHASE1: - Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD. - 1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip - 2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards) - -*/ - -__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -parallel_build_phase1_Indirect_SG( global struct Globals* globals, - global struct MortonCodePrimitive* mc, - global struct AABB* primref, - global struct BinaryMortonCodeHierarchy* bnodes, - global char* bvh_mem) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - const uint leafPrimType = globals->leafPrimType; - - //special case for empty blas - if(globals->numPrimitives == 0) - { - bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1; - update_empty_blas(bvh, leafPrimType); - return; - } - - local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1]; - local uint local_records_head; - // Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers - local uint local_globalOffsetForNodes, local_globalOffsetForNodes2; - - uint rootIndex = 0; - uint parentOfRoot = 0; - BuildRecordLocalMortonFlattener rootBuildRecord; - - /* add start build record to local stack */ - if (get_sub_group_id() == 0 ) - { - global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart); - uint recordID = get_group_id(0); - struct BuildRecordMorton mortonGlobalRecord = records[recordID]; - - rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID)); - - parentOfRoot = mortonGlobalRecord.parent_index; - rootIndex = mortonGlobalRecord.current_index; - -#if MORTON_VERBOSE_LOG - printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n", - local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index); -#endif - } - - if (leafPrimType == NODE_TYPE_INSTANCE) - { - perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, - &local_records_head, &local_globalOffsetForNodes, - rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true); - } - else - { - perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, - &local_records_head, &local_globalOffsetForNodes, - rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false); - } - -} - -__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -parallel_build_phase1_Indirect_global_root( global struct Globals* globals, - global struct MortonCodePrimitive* mc, - global struct AABB* primref, - global struct BinaryMortonCodeHierarchy* bnodes, - global char* bvh_mem) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - const uint leafPrimType = globals->leafPrimType; - const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; - - bvh->nodeDataCur = nodeDataStart + 1; - - //special case for empty blas - if(globals->numPrimitives == 0) - { - update_empty_blas(bvh, leafPrimType); - return; - } - - local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1]; - local uint local_records_head; - local uint local_globalOffsetForNodes; - - BuildRecordLocalMortonFlattener rootBuildRecord; - - if (get_sub_group_id() == 0 ) - { - struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root); - - rootBuildRecord = TranslateToLocalRecord(binaryNode); - - local_globalOffsetForNodes = 0; - } - - if (leafPrimType == NODE_TYPE_INSTANCE) - { - perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, - &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true); - } - else - { - perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData, - &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false); - - } -} - -#if 0 -GRL_INLINE void -DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals, - global struct MortonCodePrimitive* mc, - global struct AABB* primref, - global struct BinaryMortonCodeHierarchy* bnodes, - global char* bvh_mem, - uint startID, uint endID, - local uint* local_numRecords, - local uint* local_numRecordsOld, - local struct BuildRecordMorton* local_records -) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64); - - /* iterate over all subtrees this workgroup should build */ - for ( uint recordID = startID; recordID < endID; recordID++ ) - { - /* add start build record to local stack */ - if ( get_local_id( 0 ) == 0 ) - { - local_records[0] = records[recordID]; - *local_numRecords = 1; - *local_numRecordsOld = 0; - } - work_group_barrier( CLK_LOCAL_MEM_FENCE ); - - /* terminate when all subtrees are leaves */ - while ( *local_numRecords != *local_numRecordsOld ) - { - /* remember the old number of build records to detect later - * whether we are done */ - if ( get_local_id( 0 ) == 0 ) - { - *local_numRecordsOld = *local_numRecords; - } - work_group_barrier( CLK_LOCAL_MEM_FENCE ); - - /* all work items in the sub group pick a subtree to build */ - for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) ) - { - /* ignore small subtrees */ - if ( local_records[ID].items <= BVH_NODE_N6 ) - continue; - - /* create QBVH node */ - create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] ); - } - - /* wait for all work items to have updated local_records array */ - work_group_barrier( CLK_LOCAL_MEM_FENCE ); - } - - const uint shift_mask = globals->shift_mask; - const uint leafPrimType = globals->leafPrimType; - const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; - BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); - global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); - - /* create all fat leaf nodes and initiate refit */ - for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) ) - { - struct BuildRecordMorton current = local_records[ID]; - const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID ); - - global struct QBVHNodeN* qnode = nodeData + current.current_index; - - /* get bounds of all children of the fat leaf node */ - struct AABB bounds[BVH_NODE_N6]; - for ( uint i = 0; i < current.items; i++ ) - { - /* get primID and bounds of primitive */ - const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask); - bounds[i] = primref[primID]; - - /* For all primitives in a fat leaf we store a back - * pointer. This way we can modify the fat leaf node at leaf construction time. */ - const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem; - - /* Store back pointer and primID inside morton code array to - * be later used by leaf creation. */ - mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID; - } - - /* update fat leaf node */ - QBVHNodeN_setType( qnode, leafPrimType ); - global void* offset; - if ( leafPrimType != BVH_INSTANCE_NODE ) - { - offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad ); - QBVHNodeN_setChildIncr1( qnode ); - } - else - { - offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf ); - QBVHNodeN_setChildIncr2( qnode ); - } - QBVH6Node_set_offset( qnode, offset ); - QBVHNodeN_setBounds( qnode, bounds, current.items ); - - /* set back pointers for fat leaf nodes */ - *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3); - - /* bottom up refit */ - refit_bottom_up( qnode, bvh, bounds, current.items ); - } - } -} - -/* - - This phase takes the build records calculated in phase0 as input and - finished the BVH construction for all these subtrees. - -*/ -__attribute__((reqd_work_group_size(8, 1, 1))) -old_parallel_build_phase1(global struct Globals *globals, - global struct MortonCodePrimitive *mc, - global struct AABB *primref, - global struct BinaryMortonCodeHierarchy *bnodes, - global char *bvh_mem) -{ - global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; - global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); - - /* a queue of build records */ - local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; - local uint local_numRecords; - local uint local_numRecordsOld; - - /* construct range of build records that each sub group will process */ - const uint numRecords = globals->numBuildRecords; - const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0); - const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0); - - DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); - -} - -__attribute__( (reqd_work_group_size( 8, 1, 1 )) ) -old_parallel_build_phase1_Indirect( global struct Globals* globals, - global struct MortonCodePrimitive* mc, - global struct AABB* primref, - global struct BinaryMortonCodeHierarchy* bnodes, - global char* bvh_mem ) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart); - - /* a queue of build records */ - local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; - local uint local_numRecords; - local uint local_numRecordsOld; - - /* construct range of build records that each sub group will process */ - const uint numRecords = globals->numBuildRecords; - uint startID = get_group_id( 0 ); - uint endID = startID + 1; - - DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); - -} -#endif diff --git a/src/intel/vulkan/grl/gpu/morton/phase2.cl b/src/intel/vulkan/grl/gpu/morton/phase2.cl deleted file mode 100644 index e82d22aaacf..00000000000 --- a/src/intel/vulkan/grl/gpu/morton/phase2.cl +++ /dev/null @@ -1,314 +0,0 @@ -// -// Copyright (C) 2009-2022 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "bvh_build_refit.h" -#include "libs/lsc_intrinsics.h" -#include "morton/morton_common.h" - -/* - - POSTSORT PHASE2: - Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value. - 1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate. - This kernel should be used only for very big bvh, it is faster than non-SLM fallback - in parallel_build_phase2_refit_local. - 2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of - nodes allocated in phase0, but there is also non-SLM fallback there, as the - decision on which kernel to run is based on the nodes estimates on the host - side. - -*/ - - -GRL_INLINE void refit_bottom_up_global_sync( - global char* bvh_mem, - global uint* global_refit_startpoints, - uniform uint nodeId, - uniform ushort lane) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - - BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); - global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); - - // Get the node idx that was put here in phase1 - const uint innerNodeIdx = global_refit_startpoints[nodeId]; - - // Get the qnode and backpointer - uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx; - uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); - - varying struct AABB childrenAABB; // one child AABB per lane - AABB_init(&childrenAABB); - - uniform uint numChildren = (backPointer >> 3) & 0x7; - if(numChildren == 0) return; - - global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode ); - varying ushort child_idx = (lane < numChildren) ? lane : 0; - childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx ); - -#if MORTON_VERBOSE_LOG - if(lane == 0) - printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx); -#endif - - struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB ); - reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 ); - - subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane); - - uint children_mask = qnode_child[child_idx].instMask; - qnode->instMask = sub_group_reduce_or_N6(children_mask); - - SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 ); -} - -__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel -parallel_build_phase2_refit( global char* bvh_mem, - global uint* global_refit_startpoints ) -{ - refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0)); -} - - -GRL_INLINE void SUBGROUP_refit_bottom_up_global( - uniform global struct QBVHNodeN* globalNodeData, - uniform struct BackPointers* backPointers, - varying ushort lane, - varying uint curNodeIndex) -{ - uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex); - - const uint head_lane = 0; - uniform struct AABB child_aabb; // this carries reduced aabb between loop turns - - while (curNodeIndex != 0) - { - global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex; - global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode ); - uint numChildren = BackPointer_GetNumChildren(backpointer); - - varying ushort child_idx = (lane < numChildren) ? lane : 0; - child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx ); - - struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb); - reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane); - - /* get bounds of all children from child nodes directly */ - subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane); - - uchar childrenMask = qnode_child[child_idx].instMask; - qnode->instMask = sub_group_reduce_or_N6(childrenMask); - - uint parentIndex = BackPointer_GetParentIndex(backpointer); - - mem_fence_gpu_invalidate(); - - if (lane == 0) - { - backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex)); - - uint globalBackpointer = (parentIndex << 6) | (numChildren << 3); - - /* set global back pointer */ - *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer; - -#if MORTON_VERBOSE_LOG - printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n", - curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x); -#endif - } - - backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane); - curNodeIndex = parentIndex; - - /* if all children got refitted, then continue */ - uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7; - uniform uint numChildrenTotal = (backpointer >> 3) & 0x7; - - if (numChildrenRefitted != numChildrenTotal) - return; - } - - // process root of the treelet - { - -#if MORTON_DEBUG_CHECKS - if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n"); -#endif - - global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData ); - uint numChildren = BackPointer_GetNumChildren(backpointer); - - varying ushort child_idx = (lane < numChildren) ? lane : 0; - child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx ); - - struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb); - reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane); - - /* get bounds of all children from child nodes directly */ - subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane); - - uchar childrenMask = qnode_child[child_idx].instMask; - globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask); - - /* reset refit counter for next refit */ - if (lane == 0) - { - /* set global back pointer */ - *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u); - -#if MORTON_VERBOSE_LOG - printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n", - curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt); -#endif - } - } -} - - -// TODO: Check why 512 wg size has worse performance than 256 -__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -parallel_build_phase2_refit_local( global struct Globals* globals, - global char* bvh_mem, - global struct MortonFlattenedBoxlessNode *boxless_nodes) -{ - // Number of nodes created in P0, to be refitted in this stage - uint p0_created_num = globals->p0_created_num; - - // Return immediately if host executed this kernel but there is nothing to do - if(p0_created_num == 0) - return; - - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); - global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); - varying ushort lane = get_sub_group_local_id(); - - // Hardcode SLM to max here as we do not know upfront how much mem will be needed - local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */ - - // Number of allocated nodes in phase0 (p0_created_num + children) - uint p0_allocated_num = globals->p0_allocated_num; - - // array that will keep 2x8 shorts indices - varying uint sg_fatleaf_array = 0x0; - uniform uint8_t sg_bu_startpoints_cnt = 0; - - // Determine if we can fit into SLM with all the nodes allocated in phase0, - // There are two paths here: - // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local, - // which does refit nad creates qnodes in bvh - // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization. - // It is not performant to do so, keep it as a guardrail here. On the host side we do fallback - // to the old refit separated path, with wg_size 8 with better EU reuse. - if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM) - { - for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() ) - { - MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID]; - uint current_id = boxless_node.binary_hierarchy_index >> 6; - - // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused - uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F); - - if(lane == 0) - perNodeData[current_id].boxlessNode = boxless_node; - - // When no children are subtree roots, we are done and skip to the next iteration - if(children_root_mask == 0x0) - { - continue; - } - // When all children are subtree roots, put them to sg_fatleaf_array - else if(children_root_mask == 0x3F) - { - set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane); - } - - uniform global struct QBVHNodeN* qnode = nodeData + current_id; - - uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7; - uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node); - varying ushort child_idx = (lane < numChildren) ? lane : 0; - - varying struct AABB childrenAABB; // one child AABB per lane - AABB_init(&childrenAABB); - - uint lead_child_global_id = current_id + lead_child_offset; - - uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id; - childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx ); - - // Get only AABBs of children that are p1 subtree roots - bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx); - if(lane_active) - { - uint child_global_id = lead_child_global_id + child_idx; - perNodeData[child_global_id].box = childrenAABB; - perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask); - } - -#if MORTON_VERBOSE_LOG - if(lane == 0) - printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset); -#endif - } - - work_group_barrier(CLK_LOCAL_MEM_FENCE); - - SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt); - } - else - { - for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() ) - { - MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID]; - uint current_id = boxless_node.binary_hierarchy_index >> 6; - - // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused - uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F); - uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7; - - uniform global struct QBVHNodeN* qnode = nodeData + current_id; - uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node); - uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node); - - SUBGROUP_QBVHNodeN_setChildIncr1( qnode ); - if(lane == 0) - { - QBVH6Node_set_type( qnode, nodeType ); - qnode->offset = lead_child_offset; - } - - // When no children are subtree roots, we are done and skip to the next iteration - if(children_root_mask == 0x0) - { - continue; - } - // When all children are subtree roots, put them to sg_fatleaf_array - else if(children_root_mask == 0x3F) - { - set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane); - } - -#if MORTON_VERBOSE_LOG - if(lane == 0) - printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset); -#endif - } - - while (sg_bu_startpoints_cnt > 0) - { - uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane); - - SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex); - } - } -} diff --git a/src/intel/vulkan/grl/gpu/morton/post_sort.cl b/src/intel/vulkan/grl/gpu/morton/post_sort.cl deleted file mode 100644 index c13762438a3..00000000000 --- a/src/intel/vulkan/grl/gpu/morton/post_sort.cl +++ /dev/null @@ -1,521 +0,0 @@ -// -// Copyright (C) 2009-2022 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "libs/lsc_intrinsics.h" -#include "morton/morton_common.h" - -//////////////////////////////////////////////////////////////////////////////////////////////////////// -/* - - This kernel constructs a binary hierarchy in bottom up fashion from - the morton codes. - -*/ -//////////////////////////////////////////////////////////////////////////////////////////////////////// - -int Delta(global struct MortonCodePrimitive* mc, const uint64_t key0, const uint i1 ) -{ - const uint64_t key1 = mc[i1].index_code; - return clz(key0 ^ key1); -} - -int sign( int d ) -{ - return (d > 0) ? 1 : -1; -} - -__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) ) -void kernel build_bottom_up_indirect( global struct Globals* globals, - global struct BinaryMortonCodeHierarchy* bnodes, - global struct MortonCodePrimitive* mc ) -{ - /* construct range of primitives that each work group will process */ - const uint numPrimitives = globals->numPrimitives; - - uint i = get_group_id( 0 ) * get_local_size(0) + get_local_id( 0 ); - - if (i == 0) - { - globals->binary_hierarchy_root = 0; - if (numPrimitives == 1) - { - // special kludge for 1-prim tree. Make sure the one leaf node is initialized - bnodes[i].range.start = 0; - bnodes[i].range.end = 0; - bnodes[i].leftChild = -1; - bnodes[i].rightChild = -1; - } - - // store pointer to the binary hierarchy in the globals struct. - // This will be used - globals->binary_hierarchy_buffer = (gpuva_t) bnodes; - } - - uint num_inner_nodes = numPrimitives-1; - if ( i < num_inner_nodes ) - { - // - // direction is 1 if this morton code is the node's first key, -1 if it's the last - // By construction every internal node is either the start or the end of a given key range - // direction should be towards the neighbor with the most bits in common - - uint64_t ki = mc[i].index_code; - - int direction, delta_min; - uint lmax; - if( i == 0 ) - { - direction = 1; - delta_min = -1; - lmax = numPrimitives; - } - else - { - direction = sign( Delta( mc, ki, i + 1 ) - Delta( mc, ki, i - 1 ) ); - delta_min = Delta( mc, ki, i - direction ); - - // find upper bound for length of this node's key range - lmax = 8; - while ( (i+lmax*direction) < numPrimitives && Delta( mc, ki, i+lmax*direction ) > delta_min) - lmax = lmax * 2; - } - - // clamp max length so that the binary searches are fully in-bounds - uint maxLen = (direction>0) ? (numPrimitives - i) : (i+1); - lmax = min(lmax, maxLen); - - // find end of range using binary search - uint length = 0; - uint end = lmax-1; - while (length != end) - { - uint mid = length + ((end-length)/2) + ((end-length)%2); - bool bigger = Delta( mc, ki, i+mid*direction) > delta_min; - length = bigger ? mid : length; - end = bigger ? end : mid-1; - } - uint j = i + length*direction ; - - // find split position using binary search - uint split = 0; - end = length-1; - int delta_node = Delta(mc, ki, j); - while (split != end) - { - uint mid = split + ((end-split)/2) + ((end-split)%2); - bool bigger = Delta( mc, ki, i+mid*direction) > delta_node; - split = bigger ? mid : split; - end = bigger ? end : mid-1; - } - split = i + split*direction + min(direction,0); - - uint left = split; - uint right = split+1; - - // mark leaves - if( min(i,j) == split ) - left = left | (1<<31); - if( max(i,j) == split+1 ) - right = right | (1<<31); - - bnodes[i].range.start = min(i,j); - bnodes[i].range.end = max(i,j); - bnodes[i].leftChild = left; - bnodes[i].rightChild = right; - } -} - - - - - -#if 0 -__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) ) -void kernel build_bottom_up_indirect( global struct Globals* globals, - global struct BinaryMortonCodeHierarchy* bnodes, - global struct MortonCodePrimitive* mc ) -{ - /* construct range of primitives that each work group will process */ - const uint numPrimitives = globals->numPrimitives; - - // RangeFactor determines the distance between adjacent nodeIds in work group. - // The aim of the nodes distribution within work group, for rangeFactor > 1 - // is to be sure that half of the work groups will entirelly be dropped off - // at the bottom layer of the graph. This way the EUs can be reused faster. - // The factor needs to be smaller than MAX_HW_SIMD_WIDTH - const uint rangeFactor = 2; - - const uint numGroups = ((numPrimitives + MAX_HW_SIMD_WIDTH - 1) / MAX_HW_SIMD_WIDTH); - const uint globalId = get_group_id( 0 ) * MAX_HW_SIMD_WIDTH + get_local_id( 0 ); - const uint numPrimitivesAlignedToWGSize = MAX_HW_SIMD_WIDTH * numGroups; - const uint groupsRange = numPrimitivesAlignedToWGSize / rangeFactor; - - /* iterate over all primitives the work group should process */ - const uint i = (globalId * rangeFactor) % numPrimitivesAlignedToWGSize + globalId / groupsRange; - - if ( i < numPrimitives ) - { - uint node = i | ((uint)1 << 31); - uint start = i; - uint end = i; - - /* bottom up */ - while ( true ) - { - /* goto parent node and link parent node to current node */ - node = updateParent( bnodes, mc, node, start, end, numPrimitives - 1 ); - - /* do not continue if we reached this node the first time */ - if ( node == -1 ) - break; - - mem_fence_gpu_invalidate(); - - /* update range */ - start = bnodes[node].range.start; - end = bnodes[node].range.end; - - /* stop when we reached the root node */ - if ( start == 0 && end == numPrimitives - 1 ) - { - globals->binary_hierarchy_root = node; - break; - } - } - } -} - -#endif - -/* - - This function builds one QBVH6 node by opening the provided binary - BVH nodes until the QBVH node is full. - - */ - -GRL_INLINE void create_node(global struct Globals *globals, - global struct BinaryMortonCodeHierarchy *bnodes, - global char *bvh_mem, - uint rID, - local uint *local_numRecords, - local uint *local_QNodeOffset, - struct BuildRecordMorton *records, - struct BuildRecordMorton *current) -{ - global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; - const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; - global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); - BackPointers *backPointers = BVHBase_GetBackPointers(bvh); - - /* initialize child array */ - uint numChildren = 2; - struct BuildRecordMorton children[BVH_NODE_N6]; - children[0].nodeID = bnodes[current->nodeID].leftChild; - children[0].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[0].nodeID); - children[1].nodeID = bnodes[current->nodeID].rightChild; - children[1].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[1].nodeID); - - /* fill QBVH6 node with up to 6 children */ - while (numChildren < BVH_NODE_N6) - { - /*! find best child to split */ - uint bestItems = 0; - int bestChild = -1; - for (int i = 0; i < numChildren; i++) - { - const uint items = children[i].items; - - /* ignore leaves as they cannot get split */ - if (items <= cfg_minLeafSize) - continue; - - /* find child with largest number of items */ - if (items > bestItems) - { - bestItems = items; - bestChild = i; - } - } - if (bestChild == -1) - break; - - /* perform best found split */ - const uint bestNodeID = children[bestChild].nodeID; - struct BuildRecordMorton *lrecord = &children[bestChild]; - struct BuildRecordMorton *rrecord = &children[numChildren]; - lrecord->nodeID = bnodes[bestNodeID].leftChild; - lrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, lrecord->nodeID); - rrecord->nodeID = bnodes[bestNodeID].rightChild; - rrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, rrecord->nodeID); - numChildren++; - } - - /* allocate memory for all children */ - const uint child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren); - global struct QBVHNodeN *childNodes = (global struct QBVHNodeN *)(bvh_mem + child_node_offset); - - /* create node, but to not set bounds yet as these get calculated during refit */ - const uint current_index = current->current_index; - struct QBVHNodeN *qnode = nodeData + current_index; - QBVH6Node_set_type(qnode, BVH_INTERNAL_NODE); - QBVHNodeN_setChildIncr1(qnode); - QBVH6Node_set_offset(qnode, childNodes); - - /* set back pointers */ - *InnerNode_GetBackPointer(backPointers, current_index) = (current->parent_index << 6) | (numChildren << 3); - - /* update parent pointer of build records of all children */ - for (uint ID = 0; ID < numChildren; ID++) - { - children[ID].current_index = childNodes - nodeData + ID; - children[ID].parent_index = current_index; - } - - /* write out child build records */ - const uint global_offset = atomic_add_local(local_numRecords, numChildren - 1); - records[rID] = children[0]; - - for (uint i = 1; i < numChildren; i++) - records[global_offset + i - 1] = children[i]; - - mem_fence_workgroup_default(); - -} - -#if 0 -/* This function calculates the similarity between two morton - * codes. It essentially counts how many bits of the morton codes are - * equal starting at the top. The more bits are equal, the similar the - * codes, and the closer the primitives are located spatially. */ - -GRL_INLINE uint64_t delta(global struct MortonCodePrimitive *mc, - const uint id) -{ - const uint64_t key0 = mc[id + 0].index_code; - const uint64_t key1 = mc[id + 1].index_code; - return clz(key0 ^ key1); -} - - - -/* This function checks for a range [left,right] of morton codes, if - * it is spatially closer to the left or to the right nodes. */ - -GRL_INLINE bool merge_to_right(global struct MortonCodePrimitive *mc, - const uint left, - const uint right, - const uint last) -{ - /* merge to right if we are at the left end of the array */ - if (left == 0) - return true; - - /* merge to left if we are at the right end of the array */ - if (right == last) - return false; - - /* otherwise merge to the side where the morton code sequence has - * the largest number of equal bits from the top */ - return delta(mc, right) > delta(mc, left - 1); -} - -GRL_INLINE uint updateParent(global struct BinaryMortonCodeHierarchy *bnodes, - global struct MortonCodePrimitive *mc, - const uint nodeID, - const uint left, - const uint right, - const uint last) -{ - uint parent; - - /* check if we should merge this node to the left or right */ - if (merge_to_right(mc, left, right, last)) - { - parent = right; - bnodes[parent].leftChild = nodeID; - bnodes[parent].range.start = left; - } - else - { - parent = left - 1; - bnodes[parent].rightChild = nodeID; - bnodes[parent].range.end = right; - } - - mem_fence_gpu_default(); - - /* stop ascending the tree if we reached this node the first time */ - const bool first = atomic_inc_global((global uint *)&bnodes[parent].flag) == 0; - return first ? -1 : parent; -} - -GRL_INLINE void -DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals, - global struct MortonCodePrimitive* mc, - global struct AABB* primref, - global struct BinaryMortonCodeHierarchy* bnodes, - global char* bvh_mem, - uint startID, uint endID, - local uint* local_numRecords, - local uint* local_numRecordsOld, - local struct BuildRecordMorton* local_records -) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64); - - /* iterate over all subtrees this workgroup should build */ - for ( uint recordID = startID; recordID < endID; recordID++ ) - { - /* add start build record to local stack */ - if ( get_local_id( 0 ) == 0 ) - { - local_records[0] = records[recordID]; - *local_numRecords = 1; - *local_numRecordsOld = 0; - } - work_group_barrier( CLK_LOCAL_MEM_FENCE ); - - /* terminate when all subtrees are leaves */ - while ( *local_numRecords != *local_numRecordsOld ) - { - /* remember the old number of build records to detect later - * whether we are done */ - if ( get_local_id( 0 ) == 0 ) - { - *local_numRecordsOld = *local_numRecords; - } - work_group_barrier( CLK_LOCAL_MEM_FENCE ); - - /* all work items in the sub group pick a subtree to build */ - for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) ) - { - /* ignore small subtrees */ - if ( local_records[ID].items <= BVH_NODE_N6 ) - continue; - - /* create QBVH node */ - create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] ); - } - - /* wait for all work items to have updated local_records array */ - work_group_barrier( CLK_LOCAL_MEM_FENCE ); - } - - const uint shift_mask = globals->shift_mask; - const uint leafPrimType = globals->leafPrimType; - const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; - BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); - global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); - - /* create all fat leaf nodes and initiate refit */ - for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) ) - { - struct BuildRecordMorton current = local_records[ID]; - const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID ); - - global struct QBVHNodeN* qnode = nodeData + current.current_index; - - /* get bounds of all children of the fat leaf node */ - struct AABB bounds[BVH_NODE_N6]; - for ( uint i = 0; i < current.items; i++ ) - { - /* get primID and bounds of primitive */ - const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask); - bounds[i] = primref[primID]; - - /* For all primitives in a fat leaf we store a back - * pointer. This way we can modify the fat leaf node at leaf construction time. */ - const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem; - - /* Store back pointer and primID inside morton code array to - * be later used by leaf creation. */ - mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID; - } - - /* update fat leaf node */ - QBVHNodeN_setType( qnode, leafPrimType ); - global void* offset; - if ( leafPrimType != BVH_INSTANCE_NODE ) - { - offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad ); - QBVHNodeN_setChildIncr1( qnode ); - } - else - { - offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf ); - QBVHNodeN_setChildIncr2( qnode ); - } - QBVH6Node_set_offset( qnode, offset ); - QBVHNodeN_setBounds( qnode, bounds, current.items ); - - /* set back pointers for fat leaf nodes */ - *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3); - - /* bottom up refit */ - refit_bottom_up( qnode, bvh, bounds, current.items ); - } - } -} - -/* - - This phase takes the build records calculated in phase0 as input and - finished the BVH construction for all these subtrees. - -*/ -__attribute__((reqd_work_group_size(8, 1, 1))) -old_parallel_build_phase1(global struct Globals *globals, - global struct MortonCodePrimitive *mc, - global struct AABB *primref, - global struct BinaryMortonCodeHierarchy *bnodes, - global char *bvh_mem) -{ - global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; - global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); - - /* a queue of build records */ - local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; - local uint local_numRecords; - local uint local_numRecordsOld; - - /* construct range of build records that each sub group will process */ - const uint numRecords = globals->numBuildRecords; - const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0); - const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0); - - DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); - -} - -__attribute__( (reqd_work_group_size( 8, 1, 1 )) ) -old_parallel_build_phase1_Indirect( global struct Globals* globals, - global struct MortonCodePrimitive* mc, - global struct AABB* primref, - global struct BinaryMortonCodeHierarchy* bnodes, - global char* bvh_mem ) -{ - global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; - global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart); - - /* a queue of build records */ - local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD]; - local uint local_numRecords; - local uint local_numRecordsOld; - - /* construct range of build records that each sub group will process */ - const uint numRecords = globals->numBuildRecords; - uint startID = get_group_id( 0 ); - uint endID = startID + 1; - - DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records ); - -} -#endif diff --git a/src/intel/vulkan/grl/gpu/morton/pre_sort.cl b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl deleted file mode 100644 index 099f926e194..00000000000 --- a/src/intel/vulkan/grl/gpu/morton/pre_sort.cl +++ /dev/null @@ -1,117 +0,0 @@ -// -// Copyright (C) 2009-2022 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "morton/morton_common.h" - -GRL_INLINE uint get_morton_shift( uint numPrimitives ) -{ - return 32 - clz( numPrimitives ); -} - -GRL_INLINE uint get_morton_shift_mask( uint numPrimitives ) -{ - uint shift = get_morton_shift( numPrimitives ); - uint mask =(uint)(((ulong)1 << shift)); - return mask - 1; // separated due to problems in DX -} - -__attribute__((reqd_work_group_size(1, 1, 1))) void kernel init( global struct Globals *globals ) -{ - /* variable shift for putting morton code + index to 64 bit */ - const uint shift = 32 - clz(globals->numPrimitives); - globals->shift = shift; - globals->shift_mask = (uint)(((ulong)1 << shift)); - globals->shift_mask -= 1; // separated due to problems in DX - globals->binary_hierarchy_root = 0; - globals->morton_sort_in_flight = 0; - globals->sort_iterations = get_morton_sort_lsb_req_iterations(shift); -} - -/* - - This kernel create a morton code array containing a morton code and - index into the primref array. - - The code uses the maximal number of bits for the morton code, such - that the morton code and index can still both get stored in 64 bits. - - The algorithm first maps the centroids of the primitives and their - bounding box diagonal into a 4D grid, and then interleaves all 4 - grid coordinates to construct the to morton code. - - */ - -__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) -__attribute__( (intel_reqd_sub_group_size( 16 )) ) void kernel -create_morton_codes_indirect( global struct Globals* globals, - global struct BVHBase* bvh, - global struct AABB* primref, - global struct MortonCodePrimitive* morton_codes, - global struct MortonCodePrimitive* morton_codes_tmp, - uint use_new_morton_sort) -{ - /* construct range of morton codes each work group should create */ - const uint numPrimitives = globals->numPrimitives; - const uint startID = get_group_id( 0 ) * get_local_size( 0 ); - const uint endID = min((uint)(startID + get_local_size(0)), numPrimitives); - - /* get lower and upper bounds of geometry and length of scene diagonal */ - const float3 lower = globals->centroidBounds.lower.xyz; - const float3 upper = globals->centroidBounds.upper.xyz; - const float diag = length( AABB3f_size( &bvh->Meta.bounds ).xyz ); - - /* calculates the 4D grid */ - const uint shift = get_morton_shift( numPrimitives ); - const uint grid_size = 1 << (64 - shift) / 4; - const float4 grid_base = (float4)(lower, 0.0f); - const float4 grid_extend = (float4)(upper - lower, diag); - const float4 grid_scale = select( (grid_size * 0.99f) / grid_extend, 0.0f, grid_extend == 0.0f ); // FIXME: 0.99f!!!!! - - const uint req_iterations = get_morton_sort_lsb_req_iterations(shift); - - /* each work group iterates over its range of morton codes to create */ - uint primID = startID + get_local_id( 0 ); - if( primID < endID ) - { - /* calculate position inside 4D grid */ - float4 centroid2 = AABB_centroid2( &primref[primID] ); - centroid2.w = length( AABB_size( &primref[primID] ).xyz ); - const uint4 gridpos = convert_uint4_rtz( (centroid2 - grid_base) * grid_scale ); - - /* calculate and store morton code */ - const ulong code = ulong_bitInterleave4D( gridpos ); - const ulong index_code = ((ulong)code << shift) | (ulong)primID; - - // It is required for morton code to be in morton_codes buffer after LSB sort finishes. - // If there would be odd iteration number needed for sorting, it is needed - // to skip some iterations of sorting. For odd number of iteration start with morton_codes_tmp buffer - if(req_iterations & 1 && !use_new_morton_sort) - morton_codes_tmp[primID].index_code = index_code; - else - morton_codes[primID].index_code = index_code; - } -} - -/* - - Initialization of the binary morton code hierarchy. - - */ - -__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) void kernel init_bottom_up_indirect( global struct Globals* globals, - global struct BinaryMortonCodeHierarchy* bnodes ) -{ - /* construct range each work group will process */ - const uint numPrimitives = globals->numPrimitives; - const uint startID = get_group_id( 0 ) * get_local_size(0); - const uint endID = min((uint)(startID + get_local_size(0)), numPrimitives); - - /* each workgroup iterates over its range to initialize the binary BVH */ - uint i = startID + get_local_id( 0 ); - if( i < endID ) - BinaryMortonCodeHierarchy_init( &bnodes[i], 0, numPrimitives - 1 ); -} diff --git a/src/intel/vulkan/grl/gpu/morton_builder.grl b/src/intel/vulkan/grl/gpu/morton_builder.grl deleted file mode 100644 index f221fd39fed..00000000000 --- a/src/intel/vulkan/grl/gpu/morton_builder.grl +++ /dev/null @@ -1,335 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module morton_builder; - -kernel_module morton_kernels ("morton/pre_sort.cl") -{ - kernel opencl_build_kernel_init < kernelFunction="init" >; - kernel opencl_build_morton_kernel_create_morton_codes_indirect < kernelFunction="create_morton_codes_indirect" >; - kernel opencl_build_morton_kernel_init_bottom_up_indirect < kernelFunction="init_bottom_up_indirect" >; -} - -kernel_module morton_kernels ("morton/post_sort.cl") -{ - links lsc_intrinsics; - - kernel opencl_build_morton_kernel_build_bottom_up_indirect < kernelFunction="build_bottom_up_indirect" >; -} - -kernel_module morton_kernels ("morton/phase0.cl") -{ - links lsc_intrinsics; - - kernel opencl_build_morton_kernel_parallel_build_phase0 < kernelFunction="parallel_build_phase0" >; - kernel opencl_build_morton_kernel_parallel_build_phase0_local_sync < kernelFunction="parallel_build_phase0_local_sync" >; -} - -kernel_module morton_kernels ("morton/phase1.cl") -{ - links lsc_intrinsics; - - kernel opencl_build_morton_kernel_parallel_build_phase1_Indirect < kernelFunction="parallel_build_phase1_Indirect_SG" >; - kernel opencl_build_morton_kernel_parallel_build_phase1_root < kernelFunction="parallel_build_phase1_Indirect_global_root" >; -} - -kernel_module morton_kernels ("morton/phase2.cl") -{ - links lsc_intrinsics; - - kernel opencl_build_morton_kernel_parallel_build_phase2_refit < kernelFunction="parallel_build_phase2_refit" >; - kernel opencl_build_morton_kernel_parallel_build_phase2_refit_local < kernelFunction="parallel_build_phase2_refit_local" >; -} - -import struct MKBuilderState "structs.grl"; - -/* -metakernel begin( - MKBuilderState state, - qword morton_code_buffer, - dword primLeafType, - dword numHwThreads) -{ - dispatch opencl_build_kernel_init(1, 1, 1) args( - state.build_globals - ); - - control(wait_idle); - - - dispatch opencl_build_morton_kernel_create_morton_codes(numHwThreads, 1, 1) args( - state.build_globals, - state.bvh_buffer, - state.build_primref_buffer, - morton_code_buffer); - - control(wait_idle); - -} - -metakernel build_bottom_up( - MKBuilderState state, - qword buildrecords_bottom_up, - qword morton_code_buffer, - dword numHwThreads) -{ - dispatch opencl_build_morton_kernel_init_bottom_up(numHwThreads, 1, 1) args( - state.build_globals, - buildrecords_bottom_up); - - control(wait_idle); - - dispatch opencl_build_morton_kernel_build_bottom_up(numHwThreads, 1, 1) args( - state.build_globals, - buildrecords_bottom_up, - morton_code_buffer); - - control(wait_idle); - -} - - -metakernel parallel_build( - MKBuilderState state, - qword buildrecords_bottom_up, - qword morton_code_buffer, - dword numHwThreads) -{ - dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args( - state.build_globals, - buildrecords_bottom_up, - state.bvh_buffer); - - control(wait_idle); - - dispatch opencl_build_morton_kernel_parallel_build_phase1(numHwThreads, 1, 1) args( - state.build_globals, - morton_code_buffer, - state.build_primref_buffer, - buildrecords_bottom_up, - state.bvh_buffer); - - control(wait_idle); - -} - -*/ - -metakernel NewMorton_pre_sort( - qword num_primrefs_counter, - MKBuilderState state, - qword morton_code_buffer, - qword morton_code_buffer_tmp, - qword buildrecords_bottom_up, - dword use_new_morton_sort) -{ - - - { - REG1 = 15; - REG2 = 4; - REG0 = load_dword( num_primrefs_counter ); - - REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals - REG1 = ~REG1; - REG0 = REG0 & REG1; - REG0 = REG0 >> REG2; - } - - dispatch opencl_build_kernel_init(1, 1, 1) args( state.build_globals ); - - DISPATCHDIM_X = REG0.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - /* - // new bottom-up kernel does not need this - dispatch_indirect opencl_build_morton_kernel_init_bottom_up_indirect args( - state.build_globals, - buildrecords_bottom_up); - */ - dispatch_indirect opencl_build_morton_kernel_create_morton_codes_indirect args( - state.build_globals, - state.bvh_buffer, - state.build_primref_buffer, - morton_code_buffer, - morton_code_buffer_tmp, - use_new_morton_sort); - - -} - - - -metakernel NewMorton_post_sort( - qword num_primrefs_counter, - qword num_buildrecords_counter, - MKBuilderState state, - qword buildrecords_bottom_up, - qword morton_code_buffer ) -{ - - { - REG1 = 15; - REG2 = 4; - REG0 = load_dword( num_primrefs_counter ); - - REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals - REG1 = ~REG1; - REG0 = REG0 & REG1; - REG0 = REG0 >> REG2; - } - - DISPATCHDIM_X = REG0.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args( - state.build_globals, - buildrecords_bottom_up, - morton_code_buffer); - - - /* - dispatch opencl_build_morton_kernel_build_bottom_up(16, 1, 1) args( - state.build_globals, - buildrecords_bottom_up, - morton_code_buffer); - */ - - control(wait_idle); - - dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args( - state.build_globals, - buildrecords_bottom_up, - state.bvh_buffer); - - control(wait_idle); - - DISPATCHDIM_X = load_dword( num_buildrecords_counter ); - - dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args( - state.build_globals, - morton_code_buffer, - state.build_primref_buffer, - buildrecords_bottom_up, - state.bvh_buffer); - - control(wait_idle); - -} - -metakernel NewMorton_bottom_up( - qword num_primrefs_counter, - MKBuilderState state, - qword buildrecords_bottom_up, - qword morton_code_buffer ) -{ - - { - REG1 = 15; - REG2 = 4; - REG0 = load_dword( num_primrefs_counter ); - - REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals - REG1 = ~REG1; - REG0 = REG0 & REG1; - REG0 = REG0 >> REG2; - } - - DISPATCHDIM_X = REG0.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args( - state.build_globals, - buildrecords_bottom_up, - morton_code_buffer); -} - - -metakernel NewMorton_phase0( - MKBuilderState state, - qword buildrecords_bottom_up, - qword morton_p0_refit_startpoints) -{ - - dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args( - state.build_globals, - buildrecords_bottom_up, - state.bvh_buffer, - morton_p0_refit_startpoints); -} - -metakernel NewMorton_phase0_local_sync( - MKBuilderState state, - qword buildrecords_bottom_up, - qword p0_boxless_nodes) -{ - - dispatch opencl_build_morton_kernel_parallel_build_phase0_local_sync(1, 1, 1) args( - state.build_globals, - buildrecords_bottom_up, - state.bvh_buffer, - p0_boxless_nodes); -} - - -metakernel NewMorton_phase1( - qword num_buildrecords_counter, - MKBuilderState state, - qword buildrecords_bottom_up, - qword morton_code_buffer) -{ - - DISPATCHDIM_X = load_dword( num_buildrecords_counter ); - - dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args( - state.build_globals, - morton_code_buffer, - state.build_primref_buffer, - buildrecords_bottom_up, - state.bvh_buffer); -} - -metakernel NewMorton_phase1_root( - qword num_buildrecords_counter, - MKBuilderState state, - qword buildrecords_bottom_up, - qword morton_code_buffer) -{ - dispatch opencl_build_morton_kernel_parallel_build_phase1_root(1, 1, 1) args( - state.build_globals, - morton_code_buffer, - state.build_primref_buffer, - buildrecords_bottom_up, - state.bvh_buffer); -} - -metakernel NewMorton_phase2( - qword num_leaves_counter, - MKBuilderState state, - qword bottom_node_ids ) -{ - - DISPATCHDIM_X = load_dword( num_leaves_counter ); - - dispatch_indirect opencl_build_morton_kernel_parallel_build_phase2_refit args( - state.bvh_buffer, - bottom_node_ids); -} - -metakernel NewMorton_phase2_local( - MKBuilderState state, - qword p0_boxless_nodes) -{ - - dispatch opencl_build_morton_kernel_parallel_build_phase2_refit_local(1, 1, 1) args( - state.build_globals, - state.bvh_buffer, - p0_boxless_nodes); -} diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl deleted file mode 100644 index 075d44a51ba..00000000000 --- a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl +++ /dev/null @@ -1,9 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// just inlines the kernels that are there in the header -#include "morton_msb_radix_bitonic_sort.h" \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h deleted file mode 100644 index 4fb6c21b014..00000000000 --- a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h +++ /dev/null @@ -1,924 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "common.h" -#include "morton_msb_radix_bitonic_sort_shared.h" - -#include "libs/lsc_intrinsics.h" - -/////////////////////////////////////////////////////////////////////////////// -// -// Configuration switches -// -/////////////////////////////////////////////////////////////////////////////// - -#define DEBUG 0 -#define MERGE_BLS_WITHIN_SG 0 - -/////////////////////////////////////////////////////////////////////////////// - - -#if DEBUG -#define DEBUG_CODE(A) A -#else -#define DEBUG_CODE(A) -#endif - -#define BOTTOM_LEVEL_SORT_WG_SIZE 512 - -// this kernel is only used to put into metakernel for debug to print that the code reached that place -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) -void kernel debug_print_kernel(uint variable) -{ - if(get_local_id(0) == 0) - printf("I'm here! %d\n", variable); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(1, 1, 1))) -void kernel check_bls_sort(global struct Globals* globals, global ulong* input) -{ - uint prims_num = globals->numPrimitives; - - printf("in check_bls_sort kernel. Values count:: %d\n", prims_num); - - ulong left = input[0]; - ulong right; - for (int i = 0; i < prims_num - 1; i++) - { - right = input[i + 1]; - printf("sorted val: %llu\n", left); - if (left > right) - { - printf("element %d is bigger than %d: %llu > %llu\n", i, i+1, left, right); - } - left = right; - } -} - -inline uint wg_scan_inclusive_add_opt(local uint* tmp, uint val, uint SG_SIZE, uint WG_SIZE) -{ - const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE; - const uint sg_local_id = get_local_id(0) % SG_SIZE; - const uint NUM_HW_THREADS_IN_WG = WG_SIZE / SG_SIZE; - - uint acc = sub_group_scan_inclusive_add(val); - if (NUM_HW_THREADS_IN_WG == 1) - { - return acc; - } - tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc, SG_SIZE - 1); - barrier(CLK_LOCAL_MEM_FENCE); - - uint loaded_val = sg_local_id < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0; - uint wgs_acc = sub_group_scan_exclusive_add(loaded_val); - uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id); - // for > 256 workitems in SIMD16 we won't fit in 16 workitems per subgroup, so we need additional iteration - // same for > 64 workitems and more in SIMD8 - uint num_iterations = (NUM_HW_THREADS_IN_WG + SG_SIZE - 1) / SG_SIZE; - for (int i = 1; i < num_iterations; i++) - { - // need to add tmp[] because of "exclusive" scan, so last element misses it - uint prev_max_sum = sub_group_broadcast(wgs_acc, SG_SIZE - 1) + tmp[(i * SG_SIZE) - 1]; - loaded_val = (sg_local_id + i * SG_SIZE) < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0; - wgs_acc = sub_group_scan_exclusive_add(loaded_val); - wgs_acc += prev_max_sum; - uint new_acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id % SG_SIZE); - if (hw_thread_in_wg_id >= i * SG_SIZE) - acc_for_this_hw_thread = new_acc_for_this_hw_thread; - } - return acc + acc_for_this_hw_thread; -} - -struct MSBDispatchArgs -{ - global struct MSBRadixContext* context; - uint num_of_wgs; // this is the number of workgroups that was dispatched for this context - ulong* wg_key_start; // this is where keys to process start for current workgroup - ulong* wg_key_end; - uint shift_bit; -}; - - - - -struct MSBDispatchArgs get_msb_dispatch_args(global struct VContextScheduler* scheduler) -{ - global struct MSBDispatchQueue* queue = &scheduler->msb_queue; - - uint group = get_group_id(0); - struct MSBDispatchRecord record; - - // TODO_OPT: Load this entire prefix array into SLM instead of searching.. - // Or use sub-group ops - uint i = 0; - while (i < queue->num_records) - { - uint n = queue->records[i].wgs_to_dispatch; - - if (group < n) - { - record = queue->records[i]; - break; - } - - group -= n; - i++; - } - - uint context_id = i; - global struct MSBRadixContext* context = &scheduler->contexts[context_id]; - - // moving to ulongs to avoid uint overflow - ulong group_id_in_dispatch = group; - ulong start_offset = context->start_offset; - ulong num_keys = context->num_keys; - ulong wgs_to_dispatch = record.wgs_to_dispatch; - - struct MSBDispatchArgs args; - args.context = context; - args.num_of_wgs = record.wgs_to_dispatch; - args.wg_key_start = context->keys_in + start_offset + (group_id_in_dispatch * num_keys / wgs_to_dispatch); - args.wg_key_end = context->keys_in + start_offset + ((group_id_in_dispatch+1) * num_keys / wgs_to_dispatch); - args.shift_bit = MSB_SHIFT_BYTE_START_OFFSET - context->iteration * MSB_BITS_PER_ITERATION; - return args; -} - - - - -void BLSDispatchQueue_push(global struct BLSDispatchQueue* queue, struct BLSDispatchRecord* record) -{ - uint new_idx = atomic_inc_global(&queue->num_records); - queue->records[new_idx] = *record; - DEBUG_CODE(printf("adding bls of size: %d\n", record->count)); -} - - - - -void DO_CountSort(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output) -{ - uint tid = get_local_id(0); - - global ulong* in = ((global ulong*)(dispatchRecord.keys_in)) + dispatchRecord.start_offset; - - ulong a = tid < dispatchRecord.count ? in[tid] : ULONG_MAX; - - SLM_shared[tid] = a; - - uint counter = 0; - - barrier(CLK_LOCAL_MEM_FENCE); - - ulong curr = SLM_shared[get_sub_group_local_id()]; - - for (uint i = 16; i < dispatchRecord.count; i += 16) - { - ulong next = SLM_shared[i + get_sub_group_local_id()]; - - for (uint j = 0; j < 16; j++) - { - // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint - uint2 curr_as_uint2 = as_uint2(curr); - uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j)); - ulong c = as_ulong(sg_curr_as_uint2); - if (c < a) - counter++; - } - - curr = next; - } - - - // last iter - for (uint j = 0; j < 16; j++) - { - // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint - uint2 curr_as_uint2 = as_uint2(curr); - uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j)); - ulong c = as_ulong(sg_curr_as_uint2); - if (c < a) - counter++; - } - - // save elements to its sorted positions - if (tid < dispatchRecord.count) - output[dispatchRecord.start_offset + counter] = a; -} - -void DO_Bitonic(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output) -{ - uint lid = get_local_id(0); - uint elements_to_sort = BOTTOM_LEVEL_SORT_THRESHOLD; - while ((elements_to_sort >> 1) >= dispatchRecord.count && elements_to_sort >> 1 >= BOTTOM_LEVEL_SORT_WG_SIZE) - { - elements_to_sort >>= 1; - } - - for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++) - { - uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE; - - if (tid >= dispatchRecord.count) - SLM_shared[tid] = ULONG_MAX; - else - SLM_shared[tid] = ((global ulong*)(dispatchRecord.keys_in))[dispatchRecord.start_offset + tid]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - uint k_iterations = elements_to_sort; - while(k_iterations >> 1 >= dispatchRecord.count && k_iterations != 0) - { - k_iterations >>= 1; - } - - for (unsigned int k = 2; k <= k_iterations; k *= 2) - { - for (unsigned int j = k / 2; j > 0; j /= 2) - { - // this loop is needed when we can't create big enough workgroup so we need to process multiple times - for (uint i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++) - { - uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE; - unsigned int ixj = tid ^ j; - if (ixj > tid) - { - if ((tid & k) == 0) - { - if (SLM_shared[tid] > SLM_shared[ixj]) - { - ulong tmp = SLM_shared[tid]; - SLM_shared[tid] = SLM_shared[ixj]; - SLM_shared[ixj] = tmp; - } - } - else - { - if (SLM_shared[tid] < SLM_shared[ixj]) - { - ulong tmp = SLM_shared[tid]; - SLM_shared[tid] = SLM_shared[ixj]; - SLM_shared[ixj] = tmp; - } - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - } - - for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++) - { - uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE; - - if (tid < dispatchRecord.count) - output[dispatchRecord.start_offset + tid] = SLM_shared[tid]; - } -} - - - - -void DO_Create_Separate_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input) -{ - uint lid = get_local_id(0); - - uint start = context->start[lid]; - uint count = context->count[lid]; - uint start_offset = context->start_offset + start; - - struct BLSDispatchRecord record; - record.start_offset = start_offset; - record.count = count; - record.keys_in = context->keys_out; - - if (count == 0) // we don't have elements so don't do anything - { - } - else if (count == 1) // single element so just write it out - { - input[start_offset] = ((global ulong*)record.keys_in)[start_offset]; - } - else if (count <= BOTTOM_LEVEL_SORT_THRESHOLD) - { - BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); - } -} - - - - -// We try to merge small BLS into larger one within the sub_group -void DO_Create_SG_Merged_BLS_Work_Parallel(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input) -{ - uint lid = get_local_id(0); - uint sid = get_sub_group_local_id(); - - uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0; - - uint start = context->start[lid]; - uint count = context->count[lid]; - uint ctx_start_offset = context->start_offset; - - if (sid == 0 || create_msb_work) // these SIMD lanes are the begining of merged BLS - { - struct BLSDispatchRecord record; - if (create_msb_work) - { - record.start_offset = ctx_start_offset + start + count; - record.count = 0; - } - else // SIMD lane 0 case - { - record.start_offset = ctx_start_offset + start; - record.count = count; - } - - record.keys_in = context->keys_out; - - uint loop_idx = 1; - while (sid + loop_idx < 16) // loop over subgroup - { - uint _create_msb_work = intel_sub_group_shuffle_down(create_msb_work, 0u, loop_idx); - uint _count = intel_sub_group_shuffle_down(count, 0u, loop_idx); - uint _start = intel_sub_group_shuffle_down(start, 0u, loop_idx); - - if (_create_msb_work) // found out next MSB work, so range of merges ends - break; - - // need to push record since nothing more will fit - if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD) - { - if (record.count == 1) - { - input[record.start_offset] = record.keys_in[record.start_offset]; - } - else if (record.count > 1) - { - BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); - } - record.start_offset = ctx_start_offset + _start; - record.count = _count; - } - else - { - record.count += _count; - } - loop_idx++; - } - // if we have any elements left, then schedule them - if (record.count == 1) // only one element, so just write it out - { - input[record.start_offset] = record.keys_in[record.start_offset]; - } - else if (record.count > 1) - { - BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); - } - } -} - - - - -// We try to merge small BLS into larger one within the sub_group -void DO_Create_SG_Merged_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input) -{ - uint lid = get_local_id(0); - uint sid = get_sub_group_local_id(); - - uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0; - - uint start = context->start[lid]; - uint count = context->count[lid]; - uint ctx_start_offset = context->start_offset; - - if (sid == 0) - { - struct BLSDispatchRecord record; - record.start_offset = ctx_start_offset + start; - record.count = 0; - record.keys_in = context->keys_out; - - for (int i = 0; i < 16; i++) - { - uint _create_msb_work = sub_group_broadcast(create_msb_work, i); - uint _count = sub_group_broadcast(count, i); - uint _start = sub_group_broadcast(start, i); - if (_create_msb_work) - { - if (record.count == 1) // only one element, so just write it out - { - input[record.start_offset] = record.keys_in[record.start_offset]; - } - else if (record.count > 1) - { - BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); - } - record.start_offset = ctx_start_offset + _start + _count; - record.count = 0; - continue; - } - // need to push record since nothing more will fit - if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD) - { - BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); - record.start_offset = ctx_start_offset + _start; - record.count = _count; - } - else - { - record.count += _count; - } - } - // if we have any elements left, then schedule them - if (record.count == 1) // only one element, so just write it out - { - input[record.start_offset] = record.keys_in[record.start_offset]; - } - else if (record.count > 1) - { - BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record); - } - } -} - - - - -void DO_Create_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input, local uint* slm_for_wg_scan, uint sg_size, uint wg_size) -{ - uint lid = get_local_id(0); - - uint iteration = context->iteration + 1; - uint start = context->start[lid]; - uint count = context->count[lid]; - uint start_offset = context->start_offset + start; - - uint create_msb_work = count > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0; - -#if MERGE_BLS_WITHIN_SG - DO_Create_SG_Merged_BLS_Work_Parallel(scheduler, context, input); -#else - DO_Create_Separate_BLS_Work(scheduler, context, input); -#endif - - uint new_entry_id = wg_scan_inclusive_add_opt(slm_for_wg_scan, create_msb_work, sg_size, wg_size);//work_group_scan_inclusive_add(create_msb_work); - uint stack_begin_entry; - // last workitem in wg contains number of all new entries - if (lid == (MSB_RADIX_NUM_BINS - 1)) - { - stack_begin_entry = atomic_add_global(&scheduler->msb_stack.num_entries, new_entry_id); - } - stack_begin_entry = work_group_broadcast(stack_begin_entry, (MSB_RADIX_NUM_BINS - 1)); - new_entry_id += stack_begin_entry -1; - - - if (create_msb_work) - { - scheduler->msb_stack.entries[new_entry_id].start_offset = start_offset; - scheduler->msb_stack.entries[new_entry_id].count = count; - scheduler->msb_stack.entries[new_entry_id].iteration = iteration; - } - - if (lid == 0) { - DEBUG_CODE(printf("num of new bls: %d\n", scheduler->next_bls_queue->num_records)); - } -} - - -struct BatchedBLSDispatchEntry -{ - ///////////////////////////////////////////////////////////// - // State data used for communication with command streamer - // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl' - ///////////////////////////////////////////////////////////// - qword p_data_buffer; - qword num_elements; // number of elements in p_data_buffer -}; - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel sort_morton_codes_batched_BLS_dispatch(global struct BatchedBLSDispatchEntry* bls_dispatches) -{ - uint dispatch_id = get_group_id(0); - uint lid = get_local_id(0); - - local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD]; - - struct BatchedBLSDispatchEntry dispatchArgs = bls_dispatches[dispatch_id]; - struct BLSDispatchRecord dispatchRecord; - dispatchRecord.start_offset = 0; - dispatchRecord.count = dispatchArgs.num_elements; - dispatchRecord.keys_in = (ulong*)dispatchArgs.p_data_buffer; - - DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", dispatchRecord.count)); - - if(dispatchRecord.count > 1) - DO_Bitonic(dispatchRecord, SLM_shared, (global ulong*)dispatchRecord.keys_in); -} - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel sort_morton_codes_bottom_level_single_wg(global struct Globals* globals, global ulong* input, global ulong* output) -{ - uint lid = get_local_id(0); - - DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", globals->numPrimitives)); - - local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD]; - - struct BLSDispatchRecord dispatchRecord; - dispatchRecord.start_offset = 0; - dispatchRecord.count = globals->numPrimitives; - dispatchRecord.keys_in = (ulong*)input; - - //TODO: count or bitonic here? - //DO_Bitonic(dispatchRecord, SLM_shared, output); - DO_CountSort(dispatchRecord, SLM_shared, output); -} - - - - -// This kernel initializes first context to start up the whole execution -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel sort_morton_codes_msb_begin( - global struct Globals* globals, - global struct VContextScheduler* scheduler, - global ulong* buf0, - global ulong* buf1) -{ - uint lid = get_local_id(0); - uint gid = get_group_id(0); - - DEBUG_CODE(if (lid == 0)printf("running sort_morton_codes_msb_begin\n")); - - scheduler->contexts[gid].count[lid] = 0; - - if (gid == 0 && lid == 0) - { - global struct MSBRadixContext* context = &scheduler->contexts[lid]; - const uint num_prims = globals->numPrimitives; - - scheduler->bls_queue0.num_records = 0; - scheduler->bls_queue1.num_records = 0; - - scheduler->curr_bls_queue = &scheduler->bls_queue1; - scheduler->next_bls_queue = &scheduler->bls_queue0; - - context->start_offset = 0; - context->num_wgs_in_flight = 0; - context->num_keys = num_prims; - context->iteration = 0; - context->keys_in = buf0; - context->keys_out = buf1; - - uint msb_wgs_to_dispatch = (num_prims + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD; - scheduler->msb_queue.records[0].wgs_to_dispatch = msb_wgs_to_dispatch; - - scheduler->num_wgs_msb = msb_wgs_to_dispatch; - scheduler->num_wgs_bls = 0; - scheduler->msb_stack.num_entries = 0; - scheduler->msb_queue.num_records = 1; - } -} - - - - -__attribute__((reqd_work_group_size(MSB_RADIX_NUM_VCONTEXTS, 1, 1))) -kernel void -scheduler(global struct VContextScheduler* scheduler, global ulong* buf0, global ulong* buf1) -{ - uint lid = get_local_id(0); - - DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler\n")); - - uint context_idx = lid; - - const uint num_of_stack_entries = scheduler->msb_stack.num_entries; - - uint msb_wgs_to_dispatch = 0; - if (lid < num_of_stack_entries) - { - struct MSBStackEntry entry = scheduler->msb_stack.entries[(num_of_stack_entries-1) - lid]; - global struct MSBRadixContext* context = &scheduler->contexts[lid]; - context->start_offset = entry.start_offset; - context->num_wgs_in_flight = 0; - context->num_keys = entry.count; - context->iteration = entry.iteration; - context->keys_in = entry.iteration % 2 == 0 ? buf0 : buf1; - context->keys_out = entry.iteration % 2 == 0 ? buf1 : buf0; - - msb_wgs_to_dispatch = (entry.count + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD; - scheduler->msb_queue.records[lid].wgs_to_dispatch = msb_wgs_to_dispatch; - } - - msb_wgs_to_dispatch = work_group_reduce_add(msb_wgs_to_dispatch);// TODO: if compiler implementation is slow, then consider to manually write it - - if (lid == 0) - { - // swap queue for next iteration - struct BLSDispatchQueue* tmp = scheduler->curr_bls_queue; - scheduler->curr_bls_queue = scheduler->next_bls_queue; - scheduler->next_bls_queue = tmp; - - scheduler->next_bls_queue->num_records = 0; - - scheduler->num_wgs_bls = scheduler->curr_bls_queue->num_records; - scheduler->num_wgs_msb = msb_wgs_to_dispatch; - - if (num_of_stack_entries < MSB_RADIX_NUM_VCONTEXTS) - { - scheduler->msb_queue.num_records = num_of_stack_entries; - scheduler->msb_stack.num_entries = 0; - } - else - { - scheduler->msb_queue.num_records = MSB_RADIX_NUM_VCONTEXTS; - scheduler->msb_stack.num_entries -= MSB_RADIX_NUM_VCONTEXTS; - } - } - - DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler finished, to spawn %d MSB wgs in %d contexts and %d BLS wgs, MSB records on stack %d\n", - scheduler->num_wgs_msb, scheduler->msb_queue.num_records, scheduler->num_wgs_bls, scheduler->msb_stack.num_entries)); -} - - - - -// this is the lowest sub-task, which should end return sorted codes -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel sort_morton_codes_bottom_level( global struct VContextScheduler* scheduler, global ulong* output) -{ - uint lid = get_local_id(0); - - DEBUG_CODE(if (get_group_id(0) == 0 && lid == 0) printf("running sort_morton_codes_bottom_level\n")); - - local struct BLSDispatchRecord l_dispatchRecord; - if (lid == 0) - { - uint record_idx = get_group_id(0); - l_dispatchRecord = scheduler->curr_bls_queue->records[record_idx]; - //l_dispatchRecord = BLSDispatchQueue_pop((global struct BLSDispatchQueue*)scheduler->curr_bls_queue); - atomic_dec_global(&scheduler->num_wgs_bls); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - struct BLSDispatchRecord dispatchRecord = l_dispatchRecord; - - local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD]; - - // right now use only bitonic sort - // TODO: maybe implement something else - if (1) - { - //DO_Bitonic(dispatchRecord, SLM_shared, output); - DO_CountSort(dispatchRecord, SLM_shared, output); - } -} - - - - -#define MSB_COUNT_WG_SIZE MSB_RADIX_NUM_BINS -#define MSB_COUNT_SG_SIZE 16 - -// count how many elements per buckets we have -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MSB_COUNT_WG_SIZE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MSB_COUNT_SG_SIZE))) -void kernel sort_morton_codes_msb_count_items( global struct VContextScheduler* scheduler) -{ - uint lid = get_local_id(0); - uint lsz = MSB_RADIX_NUM_BINS; - - DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_count_items\n")); - - local uint bucket_count[MSB_RADIX_NUM_BINS]; - local uint finish_count; - bucket_count[lid] = 0; - if (lid == 0) - { - finish_count = 0; - } - - struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler); - - global struct MSBRadixContext* context = dispatchArgs.context; - - global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid; - global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end; - uint shift_bit = dispatchArgs.shift_bit; - uchar shift_byte = shift_bit / 8; // so we count how many uchars to shift - barrier(CLK_LOCAL_MEM_FENCE); - - global uchar* ks = (global uchar*)key_start; - ks += shift_byte; - global uchar* ke = (global uchar*)key_end; - ke += shift_byte; - - // double buffering on value loading - if (ks < ke) - { - uchar bucket_id = *ks; - ks += lsz * sizeof(ulong); - - for (global uchar* k = ks; k < ke; k += lsz * sizeof(ulong)) - { - uchar next_bucket_id = *k; - atomic_inc_local(&bucket_count[bucket_id]); - bucket_id = next_bucket_id; - } - - atomic_inc_local(&bucket_count[bucket_id]); - - } - - barrier(CLK_LOCAL_MEM_FENCE); - - //update global counters for context - uint count = bucket_count[lid]; - if (count > 0) - atomic_add_global(&context->count[lid], bucket_count[lid]); - - mem_fence_gpu_invalidate(); - work_group_barrier(0); - - bool final_wg = true; - // count WGs which have reached the end - if (dispatchArgs.num_of_wgs > 1) - { - if (lid == 0) - finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1; - - barrier(CLK_LOCAL_MEM_FENCE); - - final_wg = finish_count == dispatchArgs.num_of_wgs; - } - - local uint partial_dispatches[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE]; - // if this is last wg for current dispatch, update context - if (final_wg) - { - // code below does work_group_scan_exclusive_add(context->count[lid]); - { - uint lane_val = context->count[lid]; - uint sg_result = sub_group_scan_inclusive_add(lane_val); - - partial_dispatches[get_sub_group_id()] = sub_group_broadcast(sg_result, MSB_COUNT_SG_SIZE - 1); - barrier(CLK_LOCAL_MEM_FENCE); - - uint slm_result = sub_group_scan_exclusive_add(partial_dispatches[get_sub_group_local_id()]); - slm_result = sub_group_broadcast(slm_result, get_sub_group_id()); - uint result = slm_result + sg_result - lane_val; - context->start[lid] = result;//work_group_scan_exclusive_add(context->count[lid]); - } - - context->count[lid] = 0; - if(lid == 0) - context->num_wgs_in_flight = 0; - } -} - - - - -// sort elements into appropriate buckets -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) -void kernel sort_morton_codes_msb_bin_items( - global struct VContextScheduler* scheduler, global ulong* input) -{ - uint lid = get_local_id(0); - uint lsz = get_local_size(0); - - DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_bin_items\n")); - - local uint finish_count; - if (lid == 0) - { - finish_count = 0; - } - - struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler); - global struct MSBRadixContext* context = dispatchArgs.context; - - global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid; - global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end; - uint shift_bit = dispatchArgs.shift_bit; - - barrier(CLK_LOCAL_MEM_FENCE); - - global ulong* sorted_keys = (global ulong*)context->keys_out + context->start_offset; - -#if MSB_RADIX_NUM_BINS == MSB_WG_SORT_ELEMENTS_THRESHOLD // special case meaning that we process exactly 1 element per workitem - // here we'll do local counting, then move to global - - local uint slm_counters[MSB_RADIX_NUM_BINS]; - slm_counters[lid] = 0; - - barrier(CLK_LOCAL_MEM_FENCE); - - uint place_in_slm_bucket; - uint bucket_id; - ulong val; - - bool active_lane = key_start < key_end; - - if (active_lane) - { - val = *key_start; - - bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1); - place_in_slm_bucket = atomic_inc_local(&slm_counters[bucket_id]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // override slm_counters with global counters - we don't need to override counters with 0 elements since we won't use them anyway - if (slm_counters[lid]) - slm_counters[lid] = atomic_add_global(&context->count[lid], slm_counters[lid]); - - barrier(CLK_LOCAL_MEM_FENCE); - - uint id_in_bucket = slm_counters[bucket_id] + place_in_slm_bucket;//atomic_inc_global(&context->count[bucket_id]); - - if (active_lane) - sorted_keys[context->start[bucket_id] + id_in_bucket] = val; -#else - // double buffering on value loading - if (key_start < key_end) - { - ulong val = *key_start; - key_start += lsz; - - for (global ulong* k = key_start; k < key_end; k += lsz) - { - ulong next_val = *k; - uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1); - uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]); - - //printf("dec: %llu, val: %llX bucket_id: %X", *k, *k, bucket_id); - sorted_keys[context->start[bucket_id] + id_in_bucket] = val; - - val = next_val; - } - - uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1); - uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]); - - sorted_keys[context->start[bucket_id] + id_in_bucket] = val; - } -#endif - - // make sure all groups's "counters" and "starts" are visible to final workgroup - mem_fence_gpu_invalidate(); - work_group_barrier(0); - - bool final_wg = true; - // count WGs which have reached the end - if (dispatchArgs.num_of_wgs > 1) - { - if (lid == 0) - finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1; - - barrier(CLK_LOCAL_MEM_FENCE); - - final_wg = finish_count == dispatchArgs.num_of_wgs; - } - - local uint slm_for_wg_funcs[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE]; - // if this is last wg for current dispatch, then prepare sub-tasks - if (final_wg) - { - DO_Create_Work(scheduler, context, input, slm_for_wg_funcs, 16, MSB_RADIX_NUM_BINS); - - // clear context's counters for future execution - context->count[lid] = 0; - } - -} \ No newline at end of file diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h deleted file mode 100644 index c2ab0d4a2c9..00000000000 --- a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h +++ /dev/null @@ -1,135 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// -// This file contains structure definitions shared by GRL OCL kernels and host code -// - -#pragma once - -#include "GRLGen12.h" - -// NOTE: -// MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work -// BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work -// - -#define MSB_RADIX_NUM_BINS 256 -#define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration -#define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here - -#define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used - -#define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here, -// since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these - -#define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context - -#define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS, -// so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS - -#define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup. - // If a single MSB entry needs more, then it will spawn more WGs - // after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num - -#define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance -// Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with. -// Since we use ulong(8bytes) we can store 4096 elements -// This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler -// Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD - -#define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS - -GRL_NAMESPACE_BEGIN(GRL) - - - - -GRL_NAMESPACE_BEGIN(RTAS) -GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT) - -struct MSBStackEntry -{ - uint start_offset; - uint count; - uint iteration; -}; - -struct MSBStack -{ - dword num_entries; - struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM]; -}; - -struct MSBRadixContext -{ - uint start[MSB_RADIX_NUM_BINS]; - uint count[MSB_RADIX_NUM_BINS]; - uint num_wgs_in_flight; // this is used to identify which msb wg is last - uint num_keys; // number of keys to process - uint iteration; - ulong* keys_in; - ulong* keys_out; - - uint start_offset; //offset from the beginning of the buffer -}; - -struct MSBDispatchRecord -{ - uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record -}; - -struct MSBDispatchQueue -{ - dword num_records; - struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record -}; - -// BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks -struct BLSDispatchRecord -{ - uint start_offset; // offset from the beginning of the buffer - uint count; - ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer -}; - -struct BLSDispatchQueue -{ - dword num_records; - struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS]; -}; - -struct VContextScheduler -{ - ///////////////////////////////////////////////////////////// - // State data used for communication with command streamer - // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl' - ///////////////////////////////////////////////////////////// - - dword num_wgs_msb; // number of MSB workgroups being processed by current iteration - dword num_wgs_bls; // number of BLS workgroups being processed by current iteration - - dword scheduler_postsync; - dword _pad1; - - ///////////////////////////////////////////////////////////// - - struct MSBDispatchQueue msb_queue; - struct BLSDispatchQueue bls_queue0; - struct BLSDispatchQueue bls_queue1; - - struct BLSDispatchQueue* curr_bls_queue; - struct BLSDispatchQueue* next_bls_queue; - - struct MSBStack msb_stack; - - struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS]; -}; - -GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT) -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.cl b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl deleted file mode 100644 index e123b2f46d3..00000000000 --- a/src/intel/vulkan/grl/gpu/morton_radix_sort.cl +++ /dev/null @@ -1,9 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// just inlines the kernels that are there in the header -#include "morton_radix_sort.h" diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.h b/src/intel/vulkan/grl/gpu/morton_radix_sort.h deleted file mode 100644 index d58ec829883..00000000000 --- a/src/intel/vulkan/grl/gpu/morton_radix_sort.h +++ /dev/null @@ -1,855 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "common.h" -#include "libs/lsc_intrinsics.h" - -/* ============================================================================= */ -/* ============================== LSB RADIX SORT =============================== */ -/* ============================================================================= */ - -#define RADIX_BINS 256 -#define SCATTER_WG_SIZE 512 -#define MORTON_LSB_SORT_NO_SHIFT_THRESHOLD 0xFFFFFFFF // turn off, because current hierarchy build requires full sort - -uint2 get_thread_range( uint numItems, uint numGroups, uint taskID ) -{ - uint items_per_group = (numItems / numGroups); - uint remainder = numItems - (items_per_group * numGroups); - uint startID = taskID * items_per_group + min(taskID, remainder); - uint endID = startID + items_per_group + ((taskID < remainder) ? 1 : 0); - - return (uint2)(startID,endID); -} - -GRL_INLINE void sort_morton_codes_bin_items_taskID_func(global struct Globals* globals, - global uint* global_histogram, - global uchar* input, - local uint* histogram, - uint iteration, - uint numGroups, - uint numItems, - bool shift_primID, - uint taskID, - uint startID, - uint endID) -{ - const uint shift = globals->shift; - - for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0)) - histogram[i] = 0; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (shift_primID) - { - for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) - { - // Read input as ulong to make bitshift, so the bits representing primID are not being - // taken into account during sorting, which would result in smaller sort loops for - // cases where morton shift are bigger than 8 bits - ulong* ptr_ul = (ulong*)&input[8 * i]; - ulong code = *ptr_ul; - uchar* ptr = (uchar*)&code; - code >>= shift; - - uchar bin = ptr[iteration]; - atomic_inc_local(&histogram[bin]); - } - } - else - { - for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0)) - { - uchar bin = input[8 * i + iteration]; - atomic_inc_local(&histogram[bin]); - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0)) - global_histogram[RADIX_BINS * taskID + i] = histogram[i]; -} - -GRL_INLINE void sort_morton_codes_bin_items_func(global struct Globals* globals, - global uint* global_histogram, - global uint* wg_flags, - global uchar* input, - local uint* histogram, - uint iteration, - uint numGroups, - uint numItems, - bool shift_primID, - bool update_wg_flags) -{ - if (shift_primID) - { - // This check is present in other LSB sort functions as well, its purpose is - // to skip first n iterations where n is the difference between max iterations - // and actually needed iterations to sort without primIDs - const uint req_iterations = globals->sort_iterations; - if (iteration < req_iterations) - return; - - // iteration needs to be adjusted to reflect the skipped cycles - iteration -= req_iterations; - } - - const uint taskID = get_group_id(0); - - if (taskID == 0 && update_wg_flags) - { - for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0)) - wg_flags[i] = 0; - } - - uint2 ids = get_thread_range(numItems, numGroups, taskID); - uint startID = ids.x; - uint endID = ids.y; - - sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, shift_primID, - taskID, startID, endID); -} - -__attribute__((reqd_work_group_size(512, 1, 1))) -void kernel -sort_morton_codes_bin_items( - global struct Globals* globals, - global uint* global_histogram, - global uint* wg_flags, - global uchar* input, - uint iteration, - uint numGroups, - uint update_wg_flags -) -{ - local uint histogram[RADIX_BINS]; - const uint numItems = globals->numPrimitives; - if(numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) - sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, false, update_wg_flags); - else - sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, true, update_wg_flags); -} - - -GRL_INLINE void sort_morton_codes_reduce_bins_func(global struct Globals* globals, - global uint* global_histogram, - local uint* partials, - uint numTasks, - uint iteration, - bool shift_primID) -{ - const uint localID = get_local_id(0); - - if (shift_primID) - { - const uint req_iterations = globals->sort_iterations; - if (iteration < req_iterations) - return; - } - - uint t = 0; - for (uint j = 0; j < numTasks; j++) - { - const uint count = load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + localID], 0); - store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + localID], 0, t); - t += count; - } - - // each lane now contains the number of elements in the corresponding bin - // prefix sum this for use in the subsequent scattering pass. - uint global_count = t; - - partials[get_sub_group_id()] = sub_group_reduce_add(global_count); - - barrier(CLK_LOCAL_MEM_FENCE); - - uint lane = get_sub_group_local_id(); - uint p = partials[lane]; - p = (lane < get_sub_group_id()) ? p : 0; - - global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count); - - store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numTasks + localID], 0, global_count); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(256, 1, 1))) -void kernel -sort_morton_codes_reduce_bins(global struct Globals* globals, - uint numTasks, - global uint* global_histogram, - uint iteration) -{ - local uint partials[RADIX_BINS]; - const uint numItems = globals->numPrimitives; - if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) - sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, false); - else - sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, true); -} - - -#if 1 -GRL_INLINE void sort_morton_codes_scatter_items_func( - global struct Globals* globals, - global uint* global_histogram, - global ulong* input, - global ulong* output, - local uint* local_offset, - local uint* flags, - uint iteration, - uint numGroups, - uint numItems, - bool shift_primID, - bool update_morton_sort_in_flight) -{ - const uint gID = get_local_id(0) + get_group_id(0) * get_local_size(0); - - const uint global_shift = globals->shift; - const uint localID = get_local_id(0); - const uint taskID = get_group_id(0); - - if (gID == 0 && update_morton_sort_in_flight) - globals->morton_sort_in_flight = 0; - - uint2 ids = get_thread_range(numItems, numGroups, taskID); - uint startID = ids.x; - uint endID = ids.y; - - if (shift_primID) - { - const uint req_iterations = globals->sort_iterations; - if (iteration < req_iterations) - return; - - iteration -= req_iterations; - } - - const uint shift = 8 * iteration; - - // load the global bin counts, and add each bin's global prefix - // to the local prefix - { - uint global_prefix = 0, local_prefix = 0; - if (localID < RADIX_BINS) - { - local_prefix = global_histogram[RADIX_BINS * taskID + localID]; - global_prefix = global_histogram[RADIX_BINS * numGroups + localID]; - local_offset[localID] = global_prefix + local_prefix; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - - // move elements in WG-sized chunks. The elements need to be moved sequentially (can't use atomics) - // because relative order has to be preserved for LSB radix sort to work - - // For each bin, a bit vector indicating which elements are in the bin - for (uint block_base = startID; block_base < endID; block_base += get_local_size(0)) - { - // initialize bit vectors - for (uint i = 4 * localID; i < RADIX_BINS * SCATTER_WG_SIZE / 32; i += 4 * get_local_size(0)) - { - flags[i + 0] = 0; - flags[i + 1] = 0; - flags[i + 2] = 0; - flags[i + 3] = 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // read sort key, determine which bin it goes into, scatter into the bit vector - // and pre-load the local offset - uint ID = localID + block_base; - ulong key = 0; - uint bin_offset = 0; - uint bin = 0; - uint bin_word = localID / 32; - uint bin_bit = 1 << (localID % 32); - - if (ID < endID) - { - key = input[ID]; - - if (shift_primID) - bin = ((key >> global_shift) >> shift) & (RADIX_BINS - 1); - else - bin = (key >> shift) & (RADIX_BINS - 1); - - atomic_add_local(&flags[(SCATTER_WG_SIZE / 32) * bin + bin_word], bin_bit); - bin_offset = local_offset[bin]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (ID < endID) - { - // each key reads the bit-vectors for its bin, - // - Computes local prefix sum to determine its output location - // - Computes number of items added to its bin (last thread adjusts bin position) - uint prefix = 0; - uint count = 0; - for (uint i = 0; i < (SCATTER_WG_SIZE / 32); i++) - { - uint bits = flags[(SCATTER_WG_SIZE / 32) * bin + i]; - uint bc = popcount(bits); - uint pc = popcount(bits & (bin_bit - 1)); - prefix += (i < bin_word) ? bc : 0; - prefix += (i == bin_word) ? pc : 0; - - count += bc; - } - - // store the key in its proper place.. - output[prefix + bin_offset] = key; - - // last item for each bin adjusts local offset for next outer loop iteration - if (prefix == count - 1) - local_offset[bin] += count; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - } - - /* uint local_offset[RADIX_BINS]; */ - /* uint offset_global = 0; */ - /* for (int i=0;i> shift) & (RADIX_BINS-1); */ - /* const uint offset = local_offset[bin]; */ - /* output[offset] = input[ID]; */ - /* local_offset[bin]++; */ - /* } */ -} - -#else - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -sort_morton_codes_scatter_items( - global struct Globals* globals, - uint shift, - global uint* global_histogram, - global char* input0, - global char* input1, - unsigned int input0_offset, - unsigned int input1_offset, - uint iteration) -{ - const uint numItems = globals->numPrimitives; - const uint local_size = get_local_size(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - const uint localID = get_local_id(0); - const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - - const uint startID = (taskID + 0) * numItems / numTasks; - const uint endID = (taskID + 1) * numItems / numTasks; - - global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset); - global ulong* output = (global ulong*)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset); - - local uint local_offset[RADIX_BINS]; - uint off = 0; - for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size) - { - const uint count = global_histogram[RADIX_BINS * numTasks + i]; - const uint offset_task = global_histogram[RADIX_BINS * taskID + i]; - const uint sum = sub_group_reduce_add(count); - const uint prefix_sum = sub_group_scan_exclusive_add(count); - local_offset[i] = off + offset_task + prefix_sum; - off += sum; - } - - for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size) - { - const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1); - const uint offset = atomic_add_local(&local_offset[bin], 1); - output[offset] = input[ID]; - } - - /* uint local_offset[RADIX_BINS]; */ - /* uint offset_global = 0; */ - /* for (int i=0;i> shift) & (RADIX_BINS-1); */ - /* const uint offset = local_offset[bin]; */ - /* output[offset] = input[ID]; */ - /* local_offset[bin]++; */ - /* } */ -} -#endif - -#if 1 -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(SCATTER_WG_SIZE, 1, 1))) -void kernel -sort_morton_codes_scatter_items( - global struct Globals *globals, - global uint *global_histogram, - global ulong *input, - global ulong *output, - uint iteration, - uint numGroups, - uint update_morton_sort_in_flight) -{ - local uint local_offset[RADIX_BINS]; - local uint flags[RADIX_BINS*SCATTER_WG_SIZE/32]; - const uint numItems = globals->numPrimitives; - if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) - sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset, - flags, iteration, numGroups, numItems, false, update_morton_sort_in_flight); - else - sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset, - flags, iteration, numGroups, numItems, true, update_morton_sort_in_flight); -} - -#else - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -sort_morton_codes_scatter_items( - global struct Globals *globals, - uint shift, - global uint *global_histogram, - global char *input0, - global char *input1, - unsigned int input0_offset, - unsigned int input1_offset, - uint iteration) -{ - const uint numItems = globals->numPrimitives; - const uint local_size = get_local_size(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - const uint localID = get_local_id(0); - const uint globalID = get_local_id(0) + get_group_id(0)*get_local_size(0); - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - - const uint startID = (taskID + 0) * numItems / numTasks; - const uint endID = (taskID + 1) * numItems / numTasks; - - global ulong *input = (global ulong *)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset); - global ulong *output = (global ulong *)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset); - - local uint local_offset[RADIX_BINS]; - uint off = 0; - for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size) - { - const uint count = global_histogram[RADIX_BINS * numTasks + i]; - const uint offset_task = global_histogram[RADIX_BINS * taskID + i]; - const uint sum = sub_group_reduce_add(count); - const uint prefix_sum = sub_group_scan_exclusive_add(count); - local_offset[i] = off + offset_task + prefix_sum; - off += sum; - } - - for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size) - { - const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1); - const uint offset = atomic_add_local(&local_offset[bin], 1); - output[offset] = input[ID]; - } - - /* uint local_offset[RADIX_BINS]; */ - /* uint offset_global = 0; */ - /* for (int i=0;i> shift) & (RADIX_BINS-1); */ - /* const uint offset = local_offset[bin]; */ - /* output[offset] = input[ID]; */ - /* local_offset[bin]++; */ - /* } */ -} -#endif - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(512, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) -void kernel -sort_morton_codes_merged( - global struct Globals* globals, - global uint* global_histogram, - global uchar* input, - uint iteration, - uint numGroups -) -{ - const uint numItems = globals->numPrimitives; - const uint taskID = get_group_id(0); - const uint loc_id = get_local_id(0); - const uint lane = get_sub_group_local_id(); - - uint2 ids = get_thread_range(numItems, numGroups, taskID); - uint startID = ids.x; - uint endID = ids.y; - - local uint histogram[RADIX_BINS]; - local uint hist_tmp[RADIX_BINS]; - - if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) - { - sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, false, - taskID, startID, endID); - } - else - { - const uint req_iterations = globals->sort_iterations; - if (iteration < req_iterations) - return; - - iteration -= req_iterations; - - sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, true, - taskID, startID, endID); - } - - uint last_group = 0; - if (loc_id == 0) - last_group = atomic_inc_global(&globals->morton_sort_in_flight); - - write_mem_fence(CLK_GLOBAL_MEM_FENCE); - barrier(CLK_LOCAL_MEM_FENCE); - - last_group = work_group_broadcast(last_group, 0); - - bool isLastGroup = (loc_id < RADIX_BINS) && (last_group == numGroups - 1); - - uint global_count = 0; - - if (isLastGroup) - { - for (uint j = 0; j < numGroups; j++) - { - const uint count = (j == taskID) ? histogram[loc_id] : load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + loc_id], 0); - store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + loc_id], 0, global_count); - global_count += count; - } - - hist_tmp[get_sub_group_id()] = (get_sub_group_id() < MAX_HW_SIMD_WIDTH) ? sub_group_reduce_add(global_count) : 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (isLastGroup) - { - uint p = hist_tmp[lane]; - p = (lane < get_sub_group_id()) ? p : 0; - - global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count); - - store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numGroups + loc_id], 0, global_count); - } -} - -#if 0 -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(16))) void kernel -sort_morton_codes_bin_items( - global struct Globals* globals, - uint shift, - global uint* global_histogram, - global char* input0, - global char* input1, - unsigned int input0_offset, - unsigned int input1_offset, - uint iteration) -{ - const uint numItems = globals->numPrimitives; - const uint local_size = get_local_size(0); - const uint taskID = get_group_id(0); - const uint numTasks = get_num_groups(0); - const uint localID = get_local_id(0); - const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0); - const uint subgroupLocalID = get_sub_group_local_id(); - const uint subgroup_size = get_sub_group_size(); - - const uint startID = (taskID + 0) * numItems / numTasks; - const uint endID = (taskID + 1) * numItems / numTasks; - - global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset); - -#if 1 - local uint histogram[RADIX_BINS]; - for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size) - histogram[i] = 0; - - for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size) - { - const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1); - atomic_add(&histogram[bin], 1); - } - - for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size) - global_histogram[RADIX_BINS * taskID + i] = histogram[i]; - -#else - uint histogram[RADIX_BINS]; - for (int i = 0; i < RADIX_BINS; i++) - histogram[i] = 0; - - for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size) - { - const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1); - histogram[bin]++; - } - - for (uint i = 0; i < RADIX_BINS; i++) - { - const uint reduced_counter = sub_group_reduce_add(histogram[i]); - global_histogram[RADIX_BINS * taskID + i] = reduced_counter; - } -#endif -} - -#endif - -#define WG_SIZE_WIDE 256 -#define SG_SIZE_SCAN 16 - -// Fast implementation of work_group_scan_exclusive using SLM for WG size 256 and SG size 16 -GRL_INLINE uint work_group_scan_exclusive_add_opt(local uint* tmp, uint val) -{ - const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE_SCAN; - const uint sg_local_id = get_local_id(0) % SG_SIZE_SCAN; - const uint NUM_HW_THREADS_IN_WG = WG_SIZE_WIDE / SG_SIZE_SCAN; - - uint acc = sub_group_scan_exclusive_add(val); - uint acc2 = acc + val; - - tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc2, SG_SIZE_SCAN - 1); - barrier(CLK_LOCAL_MEM_FENCE); - uint loaded_val = tmp[sg_local_id]; - uint wgs_acc = sub_group_scan_exclusive_add(loaded_val); - uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id); - return acc + acc_for_this_hw_thread; -} - -// Wide reduce algorithm is divided into 2 kernels: -// 1. First, partial exclusive add scans are made within each work group using SLM. -// Then, The last work group for each histogram bin perform exclusive add scan along the bins using separate histgram_partials buffer. -// Last work group is determined using global atomics on wg_flags buffer. -// 2. Second kernel globally adds the values from histgram_partials to the histogram buffer where partial sums are. -// Then, last work group performs one more work_group scan and add so the histogram buffer values are adjusted with the global ones. -GRL_INLINE void sort_morton_codes_reduce_bins_wide_partial_sum_func( - global struct Globals* globals, - global uint* global_histogram, - global uint* global_histogram_partials, - global uint* wg_flags, - local uint* exclusive_scan_tmp, - uint numTasks, - uint numGroups, - uint iteration, - bool shift_primID) -{ - if (shift_primID) - { - const uint req_iterations = globals->sort_iterations; - if (iteration < req_iterations) - return; - - iteration -= req_iterations; - } - - const uint groupID = get_group_id(0) % RADIX_BINS; - const uint scanGroupID = get_group_id(0) / RADIX_BINS; - uint localID = get_local_id(0); - uint globalID = localID + (scanGroupID * WG_SIZE_WIDE); - const uint lastGroup = (numGroups / WG_SIZE_WIDE); - const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1; - - uint temp = 0; - uint last_count = 0; - if (globalID < numTasks) - { - temp = global_histogram[RADIX_BINS * globalID + groupID]; - - // Store the last value of the work group, it is either last element of histogram or last item in work group - if (globalID == endID) - last_count = temp; - } - - uint val = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp); - - if (globalID <= numTasks) - { - global_histogram[RADIX_BINS * globalID + groupID] = val; - - // Store the block sum value to separate buffer - if (globalID == endID) - global_histogram_partials[scanGroupID * WG_SIZE_WIDE + groupID] = val + last_count; - } - - // Make sure that global_histogram_partials is updated in all work groups - write_mem_fence(CLK_GLOBAL_MEM_FENCE); - barrier(0); - - // Now, wait for the last group for each histogram bin, so we know that - // all work groups already updated the global_histogram_partials buffer - uint last_group = 0; - if (localID == 0) - last_group = atomic_inc_global(&wg_flags[groupID]); - - last_group = work_group_broadcast(last_group, 0); - bool isLastGroup = (last_group == lastGroup - 1); - - // Each of the last groups computes the scan exclusive add for each partial sum we have - if (isLastGroup) - { - uint temp1 = 0; - if (localID < lastGroup) - temp1 = global_histogram_partials[localID * WG_SIZE_WIDE + groupID]; - - uint val2 = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp1); - - if (localID < lastGroup) - global_histogram_partials[localID * WG_SIZE_WIDE + groupID] = val2; - } -} - -GRL_INLINE void sort_morton_codes_reduce_bins_wide_add_reduce_func( - global struct Globals* globals, - global uint* global_histogram, - global uint* global_histogram_partials, - local uint* partials, - uint numTasks, - uint numGroups, - uint iteration, - bool shift_primID) -{ - if (shift_primID) - { - const uint req_iterations = globals->sort_iterations; - if (iteration < req_iterations) - return; - - iteration -= req_iterations; - } - - const uint groupID = get_group_id(0) % RADIX_BINS; - const uint scanGroupID = get_group_id(0) / RADIX_BINS; - const uint lastGroup = (numGroups / WG_SIZE_WIDE); - uint localID = get_local_id(0); - uint globalID = localID + (scanGroupID * WG_SIZE_WIDE); - const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1; - - // Add the global sums to the partials, skip the firsy scanGroupID as the first add - // value is 0 in case of exclusive add scans - if (scanGroupID > 0 && globalID <= numTasks) - { - uint add_val = global_histogram_partials[scanGroupID * RADIX_BINS + groupID]; - atomic_add_global(&global_histogram[globalID * RADIX_BINS + groupID], add_val); - } - - // Wait for the last group - uint last_group = 0; - if (localID == 0) - last_group = atomic_inc_global(&globals->morton_sort_in_flight); - - last_group = work_group_broadcast(last_group, 0); - bool isLastGroup = (last_group == numGroups - 1); - - // Do the exclusive scan within all bins with global data now - if (isLastGroup) - { - mem_fence_gpu_invalidate(); - - uint global_count = global_histogram[numTasks * RADIX_BINS + localID]; - - partials[get_sub_group_id()] = sub_group_reduce_add(global_count); - - barrier(CLK_LOCAL_MEM_FENCE); - - uint lane = get_sub_group_local_id(); - uint p = partials[lane]; - p = (lane < get_sub_group_id()) ? p : 0; - - global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count); - - store_uint_L1WB_L3WB(&global_histogram[numTasks * RADIX_BINS + localID], 0, global_count); - } -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN))) -void kernel -sort_morton_codes_reduce_bins_wide_partial_sum( - global struct Globals* globals, - uint numTasks, - uint numGroups, - global uint* global_histogram, - global uint* global_histogram_partials, - global uint* wg_flags, - uint iteration) -{ - local uint exclusive_scan_tmp[WG_SIZE_WIDE / SG_SIZE_SCAN]; - - const uint numItems = globals->numPrimitives; - if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) - sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, false); - else - sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, true); -} - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1))) -__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN))) -void kernel -sort_morton_codes_reduce_bins_wide_add_reduce( - global struct Globals* globals, - uint numTasks, - uint numGroups, - global uint* global_histogram, - global uint* global_histogram_partials, - uint iteration) -{ - local uint partials[RADIX_BINS]; - - const uint numItems = globals->numPrimitives; - if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD) - sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, false); - else - sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, true); -} diff --git a/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl deleted file mode 100644 index dee315adcda..00000000000 --- a/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl +++ /dev/null @@ -1,297 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module msb_radix_bitonic_sort; - -kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl") -{ - links lsc_intrinsics; - - kernel opencl_debug_print < kernelFunction="debug_print_kernel">; - kernel opencl_check_bls < kernelFunction="check_bls_sort">; - - kernel opencl_bottom_level_sort_single_wg < kernelFunction="sort_morton_codes_bottom_level_single_wg">; - - kernel opencl_build_morton_kernel_sort_msb_init < kernelFunction="sort_morton_codes_msb_begin">; - - kernel opencl_build_morton_kernel_sort_msb_scheduler < kernelFunction="scheduler">; - - kernel opencl_build_morton_kernel_sort_bottom_level < kernelFunction="sort_morton_codes_bottom_level">; - - kernel opencl_build_morton_kernel_sort_msb_count_items < kernelFunction="sort_morton_codes_msb_count_items">; - kernel opencl_build_morton_kernel_sort_msb_bin_items < kernelFunction="sort_morton_codes_msb_bin_items">; - - kernel opencl_build_morton_kernel_sort_batched_bls_dispatch < kernelFunction="sort_morton_codes_batched_BLS_dispatch">; -} - - -const MSB_RADIX_NUM_VCONTEXTS = 8; -const BOTTOM_LEVEL_SORT_THRESHOLD = 512; - -struct MSBRadixScheduler -{ - dword num_wgs_msb; - dword num_wgs_bls; - - dword scheduler_postsync; - dword _pad1; -}; - -struct MSBRadixArgs -{ - qword p_scheduler; - qword p_num_primitives; -}; - - - - -struct BatchedBLSDispatchEntry -{ - qword p_data_buffer; - qword num_elements; // number of elements in p_data_buffer -}; - - - - -metakernel add_bls_dispatch_init(qword p_storage) -{ - define REG_numWgs REG14; - define REG_p_storage REG15; - - REG_numWgs = 0; - REG_p_storage = p_storage; -} - - - - -// basically this code does: -// bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives }; -// dispatchId++; -// -metakernel add_bls_dispatch( - qword p_data, - qword p_num_primitives -) -{ - define C_1 REG0; - define C_8 REG1; - - define C_MIN_PRIMREFS REG2; - - define REG_p_data REG3; - define REG_num_prims REG4; - define REG_no_dispatch REG5; - - define REG_numWgs REG14; - define REG_p_storage REG15; - - C_MIN_PRIMREFS = 2; - - REG_num_prims = 0; - REG_num_prims.lo = load_dword(p_num_primitives); - - REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; - - goto l_finish if(REG_no_dispatch.lo); - - C_1 = 1; - C_8 = 8; - - // pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data - REG_p_data = p_data; - store_qword( REG_p_storage, REG_p_data ); // store the data pointer - - REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct - - // pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives - store_qword( REG_p_storage, REG_num_prims ); - - REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance - - REG_numWgs = REG_numWgs + C_1; - -l_finish: - -} - - - - -metakernel batched_bls_dispatch( - qword private_mem -) -{ - define REG_numWgs REG14; - - DISPATCHDIM_X = REG_numWgs; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem); -} - - - - -metakernel sort_bottom_level( - qword build_globals, - qword input, - qword p_num_primitives) -{ - define REG_num_prims REG0; - define C_MIN_PRIMREFS REG1; - define REG_no_dispatch REG2; - - REG_num_prims = load_dword( p_num_primitives ); - - C_MIN_PRIMREFS = 2; - - REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; - - goto l_finish if(REG_no_dispatch.lo); - - dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input); - -l_finish: - -} - - - - -metakernel sort( - qword build_globals, - qword input, - qword tmp, - MSBRadixArgs sort_args) -{ - define REG_num_prims REG0; - { - define C_MIN_PRIMREFS REG1; - define C_MAX_PRIMREFS REG2; - define REG_no_dispatch REG3; - define REG_dispatch_single_wg REG4; - - REG_num_prims = load_dword( sort_args.p_num_primitives ); - C_MIN_PRIMREFS = 2; - C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD; - - REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; - REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS; - - goto l_sort_finish if(REG_no_dispatch.lo); - goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo); - goto l_full_sort; - } - -l_dispatch_single_wg: - - { - dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input); - goto l_sort_finish; - } - -l_full_sort: - - define p_scheduler sort_args.p_scheduler; - define p_scheduler_postsync (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) ); - define p_num_wgs_bls (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) ); - - define REG_scheduler_postsync REG3; - REG_scheduler_postsync = p_scheduler_postsync; - - define C_0 REG4; - define C_8 REG5; - define C_255 REG6; - C_0 = 0; - C_8 = 8; - C_255 = 255; - - store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore - - REG_num_prims = REG_num_prims + C_255; - REG_num_prims = REG_num_prims >> C_8; - - DISPATCHDIM_X = REG_num_prims.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - control( cs_store_fence ); // commit the semaphore write - - // initialize the whole execution - dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp) - postsync store_dword( p_scheduler_postsync, 1 ); - - // wait on count_items kernel - semaphore_wait while( *p_scheduler_postsync != 1 ); - - dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler) - postsync store_dword( p_scheduler_postsync, 2 ); - - // wait on count_items kernel - semaphore_wait while( *p_scheduler_postsync != 2 ); - - dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input) - postsync store_dword( p_scheduler_postsync, 0 ); - - define C_MASK_HI REG4; - C_MASK_HI = 0x00000000ffffffff; - - l_build_loop: - { - semaphore_wait while( *p_scheduler_postsync != 0 ); - { - dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp ) - postsync store_dword( p_scheduler_postsync, 1 ); - - // wait on scheduler kernel - semaphore_wait while( *p_scheduler_postsync != 1 ); - } - - // load and process the scheduler results - define REG_wg_counts REG0; - define REG_num_msb_wgs REG0.lo; - define REG_num_bls_wgs REG0.hi; - define REG_p_scheduler REG1; - define REG_no_msb_wgs REG2; - { - REG_p_scheduler = p_scheduler; - REG_wg_counts = load_qword( REG_p_scheduler ); - - REG_no_msb_wgs = REG_wg_counts & C_MASK_HI; - REG_no_msb_wgs = REG_no_msb_wgs == 0; - } - - // dispatch new bls WGs - DISPATCHDIM_X = REG_num_bls_wgs; - dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input ); - - // jump out if there are no msb WGs - goto l_sort_finish if (REG_no_msb_wgs); - - DISPATCHDIM_X = REG_num_msb_wgs; - dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler) - postsync store_dword( p_scheduler_postsync, 2 ); - - // wait on count_items kernel - semaphore_wait while( *p_scheduler_postsync != 2 ); - - dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input) - postsync store_dword( p_scheduler_postsync, 0 ); - - // wait till all BLS finished launching - semaphore_wait while( *p_num_wgs_bls != 0 ); - - goto l_build_loop; - } - -l_sort_finish: - -} diff --git a/src/intel/vulkan/grl/gpu/new_sah_builder.grl b/src/intel/vulkan/grl/gpu/new_sah_builder.grl deleted file mode 100644 index d0a9694acc2..00000000000 --- a/src/intel/vulkan/grl/gpu/new_sah_builder.grl +++ /dev/null @@ -1,665 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module new_sah_builder; - -kernel_module bfs_kernels ("bvh_build_BFS.cl") -{ - links lsc_intrinsics; - - kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial < kernelFunction="BFS_pass1_initial" > ; - kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed < kernelFunction="BFS_pass1_indexed" > ; - kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial < kernelFunction="BFS_pass2_initial" > ; - kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed < kernelFunction="BFS_pass2_indexed" > ; - - kernel opencl_build_kernel_BinnedSAH_DFS < kernelFunction="DFS" >; - // kernel opencl_build_kernel_BinnedSAH_BuildQNodes < kernelFunction="build_qnodes" >; - kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff < kernelFunction="build_qnodes_pc_kickoff" >; - kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify < kernelFunction="build_qnodes_pc_amplify" >; - kernel opencl_build_kernel_BinnedSAH_begin < kernelFunction = "begin" >; - kernel opencl_build_kernel_BinnedSAH_scheduler < kernelFunction = "scheduler" >; - - kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch < kernelFunction="BFS_pass1_initial_batchable" >; - kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch < kernelFunction="BFS_pass1_indexed_batchable" >; - kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch < kernelFunction="BFS_pass2_initial_batchable" >; - kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch < kernelFunction="BFS_pass2_indexed_batchable" >; - - kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >; - kernel opencl_build_kernel_BinnedSAH_begin_batched < kernelFunction="begin_batchable" >; - - kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched < kernelFunction="build_qnodes_init_scheduler_batched" >; - kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched < kernelFunction="build_qnodes_begin_batchable" >; - kernel opencl_build_kernel_BinnedSAH_qnode_scheduler < kernelFunction="build_qnodes_scheduler" >; - kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch < kernelFunction="build_qnodes_pc_amplify_batched" >; - - kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >; - -} - -kernel opencl_build_kernel_DFS_single_wg < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" > -kernel opencl_build_kernel_DFS_trivial < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial" > -kernel opencl_build_kernel_DFS_single_wg_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" > -kernel opencl_build_kernel_DFS_trivial_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable" > - -kernel single_pass_binsah < source="bvh_build_DFS.cl", kernelFunction="DFS" > - - -const DFS_MIN_PRIMREFS = 6; -const DFS_MAX_PRIMREFS = 256; -const BFS_WG_SIZE_SHIFT = 9; - - - -struct Scheduler -{ - dword num_bfs_wgs; - dword num_dfs_wgs; - - dword scheduler_postsync; - dword _pad1; - - dword num_trivial_builds; - dword num_single_builds; - - dword batched_build_wg_count; - dword batched_build_loop_mask; - -}; - - -struct SAHBuildArgs -{ - qword p_num_primitives; - qword p_qnode_child_buffer; - qword p_scheduler; - qword p_sah_globals; - qword p_globals; - qword p_primref_buffer; - qword p_primref_index_buffers; - qword p_bvh_base; - qword p_bvh2; - qword p_root_buffer_counters; - dword sah_build_flags; - dword leaf_size; - dword leaf_type; - dword max_internal_nodes; -}; - - -metakernel single_pass_binsah( - qword build_globals, - qword bvh_buffer, - qword build_primref_buffer, - qword build_primref_index_buffers, - dword alloc_backpointers ) -{ - - dispatch single_pass_binsah(1, 1, 1) args( - build_globals, - bvh_buffer, - build_primref_buffer, - build_primref_index_buffers, - alloc_backpointers - ); - -} - - - -metakernel new_sah_build( SAHBuildArgs build_args ) -{ - define REG_num_prims REG0; - - { - define C_MIN_PRIMREFS REG1; - define C_MAX_PRIMREFS REG2; - define REG_dispatch_trivial REG3; - define REG_dispatch_single_wg REG4; - - REG_num_prims = load_dword( build_args.p_num_primitives ); - C_MIN_PRIMREFS = DFS_MIN_PRIMREFS; - C_MAX_PRIMREFS = DFS_MAX_PRIMREFS; - - REG_dispatch_trivial = REG_num_prims <= C_MIN_PRIMREFS; - REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS; - - goto l_dispatch_trivial if(REG_dispatch_trivial.lo); - goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo); - goto l_full_build; - } - -l_dispatch_trivial: - { - dispatch opencl_build_kernel_DFS_trivial (1,1,1) - args( build_args.p_globals, - build_args.p_bvh_base, - build_args.p_primref_buffer, - build_args.p_primref_index_buffers, - build_args.sah_build_flags - ); - - control( wait_idle ); - goto l_done; - } - -l_dispatch_single_wg: - { - dispatch opencl_build_kernel_DFS_single_wg (1,1,1) - args( build_args.p_globals, - build_args.p_bvh_base, - build_args.p_primref_buffer, - build_args.p_primref_index_buffers, - build_args.sah_build_flags - ); - - control( wait_idle ); - goto l_done; - } - - -l_full_build: - - - { - define p_scheduler build_args.p_scheduler; - define p_num_dfs_wgs build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs); - define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) ); - define C_0 REG1; - define C_8 REG2; - C_8 = 8; - C_0 = 0; - - - // - // Init pass - // - store_dword( p_scheduler_postsync, C_0.lo ); - - // compute number of BFS WGs from prim-count - // NOTE: This code uses a hardcoded WG size of 512 for BFS - // If the BFS wg size ever changes, it needs to be touched - // This is necessary because DG2 shifter only supports POW2 shifts - { - define REG_scheduler_postsync REG3; - define C_511 REG4; - define C_1 REG5; - - REG_scheduler_postsync = p_scheduler_postsync; - C_511 = 511; - C_1 = 1; - - store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore - - REG_num_prims = REG_num_prims + C_511; - REG_num_prims = REG_num_prims >> C_8; - REG_num_prims = REG_num_prims >> C_1; - - DISPATCHDIM_X = REG_num_prims.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - control( cs_store_fence ); // commit the semaphore write - - // launch scheduler init kernel - dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1) - args( - build_args.p_scheduler, - build_args.leaf_size, - build_args.leaf_type, - build_args.p_primref_index_buffers, - build_args.p_primref_buffer, - build_args.p_bvh2, - build_args.p_bvh_base, - build_args.p_globals, - build_args.p_sah_globals, - build_args.p_qnode_child_buffer, - build_args.sah_build_flags - ) - postsync store_dword( p_scheduler_postsync, 1 ); - - // wait on init kernel - semaphore_wait while( *p_scheduler_postsync != 1 ); - - // launch BFS1 pass1 - dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial - args( build_args.p_scheduler, - build_args.p_sah_globals) - postsync store_dword( p_scheduler_postsync, 0 ); - - // wait on BFS pass1 - semaphore_wait while( *p_scheduler_postsync != 0 ); - - // launch BFS pass2 - dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial - args( build_args.p_scheduler, - build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 1 ); - } - - // after BFS pass 2 we drop into a scheduling loop - - l_build_loop: - { - semaphore_wait while( *p_scheduler_postsync != 1 ); - - { - dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1) - args( build_args.p_scheduler, build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 0 ); - - // wait on the scheduler - semaphore_wait while( *p_scheduler_postsync != 0 ); - } - - // load and process the scheduler results - define REG_wg_counts REG0; - define REG_num_bfs_wgs REG0.lo; - define REG_num_dfs_wgs REG0.hi; - define REG_loop_break REG1; - define REG_p_scheduler REG2; - { - REG_p_scheduler = p_scheduler; - REG_wg_counts = load_qword( REG_p_scheduler ); - - define C_MASK_LO REG3 ; - C_MASK_LO = 0xffffffff; - - REG_loop_break = REG_wg_counts & C_MASK_LO; - REG_loop_break = REG_loop_break == 0; - } - - // dispatch new DFS WGs - DISPATCHDIM_X = REG_num_dfs_wgs; - dispatch_indirect opencl_build_kernel_BinnedSAH_DFS - args( p_scheduler, - build_args.p_sah_globals ); - - // jump out if there are no bfs WGs - goto l_build_qnodes if (REG_loop_break); - - // dispatch new BFS1 WGs - DISPATCHDIM_X = REG_num_bfs_wgs; - dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed - args( p_scheduler, - build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 2 ); - - semaphore_wait while( *p_scheduler_postsync != 2 ); - - // dispatch new BFS2 WGs - dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed - args( p_scheduler, - build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 1 ); - - //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore - - // wait until all upcoming DFS WGs have finished launching - // so that the scheduler can refill the launch array - // TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely) - semaphore_wait while( *p_num_dfs_wgs != 0 ); - - - goto l_build_loop; - } - } - -l_build_qnodes: - - control( wait_idle ); - - // P/C qnode build - - dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1) - args( build_args.p_sah_globals, - build_args.p_qnode_child_buffer, - build_args.sah_build_flags ); - - { - define p_pc_counters ( build_args.p_root_buffer_counters ); - - define REG_addr REG0; - define REG_produced REG1; - define REG_consumed REG2; - define REG_have_work REG3; - define REG_wg_count REG4; - define C_8 REG5; - define C_16 REG6; - define C_1 REG7; - C_1 = 1; - C_8 = 8; - C_16 = 16; - REG_addr = build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address - - REG_consumed = 0; - - l_qnode_loop: - - control( wait_idle ); // wait for previous pass - - // load counters and compute number of wgs to respawn - REG_produced = load_qword( REG_addr ); REG_addr = REG_addr + C_8; - REG_wg_count = REG_produced - REG_consumed; - REG_have_work = REG_wg_count > 0; - - goto l_done if not(REG_have_work.lo); - - // save REG_consumed as a starting position in p_qnode_child_buffer - store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8; - - // save REG_produced as ending position in p_qnode_child_buffer - store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16; - - REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration - - // calculate amount of workgroups to schedule - REG_wg_count = REG_wg_count + C_1; - REG_wg_count = REG_wg_count >> C_1; - - DISPATCHDIM_X = REG_wg_count.lo; - - control( cs_store_fence ); // commit the stores - - dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify - args( build_args.p_sah_globals, - build_args.p_qnode_child_buffer, - build_args.sah_build_flags); - - goto l_qnode_loop; - } - -l_done: -} - - - - - - - - - -struct SAHBuildArgsBatchable -{ - qword p_globals_ptrs; - qword p_scheduler; - qword p_buffers_info; - qword p_sah_globals; - - dword num_max_qnode_global_root_buffer_entries; - dword num_builds; - -}; - - -metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args ) -{ - define p_scheduler build_args.p_scheduler; - define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) ); - define p_num_dfs_wgs (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs)); - - // initialize scheduler semaphore - REG0.lo = 0; - store_dword( p_scheduler_postsync, REG0.lo ); - - - // dispatch categorization pass - dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1) - args( - build_args.p_scheduler, - build_args.p_globals_ptrs, - build_args.p_buffers_info, - build_args.p_sah_globals, - build_args.num_builds - ) - postsync store_dword( p_scheduler_postsync, 1 ); - - // wait on the categorization pass - semaphore_wait while( *p_scheduler_postsync != 1 ); - - - // dispatch the trivial and single-WG passes - { - REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) ); - DISPATCHDIM_X = REG0.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - // dispatch trivial builds - - dispatch_indirect opencl_build_kernel_DFS_trivial_batch - args( build_args.p_sah_globals ); - - control( wait_idle ); - - // dispatch single-wg builds - - DISPATCHDIM_X = REG0.hi; - dispatch_indirect opencl_build_kernel_DFS_single_wg_batch - args( build_args.p_sah_globals, build_args.p_scheduler ); - } - - // compute the number of builds not covered by the trivial passes - // skip the builder loop if all builds are satisfied by trivial passes - { - REG1 = REG0.lo; - REG2 = REG0.hi; - REG3 = build_args.num_builds; - REG5 = REG2 + REG1; - REG5 = REG3 - REG5; - REG4 = REG5 == 0 ; - - goto l_done if (REG4.lo); - } - - // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop - define REG_num_nontrivial REG5; - -l_build_outer_loop: - { - - // configure the scheduler to initiate a new block of builds - - dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1) - args( build_args.p_scheduler, build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 0 ); - - // wait on init kernel - semaphore_wait while( *p_scheduler_postsync != 0 ); - - - // read results produced by scheduler init kernel - // lo == BFS wg count. hi == all ones if we need to loop again - // - REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count); - REG4 = load_qword( REG0 ); - - // launch BFS1 pass1 - DISPATCHDIM_X = REG4.lo; - dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch - args( build_args.p_scheduler, - build_args.p_sah_globals) - postsync store_dword( p_scheduler_postsync, 1 ); - - // wait on BFS pass1 - semaphore_wait while( *p_scheduler_postsync != 1 ); - - // launch BFS pass2 - dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch - args( build_args.p_scheduler, - build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 0 ); - - l_build_loop: - { - semaphore_wait while( *p_scheduler_postsync != 0 ); - - { - dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1) - args( build_args.p_scheduler, build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 1 ); - - // wait on the scheduler - semaphore_wait while( *p_scheduler_postsync != 1 ); - } - - // load and process the scheduler results - define REG_wg_counts REG0; - define REG_num_bfs_wgs REG0.lo; - define REG_num_dfs_wgs REG0.hi; - define REG_loop_break REG1; - define REG_p_scheduler REG2; - { - REG_p_scheduler = p_scheduler; - REG_wg_counts = load_qword( REG_p_scheduler ); - - define C_MASK_LO REG3 ; - C_MASK_LO = 0xffffffff; - - REG_loop_break = REG_wg_counts & C_MASK_LO; - REG_loop_break = REG_loop_break == 0; - } - - // dispatch new DFS WGs - DISPATCHDIM_X = REG_num_dfs_wgs; - dispatch_indirect opencl_build_kernel_BinnedSAH_DFS - args( p_scheduler, - build_args.p_sah_globals ); - - // jump out if there are no bfs WGs - goto l_continue_outer_loop if (REG_loop_break); - - // dispatch new BFS1 WGs - DISPATCHDIM_X = REG_num_bfs_wgs; - dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch - args( p_scheduler, - build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 2 ); - - semaphore_wait while( *p_scheduler_postsync != 2 ); - - // dispatch new BFS2 WGs - dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch - args( p_scheduler, - build_args.p_sah_globals ) - postsync store_dword( p_scheduler_postsync, 0 ); - - //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore - - // wait until all upcoming DFS WGs have finished launching - // so that the scheduler can refill the launch array - // TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely) - semaphore_wait while( *p_num_dfs_wgs != 0 ); - - goto l_build_loop; - } - - - l_continue_outer_loop: - - - goto l_build_outer_loop if(REG4.hi); - - } - -//////// -// -// Qnode build phase -// -//////// - - // Wait for all outstanding DFS dispatches to complete, then build the QNodes - control( wait_idle ); - - define REG_wg_counts REG1; - define REG_p_scheduler REG2; - define REG_have_work REG3; - define REG_GRB_NUM_MAX_ENTRIES REG4; - - // init scheduler for qnode phase - dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1) - args( build_args.p_scheduler, - build_args.num_builds, - build_args.num_max_qnode_global_root_buffer_entries); - - REG_p_scheduler = p_scheduler; - - control( wait_idle ); - - REG_wg_counts = load_qword( REG_p_scheduler ); - - DISPATCHDIM_X = REG_wg_counts.lo; - - // configure the scheduler to initiate a new block of builds - dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched - args( build_args.p_scheduler, - build_args.p_sah_globals); - - // read results produced by init scheduler kernel - // lo == num of builds processed. hi == num of maximum global root buffer entries - // - REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count); - REG5 = load_qword( REG0 ); - - REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi; - REG_GRB_NUM_MAX_ENTRIES.hi = 0; - -l_qnode_loop: - { - control( wait_idle ); // wait for previous pass - - dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler ); - - control( wait_idle ); - - REG_wg_counts = load_qword( REG_p_scheduler ); - REG_have_work = REG_wg_counts > 0; - - goto l_done if not(REG_have_work.lo); - - DISPATCHDIM_X = REG_wg_counts.lo; - - dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch - args( build_args.p_sah_globals, - build_args.p_scheduler ); - - control( wait_idle ); - - REG_wg_counts = load_qword( REG_p_scheduler ); // reload values - REG_wg_counts.lo = REG_wg_counts.hi; - REG_wg_counts.hi = 0; - - REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES; - - goto l_qnode_loop if not(REG_have_work.lo); - - DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled - - dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched - args( build_args.p_sah_globals, - build_args.p_scheduler ); - - goto l_qnode_loop; - } - -//////// -// -// Old implementation - TODO: maybe add switch between two implementations? -// -//////// - // Wait for all outstanding DFS dispatches to complete, then build the QNodes - //DISPATCHDIM_X = REG5.lo; - - //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes - // args( build_args.p_sah_globals, build_args.p_scheduler ); - - -l_done: - - control( wait_idle ); - -} diff --git a/src/intel/vulkan/grl/gpu/postbuild_info.grl b/src/intel/vulkan/grl/gpu/postbuild_info.grl deleted file mode 100644 index 3039e533a9b..00000000000 --- a/src/intel/vulkan/grl/gpu/postbuild_info.grl +++ /dev/null @@ -1,49 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module postbuild_info; // In postbuild we assume output data structure to be DXR compatible - -kernel compacted_size < source="bvh_postbuild_info.cl", kernelFunction="compacted_size" > -kernel current_size < source="bvh_postbuild_info.cl", kernelFunction="current_size" > -kernel serialized_size < source="bvh_postbuild_info.cl", kernelFunction="serialized_size" > -kernel decoded_size < source="bvh_postbuild_info.cl", kernelFunction="decoded_size" > - -metakernel compacted_size( - qword bvh, - qword postbuildInfo) -{ - dispatch compacted_size(1,1,1) args( - bvh, - postbuildInfo); -} - -metakernel current_size( - qword bvh, - qword postbuildInfo) -{ - dispatch current_size(1,1,1) args( - bvh, - postbuildInfo); -} - -metakernel serialized_size( - qword bvh, - qword postbuildInfo) -{ - dispatch serialized_size(1,1,1) args( - bvh, - postbuildInfo); -} - -metakernel decoded_size( - qword bvh, - qword postbuildInfo) -{ - dispatch decoded_size(1,1,1) args( - bvh, - postbuildInfo); -} diff --git a/src/intel/vulkan/grl/gpu/presplit.grl b/src/intel/vulkan/grl/gpu/presplit.grl deleted file mode 100644 index d0f6e53fbb1..00000000000 --- a/src/intel/vulkan/grl/gpu/presplit.grl +++ /dev/null @@ -1,62 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module presplit; - -kernel_module presplit_kernels ("bvh_build_presplit.cl") -{ - links lsc_intrinsics; - - kernel opencl_kernel_compute_num_presplits < kernelFunction="compute_num_presplits" >; - kernel opencl_kernel_priority_sum < kernelFunction="priority_sum" >; - kernel opencl_kernel_perform_presplits < kernelFunction="perform_presplits" >; -} - -import struct MKBuilderState "structs.grl"; -import struct MKSizeEstimate "structs.grl"; - - -metakernel compute_num_presplits( - MKBuilderState state, - qword presplit_buffer, - dword numHwThreads ) -{ - dispatch opencl_kernel_compute_num_presplits ( numHwThreads, 1, 1 ) args( - state.build_globals, - state.bvh_buffer, - state.build_primref_buffer, - presplit_buffer, - state.geomDesc_buffer ); -} - - -metakernel priority_sum( - MKBuilderState state, - MKSizeEstimate estimate, - qword presplit_buffer ) -{ - dispatch opencl_kernel_priority_sum ( 1, 1, 1 ) args( - state.build_globals, - presplit_buffer, - estimate.numPrimitivesToSplit / 2 ); -} - -metakernel perform_presplits( - MKBuilderState state, - MKSizeEstimate estimate, - qword presplit_buffer, - dword numHwThreads ) -{ - dispatch opencl_kernel_perform_presplits ( numHwThreads, 1, 1 ) args( - state.build_globals, - state.bvh_buffer, - state.build_primref_buffer, - presplit_buffer, - state.bvh_buffer, - state.geomDesc_buffer, - estimate.numPrimitivesToSplit / 2 ); -} diff --git a/src/intel/vulkan/grl/gpu/qbvh6.h b/src/intel/vulkan/grl/gpu/qbvh6.h deleted file mode 100644 index 22260d07f41..00000000000 --- a/src/intel/vulkan/grl/gpu/qbvh6.h +++ /dev/null @@ -1,933 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "GRLGen12.h" - -#include "shared.h" -#include "quad.h" - -/* ====== GENERAL BVH config ====== */ - -#define BVH_NODE_N6 6 -#define BVH_NODE_N 8 -#define BVH_NODE_N_LOG 3 - -#define SAH_LOG_BLOCK_SHIFT 2 -#define BVH_LEAF_N_MIN BVH_NODE_N6 -#define BVH_LEAF_N_MAX BVH_NODE_N6 - -#define BVH_NODE_DEFAULT_MASK 0xff -#define BVH_NODE_DEGENERATED_MASK 0x00 - -/* ====== QUANTIZATION config ====== */ - -#define QUANT_BITS 8 -#define QUANT_MIN 0 -#define QUANT_MAX 255 -#define QUANT_MAX_MANT (255.0f / 256.0f) - -#define NO_NODE_OFFSET 0 - -/* ======================================================================= */ -/* ============================== BVH BASE =============================== */ -/* ======================================================================= */ - -GRL_INLINE void setBVHBaseBounds(struct BVHBase *base, struct AABB *aabb) -{ - base->Meta.bounds.lower[0] = aabb->lower.x; - base->Meta.bounds.lower[1] = aabb->lower.y; - base->Meta.bounds.lower[2] = aabb->lower.z; - - base->Meta.bounds.upper[0] = aabb->upper.x; - base->Meta.bounds.upper[1] = aabb->upper.y; - base->Meta.bounds.upper[2] = aabb->upper.z; -} - -GRL_INLINE global struct QBVHNodeN *BVHBase_nodeData(struct BVHBase *bvh) -{ - return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET); -} - -GRL_INLINE global struct QBVHNodeN *BVHBase_rootNode(struct BVHBase *bvh) -{ - return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET); -} - -GRL_INLINE global struct Quad *BVHBase_quadLeaves(struct BVHBase *bvh) -{ - return (global struct Quad *)((void *)bvh + 64 * (ulong)bvh->quadLeafStart); -} - -GRL_INLINE uint64_t BVHBase_numNodes(struct BVHBase *bvh) -{ - return bvh->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64; -} - -GRL_INLINE uint64_t BVHBase_numQuads(struct BVHBase *bvh) -{ - return bvh->quadLeafCur - bvh->quadLeafStart; -} - -GRL_INLINE uint64_t BVHBase_numProcedurals(struct BVHBase *bvh) -{ - return bvh->proceduralDataCur - bvh->proceduralDataStart; -} - -GRL_INLINE uint64_t BVHBase_numInstances(struct BVHBase *bvh) -{ - return bvh->instanceLeafEnd - bvh->instanceLeafStart; -} - -/* =================================================================== */ -/* ============================== QBVH =============================== */ -/* =================================================================== */ - -__constant const float ulp = FLT_EPSILON; - -GRL_INLINE struct AABB conservativeAABB(struct AABB *aabb) -{ - struct AABB box; - const float4 v4 = max(fabs(aabb->lower), fabs(aabb->upper)); - const float v = ulp * max(v4.x, max(v4.y, v4.z)); - box.lower = aabb->lower - (float4)v; - box.upper = aabb->upper + (float4)v; - return box; -} - -GRL_INLINE struct AABB3f conservativeAABB3f(struct AABB3f* aabb3d) -{ - struct AABB aabb4d = AABBfromAABB3f(*aabb3d); - struct AABB box = conservativeAABB(&aabb4d); - return AABB3fFromAABB(box); -} - -struct QBVH_AABB -{ - uchar lower_x[BVH_NODE_N6]; - uchar upper_x[BVH_NODE_N6]; - uchar lower_y[BVH_NODE_N6]; - uchar upper_y[BVH_NODE_N6]; - uchar lower_z[BVH_NODE_N6]; - uchar upper_z[BVH_NODE_N6]; -}; - -struct QBVHNodeN -{ - float lower[3]; - int offset; - // 16 bytes - uchar type; - uchar pad; - // 18 bytes - char exp[3]; - uchar instMask; - // 22 bytes - uchar childData[6]; - // 28 bytes - struct QBVH_AABB qbounds; // + 36 bytes - // 64 bytes -}; - -GRL_INLINE uint QBVHNodeN_blockIncr(struct QBVHNodeN *This, uint childID) -{ - return This->childData[childID] & 0x3; -} - -GRL_INLINE uint QBVHNodeN_startPrim(struct QBVHNodeN *This, uint childID) -{ - return (This->childData[childID] >> 2) & 0xF; -} - -GRL_INLINE void initQBVHNodeN(struct QBVHNodeN *qnode) -{ - uint *ptr = (uint *)qnode; - for (uint i = 0; i < 16; i++) - ptr[i] = 0; -} - -GRL_INLINE struct AABB extractAABB_QBVHNodeN(struct QBVHNodeN *qnode, uint i) -{ - struct AABB aabb; - const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f); - const int4 lower_i = (int4)(qnode->qbounds.lower_x[i], qnode->qbounds.lower_y[i], qnode->qbounds.lower_z[i], 0); - const int4 upper_i = (int4)(qnode->qbounds.upper_x[i], qnode->qbounds.upper_y[i], qnode->qbounds.upper_z[i], 0); - const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f); - aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8); - aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8); - return aabb; -} - -GRL_INLINE struct AABB getAABB_QBVHNodeN(struct QBVHNodeN *qnode) -{ - struct AABB aabb; -#if 0 - AABB_init(&aabb); - for (uint i = 0; i < BVH_NODE_N6; i++) - { - struct AABB v = extractAABB_QBVHNodeN(qnode, i); - AABB_extend(&aabb, &v); - } -#else - uint lower_x = qnode->qbounds.lower_x[0]; - uint lower_y = qnode->qbounds.lower_y[0]; - uint lower_z = qnode->qbounds.lower_z[0]; - - uint upper_x = qnode->qbounds.upper_x[0]; - uint upper_y = qnode->qbounds.upper_y[0]; - uint upper_z = qnode->qbounds.upper_z[0]; - - for (uint i = 1; i < BVH_NODE_N6; i++) - { - uint lx = qnode->qbounds.lower_x[i]; - uint ly = qnode->qbounds.lower_y[i]; - uint lz = qnode->qbounds.lower_z[i]; - - uint ux = qnode->qbounds.upper_x[i]; - uint uy = qnode->qbounds.upper_y[i]; - uint uz = qnode->qbounds.upper_z[i]; - - bool valid = lx <= ux; - if (valid) - { - lower_x = min(lower_x, lx); - lower_y = min(lower_y, ly); - lower_z = min(lower_z, lz); - - upper_x = max(upper_x, ux); - upper_y = max(upper_y, uy); - upper_z = max(upper_z, uz); - } - } - - const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f); - const int4 lower_i = (int4)(lower_x, lower_y, lower_z, 0); - const int4 upper_i = (int4)(upper_x, upper_y, upper_z, 0); - const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f); - aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8); - aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8); -#endif - return aabb; -} - -GRL_INLINE struct AABB3f InternalNode_getAABB3f(struct InternalNode* node) -{ - return AABB3fFromAABB(getAABB_QBVHNodeN((struct QBVHNodeN*)node)); -} - -GRL_INLINE uint getNumChildren_QBVHNodeN(struct QBVHNodeN *qnode) -{ - uint children = 0; - for (uint i = 0; i < BVH_NODE_N6; i++) - { - uint lx = qnode->qbounds.lower_x[i]; - uint ux = qnode->qbounds.upper_x[i]; - bool valid = lx <= ux; - if (valid) - children++; - } - return children; -} - -GRL_INLINE long extractQBVHNodeN_offset(struct QBVHNodeN *qnode) -{ - return ((long)qnode->offset) << 6; -} - -GRL_INLINE void *QBVHNodeN_childrenPointer(struct QBVHNodeN *qnode) -{ - const int offset = qnode->offset; - return (void *)(qnode + offset); -} - -GRL_INLINE void subgroup_setQBVHNodeN_setFields_reduced_bounds(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, struct AABB reduced_aabb) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - const uint k = subgroupLocalID; - const float up = 1.0f + ulp; - const float down = 1.0f - ulp; - - struct AABB aabb = reduced_aabb; // needs to execute with full subgroup width - aabb = AABB_sub_group_broadcast(&aabb, 0); - - if (subgroupLocalID < BVH_NODE_N6) - { - struct AABB conservative_aabb = conservativeAABB(&aabb); - const float3 len = AABB_size(&conservative_aabb).xyz * up; - int3 exp; - const float3 mant = frexp_vec3(len, &exp); - const float3 org = conservative_aabb.lower.xyz; - - exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); - - qbvh_node->offset = offset; - qbvh_node->type = type; - - qbvh_node->lower[0] = org.x; - qbvh_node->lower[1] = org.y; - qbvh_node->lower[2] = org.z; - - qbvh_node->exp[0] = exp.x; - qbvh_node->exp[1] = exp.y; - qbvh_node->exp[2] = exp.z; - - qbvh_node->instMask = mask; - - uchar3 lower_uchar = (uchar3)(0x80); - uchar3 upper_uchar = (uchar3)(0); - - if (subgroupLocalID < numChildren) - { - struct AABB child_aabb = conservativeAABB(input_aabb); - - float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8)); - lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX)); - float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8)); - upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX)); - - lower_uchar = convert_uchar3_rtn(lower); - upper_uchar = convert_uchar3_rtp(upper); - - if (degenerated) - { - lower_uchar = upper_uchar = 0; - } - } - - qbvh_node->qbounds.lower_x[k] = lower_uchar.x; - qbvh_node->qbounds.lower_y[k] = lower_uchar.y; - qbvh_node->qbounds.lower_z[k] = lower_uchar.z; - qbvh_node->qbounds.upper_x[k] = upper_uchar.x; - qbvh_node->qbounds.upper_y[k] = upper_uchar.y; - qbvh_node->qbounds.upper_z[k] = upper_uchar.z; - - qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1; - -#if ENABLE_CONVERSION_CHECKS == 1 - - if (!(exp.x >= -128 && exp.x <= 127)) - printf("exp_x error \n"); - if (!(exp.y >= -128 && exp.y <= 127)) - printf("exp_y error \n"); - if (!(exp.z >= -128 && exp.z <= 127)) - printf("exp_z error \n"); - - struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k); - if (!AABB_subset(&child_aabb, &child_qaabb)) - { - uint3 lower_i = convert_uint3(lower_uchar); - uint3 upper_i = convert_uint3(upper_uchar); - - printf("\n ERROR %d\n", k); - printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i); - printf("%i uncompressed \n", k); - AABB_print(&child_aabb); - printf("%i compressed \n", k); - AABB_print(&child_qaabb); - - printf("%i uncompressed (as int) \n", k); - AABB_printasInt(&child_aabb); - printf("%i compressed (as int) \n", k); - AABB_printasInt(&child_qaabb); - - int4 e0 = child_aabb.lower < child_qaabb.lower; - int4 e1 = child_aabb.upper > child_qaabb.upper; - printf("e0 %d e1 %d \n", e0, e1); - } -#endif - } -} - -GRL_INLINE void subgroup_setQBVHNodeN_setFields(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated) -{ - struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); - subgroup_setQBVHNodeN_setFields_reduced_bounds(offset, type, input_aabb, numChildren, mask, qbvh_node, degenerated, aabb); -} - -GRL_INLINE void subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, bool active_lane) -{ - const uint lane = get_sub_group_local_id() % 8; - const uint node_in_sg = get_sub_group_local_id() / 8; - const uint k = lane; - const float up = 1.0f + ulp; - const float down = 1.0f - ulp; - - struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); // needs to execute with full subgroup width - aabb = AABB_sub_group_shuffle(&aabb, node_in_sg * 8); - - if (lane < BVH_NODE_N6 && active_lane) - { - struct AABB conservative_aabb = conservativeAABB(&aabb); - const float3 len = AABB_size(&conservative_aabb).xyz * up; - int3 exp; - const float3 mant = frexp_vec3(len, &exp); - const float3 org = conservative_aabb.lower.xyz; - - exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); - - qbvh_node->offset = offset; - qbvh_node->type = type; - - qbvh_node->lower[0] = org.x; - qbvh_node->lower[1] = org.y; - qbvh_node->lower[2] = org.z; - - qbvh_node->exp[0] = exp.x; - qbvh_node->exp[1] = exp.y; - qbvh_node->exp[2] = exp.z; - - qbvh_node->instMask = mask; - - uchar3 lower_uchar = (uchar3)(0x80); - uchar3 upper_uchar = (uchar3)(0); - - if (lane < numChildren) - { - struct AABB child_aabb = conservativeAABB(input_aabb); - - float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8)); - lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX)); - float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8)); - upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX)); - - lower_uchar = convert_uchar3_rtn(lower); - upper_uchar = convert_uchar3_rtp(upper); - - if (degenerated) - { - lower_uchar = upper_uchar = 0; - } - } - - qbvh_node->qbounds.lower_x[k] = lower_uchar.x; - qbvh_node->qbounds.lower_y[k] = lower_uchar.y; - qbvh_node->qbounds.lower_z[k] = lower_uchar.z; - qbvh_node->qbounds.upper_x[k] = upper_uchar.x; - qbvh_node->qbounds.upper_y[k] = upper_uchar.y; - qbvh_node->qbounds.upper_z[k] = upper_uchar.z; - - qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1; - -#if ENABLE_CONVERSION_CHECKS == 1 - - if (!(exp.x >= -128 && exp.x <= 127)) - printf("exp_x error \n"); - if (!(exp.y >= -128 && exp.y <= 127)) - printf("exp_y error \n"); - if (!(exp.z >= -128 && exp.z <= 127)) - printf("exp_z error \n"); - - struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k); - if (!AABB_subset(&child_aabb, &child_qaabb)) - { - uint3 lower_i = convert_uint3(lower_uchar); - uint3 upper_i = convert_uint3(upper_uchar); - - printf("\n ERROR %d\n", k); - printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i); - printf("%i uncompressed \n", k); - AABB_print(&child_aabb); - printf("%i compressed \n", k); - AABB_print(&child_qaabb); - - printf("%i uncompressed (as int) \n", k); - AABB_printasInt(&child_aabb); - printf("%i compressed (as int) \n", k); - AABB_printasInt(&child_qaabb); - - int4 e0 = child_aabb.lower < child_qaabb.lower; - int4 e1 = child_aabb.upper > child_qaabb.upper; - printf("e0 %d e1 %d \n", e0, e1); - } -#endif - } -} - -GRL_INLINE void subgroup_setInstanceQBVHNodeN(const int offset, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node, const uint instMask) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - - // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin. - // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here. - bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK); - - struct AABB aabb; - AABB_init(&aabb); - - // if every child is degenerated (or inactive) instance, we need to init aabb with origin point - uchar commonMask = sub_group_reduce_or_N6(instMask); - if (subgroupLocalID < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK)) - aabb = *input_aabb; - - subgroup_setQBVHNodeN_setFields(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated); -} - - -// return true if is degenerated -GRL_INLINE bool subgroup_setInstanceBox_2xSIMD8_in_SIMD16(struct AABB* input_aabb, const uint numChildren, uchar* mask, const uint instMask, bool active_lane) -{ - const uint lane = get_sub_group_local_id() % 8; - - // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin. - // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here. - bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK); - - // if every child is degenerated (or inactive) instance, we need to init aabb with origin point - uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask); - if (active_lane) - *mask = commonMask; - - if (active_lane && (degenerated && commonMask != BVH_NODE_DEGENERATED_MASK)) - AABB_init(input_aabb); - - return active_lane ? degenerated : false; -} - -GRL_INLINE void subgroup_setInstanceQBVHNodeN_x2(const int offset, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, const uint instMask, bool active_lane) -{ - const uint lane = get_sub_group_local_id() % 8; - - // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin. - // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here. - bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK); - - struct AABB aabb; - AABB_init(&aabb); - - // if every child is degenerated (or inactive) instance, we need to init aabb with origin point - uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask); - if (lane < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK)) - aabb = *input_aabb; - - subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated, active_lane); -} - - -GRL_INLINE void subgroup_setQBVHNodeN(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, uint mask) -{ - const uint subgroupLocalID = get_sub_group_local_id(); - - struct AABB aabb; - AABB_init(&aabb); - - if (subgroupLocalID < numChildren) - aabb = *input_aabb; - - subgroup_setQBVHNodeN_setFields(offset, type, &aabb, numChildren, mask, qbvh_node, false); -} - - -GRL_INLINE void subgroup_setQBVHNodeN_x2(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, bool active_lane) -{ - const uint lane = get_sub_group_local_id() % 8; - - struct AABB aabb; - AABB_init(&aabb); - - if (lane < numChildren) - aabb = *input_aabb; - - subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, type, &aabb, numChildren, BVH_NODE_DEFAULT_MASK, qbvh_node, false, active_lane); -} - - -GRL_INLINE void subgroup_QBVHNodeN_setBounds( uniform struct QBVHNodeN* qbvh_node, - uniform struct AABB reduced_bounds, - varying struct AABB input_aabb, - uniform uint numChildren, - varying ushort lane ) -{ - const float up = 1.0f + ulp; - const float down = 1.0f - ulp; - - int3 exp; - - struct AABB conservative_aabb = conservativeAABB( &reduced_bounds); - const float3 len = AABB_size( &conservative_aabb ).xyz * up; - const float3 mant = frexp_vec3( len, &exp ); - const float3 org = conservative_aabb.lower.xyz; - - exp += (mant > ( float3 )QUANT_MAX_MANT ? (int3)1 : (int3)0); - - qbvh_node->lower[0] = org.x; - qbvh_node->lower[1] = org.y; - qbvh_node->lower[2] = org.z; - - qbvh_node->exp[0] = exp.x; - qbvh_node->exp[1] = exp.y; - qbvh_node->exp[2] = exp.z; - - qbvh_node->instMask = 0xff; - - uchar3 lower_uchar = 0x80; - uchar3 upper_uchar = 0; - - if ( lane < BVH_NODE_N6 ) - { - ushort k = lane; - if( lane < numChildren ) - { - struct AABB child_aabb = conservativeAABB( &input_aabb ); // conservative ??? - - float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) ); - lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) ); - float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) ); - upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) ); - - lower_uchar = convert_uchar3_rtn( lower ); - upper_uchar = convert_uchar3_rtp( upper ); - } - - qbvh_node->qbounds.lower_x[k] = lower_uchar.x; - qbvh_node->qbounds.lower_y[k] = lower_uchar.y; - qbvh_node->qbounds.lower_z[k] = lower_uchar.z; - qbvh_node->qbounds.upper_x[k] = upper_uchar.x; - qbvh_node->qbounds.upper_y[k] = upper_uchar.y; - qbvh_node->qbounds.upper_z[k] = upper_uchar.z; - } - -} - -GRL_INLINE void QBVHNodeN_setBounds(struct QBVHNodeN *qbvh_node, struct AABB *input_aabb, const uint numChildren) -{ - const float up = 1.0f + ulp; - const float down = 1.0f - ulp; - - int3 exp; - struct AABB aabb; - AABB_init(&aabb); - for (uint i = 0; i < numChildren; i++) - AABB_extend(&aabb, &input_aabb[i]); - - struct AABB conservative_aabb = conservativeAABB(&aabb); - const float3 len = AABB_size(&conservative_aabb).xyz * up; - const float3 mant = frexp_vec3(len, &exp); - const float3 org = conservative_aabb.lower.xyz; - - exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); - - qbvh_node->lower[0] = org.x; - qbvh_node->lower[1] = org.y; - qbvh_node->lower[2] = org.z; - - qbvh_node->exp[0] = exp.x; - qbvh_node->exp[1] = exp.y; - qbvh_node->exp[2] = exp.z; - - qbvh_node->instMask = 0xff; - - for (uint k = 0; k < numChildren; k++) - { - struct AABB child_aabb = conservativeAABB(&input_aabb[k]); // conservative ??? - - float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8)); - lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX)); - float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8)); - upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX)); - - uchar3 lower_uchar = convert_uchar3_rtn(lower); - uchar3 upper_uchar = convert_uchar3_rtp(upper); - - qbvh_node->qbounds.lower_x[k] = lower_uchar.x; - qbvh_node->qbounds.lower_y[k] = lower_uchar.y; - qbvh_node->qbounds.lower_z[k] = lower_uchar.z; - qbvh_node->qbounds.upper_x[k] = upper_uchar.x; - qbvh_node->qbounds.upper_y[k] = upper_uchar.y; - qbvh_node->qbounds.upper_z[k] = upper_uchar.z; - -#if ENABLE_CONVERSION_CHECKS == 1 - if (!(exp.x >= -128 && exp.x <= 127)) - printf("exp_x error \n"); - if (!(exp.y >= -128 && exp.y <= 127)) - printf("exp_y error \n"); - if (!(exp.z >= -128 && exp.z <= 127)) - printf("exp_z error \n"); - - struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k); - if (!AABB_subset(&child_aabb, &child_qaabb)) - { - uint3 lower_i = convert_uint3(lower_uchar); - uint3 upper_i = convert_uint3(upper_uchar); - - printf("\n ERROR %d\n", k); - printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i); - printf("%i uncompressed \n", k); - AABB_print(&child_aabb); - printf("%i compressed \n", k); - AABB_print(&child_qaabb); - - printf("%i uncompressed (as int) \n", k); - AABB_printasInt(&child_aabb); - printf("%i compressed (as int) \n", k); - AABB_printasInt(&child_qaabb); - - int4 e0 = child_aabb.lower < child_qaabb.lower; - int4 e1 = child_aabb.upper > child_qaabb.upper; - printf("e0 %d e1 %d \n", e0, e1); - } -#endif - } - for (uint k = numChildren; k < BVH_NODE_N6; k++) - { - qbvh_node->qbounds.lower_x[k] = 0x80; - qbvh_node->qbounds.lower_y[k] = 0x80; - qbvh_node->qbounds.lower_z[k] = 0x80; - qbvh_node->qbounds.upper_x[k] = 0; - qbvh_node->qbounds.upper_y[k] = 0; - qbvh_node->qbounds.upper_z[k] = 0; - } -} - -GRL_INLINE void QBVHNodeN_setChildren(struct QBVHNodeN *qbvh_node, const int offset, const uint numChildren) -{ - qbvh_node->offset = offset; - for (uint k = 0; k < BVH_NODE_N6; k++) - qbvh_node->childData[k] = 1; -} - -GRL_INLINE void QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node) -{ - for (uint k = 0; k < BVH_NODE_N6; k++) - qbvh_node->childData[k] = 1; -} - -GRL_INLINE void SUBGROUP_QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node) -{ - if( get_sub_group_local_id() < BVH_NODE_N6 ) - qbvh_node->childData[get_sub_group_local_id()] = 1; -} - - -GRL_INLINE void QBVHNodeN_setChildIncr2(struct QBVHNodeN *qbvh_node) -{ - for (uint k = 0; k < BVH_NODE_N6; k++) - qbvh_node->childData[k] = 2; -} - -GRL_INLINE void QBVHNodeN_setType(struct QBVHNodeN *qbvh_node, const uint type) -{ - qbvh_node->type = type; -} - -GRL_INLINE void setQBVHNodeN(const int offset, const uint type, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node) -{ - QBVHNodeN_setType(qbvh_node, type); - QBVHNodeN_setChildren(qbvh_node, offset, numChildren); - QBVHNodeN_setBounds(qbvh_node, input_aabb, numChildren); -} - -GRL_INLINE void printQBVHNodeN(struct QBVHNodeN *qnode) -{ - printf(" offset %d type %d \n", qnode->offset, (int)qnode->type); - printf(" lower %f %f %f \n", qnode->lower[0], qnode->lower[1], qnode->lower[2]); - printf(" exp %d %d %d \n", (int)qnode->exp[0], (int)qnode->exp[1], (int)qnode->exp[2]); - printf(" instMask %d \n", qnode->instMask); - - struct AABB aabb0 = extractAABB_QBVHNodeN(qnode, 0); - struct AABB aabb1 = extractAABB_QBVHNodeN(qnode, 1); - struct AABB aabb2 = extractAABB_QBVHNodeN(qnode, 2); - struct AABB aabb3 = extractAABB_QBVHNodeN(qnode, 3); - struct AABB aabb4 = extractAABB_QBVHNodeN(qnode, 4); - struct AABB aabb5 = extractAABB_QBVHNodeN(qnode, 5); - - printf(" lower_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_x[0], qnode->qbounds.lower_x[1], qnode->qbounds.lower_x[2], qnode->qbounds.lower_x[3], qnode->qbounds.lower_x[4], qnode->qbounds.lower_x[5], aabb0.lower.x, aabb1.lower.x, aabb2.lower.x, aabb3.lower.x, aabb4.lower.x, aabb5.lower.x); - printf(" upper_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_x[0], qnode->qbounds.upper_x[1], qnode->qbounds.upper_x[2], qnode->qbounds.upper_x[3], qnode->qbounds.upper_x[4], qnode->qbounds.upper_x[5], aabb0.upper.x, aabb1.upper.x, aabb2.upper.x, aabb3.upper.x, aabb4.upper.x, aabb5.upper.x); - - printf(" lower_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_y[0], qnode->qbounds.lower_y[1], qnode->qbounds.lower_y[2], qnode->qbounds.lower_y[3], qnode->qbounds.lower_y[4], qnode->qbounds.lower_y[5], aabb0.lower.y, aabb1.lower.y, aabb2.lower.y, aabb3.lower.y, aabb4.lower.y, aabb5.lower.y); - printf(" upper_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_y[0], qnode->qbounds.upper_y[1], qnode->qbounds.upper_y[2], qnode->qbounds.upper_y[3], qnode->qbounds.upper_y[4], qnode->qbounds.upper_y[5], aabb0.upper.y, aabb1.upper.y, aabb2.upper.y, aabb3.upper.y, aabb4.upper.y, aabb5.upper.y); - - printf(" lower_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_z[0], qnode->qbounds.lower_z[1], qnode->qbounds.lower_z[2], qnode->qbounds.lower_z[3], qnode->qbounds.lower_z[4], qnode->qbounds.lower_z[5], aabb0.lower.z, aabb1.lower.z, aabb2.lower.z, aabb3.lower.z, aabb4.lower.z, aabb5.lower.z); - printf(" upper_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_z[0], qnode->qbounds.upper_z[1], qnode->qbounds.upper_z[2], qnode->qbounds.upper_z[3], qnode->qbounds.upper_z[4], qnode->qbounds.upper_z[5], aabb0.upper.z, aabb1.upper.z, aabb2.upper.z, aabb3.upper.z, aabb4.upper.z, aabb5.upper.z); -} - -GRL_INLINE int encodeOffset(global char *bvh_mem, global void *parent, int global_child_offset) -{ - long global_parent_offset = (long)parent - (long)bvh_mem; - global_parent_offset = global_parent_offset & (~(64 - 1)); // FIXME: (sw) this should not be necessary? - int relative_offset = global_child_offset - global_parent_offset; // FIXME: this limits BVH size to 4GB - //if ((int)relative_offset <= 0) printf("relative offset <= 0 %d global_child_offset %d global_parent_offset %d \n", relative_offset,global_child_offset,global_parent_offset); - return relative_offset; -} - -GRL_INLINE void QBVH6Node_set_offset(struct QBVHNodeN *qnode, void *children) -{ - int ofs = (struct QBVHNodeN *)children - qnode; - qnode->offset = ofs; -} - -GRL_INLINE void QBVH6Node_set_type(struct QBVHNodeN *qnode, uint type) -{ - qnode->type = type; -} - -GRL_INLINE uint sortBVHChildrenIDs(uint input) -{ -#if BVH_NODE_N == 8 - return sort8_descending(input); -#else - return sort4_descending(input); -#endif -} - -enum XFM_BOX_OPTION { - XFM_BOX_NO_CLIP = 0, - XFM_BOX_NOT_REFINED_CLIPPED = 1, //<upper); - AABB3f_trim_upper(&child_bounds1, clipBox->upper); - AABB3f_trim_upper(&child_bounds2, clipBox->upper); - AABB3f_trim_upper(&child_bounds3, clipBox->upper); - AABB3f_trim_upper(&child_bounds4, clipBox->upper); - AABB3f_trim_upper(&child_bounds5, clipBox->upper); - } - - child_bounds0 = transform_aabb(child_bounds0, xfm); - child_bounds1 = transform_aabb(child_bounds1, xfm); - child_bounds2 = transform_aabb(child_bounds2, xfm); - child_bounds3 = transform_aabb(child_bounds3, xfm); - child_bounds4 = transform_aabb(child_bounds4, xfm); - child_bounds5 = transform_aabb(child_bounds5, xfm); - - AABB3f_extend(&child_bounds0, &child_bounds1); - AABB3f_extend(&child_bounds2, &child_bounds3); - AABB3f_extend(&child_bounds4, &child_bounds5); - AABB3f_extend(&child_bounds0, &child_bounds2); - AABB3f_extend(&child_bounds0, &child_bounds4); - - return child_bounds0; - } -#endif - -#if DEB_PRINTFS - printf("0"); -#endif - - struct AABB3f child_bounds; - - if (clipOpt != XFM_BOX_NOT_REFINED_TAKE_CLIPBOX) - { - // XFM_BOX_NOT_REFINED_CLIPPED || XFM_BOX_NO_CLIP - child_bounds = InternalNode_getAABB3f(pnode); - if (clipOpt != XFM_BOX_NO_CLIP) - { - AABB3f_intersect(&child_bounds, *clipBox); - } - } - else - { - //XFM_BOX_NOT_REFINED_TAKE_CLIPBOX - child_bounds = *clipBox; - } - - child_bounds = transform_aabb(child_bounds, xfm); - //child_bounds = conservativeAABB3f(&child_bounds); - return child_bounds; -} - -GRL_INLINE AABB3f GRL_OVERLOADABLE compute_xfm_bbox(struct AffineSpace3f xfm, InternalNode* pnode, bool clip, AABB3f* clipBox, float matOverhead) -{ - float transform[12]; - load_row_major_from_AffineSpace3f(xfm, transform); - return compute_xfm_bbox(transform, pnode, clip, clipBox, matOverhead); -} - -GRL_INLINE uint64_t compute_refit_structs_compacted_size(BVHBase* base) -{ - uint dataSize = 0; - - if (BVHBase_HasBackPointers(base)) - { - const uint fatleafEntrySize = (base->fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63; - const uint innerEntrySize = (base->innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63; - - // New atomic update - if(base->quadIndicesDataStart > base->backPointerDataStart) - { - uint numQuads = BVHBase_GetNumQuads(base); - - const uint quadTableMainBufferSize = (numQuads + 255) & ~255; - const uint quadLeftoversSize = (base->quadLeftoversCountNewAtomicUpdate + 255) & ~255; - const uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63); - - const uint quadIndicesDataSize = (numQuads * sizeof(QuadDataIndices) + 63) & ~63; - - dataSize += quadTableEntriesSize + quadIndicesDataSize; - } - - dataSize += - ((BVHBase_GetNumInternalNodes(base) * sizeof(uint) + 63) & ~63) - + fatleafEntrySize + innerEntrySize; - } - - return (uint64_t)dataSize; -} - -GRL_INLINE uint64_t compute_compacted_size(BVHBase* base) -{ - uint64_t size = sizeof(BVHBase); - size += BVHBase_GetNumHWInstanceLeaves(base) * sizeof(HwInstanceLeaf); - size += BVHBase_GetNumProcedurals(base) * sizeof(ProceduralLeaf); - size += BVHBase_GetNumQuads(base) * sizeof(QuadLeaf); - size += compute_refit_structs_compacted_size(base); - size += BVHBase_GetNumInternalNodes(base) * sizeof(InternalNode); - size += sizeof(InstanceDesc) * base->Meta.instanceCount; - size += (sizeof(GeoMetaData) * base->Meta.geoCount + 63) & ~63; // align to 64 - size = (size + 63) & ~63; - - return size; -} diff --git a/src/intel/vulkan/grl/gpu/quad.h b/src/intel/vulkan/grl/gpu/quad.h deleted file mode 100644 index cc1b7d470f8..00000000000 --- a/src/intel/vulkan/grl/gpu/quad.h +++ /dev/null @@ -1,127 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "shared.h" -#include "intrinsics.h" -#include "AABB.h" -#include "AABB3f.h" - -// JDB TODO: Use corresponding GRL structures!!! - -struct Quad -{ - unsigned int shaderIndex; // note: also mask - unsigned int geomIndex; // note: also geom flags in upper 2 bits - unsigned int primIndex0; - unsigned int primIndex1Delta; - float v[4][3]; -}; - -GRL_INLINE unsigned int Quad_getGeomIndex(global struct Quad *quad) -{ - return quad->geomIndex; -} - -GRL_INLINE unsigned int Quad_getPrimIndex0(global struct Quad *quad) -{ - return quad->primIndex0; -} - -GRL_INLINE unsigned int Quad_getPrimIndex1(global struct Quad *quad) -{ - return quad->primIndex0 + (quad->primIndex1Delta & 0xFFFF); -} - -GRL_INLINE float3 load_float3(float *p) -{ - return (float3)(p[0], p[1], p[2]); -} - -GRL_INLINE float3 load_perm_float3(float *p, const uint3 perm) -{ - return (float3)(p[perm.x], p[perm.y], p[perm.z]); -} - -GRL_INLINE float2 load_perm_float2(float *p, const uint2 perm) -{ - return (float2)(p[perm.x], p[perm.y]); -} - -GRL_INLINE float load_perm_float(float *p, const uint perm) -{ - return p[perm]; -} - -GRL_INLINE struct AABB getAABB_Quad(struct Quad *q) -{ - struct AABB aabb; - const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3]))); - const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3]))); - aabb.lower = (float4)(lower, 0.0f); - aabb.upper = (float4)(upper, 0.0f); - return aabb; -} - -GRL_INLINE void Quad_ExtendAABB(struct Quad* q, struct AABB* box) -{ - struct AABB aabb; - const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3]))); - const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3]))); - aabb.lower = (float4)(lower, 0.0f); - aabb.upper = (float4)(upper, 0.0f); - AABB_extend(box, &aabb); -} - -GRL_INLINE float4 getCentroid2_Quad(struct Quad *q) -{ - struct AABB aabb = getAABB_Quad(q); - return aabb.lower + aabb.upper; -} - -GRL_INLINE void setQuad(struct Quad *quad, const float4 v0, const float4 v1, const float4 v2, const float4 v3, - const uchar j0, const uchar j1, const uchar j2, - const uint geomID, const uint primID0, const uint primID1, const uint geomMask, const uint geomFlags ) -{ - quad->v[0][0] = v0.x; - quad->v[0][1] = v0.y; - quad->v[0][2] = v0.z; - quad->v[1][0] = v1.x; - quad->v[1][1] = v1.y; - quad->v[1][2] = v1.z; - quad->v[2][0] = v2.x; - quad->v[2][1] = v2.y; - quad->v[2][2] = v2.z; - quad->v[3][0] = v3.x; - quad->v[3][1] = v3.y; - quad->v[3][2] = v3.z; - - quad->shaderIndex = (geomMask << 24) | geomID; - quad->geomIndex = geomID | (geomFlags << 30); - quad->primIndex0 = primID0; - const uint delta = primID1 - primID0; - const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4)); - quad->primIndex1Delta = delta | (j << 16) | (1 << 22); // single prim in leaf - -} - -GRL_INLINE void setQuadVertices(struct Quad *quad, const float3 v0, const float3 v1, const float3 v2, const float3 v3) -{ - quad->v[0][0] = v0.x; - quad->v[0][1] = v0.y; - quad->v[0][2] = v0.z; - quad->v[1][0] = v1.x; - quad->v[1][1] = v1.y; - quad->v[1][2] = v1.z; - quad->v[2][0] = v2.x; - quad->v[2][1] = v2.y; - quad->v[2][2] = v2.z; - quad->v[3][0] = v3.x; - quad->v[3][1] = v3.y; - quad->v[3][2] = v3.z; -} diff --git a/src/intel/vulkan/grl/gpu/radix_sort.grl b/src/intel/vulkan/grl/gpu/radix_sort.grl deleted file mode 100644 index df932057a10..00000000000 --- a/src/intel/vulkan/grl/gpu/radix_sort.grl +++ /dev/null @@ -1,163 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module radix_sort; - -kernel_module radix_kernels ("morton_radix_sort.cl") -{ - links lsc_intrinsics; - kernel opencl_build_morton_kernel_sort_bin_items < kernelFunction="sort_morton_codes_bin_items">; - kernel opencl_build_morton_kernel_sort_reduce_bins < kernelFunction="sort_morton_codes_reduce_bins">; - kernel opencl_build_morton_kernel_sort_scatter_items < kernelFunction="sort_morton_codes_scatter_items">; - - kernel opencl_build_morton_codes_sort_merged < kernelFunction="sort_morton_codes_merged">; - - kernel opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum < kernelFunction="sort_morton_codes_reduce_bins_wide_partial_sum">; - kernel opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce < kernelFunction="sort_morton_codes_reduce_bins_wide_add_reduce">; -} - -metakernel sort( - qword build_globals, - dword shift, - qword global_histogram, - qword input0, - qword input1, - dword input0_offset, - dword input1_offset, - dword iteration, - dword threads) -{ - dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args( - build_globals, - shift, - global_histogram, - input0, - input1, - input0_offset, - input1_offset, - iteration); - - control(wait_idle); - - dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args( - threads, - global_histogram); - - control(wait_idle); - - dispatch opencl_build_morton_kernel_sort_scatter_items (threads, 1, 1) args( - build_globals, - shift, - global_histogram, - input0, - input1, - input0_offset, - input1_offset, - iteration); - - control(wait_idle); - -} - -metakernel sort_bin_items( - qword build_globals, - qword global_histogram, - qword wg_flags, - qword input0, - dword iteration, - dword threads, - dword update_wg_flags - ) -{ - dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args( - build_globals, - global_histogram, - wg_flags, - input0, - iteration, - threads, - update_wg_flags - ); -} - -metakernel sort_reduce_bins( - qword build_globals, - qword global_histogram, - dword threads, - dword iteration) -{ - dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args( - build_globals, - threads, - global_histogram, - iteration); -} - -metakernel sort_scatter_items( - qword build_globals, - qword global_histogram, - qword input0, - qword input1, - dword iteration, - dword threads, - dword update_morton_sort_in_flight ) -{ - dispatch opencl_build_morton_kernel_sort_scatter_items( threads, 1, 1 ) args( - build_globals, - global_histogram, - input0, - input1, - iteration, - threads, - update_morton_sort_in_flight - ); -} - -metakernel sort_bin_items_merged( - qword build_globals, - qword global_histogram, - qword input0, - dword iteration, - dword threads) -{ - dispatch opencl_build_morton_codes_sort_merged (threads, 1, 1) args( - build_globals, - global_histogram, - input0, - iteration, - threads - ); -} - -metakernel sort_reduce_bins_wide( - qword build_globals, - qword global_histogram, - qword global_histogram_tmp, - qword wg_flags, - dword threads, - dword threads_groups, - dword iteration) -{ - dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum(threads_groups, 1, 1) args( - build_globals, - threads, - threads_groups, - global_histogram, - global_histogram_tmp, - wg_flags, - iteration); - - control(wait_idle); - - dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce(threads_groups, 1, 1) args( - build_globals, - threads, - threads_groups, - global_histogram, - global_histogram_tmp, - iteration); -} diff --git a/src/intel/vulkan/grl/gpu/rebraid.grl b/src/intel/vulkan/grl/gpu/rebraid.grl deleted file mode 100644 index 5aa809637a3..00000000000 --- a/src/intel/vulkan/grl/gpu/rebraid.grl +++ /dev/null @@ -1,167 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module rebraid; - -kernel init_scratch < source="bvh_rebraid.cl", kernelFunction="rebraid_init_scratch" > -kernel chase_instance_ptrs < source="bvh_rebraid.cl", kernelFunction="rebraid_chase_instance_pointers" > -kernel calc_aabb < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances" > -kernel calc_aabb_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_indirect" > -kernel calc_aabb_ptr < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers" > -kernel calc_aabb_ptr_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers_indirect" > -kernel count_splits < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits" > -kernel count_splits_SG < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG" > -kernel count_splits_SG_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG_indirect" > -kernel build_primrefs < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs" > -kernel build_primrefs_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs_indirect" > - -//kernel ISA_TEST < source="bvh_rebraid.cl", kernelFunction="ISA_TEST" > -//kernel DEBUG_PRINT < source="bvh_rebraid.cl", kernelFunction="DEBUG_PRINT" > - - -const PRIMREF_GROUP_SIZE = 256; - -const COUNT_SPLITS_GROUP_SIZE = 16; - -struct MKRebraidArgs -{ - qword bvh_buffer; - qword primref_buffer; - qword global_buffer; - qword instances_buffer; - qword rebraid_scratch; - qword flat_instances_buffer; - dword num_instances; - dword num_extra_primrefs; -}; - -metakernel rebraid( - MKRebraidArgs Args - ) -{ - dispatch init_scratch(1,1,1) args( Args.rebraid_scratch ); - dispatch calc_aabb(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer ); - control( wait_idle ); - - //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE); - //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.num_instances ); - - dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch ); - control( wait_idle ); - - define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE); - - dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances ); - control( wait_idle ); - - //dispatch DEBUG_PRINT(1,1,1) args( Args.global_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances ); -} - -metakernel rebraid_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo) -{ - - dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch); - - define num_groups REG0; - num_groups = load_dword(indirectBuildRangeInfo); - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect calc_aabb_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo); - control(wait_idle); - - dispatch_indirect count_splits_SG_indirect - args(Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo); - - define groupsize_1 REG1; // groupsize - 1 - define C_8 REG2; - - groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1 - C_8 = 8; // log_2(PRIMREF_GROUP_SIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE; - DISPATCHDIM_X = num_groups.lo; - - control(wait_idle); - - dispatch_indirect build_primrefs_indirect args( - Args.global_buffer, - Args.bvh_buffer, - Args.instances_buffer, - Args.rebraid_scratch, - Args.primref_buffer, - indirectBuildRangeInfo, - Args.num_extra_primrefs); - control(wait_idle); -} - -metakernel rebraid_ptrs( - MKRebraidArgs Args - ) -{ - dispatch init_scratch(1,1,1) args( Args.rebraid_scratch ); - dispatch chase_instance_ptrs( Args.num_instances, 1, 1) args( Args.instances_buffer, Args.flat_instances_buffer ); - dispatch calc_aabb_ptr(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer ); - control( wait_idle ); - - //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE); - //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch ); - - dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch ); - control( wait_idle ); - - define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE); - - - dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances ); - control( wait_idle ); - -} - -metakernel rebraid_ptrs_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo) -{ - dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch); - - define num_groups REG0; - num_groups = load_dword(indirectBuildRangeInfo); - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect chase_instance_ptrs - args(Args.instances_buffer, Args.flat_instances_buffer, indirectBuildRangeInfo); - dispatch_indirect calc_aabb_ptr_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo); - control(wait_idle); - - dispatch_indirect count_splits_SG_indirect - args(Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo); - - define groupsize_1 REG1; // groupsize - 1 - define C_8 REG2; - - groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1 - C_8 = 8; // log_2(PRIMREF_GROUP_SIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE; - DISPATCHDIM_X = num_groups.lo; - - control(wait_idle); - - dispatch_indirect build_primrefs_indirect args( - Args.global_buffer, - Args.bvh_buffer, - Args.flat_instances_buffer, - Args.rebraid_scratch, - Args.primref_buffer, - Args.num_extra_primrefs, - indirectBuildRangeInfo, - Args.num_instances); - control(wait_idle); -} diff --git a/src/intel/vulkan/grl/gpu/shared.h b/src/intel/vulkan/grl/gpu/shared.h deleted file mode 100644 index 0d42d98a1d4..00000000000 --- a/src/intel/vulkan/grl/gpu/shared.h +++ /dev/null @@ -1,182 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "GRLGen12.h" -#pragma once - -#define sizeof_Quad 64 -#define sizeof_Procedural 64 -#define sizeof_PrimRef 32 -#define sizeof_PresplitItem 8 -#define sizeof_HwInstanceLeaf 128 -#define MORTON_BUILDER_SUBTREE_THRESHOLD 256 -#define MORTON_BUILDER_P2_ELEMENTS_IN_SLM 16 * 1024 / 32 -// Temporarily disable localized phase2 due to issues in ELG presi -// This implementation would be replaced with bottom_up + bounding box approach without the need for phase2 refit -#define MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD /*100000*/ 0 - -#define BVH_QUAD_NODE 4 -#define BVH_INSTANCE_NODE 1 -#define BVH_INTERNAL_NODE 0 -#define BVH_PROCEDURAL_NODE 3 -#define BUILDRECORD_STACK_SIZE 48 -#define BINS 16 - -GRL_NAMESPACE_BEGIN(GRL) -GRL_NAMESPACE_BEGIN(RTAS) -GRL_NAMESPACE_BEGIN(GPUBVHBuilder) - -struct AABB -{ - float4 lower; - float4 upper; -}; - -typedef struct BlockAllocator -{ - unsigned int start; - unsigned int cur; -} BlockAllocator; - -struct Globals -{ - struct AABB centroidBounds; - - unsigned int build_record_start; - unsigned int numPrimitives; - unsigned int leafPrimType; - unsigned int leafSize; - - unsigned int numSplittedPrimitives; - unsigned int numBuildRecords; - - // spatial split sate - unsigned int numOriginalPrimitives; - float presplitPrioritySum; - float probThreshold; - - // binned-sah bfs state - unsigned int counter; - unsigned int numBuildRecords_extended; - - // sync variable used for global-sync on work groups - unsigned int sync; - - - /* morton code builder state */ - unsigned int shift; // used by adaptive mc-builder - unsigned int shift_mask; // used by adaptive mc-builder - unsigned int binary_hierarchy_root; - unsigned int p0_allocated_num; - unsigned int p0_created_num; - unsigned int morton_sort_in_flight; - unsigned int sort_iterations; - - gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid -}; - -struct Range -{ - unsigned int start, end; -}; - -struct Triangle -{ - unsigned int vtx[3]; - //unsigned int primID; - //unsigned int geomID; -}; - -struct MortonCodePrimitive -{ - uint64_t index_code; // 64bit code + index combo -}; - -struct BuildRecord -{ - struct AABB centroidBounds; - unsigned int start, end; - __global void *current; -}; - -struct BinaryMortonCodeHierarchy -{ - struct Range range; - unsigned int leftChild; - unsigned int rightChild; - // unsigned int flag; -}; - -typedef struct MortonFlattenedBoxlessNode { - uint binary_hierarchy_index; // only needed when type != BVH_INTERNAL_NODE - uint childOffset_type; // childOffset : 26, type : 6 - uint backPointer; // same usage as in bvh -} MortonFlattenedBoxlessNode; - -struct StatStackEntry -{ - struct AABB aabb; - unsigned int node; - unsigned int type; - unsigned int depth; - float area; -}; - -struct BuildRecordMorton -{ - unsigned int nodeID; - unsigned int items; - unsigned int current_index; - unsigned int parent_index; -}; - -struct Split -{ - float sah; - int dim; - int pos; -}; - -struct BinMapping -{ - float4 ofs, scale; -}; - -struct BinInfo -{ - struct AABB3f boundsX[BINS]; - struct AABB3f boundsY[BINS]; - struct AABB3f boundsZ[BINS]; - uint3 counts[BINS]; -}; - -struct BinInfo2 -{ - struct AABB3f boundsX[BINS * 2]; - struct AABB3f boundsY[BINS * 2]; - struct AABB3f boundsZ[BINS * 2]; - uint3 counts[BINS * 2]; -}; - -struct GlobalBuildRecord -{ - struct BinInfo2 binInfo; - struct BinMapping binMapping; - struct Split split; - struct Range range; - struct AABB leftCentroid; - struct AABB rightCentroid; - struct AABB leftGeometry; - struct AABB rightGeometry; - unsigned int atomicCountLeft; - unsigned int atomicCountRight; - unsigned int buildRecordID; -}; - -GRL_NAMESPACE_END(GPUBVHBuilder) -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/gpu/structs.grl b/src/intel/vulkan/grl/gpu/structs.grl deleted file mode 100644 index f15b1d2346b..00000000000 --- a/src/intel/vulkan/grl/gpu/structs.grl +++ /dev/null @@ -1,38 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module structs; - -struct MKBuilderState { - qword geomDesc_buffer; - qword build_primref_buffer; - qword build_globals; - qword bvh_buffer; - dword leaf_type; - dword leaf_size; -}; - -struct MKSizeEstimate { - dword numTriangles; - dword numProcedurals; - dword numPrimitives; - dword numMeshes; - dword numBuildPrimitives; - dword numPrimitivesToSplit; - dword instance_descs_start; - dword geo_meta_data_start; - dword node_data_start; - dword leaf_data_start; - dword procedural_data_start; - dword back_pointer_start; - dword sizeTotal; - dword updateScratchSizeTotal; - dword fatleaf_table_start; - dword innernode_table_start; - dword max_fatleaves; - dword quad_indices_data_start; -}; diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.cl b/src/intel/vulkan/grl/gpu/traversal_shader.cl deleted file mode 100644 index ee5d2afcc75..00000000000 --- a/src/intel/vulkan/grl/gpu/traversal_shader.cl +++ /dev/null @@ -1,277 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#include "instance.h" -#include "api_interface.h" - -#include "bvh_build_primref.h" -#include "bvh_build_refit.h" - -/* - Create primrefs from array of instance descriptors. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel -TS_primrefs_from_instances( - global struct Globals* globals, - global struct BVHBase* bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, - uint numInstances, - global struct AABB* primrefs, - global uchar* pAABBs, - global uchar* pIsProcedural, - dword aabb_stride, - uint allowUpdate - ) -{ - const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; - if (instanceIndex < numInstances) - { - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex; - - global struct GRL_RAYTRACING_AABB* procedural_bb = 0; - if ( pIsProcedural[instanceIndex] ) - { - procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex); - } - - primrefs_from_instances( - globals, - bvh, - instance, - instanceIndex, - primrefs, - procedural_bb, - allowUpdate); - } -} - -/* - Create primrefs from array of instance descriptors. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -void kernel -TS_primrefs_from_instances_indirect( - global struct Globals* globals, - global struct BVHBase* bvh, - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances, - uint numInstances, - global struct AABB* primrefs, - global uchar* pAABBs, - global uchar* pIsProcedural, - dword aabb_stride, - uint allowUpdate, - global struct IndirectBuildRangeInfo* indirect_data - ) -{ - const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH; - if (instanceIndex < indirect_data->primitiveCount) - { - instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*) - (((global char*)instances) + indirect_data->primitiveOffset); - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex; - - global struct GRL_RAYTRACING_AABB* procedural_bb = 0; - if ( pIsProcedural[instanceIndex] ) - { - procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex); - } - - primrefs_from_instances( - globals, - bvh, - instance, - instanceIndex, - primrefs, - procedural_bb, - allowUpdate); - } -} - -/* - Create primrefs from array of pointers to instance descriptors. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel -TS_primrefs_from_instances_pointers(global struct Globals* globals, - global struct BVHBase* bvh, - global void* instances_in, - uint numInstances, - global struct AABB* primrefs, - global uchar* pAABBs, - global uchar* pIsProcedural, - dword aabb_stride, - uint allowUpdate - ) -{ - global const struct GRL_RAYTRACING_INSTANCE_DESC** instances = - (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in; - - const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH; - if (instanceIndex < numInstances) - { - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex]; - - global struct GRL_RAYTRACING_AABB* procedural_bb = 0; - if (pIsProcedural[instanceIndex]) - { - procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex); - } - - primrefs_from_instances( - globals, - bvh, - instance, - instanceIndex, - primrefs, - procedural_bb, - allowUpdate); - } -} - -/* - Create primrefs from array of pointers to instance descriptors. - */ - GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) -void kernel -TS_primrefs_from_instances_pointers_indirect(global struct Globals* globals, - global struct BVHBase* bvh, - global void* instances_in, - global struct AABB* primrefs, - global uchar* pAABBs, - global uchar* pIsProcedural, - dword aabb_stride, - uint allowUpdate, - global struct IndirectBuildRangeInfo* indirect_data - ) -{ - const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH; - if (instanceIndex < indirect_data->primitiveCount) - { - instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset; - global const struct GRL_RAYTRACING_INSTANCE_DESC** instances = - (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in; - global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex]; - - global struct GRL_RAYTRACING_AABB* procedural_bb = 0; - if (pIsProcedural[instanceIndex]) - { - procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex); - } - - primrefs_from_instances( - globals, - bvh, - instance, - instanceIndex, - primrefs, - procedural_bb, - allowUpdate); - } -} - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel -TS_update_instance_leaves(global struct BVHBase* bvh, - uint64_t dxrInstancesArray, - uint64_t dxrInstancesPtr, - global struct AABB3f* instance_aabb_scratch, - global uchar* aabbs, - global uchar* is_procedural, - dword aabb_stride -) -{ - uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh); - uint id = get_local_id(0) + get_local_size(0) * get_group_id(0); - if (id >= num_leaves) - return; - - struct HwInstanceLeaf* leaves = BVHBase_GetHWInstanceLeaves(bvh); - uint idx = HwInstanceLeaf_GetInstanceIndex(&leaves[id]); - - global GRL_RAYTRACING_AABB* procedural_box = 0; - if (is_procedural[idx]) - { - procedural_box = (global GRL_RAYTRACING_AABB*)(aabbs + (aabb_stride * idx)); - } - - DO_update_instance_leaves( - bvh, - dxrInstancesArray, - dxrInstancesPtr, - instance_aabb_scratch, - id, - procedural_box); -} - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(16, 1, 1))) -void kernel -TS_fixup_leaves( global struct BVHBase* bvh, - global uchar* primref_index, - global PrimRef* primrefs, - uint stride ) - -{ - uint num_inners = BVHBase_GetNumInternalNodes(bvh); - uint id = get_local_id(0) + get_local_size(0) * get_group_id(0); - - // assign 8 lanes to each inner node, 6 of which will do useful work - uint node_id = id / 8; - uint child_id = id % 8; - - bool node_valid = (node_id < num_inners); - - if (node_valid ) - { - global InternalNode* nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh); - global InternalNode* my_node = nodes + node_id; - - if (my_node->nodeType == BVH_INSTANCE_NODE) - { - bool child_valid = (child_id < 6) && InternalNode_IsChildValid(my_node, child_id); - if (child_valid) - { - global HwInstanceLeaf* leaves = (global HwInstanceLeaf*)InternalNode_GetChildren(my_node); - uint leafIndex = (leaves - BVHBase_GetHWInstanceLeaves(bvh)) + child_id; - - const uint primrefID = *(uint*)(primref_index + leafIndex * stride); - - uint type = PRIMREF_isProceduralInstance(&primrefs[primrefID]) ? - BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE; - - InternalNode_SetChildType(my_node, child_id, type); - } - - if (child_id == 0) - my_node->nodeType = BVH_INTERNAL_NODE; - } - } -} - - - - - -GRL_ANNOTATE_IGC_DO_NOT_SPILL -__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel -TS_Refit_per_one_startpoint_sg( - global struct BVHBase* bvh, - global struct AABB3f* instance_leaf_aabbs, - global uchar* procedural_instance_enable_buffer ) -{ - DO_Refit_per_one_startpoint_sg(bvh, (global GRL_RAYTRACING_GEOMETRY_DESC*) bvh, instance_leaf_aabbs, procedural_instance_enable_buffer ); - -} diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.grl b/src/intel/vulkan/grl/gpu/traversal_shader.grl deleted file mode 100644 index 3820996c348..00000000000 --- a/src/intel/vulkan/grl/gpu/traversal_shader.grl +++ /dev/null @@ -1,244 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -module traversal_shader; - -kernel_module morton_kernels ("traversal_shader.cl") -{ - links lsc_intrinsics; - - kernel TS_primrefs_from_instances < kernelFunction = "TS_primrefs_from_instances" >; - kernel TS_primrefs_from_instances_indirect < kernelFunction = "TS_primrefs_from_instances_indirect" >; - kernel TS_primrefs_from_instances_ptrs < kernelFunction = "TS_primrefs_from_instances_pointers" >; - kernel TS_primrefs_from_instances_ptrs_indirect < kernelFunction = "TS_primrefs_from_instances_pointers_indirect" >; - kernel TS_update_instance_leaves < kernelFunction = "TS_update_instance_leaves" >; - kernel TS_Refit_per_one_startpoint_sg < kernelFunction = "TS_Refit_per_one_startpoint_sg" >; - kernel TS_fixup_leaves < kernelFunction = "TS_fixup_leaves" >; -} - -struct MKTSBuildArgs -{ - qword build_globals; - qword bvh_buffer; - qword instance_descs; - qword build_primref_buffer; - qword aabb_buffer; - qword is_procedural_buffer; - qword leaf_creation_index_buffer; - dword aabb_stride; - dword num_instances; - dword leaf_creation_index_stride; -}; - -const BUILD_PRIMREFS_GROUPSIZE = 16; - - -metakernel TS_build_primrefs( MKTSBuildArgs build_state, dword allowUpdate ) -{ - define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE); - dispatch TS_primrefs_from_instances(num_groups, 1, 1) args( - build_state.build_globals, - build_state.bvh_buffer, - build_state.instance_descs, - build_state.num_instances, - build_state.build_primref_buffer, - build_state.aabb_buffer, - build_state.is_procedural_buffer, - build_state.aabb_stride, - allowUpdate - ); - -} - -metakernel TS_build_primrefs_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1 - C_4 = 4; // log_2(BUILD_PRIMREFS_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect TS_primrefs_from_instances_indirect args( - build_state.build_globals, - build_state.bvh_buffer, - build_state.instance_descs, - build_state.build_primref_buffer, - build_state.aabb_buffer, - build_state.is_procedural_buffer, - build_state.aabb_stride, - allowUpdate, - indirectBuildRangeInfo - ); - -} - -metakernel TS_build_primrefs_array_of_pointers( MKTSBuildArgs build_state, dword allowUpdate ) -{ - define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE); - dispatch TS_primrefs_from_instances_ptrs(num_groups, 1, 1) args( - build_state.build_globals, - build_state.bvh_buffer, - build_state.instance_descs, - build_state.num_instances, - build_state.build_primref_buffer, - build_state.aabb_buffer, - build_state.is_procedural_buffer, - build_state.aabb_stride, - allowUpdate - ); -} - -metakernel -TS_build_primrefs_array_of_pointers_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1 - C_4 = 4; // log_2(BUILD_PRIMREFS_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect TS_primrefs_from_instances_ptrs_indirect args( - build_state.build_globals, - build_state.bvh_buffer, - build_state.instance_descs, - build_state.build_primref_buffer, - build_state.aabb_buffer, - build_state.is_procedural_buffer, - build_state.aabb_stride, - allowUpdate, - indirectBuildRangeInfo - ); -} - - - - -const UPDATE_INSTANCE_LEAVES_GROUPSIZE = 16; - -struct MKTSUpdateArgs -{ - qword bvh_buffer; - qword instance_descs; - qword instance_descs_ptrs; - qword aabb_buffer; - qword is_procedural_buffer; - qword refit_scratch; - dword aabb_stride; - dword num_instances; -}; - -metakernel TS_update_instance_leaves( MKTSUpdateArgs update_state ) -{ - define num_groups((update_state.num_instances + UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1) / UPDATE_INSTANCE_LEAVES_GROUPSIZE); - dispatch TS_update_instance_leaves(num_groups, 1, 1) args( - update_state.bvh_buffer, - update_state.instance_descs, - update_state.instance_descs_ptrs, - update_state.refit_scratch, - update_state.aabb_buffer, - update_state.is_procedural_buffer, - update_state.aabb_stride - ); -} - -metakernel TS_update_instance_leaves_indirect( MKTSUpdateArgs update_state, qword indirectBuildRangeInfo ) -{ - define num_groups REG0; - define groupsize_1 REG1; // groupsize - 1 - define C_4 REG2; - - // init with primitiveCount - num_groups = load_dword(indirectBuildRangeInfo); - groupsize_1 = 15; // UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1 - C_4 = 4; // log_2(UPDATE_INSTANCE_LEAVES_GROUPSIZE) - - num_groups = num_groups + groupsize_1; - num_groups = num_groups >> C_4; // num_groups / UPDATE_INSTANCE_LEAVES_GROUPSIZE; - - DISPATCHDIM_X = num_groups.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - // need to add indirect offset? - dispatch_indirect TS_update_instance_leaves args( - update_state.bvh_buffer, - update_state.instance_descs, - update_state.instance_descs_ptrs, - update_state.refit_scratch, - update_state.aabb_buffer, - update_state.is_procedural_buffer, - update_state.aabb_stride - ); -} - -metakernel TS_refit(MKTSUpdateArgs update_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end ) -{ - REG0 = bvh_inner_nodes_start_value; - REG1.lo = load_dword(bvh_inner_nodes_end); - REG1.hi = 0; - REG2 = REG1 - REG0; - - DISPATCHDIM_X = REG2.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect TS_Refit_per_one_startpoint_sg - args( - update_state.bvh_buffer, - update_state.refit_scratch, - update_state.is_procedural_buffer - ); -} - - -const FIXUP_LEAVES_NODES_PER_GROUP = 2; - -metakernel TS_fixup_leaves(MKTSBuildArgs build_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end ) -{ - define ONE REG3; - - ONE = 1; - REG0 = bvh_inner_nodes_start_value; - REG1.lo = load_dword(bvh_inner_nodes_end); - REG1.hi = 0; - REG2 = REG1 - REG0; - REG2 = REG2 + ONE; - REG2 = REG2 >> ONE; - - DISPATCHDIM_X = REG2.lo; - DISPATCHDIM_Y = 1; - DISPATCHDIM_Z = 1; - - dispatch_indirect TS_fixup_leaves - args( - build_state.bvh_buffer, - build_state.leaf_creation_index_buffer, - build_state.build_primref_buffer, - build_state.leaf_creation_index_stride - ); - -} diff --git a/src/intel/vulkan/grl/grl_cl_kernel_gen.py b/src/intel/vulkan/grl/grl_cl_kernel_gen.py deleted file mode 100644 index 148438e9fa6..00000000000 --- a/src/intel/vulkan/grl/grl_cl_kernel_gen.py +++ /dev/null @@ -1,226 +0,0 @@ -COPYRIGHT = """\ -/* - * Copyright 2021 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ -""" - -import argparse -import os - -from grl_parser import parse_grl_file -from mako.template import Template - -TEMPLATE_H = Template(COPYRIGHT + """ -/* This file generated from ${filename}, don't edit directly. */ - -#ifndef GRL_CL_KERNEL_H -#define GRL_CL_KERNEL_H - -#include "genxml/gen_macros.h" -#include "compiler/brw_kernel.h" - -#ifdef __cplusplus -extern "C" { -#endif - -enum grl_cl_kernel { -% for k in kernels: - GRL_CL_KERNEL_${k.upper()}, -% endfor - GRL_CL_KERNEL_MAX, -}; - -const char *genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel); - -const char *genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id); - -void genX(grl_get_cl_kernel)(struct brw_kernel *kernel, enum grl_cl_kernel id); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* INTEL_GRL_H */ -""") - -TEMPLATE_C = Template(COPYRIGHT + """ -/* This file generated from ${filename}, don't edit directly. */ - -#include "grl_cl_kernel.h" - -% for k in kernels: -#include "${prefix}_${k}.h" -% endfor - -const char * -genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel) -{ - switch (kernel) { -% for k in kernels: - case GRL_CL_KERNEL_${k.upper()}: return "${k}"; -% endfor - default: return "unknown"; - } -} - -const char * -genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id) -{ - switch (id) { -% for k in kernels: - case GRL_CL_KERNEL_${k.upper()}: return ${prefix}_${k}_sha1; -% endfor - default: - unreachable("Invalid GRL kernel enum"); - } -}; - -void -${prefix}_grl_get_cl_kernel(struct brw_kernel *kernel, enum grl_cl_kernel id) -{ - switch (id) { -% for k in kernels: - case GRL_CL_KERNEL_${k.upper()}: - *kernel = ${prefix}_${k}; - break; -% endfor - default: - unreachable("Invalid GRL kernel enum"); - } -} -""") - -def get_libraries_files(kernel_module): - lib_files = [] - for item in kernel_module[3]: - if item[0] != 'library': - continue - default_file = None - fallback_file = None - path_directory = None - for props in item[2]: - if props[0] == 'fallback': - fallback_file = props[1] - elif props[0] == 'default': - default_file = props[1] - elif props[0] == 'path': - path_directory = props[1] - assert path_directory - assert default_file or fallback_file - if fallback_file: - lib_files.append(os.path.join(path_directory, fallback_file)) - else: - lib_files.append(os.path.join(path_directory, default_file)) - return lib_files - -def add_kernels(kernels, cl_file, entrypoint, libs): - assert cl_file.endswith('.cl') - for lib_file in libs: - assert lib_file.endswith('.cl') - kernels.append((cl_file, entrypoint, ','.join(libs))) - -def get_kernels(grl_nodes): - kernels = [] - for item in grl_nodes: - assert isinstance(item, tuple) - if item[0] == 'kernel': - ann = item[2] - add_kernels(kernels, ann['source'], ann['kernelFunction'], []) - elif item[0] == 'kernel-module': - cl_file = item[2] - libfiles = get_libraries_files(item) - for kernel_def in item[3]: - if kernel_def[0] == 'kernel': - ann = kernel_def[2] - add_kernels(kernels, cl_file, ann['kernelFunction'], libfiles) - return kernels - -def parse_libraries(filenames): - libraries = {} - for fname in filenames: - lib_package = parse_grl_file(fname, []) - for lib in lib_package: - assert lib[0] == 'library' - # Add the directory of the library so that CL files can be found. - lib[2].append(('path', os.path.dirname(fname))) - libraries[lib[1]] = lib - return libraries - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--out-c', help='Output C file') - parser.add_argument('--out-h', help='Output H file') - parser.add_argument('--ls-kernels', action='store_const', const=True, - help='List all openCL kernels') - parser.add_argument('--prefix', help='Prefix') - parser.add_argument('--library', dest='libraries', action='append', - default=[], help='Libraries to include') - parser.add_argument('files', type=str, nargs='*', help='GRL files') - args = parser.parse_args() - - libraries = parse_libraries(args.libraries) - - kernels = [] - for fname in args.files: - kernels += get_kernels(parse_grl_file(fname, libraries)) - - # Make the list of kernels unique and sorted - kernels = sorted(list(set(kernels))) - - if args.ls_kernels: - for cl_file, entrypoint, libs in kernels: - if not os.path.isabs(cl_file): - cl_file = os.path.join(os.path.dirname(fname), cl_file) - print('{}:{}:{}'.format(cl_file, entrypoint, libs)) - - kernel_c_names = [] - for cl_file, entrypoint, libs in kernels: - cl_file = os.path.splitext(cl_file)[0] - cl_file_name = cl_file.replace('/', '_') - kernel_c_names.append('_'.join([cl_file_name, entrypoint])) - - try: - if args.out_h: - with open(args.out_h, 'w', encoding='utf-8') as f: - f.write(TEMPLATE_H.render(kernels=kernel_c_names, - filename=os.path.basename(__file__))) - - if args.out_c: - with open(args.out_c, 'w', encoding='utf-8') as f: - f.write(TEMPLATE_C.render(kernels=kernel_c_names, - prefix=args.prefix, - filename=os.path.basename(__file__))) - except Exception: - # In the event there's an error, this imports some helpers from mako - # to print a useful stack trace and prints it, then exits with - # status 1, if python is run with debug; otherwise it just raises - # the exception - if __debug__: - import sys - from mako import exceptions - sys.stderr.write(exceptions.text_error_template().render() + '\n') - sys.exit(1) - raise - -if __name__ == '__main__': - main() diff --git a/src/intel/vulkan/grl/grl_metakernel_gen.py b/src/intel/vulkan/grl/grl_metakernel_gen.py deleted file mode 100644 index 7861b085c62..00000000000 --- a/src/intel/vulkan/grl/grl_metakernel_gen.py +++ /dev/null @@ -1,930 +0,0 @@ -#!/bin/env python -COPYRIGHT = """\ -/* - * Copyright 2021 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ -""" - -import argparse -import os.path -import re -import sys - -from grl_parser import parse_grl_file - -class Writer(object): - def __init__(self, file): - self._file = file - self._indent = 0 - self._new_line = True - - def push_indent(self, levels=4): - self._indent += levels - - def pop_indent(self, levels=4): - self._indent -= levels - - def write(self, s, *fmt): - if self._new_line: - s = '\n' + s - self._new_line = False - if s.endswith('\n'): - self._new_line = True - s = s[:-1] - if fmt: - s = s.format(*fmt) - self._file.write(s.replace('\n', '\n' + ' ' * self._indent)) - -# Internal Representation - -class Value(object): - def __init__(self, name=None, zone=None): - self.name = name - self._zone = zone - self.live = False - - @property - def zone(self): - assert self._zone is not None - return self._zone - - def is_reg(self): - return False - - def c_val(self): - if not self.name: - print(self) - assert self.name - return self.name - - def c_cpu_val(self): - assert self.zone == 'cpu' - return self.c_val() - - def c_gpu_val(self): - if self.zone == 'gpu': - return self.c_val() - else: - return 'mi_imm({})'.format(self.c_cpu_val()) - -class Constant(Value): - def __init__(self, value): - super().__init__(zone='cpu') - self.value = value - - def c_val(self): - if self.value < 100: - return str(self.value) - elif self.value < (1 << 32): - return '0x{:x}u'.format(self.value) - else: - return '0x{:x}ull'.format(self.value) - -class Register(Value): - def __init__(self, name): - super().__init__(name=name, zone='gpu') - - def is_reg(self): - return True - -class FixedGPR(Register): - def __init__(self, num): - super().__init__('REG{}'.format(num)) - self.num = num - - def write_c(self, w): - w.write('UNUSED struct mi_value {} = mi_reserve_gpr(&b, {});\n', - self.name, self.num) - -class GroupSizeRegister(Register): - def __init__(self, comp): - super().__init__('DISPATCHDIM_' + 'XYZ'[comp]) - self.comp = comp - -class Member(Value): - def __init__(self, value, member): - super().__init__(zone=value.zone) - self.value = value - self.member = member - - def is_reg(self): - return self.value.is_reg() - - def c_val(self): - c_val = self.value.c_val() - if self.zone == 'gpu': - assert isinstance(self.value, Register) - if self.member == 'hi': - return 'mi_value_half({}, true)'.format(c_val) - elif self.member == 'lo': - return 'mi_value_half({}, false)'.format(c_val) - else: - assert False, 'Invalid member: {}'.format(self.member) - else: - return '.'.join([c_val, self.member]) - -class OffsetOf(Value): - def __init__(self, mk, expr): - super().__init__(zone='cpu') - assert isinstance(expr, tuple) and expr[0] == 'member' - self.type = mk.m.get_type(expr[1]) - self.field = expr[2] - - def c_val(self): - return 'offsetof({}, {})'.format(self.type.c_name, self.field) - -class Scope(object): - def __init__(self, m, mk, parent): - self.m = m - self.mk = mk - self.parent = parent - self.defs = {} - - def add_def(self, d, name=None): - if name is None: - name = d.name - assert name not in self.defs - self.defs[name] = d - - def get_def(self, name): - if name in self.defs: - return self.defs[name] - assert self.parent, 'Unknown definition: "{}"'.format(name) - return self.parent.get_def(name) - -class Statement(object): - def __init__(self, srcs=[]): - assert isinstance(srcs, (list, tuple)) - self.srcs = list(srcs) - -class SSAStatement(Statement, Value): - _count = 0 - - def __init__(self, zone, srcs): - Statement.__init__(self, srcs) - Value.__init__(self, None, zone) - self.c_name = '_tmp{}'.format(SSAStatement._count) - SSAStatement._count += 1 - - def c_val(self): - return self.c_name - - def write_c_refs(self, w): - assert self.zone == 'gpu' - assert self.uses > 0 - if self.uses > 1: - w.write('mi_value_add_refs(&b, {}, {});\n', - self.c_name, self.uses - 1) - -class Half(SSAStatement): - def __init__(self, value, half): - assert half in ('hi', 'lo') - super().__init__(None, [value]) - self.half = half - - @property - def zone(self): - return self.srcs[0].zone - - def write_c(self, w): - assert self.half in ('hi', 'lo') - if self.zone == 'cpu': - if self.half == 'hi': - w.write('uint32_t {} = (uint64_t)({}) >> 32;\n', - self.c_name, self.srcs[0].c_cpu_val()) - else: - w.write('uint32_t {} = {};\n', - self.c_name, self.srcs[0].c_cpu_val()) - else: - if self.half == 'hi': - w.write('struct mi_value {} = mi_value_half({}, true);\n', - self.c_name, self.srcs[0].c_gpu_val()) - else: - w.write('struct mi_value {} = mi_value_half({}, false);\n', - self.c_name, self.srcs[0].c_gpu_val()) - self.write_c_refs(w) - -class Expression(SSAStatement): - def __init__(self, mk, op, *srcs): - super().__init__(None, srcs) - self.op = op - - @property - def zone(self): - zone = 'cpu' - for s in self.srcs: - if s.zone == 'gpu': - zone = 'gpu' - return zone - - def write_c(self, w): - if self.zone == 'cpu': - c_cpu_vals = [s.c_cpu_val() for s in self.srcs] - # There is one bitfield that is a uint64_t, but only holds 2 bits. - # In practice we won't overflow, but let's help the compiler (and - # coverity) out here. - if self.op == '<<': - w.write(f'assume({c_cpu_vals[0]} < (1 << 8));') - w.write('uint64_t {} = ', self.c_name) - if len(self.srcs) == 1: - w.write('({} {})', self.op, c_cpu_vals[0]) - elif len(self.srcs) == 2: - w.write('({} {} {})', c_cpu_vals[0], self.op, c_cpu_vals[1]) - else: - assert len(self.srcs) == 3 and op == '?' - w.write('({} ? {} : {})', *c_cpu_vals) - w.write(';\n') - return - - w.write('struct mi_value {} = ', self.c_name) - if self.op == '~': - w.write('mi_inot(&b, {});\n', self.srcs[0].c_gpu_val()) - elif self.op == '+': - w.write('mi_iadd(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val()) - elif self.op == '-': - w.write('mi_isub(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val()) - elif self.op == '&': - w.write('mi_iand(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val()) - elif self.op == '|': - w.write('mi_ior(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val()) - elif self.op == '<<': - if self.srcs[1].zone == 'cpu': - w.write('mi_ishl_imm(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val()) - else: - w.write('mi_ishl(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val()) - elif self.op == '>>': - if self.srcs[1].zone == 'cpu': - w.write('mi_ushr_imm(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val()) - else: - w.write('mi_ushr(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val()) - elif self.op == '==': - w.write('mi_ieq(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val()) - elif self.op == '<': - w.write('mi_ult(&b, {}, {});\n', - self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val()) - elif self.op == '>': - w.write('mi_ult(&b, {}, {});\n', - self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val()) - elif self.op == '<=': - w.write('mi_uge(&b, {}, {});\n', - self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val()) - else: - assert False, 'Unknown expression opcode: {}'.format(self.op) - self.write_c_refs(w) - -class StoreReg(Statement): - def __init__(self, mk, reg, value): - super().__init__([mk.load_value(value)]) - self.reg = mk.parse_value(reg) - assert self.reg.is_reg() - - def write_c(self, w): - value = self.srcs[0] - w.write('mi_store(&b, {}, {});\n', - self.reg.c_gpu_val(), value.c_gpu_val()) - -class LoadMem(SSAStatement): - def __init__(self, mk, bit_size, addr): - super().__init__('gpu', [mk.load_value(addr)]) - self.bit_size = bit_size - - def write_c(self, w): - addr = self.srcs[0] - w.write('struct mi_value {} = ', self.c_name) - if addr.zone == 'cpu': - w.write('mi_mem{}(anv_address_from_u64({}));\n', - self.bit_size, addr.c_cpu_val()) - else: - assert self.bit_size == 64 - w.write('mi_load_mem64_offset(&b, anv_address_from_u64(0), {});\n', - addr.c_gpu_val()) - self.write_c_refs(w) - -class StoreMem(Statement): - def __init__(self, mk, bit_size, addr, src): - super().__init__([mk.load_value(addr), mk.load_value(src)]) - self.bit_size = bit_size - - def write_c(self, w): - addr, data = tuple(self.srcs) - if addr.zone == 'cpu': - w.write('mi_store(&b, mi_mem{}(anv_address_from_u64({})), {});\n', - self.bit_size, addr.c_cpu_val(), data.c_gpu_val()) - else: - assert self.bit_size == 64 - w.write('mi_store_mem64_offset(&b, anv_address_from_u64(0), {}, {});\n', - addr.c_gpu_val(), data.c_gpu_val()) - -class GoTo(Statement): - def __init__(self, mk, target_id, cond=None, invert=False): - cond = [mk.load_value(cond)] if cond is not None else [] - super().__init__(cond) - self.target_id = target_id - self.invert = invert - self.mk = mk - - def write_c(self, w): - # Now that we've parsed the entire metakernel, we can look up the - # actual target from the id - target = self.mk.get_goto_target(self.target_id) - - if self.srcs: - cond = self.srcs[0] - if self.invert: - w.write('mi_goto_if(&b, mi_inot(&b, {}), &{});\n', cond.c_gpu_val(), target.c_name) - else: - w.write('mi_goto_if(&b, {}, &{});\n', cond.c_gpu_val(), target.c_name) - else: - w.write('mi_goto(&b, &{});\n', target.c_name) - -class GoToTarget(Statement): - def __init__(self, mk, name): - super().__init__() - self.name = name - self.c_name = '_goto_target_' + name - self.goto_tokens = [] - - mk = mk.add_goto_target(self) - - def write_decl(self, w): - w.write('struct mi_goto_target {} = MI_GOTO_TARGET_INIT;\n', - self.c_name) - - def write_c(self, w): - w.write('mi_goto_target(&b, &{});\n', self.c_name) - -class Dispatch(Statement): - def __init__(self, mk, kernel, group_size, args, postsync): - if group_size is None: - srcs = [mk.scope.get_def('DISPATCHDIM_{}'.format(d)) for d in 'XYZ'] - else: - srcs = [mk.load_value(s) for s in group_size] - srcs += [mk.load_value(a) for a in args] - super().__init__(srcs) - self.kernel = mk.m.kernels[kernel] - self.indirect = group_size is None - self.postsync = postsync - - def write_c(self, w): - w.write('{\n') - w.push_indent() - - group_size = self.srcs[:3] - args = self.srcs[3:] - if not self.indirect: - w.write('const uint32_t _group_size[3] = {{ {}, {}, {} }};\n', - *[s.c_cpu_val() for s in group_size]) - gs = '_group_size' - else: - gs = 'NULL' - - w.write('const struct anv_kernel_arg _args[] = {\n') - w.push_indent() - for arg in args: - w.write('{{ .u64 = {} }},\n', arg.c_cpu_val()) - w.pop_indent() - w.write('};\n') - - w.write('genX(grl_dispatch)(cmd_buffer, {},\n', self.kernel.c_name) - w.write(' {}, ARRAY_SIZE(_args), _args);\n', gs) - w.pop_indent() - w.write('}\n') - -class SemWait(Statement): - def __init__(self, scope, wait): - super().__init__() - self.wait = wait - -class Control(Statement): - def __init__(self, scope, wait): - super().__init__() - self.wait = wait - - def write_c(self, w): - w.write('cmd_buffer->state.pending_pipe_bits |=\n') - w.write(' ANV_PIPE_CS_STALL_BIT |\n') - w.write(' ANV_PIPE_DATA_CACHE_FLUSH_BIT |\n') - w.write(' ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;\n') - w.write('genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);\n') - -TYPE_REMAPS = { - 'dword' : 'uint32_t', - 'qword' : 'uint64_t', -} - -class Module(object): - def __init__(self, grl_dir, elems): - assert isinstance(elems[0], tuple) - assert elems[0][0] == 'module-name' - self.grl_dir = grl_dir - self.name = elems[0][1] - self.kernels = {} - self.structs = {} - self.constants = [] - self.metakernels = [] - self.regs = {} - - scope = Scope(self, None, None) - for e in elems[1:]: - if e[0] == 'kernel': - k = Kernel(self, *e[1:]) - assert k.name not in self.kernels - self.kernels[k.name] = k - elif e[0] == 'kernel-module': - m = KernelModule(self, *e[1:]) - for k in m.kernels: - assert k.name not in self.kernels - self.kernels[k.name] = k - elif e[0] == 'struct': - s = Struct(self, *e[1:]) - assert s.name not in self.kernels - self.structs[s.name] = s - elif e[0] == 'named-constant': - c = NamedConstant(*e[1:]) - scope.add_def(c) - self.constants.append(c) - elif e[0] == 'meta-kernel': - mk = MetaKernel(self, scope, *e[1:]) - self.metakernels.append(mk) - elif e[0] == 'import': - assert e[2] == 'struct' - self.import_struct(e[1], e[3]) - else: - assert False, 'Invalid module-level token: {}'.format(t[0]) - - def import_struct(self, filename, struct_name): - elems = parse_grl_file(os.path.join(self.grl_dir, filename), []) - assert elems - for e in elems[1:]: - if e[0] == 'struct' and e[1] == struct_name: - s = Struct(self, *e[1:]) - assert s.name not in self.kernels - self.structs[s.name] = s - return - assert False, "Struct {0} not found in {1}".format(struct_name, filename) - - def get_type(self, name): - if name in self.structs: - return self.structs[name] - return BasicType(TYPE_REMAPS.get(name, name)) - - def get_fixed_gpr(self, num): - assert isinstance(num, int) - if num in self.regs: - return self.regs[num] - - reg = FixedGPR(num) - self.regs[num] = reg - return reg - - def optimize(self): - progress = True - while progress: - progress = False - - # Copy Propagation - for mk in self.metakernels: - if mk.opt_copy_prop(): - progress = True - - # Dead Code Elimination - for r in self.regs.values(): - r.live = False - for c in self.constants: - c.live = False - for mk in self.metakernels: - mk.opt_dead_code1() - for mk in self.metakernels: - if mk.opt_dead_code2(): - progress = True - for n in list(self.regs.keys()): - if not self.regs[n].live: - del self.regs[n] - progress = True - self.constants = [c for c in self.constants if c.live] - - def compact_regs(self): - old_regs = self.regs - self.regs = {} - for i, reg in enumerate(old_regs.values()): - reg.num = i - self.regs[i] = reg - - def write_h(self, w): - for s in self.structs.values(): - s.write_h(w) - for mk in self.metakernels: - mk.write_h(w) - - def write_c(self, w): - for c in self.constants: - c.write_c(w) - for mk in self.metakernels: - mk.write_c(w) - -class Kernel(object): - def __init__(self, m, name, ann): - self.name = name - self.source_file = ann['source'] - self.kernel_name = self.source_file.replace('/', '_')[:-3].upper() - self.entrypoint = ann['kernelFunction'] - - assert self.source_file.endswith('.cl') - self.c_name = '_'.join([ - 'GRL_CL_KERNEL', - self.kernel_name, - self.entrypoint.upper(), - ]) - -class KernelModule(object): - def __init__(self, m, name, source, kernels): - self.name = name - self.kernels = [] - self.libraries = [] - - for k in kernels: - if k[0] == 'kernel': - k[2]['source'] = source - self.kernels.append(Kernel(m, *k[1:])) - elif k[0] == 'library': - # Skip this for now. - pass - -class BasicType(object): - def __init__(self, name): - self.name = name - self.c_name = name - -class Struct(object): - def __init__(self, m, name, fields, align): - assert align == 0 - self.name = name - self.c_name = 'struct ' + '_'.join(['grl', m.name, self.name]) - self.fields = [(m.get_type(t), n) for t, n in fields] - - def write_h(self, w): - w.write('{} {{\n', self.c_name) - w.push_indent() - for f in self.fields: - w.write('{} {};\n', f[0].c_name, f[1]) - w.pop_indent() - w.write('};\n') - -class NamedConstant(Value): - def __init__(self, name, value): - super().__init__(name, 'cpu') - self.name = name - self.value = Constant(value) - self.written = False - - def set_module(self, m): - pass - - def write_c(self, w): - if self.written: - return - w.write('static const uint64_t {} = {};\n', - self.name, self.value.c_val()) - self.written = True - -class MetaKernelParameter(Value): - def __init__(self, mk, type, name): - super().__init__(name, 'cpu') - self.type = mk.m.get_type(type) - -class MetaKernel(object): - def __init__(self, m, m_scope, name, params, ann, statements): - self.m = m - self.name = name - self.c_name = '_'.join(['grl', m.name, self.name]) - self.goto_targets = {} - self.num_tmps = 0 - - mk_scope = Scope(m, self, m_scope) - - self.params = [MetaKernelParameter(self, *p) for p in params] - for p in self.params: - mk_scope.add_def(p) - - mk_scope.add_def(GroupSizeRegister(0), name='DISPATCHDIM_X') - mk_scope.add_def(GroupSizeRegister(1), name='DISPATCHDIM_Y') - mk_scope.add_def(GroupSizeRegister(2), name='DISPATCHDIM_Z') - - self.statements = [] - self.parse_stmt(mk_scope, statements) - self.scope = None - - def get_tmp(self): - tmpN = '_tmp{}'.format(self.num_tmps) - self.num_tmps += 1 - return tmpN - - def add_stmt(self, stmt): - self.statements.append(stmt) - return stmt - - def parse_value(self, v): - if isinstance(v, Value): - return v - elif isinstance(v, str): - if re.match(r'REG\d+', v): - return self.m.get_fixed_gpr(int(v[3:])) - else: - return self.scope.get_def(v) - elif isinstance(v, int): - return Constant(v) - elif isinstance(v, tuple): - if v[0] == 'member': - return Member(self.parse_value(v[1]), v[2]) - elif v[0] == 'offsetof': - return OffsetOf(self, v[1]) - else: - op = v[0] - srcs = [self.parse_value(s) for s in v[1:]] - return self.add_stmt(Expression(self, op, *srcs)) - else: - assert False, 'Invalid value: {}'.format(v[0]) - - def load_value(self, v): - v = self.parse_value(v) - if isinstance(v, Member) and v.zone == 'gpu': - v = self.add_stmt(Half(v.value, v.member)) - return v - - def parse_stmt(self, scope, s): - self.scope = scope - if isinstance(s, list): - subscope = Scope(self.m, self, scope) - for stmt in s: - self.parse_stmt(subscope, stmt) - elif s[0] == 'define': - scope.add_def(self.parse_value(s[2]), name=s[1]) - elif s[0] == 'assign': - self.add_stmt(StoreReg(self, *s[1:])) - elif s[0] == 'dispatch': - self.add_stmt(Dispatch(self, *s[1:])) - elif s[0] == 'load-dword': - v = self.add_stmt(LoadMem(self, 32, s[2])) - self.add_stmt(StoreReg(self, s[1], v)) - elif s[0] == 'load-qword': - v = self.add_stmt(LoadMem(self, 64, s[2])) - self.add_stmt(StoreReg(self, s[1], v)) - elif s[0] == 'store-dword': - self.add_stmt(StoreMem(self, 32, *s[1:])) - elif s[0] == 'store-qword': - self.add_stmt(StoreMem(self, 64, *s[1:])) - elif s[0] == 'goto': - self.add_stmt(GoTo(self, s[1])) - elif s[0] == 'goto-if': - self.add_stmt(GoTo(self, s[1], s[2])) - elif s[0] == 'goto-if-not': - self.add_stmt(GoTo(self, s[1], s[2], invert=True)) - elif s[0] == 'label': - self.add_stmt(GoToTarget(self, s[1])) - elif s[0] == 'control': - self.add_stmt(Control(self, s[1])) - elif s[0] == 'sem-wait-while': - self.add_stmt(Control(self, s[1])) - else: - assert False, 'Invalid statement: {}'.format(s[0]) - - def add_goto_target(self, t): - assert t.name not in self.goto_targets - self.goto_targets[t.name] = t - - def get_goto_target(self, name): - return self.goto_targets[name] - - def opt_copy_prop(self): - progress = False - copies = {} - for stmt in self.statements: - for i in range(len(stmt.srcs)): - src = stmt.srcs[i] - if isinstance(src, FixedGPR) and src.num in copies: - stmt.srcs[i] = copies[src.num] - progress = True - - if isinstance(stmt, StoreReg): - reg = stmt.reg - if isinstance(reg, Member): - reg = reg.value - - if isinstance(reg, FixedGPR): - copies.pop(reg.num, None) - if not stmt.srcs[0].is_reg(): - copies[reg.num] = stmt.srcs[0] - elif isinstance(stmt, (GoTo, GoToTarget)): - copies = {} - - return progress - - def opt_dead_code1(self): - for stmt in self.statements: - # Mark every register which is read as live - for src in stmt.srcs: - if isinstance(src, Register): - src.live = True - - # Initialize every SSA statement to dead - if isinstance(stmt, SSAStatement): - stmt.live = False - - def opt_dead_code2(self): - def yield_live(statements): - gprs_read = set(self.m.regs.keys()) - for stmt in statements: - if isinstance(stmt, SSAStatement): - if not stmt.live: - continue - elif isinstance(stmt, StoreReg): - reg = stmt.reg - if isinstance(reg, Member): - reg = reg.value - - if not stmt.reg.live: - continue - - if isinstance(reg, FixedGPR): - if reg.num in gprs_read: - gprs_read.remove(reg.num) - else: - continue - elif isinstance(stmt, (GoTo, GoToTarget)): - gprs_read = set(self.m.regs.keys()) - - for src in stmt.srcs: - src.live = True - if isinstance(src, FixedGPR): - gprs_read.add(src.num) - yield stmt - - old_stmt_list = self.statements - old_stmt_list.reverse() - self.statements = list(yield_live(old_stmt_list)) - self.statements.reverse() - return len(self.statements) != len(old_stmt_list) - - def count_ssa_value_uses(self): - for stmt in self.statements: - if isinstance(stmt, SSAStatement): - stmt.uses = 0 - - for src in stmt.srcs: - if isinstance(src, SSAStatement): - src.uses += 1 - - def write_h(self, w): - w.write('void\n') - w.write('genX({})(\n', self.c_name) - w.push_indent() - w.write('struct anv_cmd_buffer *cmd_buffer') - for p in self.params: - w.write(',\n{} {}', p.type.c_name, p.name) - w.write(');\n') - w.pop_indent() - - def write_c(self, w): - w.write('void\n') - w.write('genX({})(\n', self.c_name) - w.push_indent() - w.write('struct anv_cmd_buffer *cmd_buffer') - for p in self.params: - w.write(',\n{} {}', p.type.c_name, p.name) - w.write(')\n') - w.pop_indent() - w.write('{\n') - w.push_indent() - - w.write('struct mi_builder b;\n') - w.write('mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);\n') - w.write('/* TODO: use anv_mocs? */\n'); - w.write('const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);\n'); - w.write('mi_builder_set_mocs(&b, mocs);\n'); - w.write('\n') - - for r in self.m.regs.values(): - r.write_c(w) - w.write('\n') - - for t in self.goto_targets.values(): - t.write_decl(w) - w.write('\n') - - self.count_ssa_value_uses() - for s in self.statements: - s.write_c(w) - - w.pop_indent() - - w.write('}\n') - -HEADER_PROLOGUE = COPYRIGHT + ''' -#include "anv_private.h" -#include "grl/genX_grl.h" - -#ifndef {0} -#define {0} - -#ifdef __cplusplus -extern "C" {{ -#endif - -''' - -HEADER_EPILOGUE = ''' -#ifdef __cplusplus -}} -#endif - -#endif /* {0} */ -''' - -C_PROLOGUE = COPYRIGHT + ''' -#include "{0}" - -#include "genxml/gen_macros.h" -#include "genxml/genX_pack.h" -#include "genxml/genX_rt_pack.h" - -#include "genX_mi_builder.h" - -#define MI_PREDICATE_RESULT mi_reg32(0x2418) -#define DISPATCHDIM_X mi_reg32(0x2500) -#define DISPATCHDIM_Y mi_reg32(0x2504) -#define DISPATCHDIM_Z mi_reg32(0x2508) -''' - -def parse_libraries(filenames): - libraries = {} - for fname in filenames: - lib_package = parse_grl_file(fname, []) - for lib in lib_package: - assert lib[0] == 'library' - # Add the directory of the library so that CL files can be found. - lib[2].append(('path', os.path.dirname(fname))) - libraries[lib[1]] = lib - return libraries - -def main(): - argparser = argparse.ArgumentParser() - argparser.add_argument('--out-c', help='Output C file') - argparser.add_argument('--out-h', help='Output C file') - argparser.add_argument('--library', dest='libraries', action='append', - default=[], help='Libraries to include') - argparser.add_argument('grl', help="Input file") - args = argparser.parse_args() - - grl_dir = os.path.dirname(args.grl) - - libraries = parse_libraries(args.libraries) - - ir = parse_grl_file(args.grl, libraries) - - m = Module(grl_dir, ir) - m.optimize() - m.compact_regs() - - with open(args.out_h, 'w') as f: - guard = os.path.splitext(os.path.basename(args.out_h))[0].upper() - w = Writer(f) - w.write(HEADER_PROLOGUE, guard) - m.write_h(w) - w.write(HEADER_EPILOGUE, guard) - - with open(args.out_c, 'w') as f: - w = Writer(f) - w.write(C_PROLOGUE, os.path.basename(args.out_h)) - m.write_c(w) - -if __name__ == '__main__': - main() diff --git a/src/intel/vulkan/grl/grl_parser.py b/src/intel/vulkan/grl/grl_parser.py deleted file mode 100644 index 2d62b25a169..00000000000 --- a/src/intel/vulkan/grl/grl_parser.py +++ /dev/null @@ -1,586 +0,0 @@ -#!/bin/env python -COPYRIGHT = """\ -/* - * Copyright 2021 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ -""" - -import os -import re -import ply.lex as lex -import ply.yacc as yacc - -# Libraries - -libraries = {} - -# LEXER - -keywords = { - '__debugbreak': 'KW_DEBUGBREAK', - 'alignas': 'KW_ALIGNAS', - 'args': 'KW_ARGS', - 'atomic': 'KW_ATOMIC', - 'atomic_return': 'KW_ATOMIC_RETURN', - 'const': 'KW_CONST', - 'control': 'KW_CONTROL', - 'define': 'KW_DEFINE', - 'dispatch': 'KW_DISPATCH', - 'dispatch_indirect': 'KW_DISPATCH_INDIRECT', - 'goto': 'KW_GOTO', - 'if': 'KW_IF', - 'kernel': 'KW_KERNEL', - 'kernel_module': 'KW_KERNEL_MODULE', - 'import': 'KW_IMPORT', - 'library': 'KW_LIBRARY', - 'links': 'KW_LINKS', - 'load_dword': 'KW_LOAD_DWORD', - 'load_qword': 'KW_LOAD_QWORD', - 'metakernel': 'KW_METAKERNEL', - 'module': 'KW_MODULE', - 'not': 'KW_NOT', - 'offsetof': 'KW_OFFSETOF', - 'postsync': 'KW_POSTSYNC', - 'print': 'KW_PRINT', - 'semaphore_wait': 'KW_SEMAPHORE_WAIT', - 'shiftof': 'KW_SHIFTOF', - 'sizeof': 'KW_SIZEOF', - 'store_dword': 'KW_STORE_DWORD', - 'store_qword': 'KW_STORE_QWORD', - 'store_timestamp': 'KW_STORE_TIMESTAMP', - 'struct': 'KW_STRUCT', - 'unsigned': 'KW_UNSIGNED', - 'while': 'KW_WHILE' -} - -ops = { - '&&': 'OP_LOGICAL_AND', - '||': 'OP_LOGICAL_OR', - '==': 'OP_EQUALEQUAL', - '!=': 'OP_NOTEQUAL', - '<=': 'OP_LESSEQUAL', - '>=': 'OP_GREATEREQUAL', - '<<': 'OP_LSHIFT', - '>>': 'OP_RSHIFT' -} - -tokens = [ - 'INT_LITERAL', - 'STRING_LITERAL', - 'OP', - 'IDENTIFIER' -] + list(keywords.values()) + list(ops.values()) - -def t_INT_LITERAL(t): - r'(0x[a-fA-F0-9]+|\d+)' - if t.value.startswith('0x'): - t.value = int(t.value[2:], 16) - else: - t.value = int(t.value) - return t - -def t_OP(t): - r'(&&|\|\||==|!=|<=|>=|<<|>>)' - t.type = ops.get(t.value) - return t - -def t_IDENTIFIER(t): - r'[a-zA-Z_][a-zA-Z_0-9]*' - t.type = keywords.get(t.value, 'IDENTIFIER') - return t - -def t_STRING_LITERAL(t): - r'"(\\.|[^"\\])*"' - t.value = t.value[1:-1] - return t - -literals = "+*/(){};:,=&|!~^.%?-<>[]" - -t_ignore = ' \t' - -def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - -def t_error(t): - print("WUT: {}".format(t.value)) - t.lexer.skip(1) - -LEXER = lex.lex() - -# PARSER - -precedence = ( - ('right', '?', ':'), - ('left', 'OP_LOGICAL_OR', 'OP_LOGICAL_AND'), - ('left', '|'), - ('left', '^'), - ('left', '&'), - ('left', 'OP_EQUALEQUAL', 'OP_NOTEQUAL'), - ('left', '<', '>', 'OP_LESSEQUAL', 'OP_GREATEREQUAL'), - ('left', 'OP_LSHIFT', 'OP_RSHIFT'), - ('left', '+', '-'), - ('left', '*', '/', '%'), - ('right', '!', '~'), - ('left', '[', ']', '.') -) - -def p_module(p): - 'module : element_list' - p[0] = p[1] - -def p_element_list(p): - '''element_list : element_list element - | element''' - if len(p) == 2: - p[0] = [p[1]] - else: - p[0] = p[1] + [p[2]] - -def p_element(p): - '''element : kernel_definition - | kernel_module_definition - | library_definition - | metakernel_definition - | module_name - | struct_definition - | const_definition - | import_definition''' - p[0] = p[1] - -def p_module_name(p): - 'module_name : KW_MODULE IDENTIFIER ";"' - p[0] = ('module-name', p[2]) - -def p_kernel_module_definition(p): - 'kernel_module_definition : KW_KERNEL_MODULE IDENTIFIER "(" STRING_LITERAL ")" "{" kernel_definition_list "}"' - p[0] = ('kernel-module', p[2], p[4], p[7]) - -def p_kernel_definition(p): - 'kernel_definition : KW_KERNEL IDENTIFIER optional_annotation_list' - p[0] = ('kernel', p[2], p[3]) - -def p_library_definition(p): - 'library_definition : KW_LIBRARY IDENTIFIER "{" library_definition_list "}"' - p[0] = ('library', p[2], p[4]) - -def p_library_definition_list(p): - '''library_definition_list : - | library_definition_list IDENTIFIER STRING_LITERAL ";"''' - if len(p) < 3: - p[0] = [] - else: - p[0] = p[1] - p[0].append((p[2], p[3])) - -def p_import_definition(p): - 'import_definition : KW_IMPORT KW_STRUCT IDENTIFIER STRING_LITERAL ";"' - p[0] = ('import', p[4], 'struct', p[3]) - -def p_links_definition(p): - 'links_definition : KW_LINKS IDENTIFIER' - - # Process a library include like a preprocessor - global libraries - - if not p[2] in libraries: - raise "Not able to find library {0}".format(p[2]) - p[0] = libraries[p[2]] - -def p_metakernel_definition(p): - 'metakernel_definition : KW_METAKERNEL IDENTIFIER "(" optional_parameter_list ")" optional_annotation_list scope' - p[0] = ('meta-kernel', p[2], p[4], p[6], p[7]) - -def p_kernel_definition_list(p): - '''kernel_definition_list : - | kernel_definition_list kernel_definition ";" - | kernel_definition_list links_definition ";"''' - if len(p) < 3: - p[0] = [] - else: - p[0] = p[1] - p[0].append(p[2]) - -def p_optional_annotation_list(p): - '''optional_annotation_list : - | "<" ">" - | "<" annotation_list ">"''' - if len(p) < 4: - p[0] = {} - else: - p[0] = p[2] - -def p_optional_parameter_list(p): - '''optional_parameter_list : - | parameter_list''' - p[0] = p[1] - -def p_annotation_list(p): - '''annotation_list : annotation''' - p[0] = p[1] - -def p_annotation_list_append(p): - '''annotation_list : annotation_list "," annotation''' - p[0] = {**p[1], **p[3]} - -def p_annotation(p): - '''annotation : IDENTIFIER "=" INT_LITERAL - | IDENTIFIER "=" IDENTIFIER - | IDENTIFIER "=" STRING_LITERAL''' - p[0] = {p[1]: p[3]} - -def p_parameter_list(p): - '''parameter_list : parameter_definition''' - p[0] = [p[1]] - -def p_parameter_list_append(p): - '''parameter_list : parameter_list "," parameter_definition''' - p[0] = p[1] - p[0].append(p[3]) - -def p_parameter_definition(p): - 'parameter_definition : IDENTIFIER IDENTIFIER' - p[0] = (p[1], p[2]) - -def p_scope(p): - '''scope : "{" optional_statement_list "}"''' - p[0] = p[2] - -def p_optional_statement_list(p): - '''optional_statement_list : - | statement_list''' - p[0] = p[1] - -def p_statement_list(p): - '''statement_list : statement''' - p[0] = [p[1]] - -def p_statement_list_append(p): - '''statement_list : statement_list statement''' - p[0] = p[1] - p[0].append(p[2]) - -def p_statement(p): - '''statement : definition_statement ";" - | assignment_statement ";" - | load_store_statement ";" - | dispatch_statement ";" - | semaphore_statement ";" - | label - | goto_statement ";" - | scope_statement - | atomic_op_statement ";" - | control_statement ";" - | print_statement ";" - | debug_break_statement ";"''' - p[0] = p[1] - -def p_definition_statement(p): - 'definition_statement : KW_DEFINE IDENTIFIER value' - p[0] = ('define', p[2], p[3]) - -def p_assignemt_statement(p): - 'assignment_statement : value "=" value' - p[0] = ('assign', p[1], p[3]) - -def p_load_store_statement_load_dword(p): - '''load_store_statement : value "=" KW_LOAD_DWORD "(" value ")"''' - p[0] = ('load-dword', p[1], p[5]) - -def p_load_store_statement_load_qword(p): - '''load_store_statement : value "=" KW_LOAD_QWORD "(" value ")"''' - p[0] = ('load-qword', p[1], p[5]) - -def p_load_store_statement_store_dword(p): - '''load_store_statement : KW_STORE_DWORD "(" value "," value ")"''' - p[0] = ('store-dword', p[3], p[5]) - -def p_load_store_statement_store_qword(p): - '''load_store_statement : KW_STORE_QWORD "(" value "," value ")"''' - p[0] = ('store-qword', p[3], p[5]) - -def p_dispatch_statement(p): - '''dispatch_statement : direct_dispatch_statement - | indirect_dispatch_statement''' - p[0] = p[1] - -def p_direct_dispatch_statement(p): - '''direct_dispatch_statement : KW_DISPATCH IDENTIFIER "(" value "," value "," value ")" optional_kernel_arg_list optional_postsync''' - p[0] = ('dispatch', p[2], (p[4], p[6], p[8]), p[10], p[11]) - -def p_indirect_dispatch_statement(p): - '''indirect_dispatch_statement : KW_DISPATCH_INDIRECT IDENTIFIER optional_kernel_arg_list optional_postsync''' - p[0] = ('dispatch', p[2], None, p[3], p[4]) - -def p_optional_kernel_arg_list(p): - '''optional_kernel_arg_list : - | KW_ARGS "(" value_list ")"''' - p[0] = p[3] - -def p_value_list(p): - '''value_list : value''' - p[0] = [p[1]] - -def p_value_list_append(p): - '''value_list : value_list "," value''' - p[0] = p[1] - p[0].append(p[3]) - -def p_optional_postsync(p): - '''optional_postsync : - | postsync_operation''' - if len(p) > 1: - p[0] = p[1] - -def p_postsync_operation(p): - '''postsync_operation : postsync_write_dword - | postsync_write_timestamp''' - p[0] = p[1] - -def p_postsync_write_dword(p): - '''postsync_write_dword : KW_POSTSYNC KW_STORE_DWORD "(" value "," value ")"''' - p[0] = ('postsync', 'store-dword', p[4], p[6]) - -def p_postsync_write_timestamp(p): - '''postsync_write_timestamp : KW_POSTSYNC KW_STORE_TIMESTAMP "(" value ")"''' - p[0] = ('postsync', 'timestamp', p[4]) - -def p_semaphore_statement(p): - '''semaphore_statement : KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value "<" value ")" - | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value ">" value ")" - | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_LESSEQUAL value ")" - | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_GREATEREQUAL value ")" - | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_EQUALEQUAL value ")" - | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_NOTEQUAL value ")"''' - p[0] = ('sem-wait-while', p[5], p[6], p[7]) - -def p_atomic_op_statement(p): - '''atomic_op_statement : KW_ATOMIC IDENTIFIER IDENTIFIER "(" value_list ")"''' - p[0] = ('atomic', p[2], p[3], p[5]) - -def p_atomic_op_statement_return(p): - '''atomic_op_statement : KW_ATOMIC_RETURN IDENTIFIER IDENTIFIER "(" value_list ")"''' - p[0] = ('atomic-return', p[2], p[3], p[5]) - -def p_label(p): - '''label : IDENTIFIER ":"''' - p[0] = ('label', p[1]) - -def p_goto_statement(p): - '''goto_statement : KW_GOTO IDENTIFIER''' - p[0] = ('goto', p[2]) - -def p_goto_statement_if(p): - '''goto_statement : KW_GOTO IDENTIFIER KW_IF "(" value ")"''' - p[0] = ('goto-if', p[2], p[5]) - -def p_goto_statement_if_not(p): - '''goto_statement : KW_GOTO IDENTIFIER KW_IF KW_NOT "(" value ")"''' - p[0] = ('goto-if-not', p[2], p[6]) - -def p_scope_statement(p): - '''scope_statement : scope''' - p[0] = (p[1]) - -def p_control_statement(p): - '''control_statement : KW_CONTROL "(" id_list ")"''' - p[0] = ('control', p[3]) - -def p_print_statement(p): - '''print_statement : KW_PRINT "(" printable_list ")"''' - p[0] = ('print', p[3]) - -def p_printable_list(p): - '''printable_list : printable''' - p[0] = [p[1]] - -def p_printable_list_append(p): - '''printable_list : printable_list "," printable''' - p[0] = p[1] - p[0].append(p[3]) - -def p_printable_str_lit(p): - '''printable : STRING_LITERAL''' - p[0] = '"{}"'.format(p[1]) - -def p_printable_value(p): - '''printable : value''' - p[0] = p[1] - -def p_printable_str_lit_value(p): - '''printable : STRING_LITERAL value''' - p[0] = ('"{}"'.format(p[1]), p[2]) - -def p_debug_break_statement(p): - '''debug_break_statement : KW_DEBUGBREAK''' - p[0] = ('debug-break') - -def p_id_list(p): - '''id_list : IDENTIFIER''' - p[0] = p[1] - -def p_id_list_append(p): - '''id_list : id_list "," IDENTIFIER''' - p[0] = p[1] - p[0].append(p[3]) - -def p_value(p): - '''value : IDENTIFIER - | INT_LITERAL''' - p[0] = p[1] - -def p_value_braces(p): - '''value : "(" value ")"''' - p[0] = (p[2]) - -def p_value_member(p): - '''value : value "." IDENTIFIER''' - p[0] = ('member', p[1], p[3]) - -def p_value_idx(p): - '''value : value "[" value "]"''' - p[0] = ('index', p[1], p[3]) - -def p_value_binop(p): - '''value : value "+" value - | value "-" value - | value "*" value - | value "/" value - | value "%" value - | value "&" value - | value "|" value - | value "<" value - | value ">" value - | value "^" value - | value OP_LESSEQUAL value - | value OP_GREATEREQUAL value - | value OP_EQUALEQUAL value - | value OP_NOTEQUAL value - | value OP_LOGICAL_AND value - | value OP_LOGICAL_OR value - | value OP_LSHIFT value - | value OP_RSHIFT value''' - p[0] = (p[2], p[1], p[3]) - -def p_value_uniop(p): - '''value : "!" value - | "~" value''' - p[0] = (p[1], p[2]) - -def p_value_cond(p): - '''value : value "?" value ":" value''' - p[0] = ('?', p[1], p[3], p[5]) - -def p_value_funcop(p): - '''value : KW_OFFSETOF "(" offset_expression ")" - | KW_SHIFTOF "(" IDENTIFIER ")" - | KW_SIZEOF "(" IDENTIFIER ")"''' - p[0] = (p[1], p[3]) - -def p_offset_expression(p): - '''offset_expression : IDENTIFIER''' - p[0] = p[1] - -def p_offset_expression_member(p): - '''offset_expression : offset_expression "." IDENTIFIER''' - p[0] = ('member', p[1], p[3]) - -def p_offset_expression_idx(p): - '''offset_expression : offset_expression "[" INT_LITERAL "]"''' - p[0] = ('index', p[1], p[3]) - -def p_struct_definition(p): - '''struct_definition : KW_STRUCT optional_alignment_specifier IDENTIFIER "{" optional_struct_member_list "}" ";"''' - p[0] = ('struct', p[3], p[5], p[2]) - -def p_optional_alignment_specifier(p): - '''optional_alignment_specifier : - | KW_ALIGNAS "(" INT_LITERAL ")"''' - if len(p) == 1: - p[0] = 0 - else: - p[0] = p[3] - -def p_optional_struct_member_list(p): - '''optional_struct_member_list : - | struct_member_list''' - if len(p) == 1: - p[0] = {} - else: - p[0] = p[1] - -def p_struct_member_list(p): - '''struct_member_list : struct_member''' - p[0] = [p[1]] - -def p_struct_member_list_append(p): - '''struct_member_list : struct_member_list struct_member''' - p[0] = p[1] + [p[2]] - -def p_struct_member(p): - '''struct_member : struct_member_typename IDENTIFIER ";"''' - p[0] = (p[1], p[2]) - -def p_struct_member_array(p): - '''struct_member : struct_member_typename IDENTIFIER "[" INT_LITERAL "]" ";"''' - '''struct_member : struct_member_typename IDENTIFIER "[" IDENTIFIER "]" ";"''' - p[0] = {p[1]: p[2], 'count': p[4]} - -def p_struct_member_typename(p): - '''struct_member_typename : IDENTIFIER''' - p[0] = p[1] - -def p_struct_member_typename_unsigned(p): - '''struct_member_typename : KW_UNSIGNED IDENTIFIER''' - p[0] = ('unsigned', p[2]) - -def p_struct_member_typename_struct(p): - '''struct_member_typename : KW_STRUCT IDENTIFIER''' - p[0] = ('struct', p[2]) - -def p_const_definition(p): - '''const_definition : KW_CONST IDENTIFIER "=" INT_LITERAL ";"''' - p[0] = ('named-constant', p[2], p[4]) - -PARSER = yacc.yacc() - -# Shamelessly stolen from some StackOverflow answer -def _remove_comments(text): - def replacer(match): - s = match.group(0) - if s.startswith('/'): - return " " # note: a space and not an empty string - else: - return s - pattern = re.compile( - r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', - re.DOTALL | re.MULTILINE - ) - return re.sub(pattern, replacer, text) - -def parse_grl_file(grl_fname, libs): - global libraries - - libraries = libs - with open(grl_fname, 'r') as f: - return PARSER.parse(_remove_comments(f.read())) diff --git a/src/intel/vulkan/grl/grl_structs.h b/src/intel/vulkan/grl/grl_structs.h deleted file mode 100644 index ed721afa6a2..00000000000 --- a/src/intel/vulkan/grl/grl_structs.h +++ /dev/null @@ -1,479 +0,0 @@ -/* - * Copyright © 2022 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -/** - * This file contains a redefinition of structures defined in the GRL library. - * We need to have those structures defined to allocate & prepare data for - * the OpenCL kernels building acceleration structures. Unfortunately because - * of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL - * header files directly so we have to redefine stuff here. - */ - -#ifndef GRL_STRUCTS_H -#define GRL_STRUCTS_H - -#include "GRLStructs.h" -#include "GRLRTASCommon.h" - -struct MKBuilderState { - qword geomDesc_buffer; - qword build_primref_buffer; - qword build_globals; - qword bvh_buffer; - dword leaf_type; - dword leaf_size; -}; - -#define PREFIX_MK_STATE(prefix, obj) \ - (struct prefix##_MKBuilderState) { \ - .geomDesc_buffer = (obj).geomDesc_buffer, \ - .build_primref_buffer = (obj).build_primref_buffer, \ - .build_globals = (obj).build_globals, \ - .bvh_buffer = (obj).bvh_buffer, \ - .leaf_type = (obj).leaf_type, \ - .leaf_size = (obj).leaf_size, \ - } - -struct MKSizeEstimate { - dword numTriangles; - dword numProcedurals; - dword numPrimitives; - dword numMeshes; - dword numBuildPrimitives; - dword numPrimitivesToSplit; - dword instance_descs_start; - dword geo_meta_data_start; - dword node_data_start; - dword leaf_data_start; - dword procedural_data_start; - dword back_pointer_start; - dword sizeTotal; - dword updateScratchSizeTotal; - dword fatleaf_table_start; - dword innernode_table_start; - dword max_fatleaves; - - size_t max_instance_leafs; - size_t max_inner_nodes; - size_t leaf_data_size; - size_t min_primitives; - size_t max_primitives; -}; - -#define PREFIX_MK_SIZE(prefix, obj) \ - (struct prefix##_MKSizeEstimate) { \ - .numTriangles = (obj).numTriangles, \ - .numProcedurals = (obj).numProcedurals, \ - .numPrimitives = (obj).numPrimitives, \ - .numMeshes = (obj).numMeshes, \ - .numBuildPrimitives = (obj).numBuildPrimitives, \ - .numPrimitivesToSplit = (obj).numPrimitivesToSplit, \ - .instance_descs_start = (obj).instance_descs_start, \ - .geo_meta_data_start = (obj).geo_meta_data_start, \ - .node_data_start = (obj).node_data_start, \ - .leaf_data_start = (obj).leaf_data_start, \ - .procedural_data_start = (obj).procedural_data_start, \ - .back_pointer_start = (obj).back_pointer_start, \ - .sizeTotal = (obj).sizeTotal, \ - .updateScratchSizeTotal = (obj).updateScratchSizeTotal, \ - .fatleaf_table_start = (obj).fatleaf_table_start, \ - .innernode_table_start = (obj).innernode_table_start, \ - .max_fatleaves = (obj).max_fatleaves, \ - } - -typedef struct AABB { - float lower[4]; - float upper[4]; -} AABB; - -struct Globals -{ - struct AABB centroidBounds; - - unsigned int build_record_start; - unsigned int numPrimitives; - unsigned int leafPrimType; - unsigned int leafSize; - - unsigned int numSplittedPrimitives; - unsigned int numBuildRecords; - - // spatial split sate - unsigned int numOriginalPrimitives; - float presplitPrioritySum; - float probThreshold; - - // binned-sah bfs state - unsigned int counter; - unsigned int numBuildRecords_extended; - - // sync variable used for global-sync on work groups - unsigned int sync; - - - /* morton code builder state */ - unsigned int shift; // used by adaptive mc-builder - unsigned int shift_mask; // used by adaptive mc-builder - unsigned int binary_hierarchy_root; - unsigned int p0_allocated_num; - unsigned int p0_created_num; - unsigned int morton_sort_in_flight; - unsigned int sort_iterations; - - gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid -}; - -typedef struct BVHBase -{ - // TODO: Implement the "copy-first-node" trick... duplicate root node here - - uint64_t rootNodeOffset; - - uint32_t reserved; - - uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64 - uint32_t quadLeafStart; - uint32_t quadLeafCur; - uint32_t proceduralDataStart; - uint32_t proceduralDataCur; - uint32_t instanceLeafStart; - uint32_t instanceLeafEnd; - uint32_t backPointerDataStart; // - uint32_t refitTreeletsDataStart; // refit structs - uint32_t refitStartPointDataStart; // - uint32_t BVHDataEnd; - - // number of bottom treelets - // if 1, then the bottom treelet is also tip treelet - uint32_t refitTreeletCnt; - uint32_t refitTreeletCnt2; // always 0, used for atomic updates - // data layout: - // @backPointerDataStart - // 'backpointer' - a dword per inner node. - // The bits are used as follows: - // 2:0 --> Used as a refit counter during BVH refitting. MBZ - // 5:3 --> Number of children - // 31:6 --> Index of the parent node in the internal node array - // The root node has a parent index of all ones - // @refitTreeletsDataStart - // RefitTreelet[], the last treelet is for top treelet all previous are for bottom - // @refitStartPointDataStart - // for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space - // @backPointerDataEnd - - uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves" - uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children) - uint32_t fatLeafTableStart; - uint32_t innerTableStart; - - uint32_t _pad[12]; - - struct RTASMetaData Meta; -} BVHBase; - - -struct BatchedInitGlobalsData -{ - qword p_build_globals; - qword p_bvh_buffer; - dword numPrimitives; - dword numGeometries; - dword numInstances; - dword instance_descs_start; - dword geo_meta_data_start; - dword node_data_start; - dword leaf_data_start; - dword procedural_data_start; - dword back_pointer_start; - dword sizeTotal; - dword leafType; - dword leafSize; - dword fatleaf_table_start; - dword innernode_table_start; -}; - - -#define BFS_NUM_BINS 16 -#define BFS_NUM_VCONTEXTS 256 -#define BFS_MAX_DEPTH 32 - -#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384 - -struct BFS_Split -{ - float sah; - int dim; - int pos; -}; - -struct BFS_BinInfo -{ - float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6] - // The 6 are lower(xyz) and -upper(xyz) - // bins use negated-max so that we can use vectorized mins instead of min/max pairs - uint counts[3 * BFS_NUM_BINS]; -}; - -struct SAHBuildGlobals -{ - qword p_primref_index_buffers; - qword p_primrefs_buffer; - qword p_bvh2; - qword p_globals; // TODO: deprecate this - qword p_bvh_base; - gpuva_t p_qnode_root_buffer; - - dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks' - dword num_primrefs; - dword leaf_size; - dword leaf_type; - - dword root_buffer_num_produced; - dword root_buffer_num_produced_hi; - dword root_buffer_num_consumed; - dword root_buffer_num_consumed_hi; - dword root_buffer_num_to_consume; - dword root_buffer_num_to_consume_hi; -}; - -typedef union LRBounds -{ - struct - { - struct AABB3f left_centroid_bounds; - struct AABB3f left_geom_bounds; - struct AABB3f right_centroid_bounds; - struct AABB3f right_geom_bounds; - } boxes; - struct - { - float Array[24]; - } scalars; -} LRBounds; - - -struct VContext -{ - uint dispatch_primref_begin; // range of primrefs for this task - uint dispatch_primref_end; - uint bvh2_root; // BVH2 root node for this task - uint tree_depth; // depth of this node in the tree - uint num_left; // primref counts - uint num_right; - uint lr_mask; // lower 8b : left mask. upper 8b : right mask - uint batch_index; - - // pass1 global working state and output - struct BFS_Split split; - struct BFS_BinInfo global_bin_info; - - // pass2 global working state and output - LRBounds lr_bounds; -}; - - - -struct BFSDispatchRecord -{ - ushort batch_index; - ushort context_id; -}; - - -struct BFSDispatchQueue -{ - uint num_dispatches; - uint wg_count[BFS_NUM_VCONTEXTS]; - struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS]; -}; - -struct BFS1SpillStackEntry -{ - uint primref_begin; - uint primref_end; - uint bvh2_root; - ushort tree_depth; - ushort batch_index; -}; - -struct BFS1SpillStack -{ - uint size; - struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH]; -}; - -struct QNodeGlobalRootBufferEntry -{ - uint bvh2_node; - uint qnode; - uint build_idx; - uint _pad; -}; - -struct QNodeGlobalRootBuffer -{ - uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM - struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2]; -}; - -struct DFSDispatchRecord -{ - uint primref_base; - uint bvh2_base; - uint batch_index; - ushort num_primrefs; - ushort tree_depth; -}; - - -struct DFSDispatchQueue -{ - struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2]; -}; - -#define VCONTEXT_STATE_EXECUTING 0 -#define VCONTEXT_STATE_UNALLOCATED 1 - -union SchedulerUnion -{ - struct VContextScheduler - { - ///////////////////////////////////////////////////////////// - // State data used for communication with command streamer - // NOTE: This part must match definition in 'new_sah_builder.grl' - ///////////////////////////////////////////////////////////// - - dword num_bfs_wgs; - dword num_dfs_wgs; - - dword scheduler_postsync; - dword _pad1; - - dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). - dword num_single_builds; // number of single-wg builds (#primrefs < threshold) - - dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass - dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition - - ///////////////////////////////////////////////////////////// - - dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer - dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer - - dword vcontext_state[BFS_NUM_VCONTEXTS]; - - struct BFSDispatchQueue bfs_queue; - struct DFSDispatchQueue dfs_queue; - - struct VContext contexts[BFS_NUM_VCONTEXTS]; - - struct BFS1SpillStack bfs2_spill_stack; - } vContextScheduler; - - struct QnodeScheduler - { - dword num_qnode_grb_curr_entries; - dword num_qnode_grb_new_entries; - - dword scheduler_postsync; - dword _pad1; - - dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). - dword num_single_builds; // number of single-wg builds (#primrefs < threshold) - - dword batched_builds_to_process; - dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer - - ///////////////////////////////////////////////////////////// - - dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer - dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer - - struct QNodeGlobalRootBuffer qnode_global_root_buffer; - } qnodeScheduler; -}; - - -struct BVH2Node -{ - struct AABB3f box; - uint meta_u; // leaf: primref start. inner: offset from node to its first child - uint meta_ss; - //ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes - //uchar is_inner; // 1 if inner, 0 if leaf - //uchar mask; -}; - -struct BVH2 -{ - uint num_nodes; - uint _pad[7]; // align to 32B -}; - -struct BatchedBLSDispatchEntry -{ - ///////////////////////////////////////////////////////////// - // State data used for communication with command streamer - // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl' - ///////////////////////////////////////////////////////////// - qword p_data_buffer; - qword num_elements; // number of elements in p_data_buffer -}; - -struct SAHBuildArgsBatchable -{ - qword p_globals_ptrs; - qword p_scheduler; - qword p_buffers_info; - qword p_sah_globals; - - dword num_max_qnode_global_root_buffer_entries; - dword num_builds; -}; - -#define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \ - (struct prefix##_SAHBuildArgsBatchable) { \ - .p_globals_ptrs = (obj).p_globals_ptrs, \ - .p_scheduler = (obj).p_scheduler, \ - .p_buffers_info = (obj).p_buffers_info, \ - .p_sah_globals = (obj).p_sah_globals, \ - .num_max_qnode_global_root_buffer_entries = \ - (obj).num_max_qnode_global_root_buffer_entries, \ - .num_builds = (obj).num_builds, \ - } - - -struct SAHBuildBuffersInfo -{ - gpuva_t p_globals; - gpuva_t p_primref_index_buffers; - gpuva_t p_primrefs_buffer; - gpuva_t p_bvh2; - gpuva_t p_bvh_base; - gpuva_t p_qnode_root_buffer; - dword sah_globals_flags; - dword _pad; - gpuva_t _pad2; -}; - -#endif /* GRL_STRUCTS_H */ diff --git a/src/intel/vulkan/grl/include/AABB3f.h b/src/intel/vulkan/grl/include/AABB3f.h deleted file mode 100644 index a3412332c77..00000000000 --- a/src/intel/vulkan/grl/include/AABB3f.h +++ /dev/null @@ -1,459 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "GRLRTASCommon.h" - -#include "affinespace.h" - -#ifndef __OPENCL_VERSION__ -# include "stdio.h" //for printf -#endif - -GRL_NAMESPACE_BEGIN(GRL) -GRL_NAMESPACE_BEGIN(RTAS) - -GRL_INLINE void AABB3f_init(struct AABB3f *aabb) -{ - aabb->lower[0] = (float)(INFINITY); - aabb->lower[1] = (float)(INFINITY); - aabb->lower[2] = (float)(INFINITY); - - aabb->upper[0] = -(float)(INFINITY); - aabb->upper[1] = -(float)(INFINITY); - aabb->upper[2] = -(float)(INFINITY); -} - -GRL_INLINE float3 AABB3f_load_lower( const struct AABB3f* aabb ) -{ - float3 v = { aabb->lower[0], aabb->lower[1], aabb->lower[2] }; - return v; -} -GRL_INLINE float3 AABB3f_load_upper( const struct AABB3f* aabb ) -{ - float3 v = { aabb->upper[0], aabb->upper[1], aabb->upper[2] }; - return v; -} - -GRL_INLINE void AABB3f_extend(struct AABB3f *aabb, const struct AABB3f *v) -{ - aabb->lower[0] = fmin(aabb->lower[0], v->lower[0]); - aabb->lower[1] = fmin(aabb->lower[1], v->lower[1]); - aabb->lower[2] = fmin(aabb->lower[2], v->lower[2]); - aabb->upper[0] = fmax(aabb->upper[0], v->upper[0]); - aabb->upper[1] = fmax(aabb->upper[1], v->upper[1]); - aabb->upper[2] = fmax(aabb->upper[2], v->upper[2]); -} - -GRL_INLINE void AABB3f_intersect(struct AABB3f* aabb, struct AABB3f inters) -{ - aabb->upper[0] = fmin(inters.upper[0],aabb->upper[0]); - aabb->upper[1] = fmin(inters.upper[1],aabb->upper[1]); - aabb->upper[2] = fmin(inters.upper[2],aabb->upper[2]); - aabb->lower[0] = fmax(inters.lower[0],aabb->lower[0]); - aabb->lower[1] = fmax(inters.lower[1],aabb->lower[1]); - aabb->lower[2] = fmax(inters.lower[2],aabb->lower[2]); -} - -GRL_INLINE void AABB3f_trim_upper(struct AABB3f* aabb, const float* upper) -{ - aabb->upper[0] = fmin(upper[0], aabb->upper[0]); - aabb->upper[1] = fmin(upper[1], aabb->upper[1]); - aabb->upper[2] = fmin(upper[2], aabb->upper[2]); -} - -GRL_INLINE void AABB3f_set( struct AABB3f* aabb, float3 lower, float3 upper ) -{ - aabb->lower[0] = lower.x ; - aabb->lower[1] = lower.y ; - aabb->lower[2] = lower.z ; - aabb->upper[0] = upper.x ; - aabb->upper[1] = upper.y ; - aabb->upper[2] = upper.z ; -} - -inline void AABB3f_extend_point(struct AABB3f *aabb, const float3 p) -{ - aabb->lower[0] = fmin(aabb->lower[0], p.x); - aabb->lower[1] = fmin(aabb->lower[1], p.y); - aabb->lower[2] = fmin(aabb->lower[2], p.z); - aabb->upper[0] = fmax(aabb->upper[0], p.x); - aabb->upper[1] = fmax(aabb->upper[1], p.y); - aabb->upper[2] = fmax(aabb->upper[2], p.z); -} - -GRL_INLINE void AABB3f_extendlu(struct AABB3f *aabb, const float3 lower, const float3 upper) -{ - aabb->lower[0] = fmin(aabb->lower[0], lower.x); - aabb->lower[1] = fmin(aabb->lower[1], lower.y); - aabb->lower[2] = fmin(aabb->lower[2], lower.z); - aabb->upper[0] = fmax(aabb->upper[0], upper.x); - aabb->upper[1] = fmax(aabb->upper[1], upper.y); - aabb->upper[2] = fmax(aabb->upper[2], upper.z); -} - -GRL_INLINE float3 AABB3f_size(struct AABB3f* aabb) -{ - return AABB3f_load_upper(aabb) - AABB3f_load_lower(aabb); -} - -GRL_INLINE float AABB3f_halfArea(struct AABB3f *aabb) -{ - const float3 d = AABB3f_load_upper( aabb ) - AABB3f_load_lower( aabb ); - return d.x* (d.y + d.z) + (d.y * d.z); -} - -GRL_INLINE float halfArea_AABB3f(struct AABB3f *aabb) // TODO: Remove me -{ - const float3 d = { aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2] }; - return fma(d.x, (d.y + d.z), d.y * d.z); -} - -GRL_INLINE void AABB3f_set_lower(struct AABB3f* aabb, float3 lower) -{ - aabb->lower[0] = lower.x; - aabb->lower[1] = lower.y; - aabb->lower[2] = lower.z; -} - -GRL_INLINE void AABB3f_set_upper(struct AABB3f* aabb, float3 upper) -{ - aabb->upper[0] = upper.x; - aabb->upper[1] = upper.y; - aabb->upper[2] = upper.z; -} - -GRL_INLINE float3 conservativeExtent(float3 extent) -{ - const float v = FLT_EPSILON * fmax(extent.x, fmax(extent.y, extent.z)); - float3 v3 = { v,v,v }; - extent = extent + v3; - return extent; -} - -inline struct AABB3f GRL_OVERLOADABLE transform_aabb(float3 lower, float3 upper, const float* Transform) -{ -#if 1 - // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area - // New AABB is center +- Extent. - // - // For derivation see: - // https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/ - // - - float3 Center = (upper + lower) * 0.5f; - float3 Extent = (conservativeExtent(upper) - lower) * 0.5f; - - float cx = Center.x * Transform[0] + Center.y * Transform[1] + Center.z * Transform[2] + Transform[3]; - float cy = Center.x * Transform[4] + Center.y * Transform[5] + Center.z * Transform[6] + Transform[7]; - float cz = Center.x * Transform[8] + Center.y * Transform[9] + Center.z * Transform[10] + Transform[11]; - float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]); - float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]); - float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]); - - Center.x = cx; Center.y = cy; Center.z = cz; - Extent.x = ex; Extent.y = ey; Extent.z = ez; - - struct AABB3f box; - AABB3f_set_lower(&box, Center - Extent); - AABB3f_set_upper(&box, Center + Extent); - return box; -#else - struct AffineSpace3f xfm = AffineSpace3f_load_row_major(Transform); - - float3 plll = { lower.x, lower.y, lower.z }; - float3 pllu = { lower.x, lower.y, upper.z }; - float3 plul = { lower.x, upper.y, lower.z }; - float3 pluu = { lower.x, upper.y, upper.z }; - float3 pull = { upper.x, lower.y, lower.z }; - float3 pulu = { upper.x, lower.y, upper.z }; - float3 puul = { upper.x, upper.y, lower.z }; - float3 puuu = { upper.x, upper.y, upper.z }; - plll = xfmPoint(xfm, plll) ; - pllu = xfmPoint(xfm, pllu) ; - plul = xfmPoint(xfm, plul) ; - pluu = xfmPoint(xfm, pluu) ; - pull = xfmPoint(xfm, pull) ; - pulu = xfmPoint(xfm, pulu) ; - puul = xfmPoint(xfm, puul) ; - puuu = xfmPoint(xfm, puuu) ; - - float3 p1_min = fmin(plll, pull); - float3 p2_min = fmin(pllu, pulu); - float3 p3_min = fmin(plul, puul); - float3 p4_min = fmin(pluu, puuu); - float3 p1_max = fmax(plll, pull); - float3 p2_max = fmax(pllu, pulu); - float3 p3_max = fmax(plul, puul); - float3 p4_max = fmax(pluu, puuu); - p1_min = fmin(p1_min, p3_min); - p2_min = fmin(p2_min, p4_min); - p1_max = fmax(p1_max, p3_max); - p2_max = fmax(p2_max, p4_max); - p1_min = fmin(p1_min, p2_min); - p1_max = fmax(p1_max, p2_max); - - AABB3f out = { - {p1_min.x,p1_min.y,p1_min.z}, - {p1_max.x,p1_max.y,p1_max.z} - }; - return out; -#endif -} - -GRL_INLINE struct AABB3f GRL_OVERLOADABLE transform_aabb(struct AABB3f box, const float* Transform) -{ - float3 lower = { box.lower[0], box.lower[1], box.lower[2] }; - float3 upper = { box.upper[0], box.upper[1], box.upper[2] }; - return transform_aabb(lower, upper, Transform); -} - -GRL_INLINE struct AABB3f AABB3f_transform(struct AffineSpace3f xfm, struct AABB3f in) -{ - struct AABB3f out; - float rmTransform[12]; - load_row_major_from_AffineSpace3f(xfm, rmTransform); - out = transform_aabb(in, rmTransform); - - return out; -} - -GRL_INLINE bool AABB3f_isIn(struct AABB3f bigger, float3 contained) -{ - bool iscontained = - contained.x >= bigger.lower[0] && - contained.y >= bigger.lower[1] && - contained.z >= bigger.lower[2] && - contained.x <= bigger.upper[0] && - contained.y <= bigger.upper[1] && - contained.z <= bigger.upper[2]; - - return iscontained; -} - -GRL_INLINE bool AABB3f_isSubset(struct AABB3f bigger, struct AABB3f contained) -{ - bool iscontained = - contained.lower[0] >= bigger.lower[0] && - contained.lower[1] >= bigger.lower[1] && - contained.lower[2] >= bigger.lower[2] && - contained.upper[0] <= bigger.upper[0] && - contained.upper[1] <= bigger.upper[1] && - contained.upper[2] <= bigger.upper[2]; - - return iscontained; -} - -GRL_INLINE bool AABB3f_is_degenerate(struct AABB3f* box ) -{ - return box->lower[0] > box->upper[0] || - box->lower[1] > box->upper[1] || - box->lower[2] > box->upper[2]; -} - -GRL_INLINE void AABB3f_print(struct AABB3f *aabb) -{ - printf("AABB {\n"); - printf(" lower = %f, %f, %f\n", aabb->lower[0], aabb->lower[1], aabb->lower[2]); - printf(" upper = %f, %f, %f\n", aabb->upper[0], aabb->upper[1], aabb->upper[2]); - printf("}\n"); -} - - - -#ifdef __OPENCL_VERSION__ -GRL_INLINE struct AABB3f AABB3f_sub_group_shuffle(struct AABB3f *aabb, const uint slotID) -{ - struct AABB3f bounds; - bounds.lower[0] = intel_sub_group_shuffle(aabb->lower[0], slotID); - bounds.lower[1] = intel_sub_group_shuffle(aabb->lower[1], slotID); - bounds.lower[2] = intel_sub_group_shuffle(aabb->lower[2], slotID); - bounds.upper[0] = intel_sub_group_shuffle(aabb->upper[0], slotID); - bounds.upper[1] = intel_sub_group_shuffle(aabb->upper[1], slotID); - bounds.upper[2] = intel_sub_group_shuffle(aabb->upper[2], slotID); - return bounds; -} - -GRL_INLINE struct AABB3f AABB3f_sub_group_reduce(struct AABB3f *aabb) -{ - struct AABB3f bounds; - bounds.lower[0] = sub_group_reduce_min(aabb->lower[0]); - bounds.lower[1] = sub_group_reduce_min(aabb->lower[1]); - bounds.lower[2] = sub_group_reduce_min(aabb->lower[2]); - bounds.upper[0] = sub_group_reduce_max(aabb->upper[0]); - bounds.upper[1] = sub_group_reduce_max(aabb->upper[1]); - bounds.upper[2] = sub_group_reduce_max(aabb->upper[2]); - return bounds; -} - -GRL_INLINE struct AABB3f AABB3f_sub_group_scan_exclusive_min_max(struct AABB3f *aabb) -{ - struct AABB3f bounds; - bounds.lower[0] = sub_group_scan_exclusive_min(aabb->lower[0]); - bounds.lower[1] = sub_group_scan_exclusive_min(aabb->lower[1]); - bounds.lower[2] = sub_group_scan_exclusive_min(aabb->lower[2]); - bounds.upper[0] = sub_group_scan_exclusive_max(aabb->upper[0]); - bounds.upper[1] = sub_group_scan_exclusive_max(aabb->upper[1]); - bounds.upper[2] = sub_group_scan_exclusive_max(aabb->upper[2]); - return bounds; -} - -GRL_INLINE struct AABB3f AABB3f_sub_group_scan_inclusive_min_max(struct AABB3f *aabb) -{ - struct AABB3f bounds; - bounds.lower[0] = sub_group_scan_inclusive_min(aabb->lower[0]); - bounds.lower[1] = sub_group_scan_inclusive_min(aabb->lower[1]); - bounds.lower[2] = sub_group_scan_inclusive_min(aabb->lower[2]); - bounds.upper[0] = sub_group_scan_inclusive_max(aabb->upper[0]); - bounds.upper[1] = sub_group_scan_inclusive_max(aabb->upper[1]); - bounds.upper[2] = sub_group_scan_inclusive_max(aabb->upper[2]); - return bounds; -} - -GRL_INLINE void AABB3f_atomic_merge_local_nocheck(local struct AABB3f *aabb, const float4 lower, const float4 upper) -{ - atomic_min((local float *)&aabb->lower + 0, lower.x); - atomic_min((local float *)&aabb->lower + 1, lower.y); - atomic_min((local float *)&aabb->lower + 2, lower.z); - atomic_max((local float *)&aabb->upper + 0, upper.x); - atomic_max((local float *)&aabb->upper + 1, upper.y); - atomic_max((local float *)&aabb->upper + 2, upper.z); -} - - -GRL_INLINE void AABB3f_atomic_merge_global_lu( global struct AABB3f* aabb, const float3 lower, const float3 upper ) -{ - atomic_min( (global float*) & aabb->lower + 0, lower.x ); - atomic_min( (global float*) & aabb->lower + 1, lower.y ); - atomic_min( (global float*) & aabb->lower + 2, lower.z ); - atomic_max( (global float*) & aabb->upper + 0, upper.x ); - atomic_max( (global float*) & aabb->upper + 1, upper.y ); - atomic_max( (global float*) & aabb->upper + 2, upper.z ); -} - -GRL_INLINE void AABB3f_atomic_merge_local_lu( local struct AABB3f* aabb, const float3 lower, const float3 upper ) -{ - atomic_min( (local float*) & aabb->lower + 0, lower.x ); - atomic_min( (local float*) & aabb->lower + 1, lower.y ); - atomic_min( (local float*) & aabb->lower + 2, lower.z ); - atomic_max( (local float*) & aabb->upper + 0, upper.x ); - atomic_max( (local float*) & aabb->upper + 1, upper.y ); - atomic_max( (local float*) & aabb->upper + 2, upper.z ); -} - -GRL_INLINE void Uniform_AABB3f_atomic_merge_local_sub_group_lu(uniform local struct AABB3f* aabb, const float3 lower, const float3 upper) -{ - float lx = sub_group_reduce_min(lower.x); - float ly = sub_group_reduce_min(lower.y); - float lz = sub_group_reduce_min(lower.z); - - float ux = sub_group_reduce_max(upper.x); - float uy = sub_group_reduce_max(upper.y); - float uz = sub_group_reduce_max(upper.z); - - if (get_sub_group_local_id() == 0) - { - atomic_min((local float*) & aabb->lower + 0, lx); - atomic_min((local float*) & aabb->lower + 1, ly); - atomic_min((local float*) & aabb->lower + 2, lz); - atomic_max((local float*) & aabb->upper + 0, ux); - atomic_max((local float*) & aabb->upper + 1, uy); - atomic_max((local float*) & aabb->upper + 2, uz); - } -} - -GRL_INLINE void AABB3f_atomic_merge_global_sub_group_lu(uniform global struct AABB3f* aabb, const float3 lower, const float3 upper) -{ - uint lane = get_sub_group_local_id(); - float l[3]; - l[0] = sub_group_reduce_min(lower.x); - l[1] = sub_group_reduce_min(lower.y); - l[2] = sub_group_reduce_min(lower.z); - float u[3]; - u[0] = sub_group_reduce_max(upper.x); - u[1] = sub_group_reduce_max(upper.y); - u[2] = sub_group_reduce_max(upper.z); - - if (lane < 3) - { - atomic_min((global float*)&aabb->lower + lane, l[lane]); - atomic_max((global float*)&aabb->upper + lane, u[lane]); - } -} - -GRL_INLINE void AABB3f_atomic_merge_global( global struct AABB3f* aabb, struct AABB3f* other ) -{ - float3 lower = AABB3f_load_lower( other ); - float3 upper = AABB3f_load_upper( other ); - atomic_min( (global float*) & aabb->lower + 0, lower.x ); - atomic_min( (global float*) & aabb->lower + 1, lower.y ); - atomic_min( (global float*) & aabb->lower + 2, lower.z ); - atomic_max( (global float*) & aabb->upper + 0, upper.x ); - atomic_max( (global float*) & aabb->upper + 1, upper.y ); - atomic_max( (global float*) & aabb->upper + 2, upper.z ); -} - -GRL_INLINE void AABB3f_atomic_merge_localBB_nocheck( local struct AABB3f* aabb, struct AABB3f* bb ) -{ - atomic_min( (local float*) & aabb->lower + 0, bb->lower[0] ); - atomic_min( (local float*) & aabb->lower + 1, bb->lower[1] ); - atomic_min( (local float*) & aabb->lower + 2, bb->lower[2] ); - atomic_max( (local float*) & aabb->upper + 0, bb->upper[0] ); - atomic_max( (local float*) & aabb->upper + 1, bb->upper[1] ); - atomic_max( (local float*) & aabb->upper + 2, bb->upper[2] ); -} - -GRL_INLINE void AABB3f_atomic_merge_local(local struct AABB3f *aabb, const float4 lower, const float4 upper) -{ - if (lower.x < aabb->lower[0]) - atomic_min((local float *)&aabb->lower + 0, lower.x); - if (lower.y < aabb->lower[1]) - atomic_min((local float *)&aabb->lower + 1, lower.y); - if (lower.z < aabb->lower[2]) - atomic_min((local float *)&aabb->lower + 2, lower.z); - if (upper.x > aabb->upper[0]) - atomic_max((local float *)&aabb->upper + 0, upper.x); - if (upper.y > aabb->upper[1]) - atomic_max((local float *)&aabb->upper + 1, upper.y); - if (upper.z > aabb->upper[2]) - atomic_max((local float *)&aabb->upper + 2, upper.z); -} - -GRL_INLINE void AABB3f_atomic_merge_global_local(global struct AABB3f *dest, local struct AABB3f *source) -{ - float3 l = AABB3f_load_lower(source); - float3 u = AABB3f_load_upper(source); - atomic_min((global float *)&dest->lower + 0, l.x ); - atomic_min((global float *)&dest->lower + 1, l.y ); - atomic_min((global float *)&dest->lower + 2, l.z ); - atomic_max((global float *)&dest->upper + 0, u.x ); - atomic_max((global float *)&dest->upper + 1, u.y ); - atomic_max((global float *)&dest->upper + 2, u.z ); -} - - -struct AABB3f AABB3f_construct( float3 min, float3 max ) -{ - struct AABB3f bb; - bb.lower[0] = min.x; bb.lower[1] = min.y; bb.lower[2] = min.z; - bb.upper[0] = max.x; bb.upper[1] = max.y; bb.upper[2] = max.z; - return bb; -} - -struct AABB3f AABB3f_select( struct AABB3f left, struct AABB3f right, int3 cond ) -{ - float3 l = select( AABB3f_load_lower(&left), AABB3f_load_lower(&right), cond ); - float3 u = select( AABB3f_load_upper(&left), AABB3f_load_upper(&right), cond ); - return AABB3f_construct( l, u ); -} - -#endif - -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) - diff --git a/src/intel/vulkan/grl/include/GRLGen12.h b/src/intel/vulkan/grl/include/GRLGen12.h deleted file mode 100644 index 20849599e91..00000000000 --- a/src/intel/vulkan/grl/include/GRLGen12.h +++ /dev/null @@ -1,691 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// -// This file is to contain structure definitions related to the Gen12 QBVH6 acceleration structures -// -// - -//******************************************************************************************** -// WARNING!!!!! -// This file is shared by OpenCL and C++ source code and must be compatible. -// There should only be C structure definitions and trivial GRL_INLINE functions here -// -//******************************************************************************************** - -#pragma once - -#include "GRLRTASCommon.h" -#include "GRLUtilities.h" - -GRL_NAMESPACE_BEGIN(GRL) -GRL_NAMESPACE_BEGIN(RTAS) -GRL_NAMESPACE_BEGIN(GEN12) - - enum_uint8(NodeType) - { - NODE_TYPE_MIXED = 0x0, // identifies a mixed internal node where each child can have a different type - NODE_TYPE_INTERNAL = 0x0, // internal BVH node with 6 children - NODE_TYPE_INSTANCE = 0x1, // instance leaf - NODE_TYPE_PROCEDURAL = 0x3, // procedural leaf - NODE_TYPE_QUAD = 0x4, // quad leaf - NODE_TYPE_INVALID = 0x7 // indicates invalid node - }; - - - typedef enum PrimLeafType - { - TYPE_NONE = 0, - - TYPE_QUAD = 0, - - /* For a node type of NODE_TYPE_PROCEDURAL we support enabling - * and disabling the opaque/non_opaque culling. */ - - TYPE_OPACITY_CULLING_ENABLED = 0, - TYPE_OPACITY_CULLING_DISABLED = 1 - } PrimLeafType; - - #define BVH_MAGIC_MACRO "GEN12_RTAS_005" // If serialization-breaking or algorithm-breaking changes are made, increment the digits at the end - static const char BVH_MAGIC[16] = BVH_MAGIC_MACRO; - - typedef struct BVHBase - { - // TODO: Implement the "copy-first-node" trick... duplicate root node here - - uint64_t rootNodeOffset; - - uint32_t reserved; - - uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64 - uint32_t quadLeafStart; - uint32_t quadLeafCur; - uint32_t proceduralDataStart; - uint32_t proceduralDataCur; - uint32_t instanceLeafStart; - uint32_t instanceLeafEnd; - uint32_t backPointerDataStart; // - uint32_t refitTreeletsDataStart; // refit structs - uint32_t refitStartPointDataStart; // - uint32_t BVHDataEnd; - - // number of bottom treelets - // if 1, then the bottom treelet is also tip treelet - uint32_t refitTreeletCnt; - uint32_t refitTreeletCnt2; // always 0, used for atomic updates - // data layout: - // @backPointerDataStart - // 'backpointer' - a dword per inner node. - // The bits are used as follows: - // 2:0 --> Used as a refit counter during BVH refitting. MBZ - // 5:3 --> Number of children - // 31:6 --> Index of the parent node in the internal node array - // The root node has a parent index of all ones - // @refitTreeletsDataStart - // RefitTreelet[], the last treelet is for top treelet all previous are for bottom - // @refitStartPointDataStart - // for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space - // @backPointerDataEnd - - uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves" - uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children) - uint32_t fatLeafTableStart; - uint32_t innerTableStart; - - uint32_t quadLeftoversCountNewAtomicUpdate; // number of quad leftovers for new atomic update - uint32_t quadTableSizeNewAtomicUpdate; // size of quad Table including leftovers, padded to 256 - uint32_t quadIndicesDataStart; - - uint32_t _pad[9]; - - struct RTASMetaData Meta; - - } BVHBase; - - GRL_INLINE struct GeoMetaData* BVHBase_GetGeoMetaData(BVHBase* base) - { - return (struct GeoMetaData*)(((char*)base) + base->Meta.geoDescsStart); - } - -#ifdef __OPENCL_VERSION__ -#define BVH_ROOT_NODE_OFFSET sizeof(BVHBase) -#else -#define BVH_ROOT_NODE_OFFSET sizeof(GRL::RTAS::GEN12::BVHBase) -#endif - -GRL_STATIC_ASSERT( sizeof(BVHBase) == BVH_ROOT_NODE_OFFSET, "Wrong size!"); -GRL_STATIC_ASSERT( (sizeof(BVHBase) % 64) == 0 , "Misaligned size!"); - - typedef struct BackPointers { - } BackPointers; - - // threshold for size of bottom treelets, note usually treelets will be 2-3x smaller than that number - // means that no bottom treelet has more paths than this number - #define TREELET_NUM_STARTPOINTS 1536 - - // threshold under which only one treelet will be created - #define SINGLE_TREELET_THRESHOLD 3072 - - typedef struct LeafTableEntry { - - uint backpointer; - uint inner_node_index; - uint leaf_index; - } LeafTableEntry; - - typedef struct InnerNodeTableEntry { - - uint node_index_and_numchildren; // numchildren in 3 lsbs - uint first_child; - - } InnerNodeTableEntry; - - typedef struct QuadDataIndices - { - uint header_data[4]; - uint vert_idx[4]; - } QuadDataIndices; - - typedef struct RefitTreelet { - uint32_t startpoint_offset; - uint32_t numStartpoints; - uint32_t numNonTrivialStartpoints; - uint8_t maxDepth; - uint8_t depthLess64; // depth from bottom at which there are less 64 paths - uint8_t depthLess128;// depth from bottom at which there are less 128 paths - uint8_t depthLess256;// depth from bottom at which there are less 256 paths - } RefitTreelet; - - // if RefitTreelet has number of startpoints == 1 - // it should be reinterpreted as: - typedef struct RefitTreeletTrivial { - uint32_t theOnlyNodeIndex; - uint32_t numStartpoints; // have to be 1 or 0 - int32_t childrenOffsetOfTheNode; // 0th node based - uint8_t maxDepth; - uint8_t numChildrenOfTheNode; - } RefitTreeletTrivial; - - // 5:0 - depth after you die - // 31:6 - Index of the inner node - typedef uint32_t StartPoint; - - struct HwInstanceLeaf; - struct QuadLeaf; - struct ProceduralLeaf; - struct InternalNode; - - typedef struct HwInstanceLeaf HwInstanceLeaf; - typedef struct InternalNode InternalNode; - typedef struct QuadLeaf QuadLeaf; - typedef struct ProceduralLeaf ProceduralLeaf; - - GRL_INLINE uint32_t BackPointer_GetParentIndex( uint32_t bp ) - { - return bp >> 6; - } - GRL_INLINE uint32_t BackPointer_GetNumChildren( uint32_t bp ) - { - return (bp >> 3) & (7); - } - GRL_INLINE uint32_t BackPointer_GetRefitCount( uint32_t bp ) - { - return bp & 7; - } - GRL_INLINE bool BackPointer_IsRoot( uint32_t bp ) - { - return (bp >> 6) == 0x03FFFFFF; - } - - GRL_INLINE InternalNode* BVHBase_GetRootNode( const BVHBase* p ) - { - return (InternalNode*)( ((char*)p) + BVH_ROOT_NODE_OFFSET); - } - - GRL_INLINE AABB3f BVHBase_GetRootAABB(const BVHBase* p) - { - return p->Meta.bounds; - } - - GRL_INLINE InternalNode* BVHBase_GetInternalNodes(const BVHBase* p) - { - return (InternalNode*)(((char*)p) + BVH_ROOT_NODE_OFFSET); - } - GRL_INLINE InternalNode* BVHBase_GetInternalNodesEnd(const BVHBase* p) - { - return (InternalNode*)(((char*)p) + (size_t)(64u * p->nodeDataCur)); - } - GRL_INLINE uint32_t BVHBase_GetNumInternalNodes(const BVHBase* p) - { - return p->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64; - } - - - GRL_INLINE QuadLeaf* BVHBase_GetQuadLeaves(const BVHBase* p) - { - return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafStart)); - } - GRL_INLINE const QuadLeaf* BVHBase_GetQuadLeaves_End(const BVHBase* p) - { - return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafCur)); - } - - GRL_INLINE const ProceduralLeaf* BVHBase_GetProceduralLeaves_End(const BVHBase* p) - { - return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataCur)); - } - - GRL_INLINE ProceduralLeaf* BVHBase_GetProceduralLeaves(const BVHBase* p) - { - return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataStart)); - } - - GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves(const BVHBase* p ) - { - char* pRTASBits = (char*)p; - return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafStart)); - } - - GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves_End(const BVHBase* p ) - { - char* pRTASBits = (char*) p; - return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafEnd)); - } - - GRL_INLINE uint BVHBase_GetNumHWInstanceLeaves( const BVHBase* p ) - { - return (p->instanceLeafEnd - p->instanceLeafStart) / 2; - } - - GRL_INLINE uint* BVHBase_GetRefitStartPoints(const BVHBase* p) - { - return (uint32_t*)(((char*)p) + (size_t)(64u * p->refitStartPointDataStart)); - } - - GRL_INLINE uint BVHBase_GetRefitStartPointsSize(const BVHBase* p) - { - return 64u * (p->fatLeafTableStart - p->refitStartPointDataStart); - } - - GRL_INLINE uint StartPoint_GetDepth(StartPoint s) - { - return s & ((1 << 6) - 1); - } - - GRL_INLINE uint StartPoint_GetNodeIdx(StartPoint s) - { - return s >> 6; - } - - GRL_INLINE RefitTreelet* BVHBase_GetRefitTreeletDescs(const BVHBase* p) - { - return (RefitTreelet*)(((char*)p) + (size_t)(64u * p->refitTreeletsDataStart)); - } - - // this is treelet count as should be executed, ie. num of bottom treelets if there are top and bottoms. - // to get real number of all treelets including tip, the formula is - // actualNumTreelets = refitTreeletCnt > 1 ? refitTreeletCnt + 1 : 1; - GRL_INLINE uint32_t* BVHBase_GetRefitTreeletCntPtr(BVHBase* p) - { - return &p->refitTreeletCnt; - } - - GRL_INLINE uint32_t BVHBase_GetRefitTreeletCnt(const BVHBase* p) - { - return p->refitTreeletCnt; - } - - GRL_INLINE uint32_t BVHBase_IsSingleTreelet(const BVHBase* p) - { - return p->refitTreeletCnt == 1; - } - - GRL_INLINE BackPointers* BVHBase_GetBackPointers(const BVHBase* p) - { - return (BackPointers*)(((char*)p) + (size_t)(64u * p->backPointerDataStart)); - } - - - GRL_INLINE LeafTableEntry* BVHBase_GetFatLeafTable(const BVHBase* p) - { - return (LeafTableEntry*)(((char*)p) + (size_t)(64u * p->fatLeafTableStart)); - } - GRL_INLINE InnerNodeTableEntry* BVHBase_GetInnerNodeTable(const BVHBase* p) - { - return (InnerNodeTableEntry*)(((char*)p) + (size_t)(64u * p->innerTableStart)); - } - GRL_INLINE QuadDataIndices* BVHBase_GetQuadDataIndicesTable(const BVHBase* p) - { - return (QuadDataIndices*)(((char*)p) + (size_t)(64u * p->quadIndicesDataStart)); - } - - GRL_INLINE unsigned* InnerNode_GetBackPointer( - BackPointers* backpointersStruct, - uint32_t inodeOffset /*in 64B units, from the earliest Inner node*/) - { - uint* backpointersArray = (uint*)backpointersStruct; - // BACKPOINTER_LAYOUT - uint new_index = inodeOffset; //<-layout canonical - //uint new_index = inodeOffset*16; //<-layout scattered - // uint new_index = (inodeOffset & (~0xFFFF)) | (((inodeOffset & 0xFF) << 8) | ((inodeOffset & 0xFF00) >> 8)); //<-layout hashed - - return backpointersArray + new_index; - } - - GRL_INLINE uint32_t BVHBase_GetRefitStructsDataSize(const BVHBase* p) - { - return 64u * (p->BVHDataEnd - p->backPointerDataStart); - } - - GRL_INLINE uint32_t BVHBase_GetBackpointersDataSize(const BVHBase* p) - { - return 64u * (p->refitTreeletsDataStart - p->backPointerDataStart); - } - - GRL_INLINE uint32_t* BVHBase_GetBVHDataEnd( const BVHBase* p ) - { - return (uint32_t*)(((char*)p) + (size_t)(64u * p->BVHDataEnd)); - } - - GRL_INLINE bool BVHBase_HasBackPointers( const BVHBase* p ) - { - return p->refitTreeletsDataStart > p->backPointerDataStart; - } - - GRL_INLINE const size_t BVHBase_GetNumQuads(const BVHBase* p) - { - return p->quadLeafCur - p->quadLeafStart; - } - - GRL_INLINE const size_t BVHBase_GetNumProcedurals(const BVHBase* p) - { - return p->proceduralDataCur - p->proceduralDataStart; - } - - GRL_INLINE const size_t BVHBase_GetNumInstances(const BVHBase* p) - { - return (p->instanceLeafEnd - p->instanceLeafStart) / 2; - } - - GRL_INLINE const size_t BVHBase_totalBytes(const BVHBase* p) - { - return p->BVHDataEnd * 64u; - } - - - - struct HwInstanceLeaf - { - /* first 64 bytes accessed during traversal */ - struct Part0 - { - //uint32_t shaderIndex : 24; - //uint32_t geomMask : 8; - uint32_t DW0; - - // uint32_t instanceContributionToHitGroupIndex : 24; - // uint32_t pad0 : 8 - // - // NOTE: Traversal shaders are implemented by aliasing instance leaves as procedural and sending them through the procedural path - // For a procedural instance, bit 29 should be set to 1, to disable "opaque culling" - // and bits 30 and 31 must be zero. See also the definition of the 'PrimLeafDesc' structure - uint32_t DW1; - - // uint64_t rootNodePtr : 48; - // uint64_t instFlags : 8; - // uint64_t pad1 : 8; - uint64_t DW2_DW3; - - // Vec3f world2obj_vx; // 1st row of Worl2Obj transform - float world2obj_vx_x; - float world2obj_vx_y; - float world2obj_vx_z; - - // Vec3f world2obj_vy; // 2nd row of Worl2Obj transform - float world2obj_vy_x; - float world2obj_vy_y; - float world2obj_vy_z; - - // Vec3f world2obj_vz; // 3rd row of Worl2Obj transform - float world2obj_vz_x; - float world2obj_vz_y; - float world2obj_vz_z; - - // Vec3f obj2world_p; // translation of Obj2World transform (on purpose in fist 64 bytes) - float obj2world_p_x; - float obj2world_p_y; - float obj2world_p_z; - } part0; - - /* second 64 bytes accessed during shading */ - // NOTE: Everything in this block is under SW control - struct Part1 - { - // uint64_t bvhPtr : 48; - // uint64_t pad : 16; - uint64_t DW0_DW1; - - uint32_t instanceID; - uint32_t instanceIndex; - - // Vec3f world2obj_vx; // 1st row of Worl2Obj transform - float obj2world_vx_x; - float obj2world_vx_y; - float obj2world_vx_z; - - // Vec3f world2obj_vy; // 2nd row of Worl2Obj transform - float obj2world_vy_x; - float obj2world_vy_y; - float obj2world_vy_z; - - // Vec3f world2obj_vz; // 3rd row of Worl2Obj transform - float obj2world_vz_x; - float obj2world_vz_y; - float obj2world_vz_z; - - // Vec3f obj2world_p; // translation of Obj2World transform (on purpose in fist 64 bytes) - float world2obj_p_x; - float world2obj_p_y; - float world2obj_p_z; - } part1; - }; - - __constant const uint64_t c_one = 1ul; - - GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceMask( const HwInstanceLeaf* p ) - { - return p->part0.DW0 >> 24; - } - - GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceContributionToHitGroupIndex( const HwInstanceLeaf* p ) - { - return p->part0.DW1 & 0x00ffffff; - } - - GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceFlags( const HwInstanceLeaf* p ) - { - return (p->part0.DW2_DW3 >> 48) & 0xff; - } - GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceID( const HwInstanceLeaf* p ) - { - return p->part1.instanceID; - } - - GRL_INLINE gpuva_t HwInstanceLeaf_GetBVH( const HwInstanceLeaf* p ) { return p->part1.DW0_DW1 & ((c_one << 48) - 1); } - GRL_INLINE gpuva_t HwInstanceLeaf_GetStartNode( const HwInstanceLeaf* p ) { return p->part0.DW2_DW3 & ((c_one << 48) - 1); } - GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceIndex( const HwInstanceLeaf* p ) { return p->part1.instanceIndex; } - - GRL_INLINE void HwInstanceLeaf_GetTransform(struct HwInstanceLeaf* p, float* transform) - { - transform[0] = p->part1.obj2world_vx_x; - transform[1] = p->part1.obj2world_vy_x; - transform[2] = p->part1.obj2world_vz_x; - transform[3] = p->part0.obj2world_p_x; - transform[4] = p->part1.obj2world_vx_y; - transform[5] = p->part1.obj2world_vy_y; - transform[6] = p->part1.obj2world_vz_y; - transform[7] = p->part0.obj2world_p_y; - transform[8] = p->part1.obj2world_vx_z; - transform[9] = p->part1.obj2world_vy_z; - transform[10] = p->part1.obj2world_vz_z; - transform[11] = p->part0.obj2world_p_z; - } - - GRL_INLINE void HwInstanceLeaf_SetBVH( HwInstanceLeaf* p, gpuva_t b ) { - uint64_t mask = ((c_one << 48) - 1); - uint64_t v = p->part1.DW0_DW1; - v = (b & mask) | (v & ~mask); - p->part1.DW0_DW1 = v; - } - GRL_INLINE void HwInstanceLeaf_SetStartNode( HwInstanceLeaf* p, gpuva_t b ) { - uint64_t mask = ((c_one << 48) - 1); - uint64_t v = p->part0.DW2_DW3; - v = (b & mask) | (v & ~mask); - p->part0.DW2_DW3 = v; - } - GRL_INLINE void HwInstanceLeaf_SetStartNodeAndInstanceFlags( HwInstanceLeaf* p, - gpuva_t root, - uint8_t flags ) { - uint64_t mask = ((1ull << 48) - 1); - uint64_t v = (root & mask) | ((uint64_t)(flags)<<48); - p->part1.DW0_DW1 = v; - } - - struct InternalNode - { - float lower[3]; // world space origin of quantization grid - int32_t childOffset; // offset to all children in 64B multiples - - uint8_t nodeType; // the type of the node - uint8_t pad; // unused byte - - int8_t exp_x; // 2^exp_x is the size of the grid in x dimension - int8_t exp_y; // 2^exp_y is the size of the grid in y dimension - int8_t exp_z; // 2^exp_z is the size of the grid in z dimension - uint8_t nodeMask; // mask used for ray filtering - - struct ChildData - { - //uint8_t blockIncr : 2; // size of child in 64 byte blocks. Must be ==2 for instance leaves, <=2 for quad leaves. - //uint8_t startPrim : 4; // start primitive in fat leaf mode or child type in mixed mode - //uint8_t pad : 2; // unused bits - uint8_t bits; - } childData[6]; - - uint8_t lower_x[6]; // the quantized lower bounds in x-dimension - uint8_t upper_x[6]; // the quantized upper bounds in x-dimension - uint8_t lower_y[6]; // the quantized lower bounds in y-dimension - uint8_t upper_y[6]; // the quantized upper bounds in y-dimension - uint8_t lower_z[6]; // the quantized lower bounds in z-dimension - uint8_t upper_z[6]; // the quantized upper bounds in z-dimension - }; - - GRL_INLINE uint InternalNode_GetChildBlockIncr( const InternalNode* p, uint idx ) - { - return p->childData[idx].bits & 3; - } - GRL_INLINE uint InternalNode_GetChildStartPrim( const InternalNode* p, uint idx ) - { - return (p->childData[idx].bits>>2) & 0xf; - } - - GRL_INLINE uint8_t InternalNode_GetChildType( const InternalNode* p, uint idx ) - { - return (p->childData[idx].bits >> 2) & 0xF; - } - - GRL_INLINE void InternalNode_SetChildType( InternalNode* p, uint idx, uint type ) - { - uint bits = p->childData[idx].bits; - const uint mask = (0xF << 2); - bits = ((type << 2) & mask) | (bits & ~mask); - p->childData[idx].bits = (uint8_t)bits; - } - - GRL_INLINE bool InternalNode_IsChildValid( const InternalNode* p, size_t child ) - { - bool lower = p->lower_x[child] & 0x80; // invalid nodes are indicated by setting lower_msb = 1 and upper_msb=0 - bool upper = p->upper_x[child] & 0x80; - return !lower || upper; - } - - GRL_INLINE AABB3f InternalNode_GetChildAABB(const InternalNode* node, size_t i) - { - float4 lower, upper; - const float4 base = { node->lower[0], node->lower[1], node->lower[2], 0.0f }; - const int4 lower_i = { node->lower_x[i], node->lower_y[i], node->lower_z[i], 0 }; - const int4 upper_i = { node->upper_x[i], node->upper_y[i], node->upper_z[i], 0 }; - const int4 exp_i = { node->exp_x, node->exp_y, node->exp_z, 0 }; - lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8); - upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8); - AABB3f aabb3f = { - { lower.x, lower.y, lower.z }, - { upper.x, upper.y, upper.z } }; - return aabb3f; - } - - GRL_INLINE void* InternalNode_GetChildren( InternalNode* node) - { - return (void*)(((char*)node) + node->childOffset * 64); - } - - typedef struct PrimLeafDesc - { - //uint32_t shaderIndex : 24; // shader index used for shader record calculations - //uint32_t geomMask : 8; // geometry mask used for ray masking - uint32_t shaderIndex_geomMask; - - //uint32_t geomIndex : 29; // the geometry index specifies the n'th geometry of the scene - //PrimLeafType type : 1; // see above - //GeometryFlags geomFlags : 2; // geometry flags of this geometry - uint32_t geomIndex_flags; - } PrimLeafDesc; - - GRL_INLINE uint32_t PrimLeaf_GetShaderIndex( const PrimLeafDesc* p ) - { - return p->shaderIndex_geomMask & ((1 << 24) - 1); - } - GRL_INLINE uint32_t PrimLeaf_GetGeoIndex( const PrimLeafDesc* p ) - { - return p->geomIndex_flags & ((1<<29)-1); - } - GRL_INLINE uint32_t PrimLeaf_GetGeomFlags( const PrimLeafDesc* p ) - { - return (p->geomIndex_flags >> 30); - } - GRL_INLINE uint32_t PrimLeaf_GetType(const PrimLeafDesc* p) - { - return (p->geomIndex_flags >> 29) & 1; - } - - struct QuadLeaf - { - PrimLeafDesc leafDesc; - - uint32_t primIndex0; - - //uint32_t primIndex1Delta : 16; - //uint32_t j0 : 2; - //uint32_t j1 : 2; - //uint32_t j2 : 2; - //uint32_t last : 1; // last quad in list - //uint32_t pad : 9; - uint32_t DW1; - - float v[4][3]; - }; - - GRL_INLINE uint32_t QuadLeaf_GetPrimIndexDelta( const QuadLeaf* p ) - { - return p->DW1 & 0x0000ffff; - } - GRL_INLINE uint32_t QuadLeaf_GetPrimIndex0( const QuadLeaf* p ) - { - return p->primIndex0; - } - GRL_INLINE uint32_t QuadLeaf_GetPrimIndex1( const QuadLeaf* p ) - { - return p->primIndex0 + QuadLeaf_GetPrimIndexDelta(p); - } - GRL_INLINE bool QuadLeaf_IsSingleTriangle( const QuadLeaf* p ) - { - return QuadLeaf_GetPrimIndexDelta(p) == 0; - } - GRL_INLINE uint32_t QuadLeaf_GetSecondTriangleIndices( const QuadLeaf* p ) - { - return (p->DW1>>16) & 0x3f; - } - - GRL_INLINE void QuadLeaf_SetVertices( QuadLeaf* quad, float3 v0, float3 v1, float3 v2, float3 v3 ) - { - quad->v[0][0] = v0.x; - quad->v[0][1] = v0.y; - quad->v[0][2] = v0.z; - quad->v[1][0] = v1.x; - quad->v[1][1] = v1.y; - quad->v[1][2] = v1.z; - quad->v[2][0] = v2.x; - quad->v[2][1] = v2.y; - quad->v[2][2] = v2.z; - quad->v[3][0] = v3.x; - quad->v[3][1] = v3.y; - quad->v[3][2] = v3.z; - } - - - struct ProceduralLeaf { - PrimLeafDesc leafDesc; - - // Number of primitives + "last" bits. - // The meaning of this section is SW-defined and flexible - uint32_t DW1 ; - uint32_t _primIndex[13]; - } ; - -GRL_NAMESPACE_END(Gen12) -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/include/GRLIntTypes.h b/src/intel/vulkan/grl/include/GRLIntTypes.h deleted file mode 100644 index 573dbbc7481..00000000000 --- a/src/intel/vulkan/grl/include/GRLIntTypes.h +++ /dev/null @@ -1,152 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -//******************************************************************************************** -// WARNING!!!!! -// -// This file is shared by OpenCL and C++ source code and must be a pure C header -// There should only be C structure definitions and trivial inline functions here -// -//******************************************************************************************** - -#pragma once - -#include "GRLOCLCompatibility.h" - -GRL_NAMESPACE_BEGIN(GRL) - - typedef uint32_t dword; - typedef uint64_t qword; - typedef qword gpuva_t; - - - enum_uint8( InstanceFlags ) - { - INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1, - INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2, - INSTANCE_FLAG_FORCE_OPAQUE = 0x4, - INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8, - }; - - enum_uint8( GeometryFlags ) - { - GEOMETRY_FLAG_NONE = 0x0, - GEOMETRY_FLAG_OPAQUE = 0x1, - GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2, - }; - - enum_uint8( GeometryType ) - { - GEOMETRY_TYPE_TRIANGLES = 0, - GEOMETRY_TYPE_PROCEDURAL = 1, - NUM_GEOMETRY_TYPES = 2 - }; - - // NOTE: Does NOT match DXR - enum_uint8( IndexFormat ) - { - INDEX_FORMAT_NONE = 0, // INDEX_FORMAT_NONE Indicates non-indexed geometry - INDEX_FORMAT_R16_UINT = 2, - INDEX_FORMAT_R32_UINT = 4, - INDEX_FORMAT_END = INDEX_FORMAT_R32_UINT + 1 - }; - - // NOTE: Does NOT match DXR - enum_uint8( VertexFormat ) - { - VERTEX_FORMAT_R32G32_FLOAT = 0, - VERTEX_FORMAT_R32G32B32_FLOAT = 1, - VERTEX_FORMAT_R16G16_FLOAT = 2, - VERTEX_FORMAT_R16G16B16A16_FLOAT = 3, - VERTEX_FORMAT_R16G16_SNORM = 4, - VERTEX_FORMAT_R16G16B16A16_SNORM = 5, - VERTEX_FORMAT_R16G16B16A16_UNORM = 6, - VERTEX_FORMAT_R16G16_UNORM = 7, - VERTEX_FORMAT_R10G10B10A2_UNORM = 8, - VERTEX_FORMAT_R8G8B8A8_UNORM = 9, - VERTEX_FORMAT_R8G8_UNORM = 10, - VERTEX_FORMAT_R8G8B8A8_SNORM = 11, - VERTEX_FORMAT_R8G8_SNORM = 12, - VERTEX_FORMAT_END = VERTEX_FORMAT_R8G8_SNORM + 1 - }; - - - - enum_uint32(RTASFlags) - { - // These flags match DXR - BUILD_FLAG_ALLOW_UPDATE = 1<<0, - BUILD_FLAG_ALLOW_COMPACTION = 1<<1, - BUILD_FLAG_PREFER_FAST_TRACE = 1<<2, - BUILD_FLAG_PREFER_FAST_BUILD = 1<<3, - BUILD_FLAG_MINIMIZE_MEMORY = 1<<4, - BUILD_FLAG_PERFORM_UPDATE = 1<<5, - - // internal flags start here - BUILD_FLAG_DISALLOW_REBRAID = 1<<16, - - BUILD_FLAG_ALL = 0x0001003f - }; - - enum_uint8(BVHType) - { - BVH_TYPE_NONE, // This is a sentinel for drivers to use when compiling out GRL on non-RT devices - BVH_TYPE_GEN12, - }; - - enum_uint8(PostBuildInfoType) - { - PBI_CURRENT_SIZE, - PBI_COMPACTED_SIZE, - PBI_DXR_TOOLS_VISUALIZATION_DESC, - PBI_DXR_SERIALIZATION_DESC, - }; - - enum_uint32(HazardTypes) - { - HAZARD_RTAS_READ = 1 << 0, - HAZARD_RTAS_WRITE = 1 << 1, - HAZARD_READ = 1 << 2, - HAZARD_WRITE = 1 << 3, - HAZARD_ALL = 0xf - }; - - enum_uint32(RaytracingAccelerationStructureType) - { - TOP_LEVEL = 0x0, - BOTTOM_LEVEL = 0x1, - }; - - typedef struct PostbuildInfoCurrentSize - { - uint64_t CurrentSizeInBytes; - } PostbuildInfoCurrentSize; - - typedef struct PostbuildInfoCompactedSize - { - uint64_t CompactedSizeInBytes; - } PostbuildInfoCompactedSize; - - typedef struct PostbuildInfoToolsVisualizationDesc - { - uint64_t DecodedSizeInBytes; - } PostbuildInfoToolsVisualizationDesc; - - typedef struct PostbuildInfoSerializationDesc - { - uint64_t SerializedSizeInBytes; - uint64_t NumBottomLevelAccelerationStructurePointers; - } PostbuildInfoSerializationDesc; - - typedef struct DecodeHeader - { - RaytracingAccelerationStructureType Type; - uint32_t NumDesc; - } DecodeHeader; - - -GRL_NAMESPACE_END(GRL) \ No newline at end of file diff --git a/src/intel/vulkan/grl/include/GRLOCLCompatibility.h b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h deleted file mode 100644 index 119104f1532..00000000000 --- a/src/intel/vulkan/grl/include/GRLOCLCompatibility.h +++ /dev/null @@ -1,210 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#ifdef __OPENCL_VERSION__ - -typedef uchar uint8_t; -typedef ushort uint16_t; -typedef uint uint32_t; -typedef ulong uint64_t; -typedef char int8_t; -typedef short int16_t; -typedef int int32_t; -typedef long int64_t; - -#else - -#include - -typedef uint8_t uchar; -typedef uint16_t ushort; -typedef uint32_t uint; -typedef uint64_t ulong; - -#define __constant -#define __global - -typedef struct uint2 -{ -#ifdef __cplusplus - uint2() {}; - uint2( uint ix, uint iy ) : x( ix ), y( iy ) {}; -#endif - uint x; - uint y; -} uint2; - -typedef struct uint3 -{ -#ifdef __cplusplus - uint3() {}; - uint3( uint ix, uint iy, uint iz ) : x( ix ), y( iy ), z( iz ) {}; -#endif - uint x; - uint y; - uint z; -} uint3; - -typedef struct int3 -{ - int32_t x; - int32_t y; - int32_t z; - -#ifdef __cplusplus - int3() {}; - int3(int32_t ix, int32_t iy, int32_t iz) : x(ix), y(iy), z(iz) {}; - - int3 operator+(const int32_t i) const { return int3(this->x + i, this->y + i, this->z + i); } - int3 operator<<(const int32_t i) const { return int3(this->x << i, this->y << i, this->z << i); } -#endif -} int3; - -typedef struct int4 -{ - int32_t x; - int32_t y; - int32_t z; - int32_t w; - -#ifdef __cplusplus - int4() {}; - int4(int32_t ix, int32_t iy, int32_t iz, int32_t iw) : x(ix), y(iy), z(iz), w(iw) {}; - - int4 operator+(const int32_t i) const { return int4(this->x + i, this->y + i, this->z + i, this->w + i); } - int4 operator-(const int32_t i) const { return int4(this->x - i, this->y - i, this->z - i, this->w - i); } - int4 operator<<(const int32_t i) const { return int4(this->x << i, this->y << i, this->z << i, this->w << i); } -#endif -} int4; - -typedef struct float3 -{ - float x; - float y; - float z; - -#ifdef __cplusplus - float3(){}; - float3( float ix, float iy, float iz ) : x(ix), y(iy), z(iz){}; - - float3 operator+( const float3& f3 ) { return float3( this->x + f3.x, this->y + f3.y, this->z + f3.z ); } - float3 operator*( const float& f ) { return float3( this->x * f, this->y * f, this->z * f ); } - float3 operator*( const float3& f3 ) const { return float3(this->x * f3.x, this->y * f3.y, this->z * f3.z); } - float3 operator-() { return float3(-this->x, -this->y, -this->z); } - float3 operator-( const float3& f3) { return float3(this->x - f3.x, this->y - f3.y, this->z - f3.z); } -#endif -} float3; - -typedef struct float4 -{ - float x; - float y; - float z; - float w; - -#ifdef __cplusplus - float4() {}; - float4( float ix, float iy, float iz, float iw ) : x( ix ), y( iy ), z( iz ), w( iw ) {}; - - float4 operator+(const float4& f4) const { return float4(this->x + f4.x, this->y + f4.y, this->z + f4.z, this->w + f4.w); } - float4 operator*(const float4& f4) const { return float4(this->x * f4.x, this->y * f4.y, this->z * f4.z, this->w * f4.w); } -#endif -} float4; - -#endif /* ! __OPENCL_VERSION__ */ - - -#ifndef __cplusplus - -#define GRL_NAMESPACE_BEGIN(x) -#define GRL_NAMESPACE_END(x) -#define GRL_OVERLOADABLE __attribute((overloadable)) -#define GRL_INLINE __attribute__((always_inline)) inline static - -# define enum_uint8(name) \ - typedef uint8_t name; \ - enum name##_uint32 -# define enum_uint16(name) \ - typedef uint16_t name; \ - enum name##_uint32 -# define enum_uint32(name) \ - typedef uint32_t name; \ - enum name##_uint32 - -#define OCL_BYTE_ALIGN(n) __attribute__ ((aligned (n))) -#define GRL_STATIC_ASSERT(condition,desc) - -#else /* C++ */ -#ifdef __OPENCL_VERSION__ -#error "OpenCL C++ not supported by this header" -#endif - -#define GRL_NAMESPACE_BEGIN(x) namespace x { -#define GRL_NAMESPACE_END(x) } -#define GRL_OVERLOADABLE -#define GRL_INLINE inline - -#define enum_uint8(N) enum N : uint8_t -#define enum_uint16(N) enum N : uint16_t -#define enum_uint32(N) enum N : uint32_t - -#define OCL_BYTE_ALIGN(n) -#define GRL_STATIC_ASSERT(condition,desc) static_assert( condition, desc ) - -#include - -inline float3 fmin(float3 a, float3 b) -{ - float3 o = { std::fmin(a.x, b.x), std::fmin(a.y, b.y), std::fmin(a.z, b.z) }; - return o; -} - -inline float3 fmax(float3 a, float3 b) -{ - float3 o = { std::fmax(a.x, b.x), std::fmax(a.y, b.y), std::fmax(a.z, b.z) }; - return o; -} - -inline float3 operator/(const float3& f3, const float& f) { return float3(f3.x / f, f3.y / f, f3.z / f); } - -inline float dot(const float3& a, const float3& b) { - return a.x * b.x + a.y * b.y + a.z * b.z; -} - -inline float as_float(uint32_t i) -{ - union { float f; uint32_t i; } fi; - - fi.i = i; - return fi.f; -} - -inline float3 as_float3(int3 i3) -{ - float3 o = { as_float(i3.x), as_float(i3.y), as_float(i3.z) }; - return o; -} - -inline float4 as_float4(int4 i4) -{ - float4 o = { as_float(i4.x), as_float(i4.y), as_float(i4.z), as_float(i4.w) }; - return o; -} - -inline float4 convert_float4_rtn(int4 i4) -{ - return float4(static_cast(i4.x), static_cast(i4.y), static_cast(i4.z), static_cast(i4.w)); -} - -inline float4 convert_float4_rtp(int4 i4) -{ - return convert_float4_rtn(i4); -} - -#endif diff --git a/src/intel/vulkan/grl/include/GRLRTASCommon.h b/src/intel/vulkan/grl/include/GRLRTASCommon.h deleted file mode 100644 index 1f2cda2ea0b..00000000000 --- a/src/intel/vulkan/grl/include/GRLRTASCommon.h +++ /dev/null @@ -1,142 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -// -// This file is to contain structure definitions for RTAS-related meta-deta. -// The structures here should be generic enough to apply to any acceleration structure. -// If we ever move to KD-Trees or Octrees, this file should not need to change. -// - -//******************************************************************************************** -// WARNING!!!!! -// -// This file is shared by OpenCL and C++ source code and must be a pure C header -// There should only be C structure definitions and trivial inline functions here -// -//******************************************************************************************** - - -#pragma once -#include "GRLIntTypes.h" - -GRL_NAMESPACE_BEGIN(GRL) -GRL_NAMESPACE_BEGIN(RTAS) - - typedef struct SerializationIdentifier - { - uint8_t Bytes[16]; - } SerializationIdentifier; - - GRL_STATIC_ASSERT(sizeof(SerializationIdentifier) == 16, "Wrong size!"); - - - // Header structure for RTAS serialization. - // This structure is binary-compatible with the DXR and Vulkan API definitions - typedef struct SerializationHeader - { - SerializationIdentifier DriverID; // DXR 'DriverOpaqueGUID'. Vulkan: 'driverUUID' - SerializationIdentifier GRLID; // DXR 'DriverOpaqueVersioningData'. Vulkan: 'accelerationStructureUUID' - - uint64_t SerializedSizeInBytesIncludingHeader; - uint64_t DeserializedSizeInBytes; - uint64_t InstanceHandleCount; - } SerializationHeader; - - GRL_STATIC_ASSERT(sizeof(SerializationHeader) == 56, "Wrong size!"); - - // This structure is binary-compatible with DXR and Vulkan 'InstanceDesc' structures - typedef struct InstanceDesc { - float Transform[3][4]; - uint32_t InstanceIDAndMask; // mask in 8 msbs - uint32_t InstanceContributionToHitGroupIndexAndFlags; // flags in 8 msbs - gpuva_t AccelerationStructureGPUVA; // NOTE: In GRL this is always a VA. Vulkan CPU builds use handles here, and these may need to be translated - } InstanceDesc; - GRL_STATIC_ASSERT(sizeof(InstanceDesc) == 64, "Wrong size!"); - - typedef struct GeoMetaData{ - uint32_t PrimitiveCount; - uint16_t Type; - uint16_t Flags; - } GeoMetaData; - GRL_STATIC_ASSERT(sizeof(GeoMetaData) == 8, "Wrong size!"); - - typedef struct AABB3f { - float lower[3]; - float upper[3]; - } AABB3f; - GRL_STATIC_ASSERT(sizeof(AABB3f) == 24, "Wrong size!"); - - enum_uint32(error_t_) { - error_t_no_error = 0x0, - error_t_internal_node_child_OOB = 0x1, - error_t_leaf_node_child_OOB = 0x2, - error_t_unrecognised_node_t = 0x4, - error_t_mixed_node_unsupported = 0x8, - error_t_instance_pointers_inconsistent = 0x10, - error_t_instance_pointed_root_not_internal = 0x20, - error_t_leaf_node_instance_child_missed_by_64B = 0x40, - error_t_internal_node_child_cycle = 0x80, - error_t_input_geo_insane = 0x100, - error_t_quad_leaf_broken = 0x200, - error_t_backpointer_not_reset = 0x400, - error_t_backpointer_wrong_children_num = 0x500, - error_t_backpointer_inconsitent_parent_child = 0x600, - error_t_backpointer_root_not_root_error = 0x700, - error_t_backpointer_OOB = 0x800, - error_t_backpointers_buffer_too_small = 0x900, - error_t_atomic_update_struct_fatleaf_count_oob = 0x1000, // for this and following: - error_t_atomic_update_struct_fatleaf_node_idx_oob = 0x2000, // offset_in_BVH is just index in fatleaf or inner node arrays - error_t_atomic_update_struct_fatleaf_backpointer_mismatch = 0x3000, - error_t_atomic_update_struct_fatleaf_num_children_error = 0x4000, - error_t_atomic_update_struct_fatleaf_children_non_leaf = 0x5000, - error_t_atomic_update_struct_inner_count_oob = 0x6000, - error_t_atomic_update_struct_inner_node_idx_oob = 0x7000, - error_t_atomic_update_struct_inner_node_child_idx_error = 0x8000, - error_t_atomic_update_struct_inner_num_children_error = 0x9000, - error_t_atomic_update_struct_inner_children_non_internal = 0xA000, - error_t_unknown = 1u << 31, - }; - - enum_uint32(error_phase_t) { - error_phase_t_unknown = 0, - error_phase_t_post_build_Morton = 1, - error_phase_t_post_build_Trivial = 2, - error_phase_t_post_build_NewSAH = 3, - error_phase_t_post_update = 4, - error_phase_t_pre_update = 5, - error_phase_t_post_copy_op = 6, - }; - - typedef struct ERROR_INFO { - error_t_ type; - uint offset_in_BVH; //in 64B units - error_phase_t when; - uint reserved; - } ERROR_INFO; - - // Meta-data common to all acceleration structures, which is needed to implement required functionality - // All RTAS structures must contain a struct of this type named 'Meta' - typedef struct RTASMetaData { - struct AABB3f bounds; - - uint32_t instanceDescsStart; // byte offset to array of original instance_descs used for build. Required for DXR visualization and serialization - uint32_t instanceCount; - - uint32_t geoDescsStart; // byte offset to array of 'GeoMetaData' matching input geos. Required for DXR visualization - uint32_t geoCount; - - uint64_t allocationSize; // Size of the memory allocation containing this RTAS - // This is the size given to the app in the prebuild info when the RTAS was first created - // If RTAS was compacted, this will be the compacted size - - ERROR_INFO errors; // only used in debug mode - } RTASMetaData; - - GRL_STATIC_ASSERT( sizeof(RTASMetaData) == 64, "Wrong size!"); - -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/include/GRLStructs.h b/src/intel/vulkan/grl/include/GRLStructs.h deleted file mode 100644 index 35130ec0810..00000000000 --- a/src/intel/vulkan/grl/include/GRLStructs.h +++ /dev/null @@ -1,60 +0,0 @@ -// -// Copyright (C) 2009-2021 Intel Corporation -// -// SPDX-License-Identifier: MIT -// -// - -#pragma once - -#include "GRLIntTypes.h" - -GRL_NAMESPACE_BEGIN(GRL) -GRL_NAMESPACE_BEGIN(_INTERNAL) - - struct GeometryTriangles - { - gpuva_t pTransformBuffer; - gpuva_t pIndexBuffer; - gpuva_t pVertexBuffer; - qword VertexBufferByteStride; - dword IndexCount; - dword VertexCount; - IndexFormat IndexFormat; - VertexFormat VertexFormat; - }; - - struct GeometryProcedural - { - gpuva_t pAABBs_GPUVA; ///, <0,1,0>, <0,0,1> - float obx = Transform[0] * Transform[0] + Transform[4] * Transform[4] + Transform[8] * Transform[8]; - float oby = Transform[1] * Transform[1] + Transform[5] * Transform[5] + Transform[9] * Transform[9]; - float obz = Transform[2] * Transform[2] + Transform[6] * Transform[6] + Transform[10] * Transform[10]; - - float obb_sq_half_surf = obx * oby + oby * obz + obz * obx; - - return obb_sq_half_surf / aabb_sq_half_surf; - - // ex = 2.0 - // ey = 2.0 - // ez = 2.0 - // ex = 4.0 - // ey = 4.0 - // ez = 4.0 - // aabb_half_surf = 16+16 *2.0 + 2.0*2.0+ 2.0*2.0; = 12; - // aabb_sq_half_surf = 144; - // - // obx = 4.0; - // oby = 4.0; - // obz = 4.0; - // obb_sq_half_surf = 16 + 16+ 16; - // obb_sq_half_surf = 16.0 *3 = 48 -} - -GRL_INLINE void load_row_major_from_AffineSpace3f(struct AffineSpace3f in, float* out) -{ - out[0] = in.l.vx.x; - out[4] = in.l.vx.y; - out[8] = in.l.vx.z; - out[1] = in.l.vy.x; - out[5] = in.l.vy.y; - out[9] = in.l.vy.z; - out[2] = in.l.vz.x; - out[6] = in.l.vz.y; - out[10] = in.l.vz.z; - - out[3] = in.p.x; - out[7] = in.p.y; - out[11] = in.p.z; -} - -GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct AffineSpace3f xfm, float3 p) -{ - return xfmPoint(xfm.l, p) + xfm.p; -} - -/* compute inverse matrix */ -GRL_INLINE struct AffineSpace3f AffineSpace3f_invert(struct AffineSpace3f in) -{ - const struct LinearSpace3f il = LinearSpace3f_invert(in.l); - float3 ip = -xfmPoint(il, in.p); - return AffineSpace3f_Constructor(il, ip); -} - -GRL_NAMESPACE_END(RTAS) -GRL_NAMESPACE_END(GRL) diff --git a/src/intel/vulkan/grl/meson.build b/src/intel/vulkan/grl/meson.build deleted file mode 100644 index 0bac5f8e460..00000000000 --- a/src/intel/vulkan/grl/meson.build +++ /dev/null @@ -1,186 +0,0 @@ -# Copyright © 2021 Intel Corporation -# SPDX-License-Identifier: MIT - -grl_lib_files = [ - 'gpu/libs/libraries.grl', -] - -grl_grl_files = [ - 'gpu/build_leaf.grl', - 'gpu/build_primref.grl', -# 'gpu/build_refit.grl', - 'gpu/copy.grl', -# 'gpu/grl_api_interface_verify.grl', - 'gpu/misc.grl', -# 'gpu/morton_builder.grl', -# 'gpu/msb_radix_bitonic_sort.grl', - 'gpu/new_sah_builder.grl', - 'gpu/postbuild_info.grl', -# 'gpu/presplit.grl', -# 'gpu/radix_sort.grl', -# 'gpu/rebraid.grl', -# 'gpu/traversal_shader.grl', -] - -grl_lib_args = [] -foreach libfile : grl_lib_files - grl_lib_args += '--library' - grl_lib_args += files(libfile) -endforeach - -grl_genX_files = [ - 'genX_grl_dispatch.c', - 'genX_grl_uuid.cpp', -] - -grl_lib_args = [] -foreach libfile : grl_lib_files - grl_lib_args += '--library' - grl_lib_args += files(libfile) -endforeach - -grl_cl_kernel_h = custom_target( - 'grl_cl_kernel.h', - input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files], - output : 'grl_cl_kernel.h', - command : [ - prog_python, '@INPUT0@', '--out-h', '@OUTPUT@', - grl_lib_args, files(grl_grl_files), - ], -) - -has_ply = run_command( - prog_python, '-c', - ''' -import ply - ''', check : false) -if has_ply.returncode() != 0 - error('Python (3.x) ply module required to build GRL kernels.') -endif - -r = run_command(prog_python, 'grl_cl_kernel_gen.py', - grl_lib_args, '--ls-kernels', grl_grl_files, check : false) -assert(r.returncode() == 0, 'Failed to fetch GRL CL kernels') -grl_kernels = r.stdout().strip().split() - -grl_metakernel_c = [] -grl_metakernel_h = [] -foreach grl_file : grl_grl_files - base_outfile = 'grl_metakernel_' + fs.replace_suffix(fs.name(grl_file), '') - outfiles = custom_target( - base_outfile, - input : ['grl_metakernel_gen.py', grl_file, grl_lib_files], - output : [base_outfile + '.h', base_outfile + '.c'], - command : [ - prog_python, '@INPUT0@', '--out-h', '@OUTPUT0@', - '--out-c', '@OUTPUT1@', grl_lib_args, '@INPUT1@', - ], - ) - grl_metakernel_h += outfiles[0] - grl_metakernel_c += outfiles[1] -endforeach - -grl_genX_libs = [] -foreach t : [['125', 'gfx125', 'dg2'], ['200', 'gfx20', 'lnl'], - ['300', 'gfx30', 'ptl'], ] - verX10 = t[0] - genX_prefix = t[1] - platform = t[2] - - grl_compiled_cl_kernels = [] - foreach k : grl_kernels - # get_cl_files dumps out filename:entrypoint:libfile1,libfile2,libfile3 - cl_file = k.split(':')[0] - entrypoint = k.split(':')[1] - library_files = k.split(':')[2] - kernel_prefix = '_'.join([ - genX_prefix, - fs.replace_suffix(cl_file, '').replace('gpu/', '').replace('/', '_'), - entrypoint - ]) - input_args = [ files(cl_file), ] - if library_files != '' - foreach lib_file : library_files.split(',') - input_args += [ lib_file ] - endforeach - endif - prepended_input_args = [] - foreach input_arg : input_args - prepended_input_args += ['--in', input_arg] - endforeach - outfile = kernel_prefix + '.h' - grl_compiled_cl_kernels += custom_target( - outfile, - input : cl_file, - output : outfile, - command : [ - prog_intel_clc, '-p', platform, '--prefix', kernel_prefix, - '-e', entrypoint, prepended_input_args, '-o', '@OUTPUT@', '--', - '-cl-std=cl2.0', '-D__OPENCL_VERSION__=200', - '-DMAX_HW_SIMD_WIDTH=16', '-DMAX_WORKGROUP_SIZE=16', - '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'), - '-I' + join_paths(meson.current_source_dir(), 'gpu'), - '-I' + join_paths(meson.current_source_dir(), 'include'), - ], - env: ['MESA_SHADER_CACHE_DISABLE=true', - 'MESA_SPIRV_LOG_LEVEL=error'], - depends : dep_prog_intel_clc - ) - endforeach - - grl_cl_kernel_c = custom_target( - 'grl_@0@_cl_kernel.c'.format(genX_prefix), - input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files], - output : 'grl_@0@_cl_kernel.c'.format(genX_prefix), - command : [ - prog_python, '@INPUT0@', '--out-c', '@OUTPUT@', - grl_lib_args, '--prefix', genX_prefix, files(grl_grl_files), - ], - ) - - grl_genX_libs += static_library( - 'grl_@0@'.format(genX_prefix), - [grl_cl_kernel_h, grl_compiled_cl_kernels, grl_cl_kernel_c, - grl_genX_files, grl_metakernel_c, grl_metakernel_h], - include_directories : [ - inc_include, inc_src, - inc_intel, - ], - c_args : [ - no_override_init_args, sse2_args, - '-DGFX_VERx10=@0@'.format(verX10), - ], - cpp_args : [ - sse2_args, - '-DGFX_VERx10=@0@'.format(verX10), - ], - dependencies : [ - dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers, - idep_vulkan_runtime_headers, idep_anv_headers, idep_genxml, - ], - gnu_symbol_visibility : 'hidden', - ) -endforeach - -libgrl_deps = [ - dep_valgrind, - idep_nir_headers, - idep_vulkan_util_headers, - idep_vulkan_wsi_headers, -] - -libgrl = static_library( - 'grl', - [grl_cl_kernel_h], - include_directories : [ - inc_include, inc_src, inc_intel, - ], - link_whole : [grl_genX_libs], - dependencies : [libgrl_deps, idep_anv_headers], -) -idep_grl = declare_dependency( - link_with : libgrl, - dependencies : libgrl_deps, - sources : [grl_metakernel_h, grl_cl_kernel_h], - include_directories : include_directories('include', 'gpu'), -) diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 03e91c57e76..f2d1d0fbca7 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -39,22 +39,10 @@ idep_anv_headers = declare_dependency( bvh_spv = [] if with_intel_vk_rt - if with_intel_bvh_grl - subdir('grl') - optional_libgrl = [libgrl] - anv_flags += '-DANV_SUPPORT_RT_GRL=1' - else - subdir('bvh') - idep_grl = null_dep - optional_libgrl = [] - anv_flags += '-DANV_SUPPORT_RT_GRL=0' - endif + subdir('bvh') anv_flags += '-DANV_SUPPORT_RT=1' else - idep_grl = null_dep - optional_libgrl = [] anv_flags += '-DANV_SUPPORT_RT=0' - anv_flags += '-DANV_SUPPORT_RT_GRL=0' endif intel_icd = custom_target( @@ -111,15 +99,9 @@ anv_per_hw_ver_files = files( 'genX_simple_shader.c', ) if with_intel_vk_rt - if with_intel_bvh_grl - anv_per_hw_ver_files += files( - 'genX_acceleration_structure_grl.c', - ) - else - anv_per_hw_ver_files += files( - 'genX_acceleration_structure.c', - ) - endif + anv_per_hw_ver_files += files( + 'genX_acceleration_structure.c', + ) endif foreach _gfx_ver : ['90', '110', '120', '125', '200', '300'] @@ -135,7 +117,7 @@ foreach _gfx_ver : ['90', '110', '120', '125', '200', '300'] dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml, idep_vulkan_util_headers, idep_vulkan_wsi_headers, idep_vulkan_runtime_headers, idep_mesautil, - idep_intel_driver_ds_headers, idep_grl, + idep_intel_driver_ds_headers, idep_intel_shaders, idep_intel_blorp, ], ) @@ -271,7 +253,7 @@ libvulkan_intel = shared_library( include_directories : [ inc_include, inc_src, inc_intel, ], - link_whole : [libanv_common, libanv_per_hw_ver_libs] + optional_libgrl, + link_whole : [libanv_common, libanv_per_hw_ver_libs], link_with : [ libisl, libintel_perf, ], @@ -313,7 +295,7 @@ if with_tests link_with : [ libanv_per_hw_ver_libs, libintel_common, libisl, libintel_perf, - ] + optional_libgrl, + ], dependencies : [ dep_thread, dep_dl, dep_m, anv_deps, idep_nir, idep_vulkan_util, idep_vulkan_wsi, idep_vulkan_runtime,