mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 07:20:10 +01:00
intel: remove GRL/intel-clc
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35227>
This commit is contained in:
parent
44bff7eb05
commit
f0e18c475b
93 changed files with 10 additions and 40555 deletions
17
meson.build
17
meson.build
|
|
@ -307,29 +307,12 @@ with_any_broadcom = [
|
|||
|
||||
with_intel_vk_rt = get_option('intel-rt') \
|
||||
.disable_auto_if(not with_intel_vk) \
|
||||
.disable_if(get_option('intel-bvh-grl') and \
|
||||
host_machine.cpu_family() != 'x86_64', \
|
||||
error_message : 'Intel Ray Tracing is only supported on x86_64') \
|
||||
.allowed()
|
||||
|
||||
with_intel_bvh_grl = get_option('intel-bvh-grl')
|
||||
|
||||
if get_option('intel-clc') != 'system' and \
|
||||
get_option('precomp-compiler') != 'system' and \
|
||||
with_intel_bvh_grl
|
||||
# Require intel-clc with Anv & Iris (for internal shaders)
|
||||
with_intel_clc = get_option('intel-clc') == 'enabled' or \
|
||||
get_option('precomp-compiler') == 'enabled' or \
|
||||
with_intel_bvh_grl
|
||||
else
|
||||
with_intel_clc = false
|
||||
endif
|
||||
|
||||
with_any_intel = [
|
||||
with_gallium_crocus,
|
||||
with_gallium_i915,
|
||||
with_gallium_iris,
|
||||
with_intel_clc,
|
||||
with_intel_hasvk,
|
||||
with_intel_tools,
|
||||
with_intel_vk,
|
||||
|
|
|
|||
|
|
@ -294,7 +294,7 @@ option(
|
|||
type : 'array',
|
||||
value : [],
|
||||
choices : [
|
||||
'device-select', 'intel-nullhw', 'overlay', 'screenshot',
|
||||
'device-select', 'intel-nullhw', 'overlay', 'screenshot',
|
||||
'vram-report-limit',
|
||||
],
|
||||
description : 'List of vulkan layers to build'
|
||||
|
|
@ -693,13 +693,6 @@ option(
|
|||
description : 'Build the intel-clc compiler or use a system version.'
|
||||
)
|
||||
|
||||
option(
|
||||
'intel-bvh-grl',
|
||||
type : 'boolean',
|
||||
value : false,
|
||||
description : 'Build the BVH structure using GRL.'
|
||||
)
|
||||
|
||||
option(
|
||||
'install-intel-clc',
|
||||
type : 'boolean',
|
||||
|
|
|
|||
|
|
@ -87,7 +87,6 @@ with_nir_headers_only = (
|
|||
with_gallium_rusticl,
|
||||
with_microsoft_clc,
|
||||
with_spirv_to_dxil,
|
||||
with_intel_clc,
|
||||
with_clc,
|
||||
with_drivers_clc,
|
||||
get_option('intel-elk'),
|
||||
|
|
|
|||
|
|
@ -1,632 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2021 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_compiler.h"
|
||||
#include "brw_kernel.h"
|
||||
#include "brw_nir.h"
|
||||
#include "elk/elk_nir.h"
|
||||
#include "compiler/brw_disasm.h"
|
||||
#include "compiler/clc/clc.h"
|
||||
#include "compiler/glsl_types.h"
|
||||
#include "compiler/nir/nir_serialize.h"
|
||||
#include "compiler/spirv/spirv_info.h"
|
||||
#include "dev/intel_debug.h"
|
||||
#include "util/build_id.h"
|
||||
#include "util/disk_cache.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/mesa-sha1.h"
|
||||
#include "util/u_dynarray.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <getopt.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
/* Shader functions */
|
||||
#define SPIR_V_MAGIC_NUMBER 0x07230203
|
||||
|
||||
static struct disk_cache *
|
||||
get_disk_cache(struct brw_compiler *compiler)
|
||||
{
|
||||
#ifdef ENABLE_SHADER_CACHE
|
||||
char renderer[14];
|
||||
ASSERTED int len = snprintf(renderer, sizeof(renderer), "brw_clc_%04x",
|
||||
compiler->devinfo->pci_device_id);
|
||||
assert(len == sizeof(renderer) - 2);
|
||||
|
||||
const struct build_id_note *note =
|
||||
build_id_find_nhdr_for_addr(get_disk_cache);
|
||||
if (note == NULL) {
|
||||
fprintf(stderr, "Failed to find build-id\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
unsigned build_id_len = build_id_length(note);
|
||||
if (build_id_len < 20) {
|
||||
fprintf(stderr, "build-id too short. It needs to be a SHA\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
struct mesa_sha1 sha1_ctx;
|
||||
uint8_t sha1[20];
|
||||
_mesa_sha1_init(&sha1_ctx);
|
||||
_mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len);
|
||||
_mesa_sha1_final(&sha1_ctx, sha1);
|
||||
|
||||
char timestamp[41];
|
||||
_mesa_sha1_format(timestamp, sha1);
|
||||
|
||||
const uint64_t driver_flags = brw_get_compiler_config_value(compiler);
|
||||
|
||||
return disk_cache_create(renderer, timestamp, driver_flags);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
compiler_log(void *data, unsigned *id, const char *fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
if (INTEL_DEBUG(DEBUG_CS))
|
||||
vfprintf(stderr, fmt, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
static void
|
||||
msg_callback(void *priv, const char *msg)
|
||||
{
|
||||
(void)priv;
|
||||
fprintf(stderr, "%s", msg);
|
||||
}
|
||||
|
||||
static void
|
||||
print_u32_data(FILE *fp, const char *prefix, const char *arr_name,
|
||||
const uint32_t *data, size_t len)
|
||||
{
|
||||
assert(len % 4 == 0);
|
||||
fprintf(fp, "static const uint32_t %s_%s[] = {", prefix, arr_name);
|
||||
for (unsigned i = 0; i < (len / 4); i++) {
|
||||
if (i % 4 == 0)
|
||||
fprintf(fp,"\n ");
|
||||
|
||||
fprintf(fp, " 0x%08" PRIx32 ",", data[i]);
|
||||
}
|
||||
fprintf(fp, "\n};\n");
|
||||
}
|
||||
|
||||
static void
|
||||
print_u8_data(FILE *fp, const char *prefix, const char *arr_name,
|
||||
const uint8_t *data, size_t len)
|
||||
{
|
||||
fprintf(fp, "static const uint8_t %s_%s[] = {", prefix, arr_name);
|
||||
for (unsigned i = 0; i < len; i++) {
|
||||
if (i % 16 == 0)
|
||||
fprintf(fp,"\n ");
|
||||
|
||||
fprintf(fp, " 0x%02" PRIx8 ",", data[i]);
|
||||
}
|
||||
fprintf(fp, "\n};\n");
|
||||
}
|
||||
|
||||
static const char *
|
||||
reloc_type_str(enum brw_shader_reloc_type type)
|
||||
{
|
||||
switch (type) {
|
||||
#define CASE(e) case e: return #e;
|
||||
CASE(BRW_SHADER_RELOC_TYPE_U32)
|
||||
CASE(BRW_SHADER_RELOC_TYPE_MOV_IMM)
|
||||
#undef CASE
|
||||
default:
|
||||
unreachable("Unknown relocation type");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
print_cs_prog_data_fields(FILE *fp, const char *prefix, const char *pad,
|
||||
const struct brw_cs_prog_data *cs_prog_data)
|
||||
{
|
||||
#define PROG_DATA_FIELD(fmt, field) \
|
||||
fprintf(fp, "%s." #field " = " fmt ",\n", pad, cs_prog_data->field)
|
||||
|
||||
#define PROG_DATA_BOOL_FIELD(field) \
|
||||
fprintf(fp, "%s." #field " = %s,\n", pad, \
|
||||
cs_prog_data->field ? "true" : "false")
|
||||
|
||||
PROG_DATA_FIELD("%u", base.nr_params);
|
||||
assert(cs_prog_data->base.stage == MESA_SHADER_COMPUTE);
|
||||
fprintf(fp, "%s.base.stage = MESA_SHADER_COMPUTE,\n", pad);
|
||||
assert(cs_prog_data->base.zero_push_reg == 0);
|
||||
assert(cs_prog_data->base.push_reg_mask_param == 0);
|
||||
PROG_DATA_FIELD("%u", base.curb_read_length);
|
||||
PROG_DATA_FIELD("%u", base.total_scratch);
|
||||
PROG_DATA_FIELD("%u", base.total_shared);
|
||||
PROG_DATA_FIELD("%u", base.program_size);
|
||||
PROG_DATA_FIELD("%u", base.const_data_size);
|
||||
PROG_DATA_FIELD("%u", base.const_data_offset);
|
||||
PROG_DATA_FIELD("%u", base.num_relocs);
|
||||
fprintf(fp, "%s.base.relocs = %s_relocs,\n", pad, prefix);
|
||||
PROG_DATA_FIELD("%u", base.grf_used);
|
||||
PROG_DATA_FIELD("%u", base.printf_info_count);
|
||||
fprintf(fp, "%s.base.printf_info = (u_printf_info *)%s_printfs,\n", pad, prefix);
|
||||
assert(!cs_prog_data->base.has_ubo_pull);
|
||||
assert(cs_prog_data->base.dispatch_grf_start_reg == 0);
|
||||
assert(!cs_prog_data->base.use_alt_mode);
|
||||
assert(cs_prog_data->base.param == 0);
|
||||
PROG_DATA_BOOL_FIELD(base.uses_atomic_load_store);
|
||||
fprintf(fp, "%s.local_size = { %u, %u, %u },\n", pad,
|
||||
cs_prog_data->local_size[0],
|
||||
cs_prog_data->local_size[1],
|
||||
cs_prog_data->local_size[2]);
|
||||
fprintf(fp, "%s.prog_offset = { %u, %u, %u },\n", pad,
|
||||
cs_prog_data->prog_offset[0],
|
||||
cs_prog_data->prog_offset[1],
|
||||
cs_prog_data->prog_offset[2]);
|
||||
PROG_DATA_FIELD("%u", prog_mask);
|
||||
PROG_DATA_FIELD("%u", prog_spilled);
|
||||
PROG_DATA_BOOL_FIELD(uses_barrier);
|
||||
PROG_DATA_BOOL_FIELD(uses_num_work_groups);
|
||||
assert(!cs_prog_data->uses_inline_data);
|
||||
assert(!cs_prog_data->uses_btd_stack_ids);
|
||||
PROG_DATA_FIELD("%u", push.per_thread.dwords);
|
||||
PROG_DATA_FIELD("%u", push.per_thread.regs);
|
||||
PROG_DATA_FIELD("%u", push.per_thread.size);
|
||||
PROG_DATA_FIELD("%u", push.cross_thread.dwords);
|
||||
PROG_DATA_FIELD("%u", push.cross_thread.regs);
|
||||
PROG_DATA_FIELD("%u", push.cross_thread.size);
|
||||
|
||||
#undef PROG_DATA_FIELD
|
||||
#undef PROG_DATA_BOOL_FIELD
|
||||
}
|
||||
|
||||
static void
|
||||
print_kernel(FILE *fp, const char *prefix,
|
||||
const struct brw_kernel *kernel,
|
||||
const struct brw_isa_info *isa)
|
||||
{
|
||||
struct mesa_sha1 sha1_ctx;
|
||||
_mesa_sha1_init(&sha1_ctx);
|
||||
|
||||
#define SHA1_UPDATE_VALUE(val) \
|
||||
_mesa_sha1_update(&sha1_ctx, &val, sizeof(val))
|
||||
|
||||
fprintf(fp, "#include \"intel/compiler/brw_kernel.h\"\n");
|
||||
fprintf(fp, "\n");
|
||||
|
||||
fprintf(fp, "static const struct brw_shader_reloc %s_relocs[] = {\n",
|
||||
prefix);
|
||||
for (unsigned i = 0; i < kernel->prog_data.base.num_relocs; i++) {
|
||||
const struct brw_shader_reloc *reloc = &kernel->prog_data.base.relocs[i];
|
||||
fprintf(fp, " { %"PRIu32", %s, %"PRIu32", %"PRIu32" },\n",
|
||||
reloc->id, reloc_type_str(reloc->type),
|
||||
reloc->offset, reloc->delta);
|
||||
}
|
||||
fprintf(fp, "};\n");
|
||||
_mesa_sha1_update(&sha1_ctx, kernel->prog_data.base.relocs,
|
||||
kernel->prog_data.base.num_relocs *
|
||||
sizeof(kernel->prog_data.base.relocs[0]));
|
||||
|
||||
fprintf(fp, "static const u_printf_info %s_printfs[] = {\n",
|
||||
prefix);
|
||||
for (unsigned i = 0; i < kernel->prog_data.base.printf_info_count; i++) {
|
||||
const u_printf_info *printf_info = &kernel->prog_data.base.printf_info[i];
|
||||
fprintf(fp, " {\n");
|
||||
fprintf(fp, " .num_args = %"PRIu32",\n", printf_info->num_args);
|
||||
fprintf(fp, " .arg_sizes = (unsigned []) {\n");
|
||||
for (unsigned a = 0; a < printf_info->num_args; a++)
|
||||
fprintf(fp, " %"PRIu32",\n", printf_info->arg_sizes[a]);
|
||||
fprintf(fp, " },\n");
|
||||
fprintf(fp, " .string_size = %"PRIu32",\n", printf_info->string_size);
|
||||
fprintf(fp, " .strings = (char []) {");
|
||||
for (unsigned c = 0; c < printf_info->string_size; c++) {
|
||||
if (c % 8 == 0 )
|
||||
fprintf(fp, "\n ");
|
||||
fprintf(fp, "0x%02hhx, ", printf_info->strings[c]);
|
||||
}
|
||||
fprintf(fp, "\n },\n");
|
||||
fprintf(fp, " },\n");
|
||||
}
|
||||
fprintf(fp, "};\n");
|
||||
|
||||
/* Get rid of the pointers before we hash */
|
||||
struct brw_cs_prog_data cs_prog_data = kernel->prog_data;
|
||||
cs_prog_data.base.relocs = NULL;
|
||||
assert(cs_prog_data.base.param == NULL);
|
||||
_mesa_sha1_update(&sha1_ctx, &cs_prog_data, sizeof(cs_prog_data));
|
||||
|
||||
SHA1_UPDATE_VALUE(kernel->args_size);
|
||||
SHA1_UPDATE_VALUE(kernel->arg_count);
|
||||
_mesa_sha1_update(&sha1_ctx, kernel->args,
|
||||
kernel->arg_count * sizeof(kernel->args[0]));
|
||||
|
||||
fprintf(fp, "static const struct brw_kernel_arg_desc %s_args[] = {\n",
|
||||
prefix);
|
||||
for (unsigned i = 0; i < kernel->arg_count; i++) {
|
||||
fprintf(fp, " { %d, %d },\n",
|
||||
kernel->args[i].offset, kernel->args[i].size);
|
||||
}
|
||||
fprintf(fp, "};\n\n");
|
||||
|
||||
_mesa_sha1_update(&sha1_ctx, kernel->code,
|
||||
kernel->prog_data.base.program_size);
|
||||
|
||||
fprintf(fp, "#if 0 /* BEGIN KERNEL ASSEMBLY */\n");
|
||||
fprintf(fp, "\n");
|
||||
brw_disassemble_with_errors(isa, kernel->code, 0, NULL, fp);
|
||||
fprintf(fp, "\n");
|
||||
fprintf(fp, "#endif /* END KERNEL ASSEMBLY */\n");
|
||||
print_u32_data(fp, prefix, "code", kernel->code,
|
||||
kernel->prog_data.base.program_size);
|
||||
|
||||
fprintf(fp, "static const struct brw_kernel %s = {\n", prefix);
|
||||
fprintf(fp, " .prog_data = {\n");
|
||||
print_cs_prog_data_fields(fp, prefix, " ", &kernel->prog_data);
|
||||
fprintf(fp, " },\n");
|
||||
fprintf(fp, " .args_size = %d,\n", (int)kernel->args_size);
|
||||
fprintf(fp, " .arg_count = %d,\n", (int)kernel->arg_count);
|
||||
fprintf(fp, " .args = %s_args,\n", prefix);
|
||||
fprintf(fp, " .code = %s_code,\n", prefix);
|
||||
fprintf(fp, "};\n");
|
||||
|
||||
unsigned char sha1[20];
|
||||
_mesa_sha1_final(&sha1_ctx, sha1);
|
||||
char sha1_str[41];
|
||||
_mesa_sha1_format(sha1_str, sha1);
|
||||
fprintf(fp, "const char *%s_sha1 = \"%s\";\n", prefix, sha1_str);
|
||||
}
|
||||
|
||||
static void
|
||||
print_usage(char *exec_name, FILE *f)
|
||||
{
|
||||
fprintf(f,
|
||||
"Usage: %s [options] -- [clang args]\n"
|
||||
"Options:\n"
|
||||
" -h --help Print this help.\n"
|
||||
" -e, --entrypoint <name> Specify the entry-point name.\n"
|
||||
" -L, --llvm17-wa Enable LLVM 17 workarounds for opaque pointers"
|
||||
" -p, --platform <name> Specify the target platform name.\n"
|
||||
" --prefix <prefix> Prefix for variable names in generated C code.\n"
|
||||
" -o, --out <filename> Specify the output filename.\n"
|
||||
" -i, --in <filename> Specify one input filename. Accepted multiple times.\n"
|
||||
" -s, --spv <filename> Specify the output filename for spirv.\n"
|
||||
" -n, --nir Specify whether to output serialized NIR instead of ISA.\n"
|
||||
" -g, --gfx-version <ver> Specify the Gfx version used for NIR output.\n"
|
||||
" -t, --text <filename> Specify the output filename for the parsed text\n"
|
||||
" -v, --verbose Print more information during compilation.\n"
|
||||
" -M, --llvm-version Print LLVM version.\n"
|
||||
, exec_name);
|
||||
}
|
||||
|
||||
#define OPT_PREFIX 1000
|
||||
|
||||
struct intel_clc_params {
|
||||
char *entry_point;
|
||||
char *platform;
|
||||
char *outfile;
|
||||
char *spv_outfile;
|
||||
char *txt_outfile;
|
||||
char *prefix;
|
||||
|
||||
unsigned gfx_version;
|
||||
|
||||
bool output_nir;
|
||||
bool print_info;
|
||||
bool llvm17_wa;
|
||||
|
||||
void *mem_ctx;
|
||||
|
||||
struct intel_device_info devinfo;
|
||||
};
|
||||
|
||||
#include "compiler/spirv/nir_spirv.h"
|
||||
|
||||
static int
|
||||
output_isa(const struct intel_clc_params *params, struct clc_binary *binary)
|
||||
{
|
||||
struct brw_kernel kernel = {};
|
||||
char *error_str;
|
||||
int ret = 0;
|
||||
|
||||
struct brw_isa_info _isa, *isa = &_isa;
|
||||
brw_init_isa_info(isa, ¶ms->devinfo);
|
||||
|
||||
struct brw_compiler *compiler = brw_compiler_create(params->mem_ctx,
|
||||
¶ms->devinfo);
|
||||
compiler->spilling_rate = 11;
|
||||
compiler->shader_debug_log = compiler_log;
|
||||
compiler->shader_perf_log = compiler_log;
|
||||
struct disk_cache *disk_cache = get_disk_cache(compiler);
|
||||
|
||||
if (!brw_kernel_from_spirv(compiler, disk_cache, &kernel, NULL, params->mem_ctx,
|
||||
binary->data, binary->size,
|
||||
params->entry_point, &error_str)) {
|
||||
fprintf(stderr, "Compile failed: %s\n", error_str);
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (params->print_info) {
|
||||
fprintf(stdout, "kernel info:\n");
|
||||
fprintf(stdout, " uses_barrier : %u\n", kernel.prog_data.uses_barrier);
|
||||
fprintf(stdout, " uses_num_work_groups : %u\n", kernel.prog_data.uses_num_work_groups);
|
||||
fprintf(stdout, " uses_inline_data : %u\n", kernel.prog_data.uses_inline_data);
|
||||
fprintf(stdout, " local_size : %ux%ux%u\n",
|
||||
kernel.prog_data.local_size[0],
|
||||
kernel.prog_data.local_size[1],
|
||||
kernel.prog_data.local_size[2]);
|
||||
fprintf(stdout, " curb_read_length : %u\n", kernel.prog_data.base.curb_read_length);
|
||||
fprintf(stdout, " total_scratch : %u\n", kernel.prog_data.base.total_scratch);
|
||||
fprintf(stdout, " total_shared : %u\n", kernel.prog_data.base.total_shared);
|
||||
fprintf(stdout, " program_size : %u\n", kernel.prog_data.base.program_size);
|
||||
fprintf(stdout, " const_data_size : %u\n", kernel.prog_data.base.const_data_size);
|
||||
fprintf(stdout, " uses_atomic_load_store : %u\n", kernel.prog_data.base.uses_atomic_load_store);
|
||||
fprintf(stdout, " dispatch_grf_start_reg : %u\n", kernel.prog_data.base.dispatch_grf_start_reg);
|
||||
}
|
||||
|
||||
char *prefix = params->prefix;
|
||||
char prefix_tmp[256];
|
||||
if (prefix == NULL) {
|
||||
bool is_pt_5 = (params->devinfo.verx10 % 10) == 5;
|
||||
snprintf(prefix_tmp, sizeof(prefix_tmp), "gfx%d%s_clc_%s",
|
||||
params->devinfo.ver, is_pt_5 ? "5" : "", params->entry_point);
|
||||
prefix = prefix_tmp;
|
||||
}
|
||||
|
||||
if (params->outfile != NULL) {
|
||||
FILE *fp = fopen(params->outfile, "w");
|
||||
print_kernel(fp, prefix, &kernel, isa);
|
||||
fclose(fp);
|
||||
} else {
|
||||
print_kernel(stdout, prefix, &kernel, isa);
|
||||
}
|
||||
|
||||
exit:
|
||||
disk_cache_destroy(disk_cache);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void
|
||||
print_llvm_version(FILE *out)
|
||||
{
|
||||
fprintf(out, "%s\n", MESA_LLVM_VERSION_STRING);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int exit_code = 0;
|
||||
|
||||
process_intel_debug_variable();
|
||||
|
||||
static struct option long_options[] ={
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{"entrypoint", required_argument, 0, 'e'},
|
||||
{"platform", required_argument, 0, 'p'},
|
||||
{"prefix", required_argument, 0, OPT_PREFIX},
|
||||
{"in", required_argument, 0, 'i'},
|
||||
{"out", required_argument, 0, 'o'},
|
||||
{"spv", required_argument, 0, 's'},
|
||||
{"text", required_argument, 0, 't'},
|
||||
{"llvm-version", no_argument, 0, 'M'},
|
||||
{"verbose", no_argument, 0, 'v'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
struct intel_clc_params params = {};
|
||||
|
||||
struct util_dynarray clang_args;
|
||||
struct util_dynarray input_files;
|
||||
|
||||
struct clc_binary spirv_obj = {0};
|
||||
struct clc_parsed_spirv parsed_spirv_data = {0};
|
||||
struct disk_cache *disk_cache = NULL;
|
||||
|
||||
params.mem_ctx = ralloc_context(NULL);
|
||||
|
||||
util_dynarray_init(&clang_args, params.mem_ctx);
|
||||
util_dynarray_init(&input_files, params.mem_ctx);
|
||||
|
||||
int ch;
|
||||
while ((ch = getopt_long(argc, argv, "he:p:s:t:i:o:Mv", long_options, NULL)) != -1)
|
||||
{
|
||||
switch (ch)
|
||||
{
|
||||
case 'h':
|
||||
print_usage(argv[0], stdout);
|
||||
goto end;
|
||||
case 'e':
|
||||
params.entry_point = optarg;
|
||||
break;
|
||||
case 'p':
|
||||
params.platform = optarg;
|
||||
break;
|
||||
case 'o':
|
||||
params.outfile = optarg;
|
||||
break;
|
||||
case 'i':
|
||||
util_dynarray_append(&input_files, char *, optarg);
|
||||
break;
|
||||
case 's':
|
||||
params.spv_outfile = optarg;
|
||||
break;
|
||||
case 't':
|
||||
params.txt_outfile = optarg;
|
||||
break;
|
||||
case 'v':
|
||||
params.print_info = true;
|
||||
break;
|
||||
case 'M':
|
||||
print_llvm_version(stdout);
|
||||
return EXIT_SUCCESS;
|
||||
case OPT_PREFIX:
|
||||
params.prefix = optarg;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Unrecognized option \"%s\".\n", optarg);
|
||||
print_usage(argv[0], stderr);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = optind; i < argc; i++) {
|
||||
util_dynarray_append(&clang_args, char *, argv[i]);
|
||||
}
|
||||
|
||||
if (util_dynarray_num_elements(&input_files, char *) == 0) {
|
||||
fprintf(stderr, "No input file(s).\n");
|
||||
print_usage(argv[0], stderr);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
struct clc_logger logger = {
|
||||
.error = msg_callback,
|
||||
.warning = msg_callback,
|
||||
};
|
||||
|
||||
size_t total_size = 0;
|
||||
char *all_inputs = NULL;
|
||||
util_dynarray_foreach(&input_files, char *, infile) {
|
||||
int fd = open(*infile, O_RDONLY);
|
||||
if (fd < 0) {
|
||||
fprintf(stderr, "Failed to open %s\n", *infile);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
off_t len = lseek(fd, 0, SEEK_END);
|
||||
size_t new_size = total_size + len;
|
||||
all_inputs = reralloc_size(params.mem_ctx, all_inputs, new_size + 1);
|
||||
if (!all_inputs) {
|
||||
fprintf(stderr, "Failed to allocate memory\n");
|
||||
goto fail;
|
||||
}
|
||||
lseek(fd, 0, SEEK_SET);
|
||||
read(fd, all_inputs + total_size, len);
|
||||
close(fd);
|
||||
total_size = new_size;
|
||||
all_inputs[total_size] = '\0';
|
||||
}
|
||||
|
||||
if (params.txt_outfile) {
|
||||
FILE *fp = fopen(params.txt_outfile, "w");
|
||||
fwrite(all_inputs, total_size, 1, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
const char *allowed_spirv_extensions[] = {
|
||||
"SPV_EXT_shader_atomic_float_add",
|
||||
"SPV_EXT_shader_atomic_float_min_max",
|
||||
"SPV_KHR_float_controls",
|
||||
"SPV_INTEL_subgroups",
|
||||
NULL,
|
||||
};
|
||||
|
||||
struct clc_compile_args clc_args = {
|
||||
.source = {
|
||||
.name = "intel_clc_files",
|
||||
.value = all_inputs,
|
||||
},
|
||||
.features = {
|
||||
.fp16 = true,
|
||||
.intel_subgroups = true,
|
||||
.subgroups = true,
|
||||
.subgroups_ifp = true,
|
||||
},
|
||||
.args = util_dynarray_begin(&clang_args),
|
||||
.num_args = util_dynarray_num_elements(&clang_args, char *),
|
||||
.allowed_spirv_extensions = allowed_spirv_extensions,
|
||||
};
|
||||
|
||||
if (!clc_compile_c_to_spirv(&clc_args, &logger, &spirv_obj, NULL)) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (params.spv_outfile) {
|
||||
FILE *fp = fopen(params.spv_outfile, "w");
|
||||
fwrite(spirv_obj.data, spirv_obj.size, 1, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
glsl_type_singleton_init_or_ref();
|
||||
|
||||
if (params.platform == NULL) {
|
||||
fprintf(stderr, "No target platform name specified.\n");
|
||||
print_usage(argv[0], stderr);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
int pci_id = intel_device_name_to_pci_device_id(params.platform);
|
||||
if (pci_id < 0) {
|
||||
fprintf(stderr, "Invalid target platform name: %s\n", params.platform);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (!intel_get_device_info_for_build(pci_id, ¶ms.devinfo)) {
|
||||
fprintf(stderr, "Failed to get device information.\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (params.devinfo.verx10 < 125) {
|
||||
fprintf(stderr, "Platform currently not supported.\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (params.entry_point == NULL) {
|
||||
fprintf(stderr, "No entry-point name specified.\n");
|
||||
print_usage(argv[0], stderr);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (!clc_parse_spirv(&spirv_obj, &logger, &parsed_spirv_data))
|
||||
goto fail;
|
||||
|
||||
const struct clc_kernel_info *kernel_info = NULL;
|
||||
for (unsigned i = 0; i < parsed_spirv_data.num_kernels; i++) {
|
||||
if (strcmp(parsed_spirv_data.kernels[i].name, params.entry_point) == 0) {
|
||||
kernel_info = &parsed_spirv_data.kernels[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (kernel_info == NULL) {
|
||||
fprintf(stderr, "Kernel entrypoint %s not found\n", params.entry_point);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
exit_code = output_isa(¶ms, &spirv_obj);
|
||||
|
||||
glsl_type_singleton_decref();
|
||||
|
||||
goto end;
|
||||
|
||||
fail:
|
||||
exit_code = 1;
|
||||
|
||||
end:
|
||||
disk_cache_destroy(disk_cache);
|
||||
clc_free_parsed_spirv(&parsed_spirv_data);
|
||||
clc_free_spirv(&spirv_obj);
|
||||
ralloc_free(params.mem_ctx);
|
||||
|
||||
return exit_code;
|
||||
}
|
||||
|
|
@ -169,42 +169,6 @@ idep_intel_compiler_brw = declare_dependency(
|
|||
],
|
||||
)
|
||||
|
||||
# For now this tool is only going to be used by Anv
|
||||
if with_intel_bvh_grl
|
||||
if get_option('intel-clc') == 'system' or get_option('precomp-compiler') == 'system'
|
||||
prog_intel_clc = find_program('intel_clc', native : true)
|
||||
dep_prog_intel_clc = []
|
||||
elif with_intel_clc
|
||||
prog_intel_clc = executable(
|
||||
'intel_clc',
|
||||
[
|
||||
'intel_clc.c',
|
||||
'brw_kernel.c',
|
||||
|
||||
# Use just the nir_options part of ELK instead of fully linking.
|
||||
'elk/elk_nir_options.h',
|
||||
'elk/elk_nir_options.c',
|
||||
'elk/elk_spirv.c',
|
||||
],
|
||||
link_with : [libisl],
|
||||
include_directories : [inc_include, inc_src, inc_intel],
|
||||
c_args : [pre_args, no_override_init_args],
|
||||
cpp_args : ['-Werror=vla'],
|
||||
link_args : [ld_args_build_id],
|
||||
dependencies : [idep_nir, idep_mesaclc, idep_mesautil, idep_intel_dev,
|
||||
idep_intel_compiler_brw],
|
||||
# If we can run host binaries directly, just build intel_clc for the host.
|
||||
# Most commonly this happens when doing a cross compile from an x86_64 build
|
||||
# machine to an x86 host
|
||||
native : not meson.can_run_host_binaries(),
|
||||
install : get_option('install-intel-clc') or get_option('install-precomp-compiler'),
|
||||
)
|
||||
dep_prog_intel_clc = [prog_intel_clc]
|
||||
endif
|
||||
else
|
||||
dep_prog_intel_clc = []
|
||||
endif
|
||||
|
||||
if with_tests
|
||||
test(
|
||||
'intel_compiler_brw_tests',
|
||||
|
|
|
|||
|
|
@ -1016,29 +1016,8 @@ get_buffer_format_features2(const struct intel_device_info *devinfo,
|
|||
flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
|
||||
|
||||
if (devinfo->has_ray_tracing) {
|
||||
#if ANV_SUPPORT_RT_GRL
|
||||
switch (vk_format) {
|
||||
case VK_FORMAT_R32G32_SFLOAT:
|
||||
case VK_FORMAT_R32G32B32_SFLOAT:
|
||||
case VK_FORMAT_R16G16_SFLOAT:
|
||||
case VK_FORMAT_R16G16B16A16_SFLOAT:
|
||||
case VK_FORMAT_R16G16_SNORM:
|
||||
case VK_FORMAT_R16G16B16A16_SNORM:
|
||||
case VK_FORMAT_R16G16B16A16_UNORM:
|
||||
case VK_FORMAT_R16G16_UNORM:
|
||||
case VK_FORMAT_R8G8B8A8_UNORM:
|
||||
case VK_FORMAT_R8G8_UNORM:
|
||||
case VK_FORMAT_R8G8B8A8_SNORM:
|
||||
case VK_FORMAT_R8G8_SNORM:
|
||||
flags |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#else
|
||||
if (vk_acceleration_struct_vtx_format_supported(vk_format))
|
||||
flags |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2415,7 +2415,7 @@ anv_queue_post_submit(struct anv_queue *queue, VkResult submit_result)
|
|||
result = vk_queue_set_lost(&queue->vk, "sync wait failed");
|
||||
}
|
||||
|
||||
#if ANV_SUPPORT_RT && !ANV_SUPPORT_RT_GRL
|
||||
#if ANV_SUPPORT_RT
|
||||
/* The recorded bvh is dumped to files upon command buffer completion */
|
||||
if (INTEL_DEBUG_BVH_ANY)
|
||||
anv_dump_bvh_to_files(queue->device);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -31,10 +31,6 @@
|
|||
|
||||
#include "vk_standard_sample_locations.h"
|
||||
|
||||
#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT_GRL
|
||||
#include "grl/genX_grl.h"
|
||||
#endif
|
||||
|
||||
#include "genX_mi_builder.h"
|
||||
|
||||
#include "vk_util.h"
|
||||
|
|
@ -895,13 +891,8 @@ genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
|
|||
assert(pdevice->info.verx10 == GFX_VERx10);
|
||||
|
||||
#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
|
||||
#if ANV_SUPPORT_RT_GRL
|
||||
genX(grl_load_rt_uuid)(pdevice->rt_uuid);
|
||||
pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
|
||||
#else
|
||||
STATIC_ASSERT(sizeof(ANV_RT_UUID_MACRO) == VK_UUID_SIZE);
|
||||
memcpy(pdevice->rt_uuid, ANV_RT_UUID_MACRO, VK_UUID_SIZE);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
|
||||
|
|
|
|||
|
|
@ -2040,12 +2040,7 @@ void genX(CmdCopyQueryPoolResults)(
|
|||
|
||||
#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
|
||||
|
||||
#if ANV_SUPPORT_RT_GRL
|
||||
#include "grl/include/GRLRTASCommon.h"
|
||||
#include "grl/grl_metakernel_postbuild_info.h"
|
||||
#else
|
||||
#include "bvh/anv_bvh.h"
|
||||
#endif
|
||||
|
||||
void
|
||||
genX(CmdWriteAccelerationStructuresPropertiesKHR)(
|
||||
|
|
@ -2064,66 +2059,19 @@ genX(CmdWriteAccelerationStructuresPropertiesKHR)(
|
|||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
|
||||
|
||||
#if !ANV_SUPPORT_RT_GRL
|
||||
anv_add_pending_pipe_bits(cmd_buffer,
|
||||
ANV_PIPE_END_OF_PIPE_SYNC_BIT |
|
||||
ANV_PIPE_DATA_CACHE_FLUSH_BIT,
|
||||
"read BVH data using CS");
|
||||
#endif
|
||||
|
||||
if (append_query_clear_flush(
|
||||
cmd_buffer, pool,
|
||||
"CmdWriteAccelerationStructuresPropertiesKHR flush query clears") ||
|
||||
!ANV_SUPPORT_RT_GRL)
|
||||
"CmdWriteAccelerationStructuresPropertiesKHR flush query clears"))
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
struct mi_builder b;
|
||||
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
|
||||
|
||||
#if ANV_SUPPORT_RT_GRL
|
||||
for (uint32_t i = 0; i < accelerationStructureCount; i++) {
|
||||
ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
|
||||
struct anv_address query_addr =
|
||||
anv_address_add(anv_query_address(pool, firstQuery + i), 8);
|
||||
|
||||
switch (queryType) {
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
|
||||
genX(grl_postbuild_info_compacted_size)(cmd_buffer,
|
||||
vk_acceleration_structure_get_va(accel),
|
||||
anv_address_physical(query_addr));
|
||||
break;
|
||||
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
|
||||
genX(grl_postbuild_info_current_size)(cmd_buffer,
|
||||
vk_acceleration_structure_get_va(accel),
|
||||
anv_address_physical(query_addr));
|
||||
break;
|
||||
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
|
||||
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
|
||||
genX(grl_postbuild_info_serialized_size)(cmd_buffer,
|
||||
vk_acceleration_structure_get_va(accel),
|
||||
anv_address_physical(query_addr));
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("unhandled query type");
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: Figure out why MTL needs ANV_PIPE_DATA_CACHE_FLUSH_BIT in order
|
||||
* to not lose the availability bit.
|
||||
*/
|
||||
anv_add_pending_pipe_bits(cmd_buffer,
|
||||
ANV_PIPE_END_OF_PIPE_SYNC_BIT |
|
||||
ANV_PIPE_DATA_CACHE_FLUSH_BIT,
|
||||
"after write acceleration struct props");
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
for (uint32_t i = 0; i < accelerationStructureCount; i++)
|
||||
emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true);
|
||||
|
||||
#else
|
||||
for (uint32_t i = 0; i < accelerationStructureCount; i++) {
|
||||
ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
|
||||
struct anv_address query_addr =
|
||||
|
|
@ -2163,6 +2111,5 @@ genX(CmdWriteAccelerationStructuresPropertiesKHR)(
|
|||
mi_builder_set_write_check(&b1, (i == (accelerationStructureCount - 1)));
|
||||
emit_query_mi_availability(&b1, anv_query_address(pool, firstQuery + i), true);
|
||||
}
|
||||
#endif /* ANV_SUPPORT_RT_GRL */
|
||||
}
|
||||
#endif /* GFX_VERx10 >= 125 && ANV_SUPPORT_RT */
|
||||
|
|
|
|||
1
src/intel/vulkan/grl/.gitignore
vendored
1
src/intel/vulkan/grl/.gitignore
vendored
|
|
@ -1 +0,0 @@
|
|||
parsetab.py
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2021 Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef ANV_GRL_H
|
||||
#define ANV_GRL_H
|
||||
|
||||
#include "grl/grl_cl_kernel.h"
|
||||
#include "genxml/gen_macros.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct anv_cmd_buffer;
|
||||
struct anv_kernel_arg;
|
||||
|
||||
void
|
||||
genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
|
||||
enum grl_cl_kernel kernel,
|
||||
const uint32_t *global_size,
|
||||
uint32_t arg_count,
|
||||
const struct anv_kernel_arg *args);
|
||||
|
||||
void
|
||||
genX(grl_load_rt_uuid)(uint8_t *out_uuid);
|
||||
|
||||
uint32_t
|
||||
genX(grl_max_scratch_size)(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* ANV_GRL_H */
|
||||
|
|
@ -1,113 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2021 Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "anv_private.h"
|
||||
#include "genX_grl.h"
|
||||
|
||||
static struct anv_shader_bin *
|
||||
get_shader_bin(struct anv_device *device,
|
||||
enum grl_cl_kernel kernel)
|
||||
{
|
||||
const char *key = genX(grl_get_cl_kernel_sha1)(kernel);
|
||||
int key_len = strlen(key);
|
||||
|
||||
bool cache_hit = false;
|
||||
struct anv_shader_bin *bin =
|
||||
anv_device_search_for_kernel(device, device->internal_cache,
|
||||
key, key_len, &cache_hit);
|
||||
if (bin != NULL)
|
||||
return bin;
|
||||
|
||||
uint32_t dummy_param[32];
|
||||
struct brw_kernel kernel_data;
|
||||
genX(grl_get_cl_kernel)(&kernel_data, kernel);
|
||||
|
||||
assert(kernel_data.prog_data.base.nr_params <= ARRAY_SIZE(dummy_param));
|
||||
kernel_data.prog_data.base.param = dummy_param;
|
||||
|
||||
struct anv_push_descriptor_info empty_push_desc_info = {};
|
||||
struct anv_pipeline_bind_map bind_map = {
|
||||
.kernel_args_size = kernel_data.args_size,
|
||||
.kernel_arg_count = kernel_data.arg_count,
|
||||
.kernel_args = (struct brw_kernel_arg_desc *)kernel_data.args,
|
||||
};
|
||||
|
||||
struct anv_shader_upload_params upload_params = {
|
||||
.stage = MESA_SHADER_KERNEL,
|
||||
.key_data = key,
|
||||
.key_size = key_len,
|
||||
.kernel_data = kernel_data.code,
|
||||
.kernel_size = kernel_data.prog_data.base.program_size,
|
||||
.prog_data = &kernel_data.prog_data.base,
|
||||
.prog_data_size = sizeof(kernel_data.prog_data),
|
||||
.bind_map = &bind_map,
|
||||
.push_desc_info = &empty_push_desc_info,
|
||||
};
|
||||
|
||||
bin = anv_device_upload_kernel(device, device->internal_cache,
|
||||
&upload_params);
|
||||
|
||||
/* The cache already has a reference and it's not going anywhere so there
|
||||
* is no need to hold a second reference.
|
||||
*/
|
||||
anv_shader_bin_unref(device, bin);
|
||||
|
||||
return bin;
|
||||
}
|
||||
|
||||
void
|
||||
genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
|
||||
enum grl_cl_kernel kernel,
|
||||
const uint32_t *global_size,
|
||||
uint32_t arg_count,
|
||||
const struct anv_kernel_arg *args)
|
||||
{
|
||||
struct anv_device *device = cmd_buffer->device;
|
||||
|
||||
const struct intel_l3_weights w =
|
||||
intel_get_default_l3_weights(device->info, true, true);
|
||||
|
||||
struct anv_kernel ak = {
|
||||
.bin = get_shader_bin(device, kernel),
|
||||
.l3_config = intel_get_l3_config(device->info, w),
|
||||
};
|
||||
|
||||
genX(cmd_buffer_dispatch_kernel)(cmd_buffer, &ak, global_size,
|
||||
arg_count, args);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
genX(grl_max_scratch_size)(void)
|
||||
{
|
||||
uint32_t scratch_size = 0;
|
||||
|
||||
for (uint32_t i = 0; i < GRL_CL_KERNEL_MAX; i++) {
|
||||
struct brw_kernel kernel_data;
|
||||
genX(grl_get_cl_kernel)(&kernel_data, i);
|
||||
|
||||
scratch_size = MAX2(kernel_data.prog_data.base.total_scratch,
|
||||
scratch_size);
|
||||
}
|
||||
|
||||
return scratch_size;
|
||||
}
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2021 Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "genX_grl.h"
|
||||
#include "include/GRLGen12.h"
|
||||
|
||||
#include "vulkan/vulkan_core.h"
|
||||
|
||||
extern "C" void
|
||||
genX(grl_load_rt_uuid)(uint8_t *out_uuid);
|
||||
|
||||
extern "C" void
|
||||
genX(grl_load_rt_uuid)(uint8_t *out_uuid)
|
||||
{
|
||||
assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE);
|
||||
memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE);
|
||||
}
|
||||
|
|
@ -1,450 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared.h"
|
||||
#include "intrinsics.h"
|
||||
#ifndef __OPENCL_VERSION__
|
||||
#include "stdio.h"
|
||||
#endif
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
/* ====== QUAD ENCODING config ====== */
|
||||
|
||||
#define QUAD_GEOMID_BITS 27 // dxr limit is 2^24 geos... we have headroom
|
||||
#define QUAD_PRIMID_DIFF_BITS (32 - QUAD_GEOMID_BITS)
|
||||
#define QUAD_GEOMID_MASK ((1<<QUAD_GEOMID_BITS)-1)
|
||||
|
||||
#define QUAD_PRIMID_BITS 29 // dxr limit is 2^29 prims total within one blas
|
||||
#define QUAD_PRIMID_MASK ((1<<QUAD_PRIMID_BITS)-1)
|
||||
|
||||
#define INSTANCE_ID_BITS 24
|
||||
#define INSTANCE_ID_MASK ((1<<INSTANCE_ID_BITS)-1)
|
||||
|
||||
// JDB TODO: Make this a separate, dedicated structure.. Aliasing a float4 AABB as a primref is needlessly obfuscated
|
||||
|
||||
typedef struct AABB PrimRef;
|
||||
|
||||
GRL_INLINE void AABB_init(struct AABB *aabb)
|
||||
{
|
||||
aabb->lower = (float4)(INFINITY, INFINITY, INFINITY, 0);
|
||||
aabb->upper = -(float4)(INFINITY, INFINITY, INFINITY, 0);
|
||||
}
|
||||
|
||||
GRL_INLINE uint PRIMREF_geomID( PrimRef* aabb)
|
||||
{
|
||||
const uint v = as_uint(aabb->lower.w);
|
||||
return v & QUAD_GEOMID_MASK;
|
||||
}
|
||||
|
||||
GRL_INLINE uint PRIMREF_primID0( PrimRef* aabb)
|
||||
{
|
||||
return as_uint( aabb->upper.w ) & QUAD_PRIMID_MASK;
|
||||
}
|
||||
|
||||
GRL_INLINE uint PRIMREF_primID1( PrimRef* aabb)
|
||||
{
|
||||
const uint v = as_uint(aabb->lower.w);
|
||||
const uint primID0 = as_uint(aabb->upper.w) & QUAD_PRIMID_MASK;
|
||||
const uint deltaID = v >> QUAD_GEOMID_BITS;
|
||||
const uint primID1 = primID0 + deltaID;
|
||||
return primID1;
|
||||
}
|
||||
|
||||
GRL_INLINE uint PRIMREF_geomFlags( PrimRef* aabb )
|
||||
{
|
||||
const uint v = as_uint( aabb->upper.w );
|
||||
return (v >> QUAD_PRIMID_BITS) ;
|
||||
}
|
||||
|
||||
GRL_INLINE uint PRIMREF_instanceIndex( PrimRef* aabb )
|
||||
{
|
||||
return as_uint(aabb->lower.w) & INSTANCE_ID_MASK;
|
||||
}
|
||||
|
||||
GRL_INLINE uchar PRIMREF_instanceMask( PrimRef* aabb )
|
||||
{
|
||||
return as_uint(aabb->lower.w) >> INSTANCE_ID_BITS;
|
||||
}
|
||||
|
||||
GRL_INLINE void PRIMREF_setProceduralMetaData( PrimRef* primref, uint geomID, uint primID, uint geomFlags )
|
||||
{
|
||||
/* encode geomID, primID */
|
||||
uint flags = (geomFlags << QUAD_PRIMID_BITS);
|
||||
primref->lower.w = as_float( geomID );
|
||||
primref->upper.w = as_float( primID | flags );
|
||||
}
|
||||
|
||||
GRL_INLINE void PRIMREF_setQuadMetaData( PrimRef* primref, uint primID0, uint primID1, uint geomID, uint geomFlags )
|
||||
{
|
||||
const uint primID_diff = primID1 - primID0;
|
||||
uint flags = geomFlags << QUAD_PRIMID_BITS;
|
||||
|
||||
primref->lower.w = as_float( geomID | (primID_diff << QUAD_GEOMID_BITS) );
|
||||
primref->upper.w = as_float( (primID0 | flags) );
|
||||
}
|
||||
|
||||
GRL_INLINE void PRIMREF_setAABB( PrimRef* primref, float3 lower, float3 upper )
|
||||
{
|
||||
primref->lower.xyz = lower.xyz;
|
||||
primref->upper.xyz = upper.xyz;
|
||||
}
|
||||
|
||||
GRL_INLINE PrimRef PRIMREF_set_instance( float3 lower, float3 upper, uint instanceIndex, uint instanceMask, uint rootOffset, bool is_procedural )
|
||||
{
|
||||
PrimRef new_ref;
|
||||
new_ref.lower.xyz = lower;
|
||||
new_ref.lower.w = as_float(instanceIndex | (instanceMask << 24));
|
||||
new_ref.upper.xyz = upper;
|
||||
new_ref.upper.w = as_float(rootOffset + (is_procedural? 0x80000000 : 0));
|
||||
return new_ref;
|
||||
}
|
||||
|
||||
GRL_INLINE bool PRIMREF_isProceduralInstance( PrimRef* primref )
|
||||
{
|
||||
return (as_uint(primref->upper.w) & 0x80000000) != 0;
|
||||
}
|
||||
|
||||
GRL_INLINE uint PRIMREF_instanceRootNodeOffset(PrimRef* primref)
|
||||
{
|
||||
return (as_uint(primref->upper.w) & 0x7fffffff);
|
||||
}
|
||||
|
||||
GRL_INLINE float3 PRIMREF_lower( PrimRef* primref )
|
||||
{
|
||||
return primref->lower.xyz;
|
||||
}
|
||||
GRL_INLINE float3 PRIMREF_upper( PrimRef* primref )
|
||||
{
|
||||
return primref->upper.xyz;
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB_extend(struct AABB *aabb, struct AABB *v)
|
||||
{
|
||||
aabb->lower = min(aabb->lower, v->lower);
|
||||
aabb->upper = max(aabb->upper, v->upper);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB_extend_point(struct AABB *aabb, const float4 p)
|
||||
{
|
||||
aabb->lower = min(aabb->lower, p);
|
||||
aabb->upper = max(aabb->upper, p);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB_extendlu(struct AABB *aabb, const float4 lower, const float4 upper)
|
||||
{
|
||||
aabb->lower = min(aabb->lower, lower);
|
||||
aabb->upper = max(aabb->upper, upper);
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB AABB_enlarge(struct AABB *aabb, const float v)
|
||||
{
|
||||
struct AABB box;
|
||||
box.lower = aabb->lower - (float4)v;
|
||||
box.upper = aabb->upper + (float4)v;
|
||||
return box;
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB_intersect(struct AABB *aabb, struct AABB *v)
|
||||
{
|
||||
aabb->lower = max(aabb->lower, v->lower);
|
||||
aabb->upper = min(aabb->upper, v->upper);
|
||||
}
|
||||
|
||||
GRL_INLINE float4 AABB_size(struct AABB *aabb)
|
||||
{
|
||||
return aabb->upper - aabb->lower;
|
||||
}
|
||||
|
||||
GRL_INLINE float4 AABB_centroid2(struct AABB *aabb)
|
||||
{
|
||||
return aabb->lower + aabb->upper;
|
||||
}
|
||||
|
||||
GRL_INLINE float AABB_halfArea(struct AABB *aabb)
|
||||
{
|
||||
const float4 d = AABB_size(aabb);
|
||||
return halfarea(d.xyz);
|
||||
}
|
||||
|
||||
GRL_INLINE float AABB_intersecion_size(struct AABB* aabb, struct AABB* v)
|
||||
{
|
||||
struct AABB temp = *aabb;
|
||||
AABB_intersect(&temp, v);
|
||||
float4 len = AABB_size(&temp);
|
||||
float ret = 0.0f;
|
||||
if (len.x >= 0.0f && len.y >= 0.0f && len.z >= 0.0f) {
|
||||
float3 v = { len.x, len.y, len.z };
|
||||
ret = halfarea(v);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
GRL_INLINE bool AABB_subset(struct AABB* small, struct AABB* big)
|
||||
{
|
||||
const int4 b0 = small->lower >= big->lower;
|
||||
const int4 b1 = small->upper <= big->upper;
|
||||
const int4 b = b0 & b1;
|
||||
return b.x & b.y & b.z;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB AABBfromAABB3f(const struct AABB3f box)
|
||||
{
|
||||
struct AABB box4d = {
|
||||
{box.lower[0], box.lower[1], box.lower[2], 0.0f},
|
||||
{box.upper[0], box.upper[1], box.upper[2], 0.0f}
|
||||
};
|
||||
return box4d;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB3f AABB3fFromAABB(const struct AABB box)
|
||||
{
|
||||
struct AABB3f box3d = {
|
||||
{box.lower[0], box.lower[1], box.lower[2]},
|
||||
{box.upper[0], box.upper[1], box.upper[2]}
|
||||
};
|
||||
return box3d;
|
||||
}
|
||||
|
||||
GRL_INLINE bool AABB_verify(struct AABB* aabb)
|
||||
{
|
||||
bool error = false;
|
||||
if (aabb->lower.x > aabb->upper.x)
|
||||
error = true;
|
||||
if (aabb->lower.y > aabb->upper.y)
|
||||
error = true;
|
||||
if (aabb->lower.z > aabb->upper.z)
|
||||
error = true;
|
||||
if (!isfinite(aabb->lower.x))
|
||||
error = true;
|
||||
if (!isfinite(aabb->lower.y))
|
||||
error = true;
|
||||
if (!isfinite(aabb->lower.z))
|
||||
error = true;
|
||||
if (!isfinite(aabb->upper.x))
|
||||
error = true;
|
||||
if (!isfinite(aabb->upper.y))
|
||||
error = true;
|
||||
if (!isfinite(aabb->upper.z))
|
||||
error = true;
|
||||
return error;
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB_print(struct AABB* aabb)
|
||||
{
|
||||
printf("AABB {\n area = %f\n lower = %f\n upper = %f\n geomID = %i primID0 = %i primID1 = %i\n aabb->lower.w = %x aabb->upper.w = %x }\n",
|
||||
AABB_halfArea(aabb),
|
||||
aabb->lower.xyz,
|
||||
aabb->upper.xyz,
|
||||
PRIMREF_geomID(aabb),
|
||||
PRIMREF_primID0(aabb),
|
||||
PRIMREF_primID1(aabb),
|
||||
as_uint(aabb->lower.w),
|
||||
as_uint(aabb->upper.w));
|
||||
}
|
||||
|
||||
#ifdef __OPENCL_VERSION__
|
||||
|
||||
GRL_INLINE PrimRef PrimRef_sub_group_shuffle(PrimRef* primRef, const uint slotID)
|
||||
{
|
||||
PrimRef shuffledPrimref;
|
||||
shuffledPrimref.lower.x = intel_sub_group_shuffle(primRef->lower.x, slotID);
|
||||
shuffledPrimref.lower.y = intel_sub_group_shuffle(primRef->lower.y, slotID);
|
||||
shuffledPrimref.lower.z = intel_sub_group_shuffle(primRef->lower.z, slotID);
|
||||
shuffledPrimref.lower.w = intel_sub_group_shuffle(primRef->lower.w, slotID);
|
||||
shuffledPrimref.upper.x = intel_sub_group_shuffle(primRef->upper.x, slotID);
|
||||
shuffledPrimref.upper.y = intel_sub_group_shuffle(primRef->upper.y, slotID);
|
||||
shuffledPrimref.upper.z = intel_sub_group_shuffle(primRef->upper.z, slotID);
|
||||
shuffledPrimref.upper.w = intel_sub_group_shuffle(primRef->upper.w, slotID);
|
||||
return shuffledPrimref;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB AABB_sub_group_broadcast(struct AABB *aabb, const uint slotID)
|
||||
{
|
||||
struct AABB bounds;
|
||||
bounds.lower.x = sub_group_broadcast(aabb->lower.x, slotID);
|
||||
bounds.lower.y = sub_group_broadcast(aabb->lower.y, slotID);
|
||||
bounds.lower.z = sub_group_broadcast(aabb->lower.z, slotID);
|
||||
bounds.lower.w = 0;
|
||||
bounds.upper.x = sub_group_broadcast(aabb->upper.x, slotID);
|
||||
bounds.upper.y = sub_group_broadcast(aabb->upper.y, slotID);
|
||||
bounds.upper.z = sub_group_broadcast(aabb->upper.z, slotID);
|
||||
bounds.upper.w = 0;
|
||||
return bounds;
|
||||
}
|
||||
GRL_INLINE struct AABB AABB_sub_group_shuffle(struct AABB* aabb, const uint slotID)
|
||||
{
|
||||
struct AABB bounds;
|
||||
bounds.lower.x = intel_sub_group_shuffle(aabb->lower.x, slotID);
|
||||
bounds.lower.y = intel_sub_group_shuffle(aabb->lower.y, slotID);
|
||||
bounds.lower.z = intel_sub_group_shuffle(aabb->lower.z, slotID);
|
||||
bounds.lower.w = 0;
|
||||
bounds.upper.x = intel_sub_group_shuffle(aabb->upper.x, slotID);
|
||||
bounds.upper.y = intel_sub_group_shuffle(aabb->upper.y, slotID);
|
||||
bounds.upper.z = intel_sub_group_shuffle(aabb->upper.z, slotID);
|
||||
bounds.upper.w = 0;
|
||||
return bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE uint AABB_sub_group_shuffle_coordPerLane(struct AABB* aabb, const uint slotID)
|
||||
{
|
||||
float coordData[8] = {
|
||||
sub_group_broadcast(aabb->lower.x, slotID),
|
||||
sub_group_broadcast(aabb->lower.y, slotID),
|
||||
sub_group_broadcast(aabb->lower.z, slotID),
|
||||
sub_group_broadcast(aabb->lower.w, slotID),
|
||||
sub_group_broadcast(aabb->upper.x, slotID),
|
||||
sub_group_broadcast(aabb->upper.y, slotID),
|
||||
sub_group_broadcast(aabb->upper.z, slotID),
|
||||
sub_group_broadcast(aabb->upper.w, slotID) };
|
||||
|
||||
uint coordDataFiltered;
|
||||
const uint lane = get_sub_group_local_id();
|
||||
if (lane < 8) coordDataFiltered = as_uint(coordData[lane]);
|
||||
return coordDataFiltered;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB AABB_sub_group_reduce(struct AABB *aabb)
|
||||
{
|
||||
struct AABB bounds;
|
||||
bounds.lower.x = sub_group_reduce_min(aabb->lower.x);
|
||||
bounds.lower.y = sub_group_reduce_min(aabb->lower.y);
|
||||
bounds.lower.z = sub_group_reduce_min(aabb->lower.z);
|
||||
bounds.lower.w = 0;
|
||||
bounds.upper.x = sub_group_reduce_max(aabb->upper.x);
|
||||
bounds.upper.y = sub_group_reduce_max(aabb->upper.y);
|
||||
bounds.upper.z = sub_group_reduce_max(aabb->upper.z);
|
||||
bounds.upper.w = 0;
|
||||
return bounds;
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE struct AABB AABB_sub_group_reduce_N6( struct AABB* aabb )
|
||||
{
|
||||
float3 l = aabb->lower.xyz;
|
||||
float3 u = aabb->upper.xyz;
|
||||
l = min( l, intel_sub_group_shuffle_down( l, l, 4 ) );
|
||||
l = min( l, intel_sub_group_shuffle_down( l, l, 2 ) );
|
||||
l = min( l, intel_sub_group_shuffle_down( l, l, 1 ) );
|
||||
u = max( u, intel_sub_group_shuffle_down( u, u, 4 ) );
|
||||
u = max( u, intel_sub_group_shuffle_down( u, u, 2 ) );
|
||||
u = max( u, intel_sub_group_shuffle_down( u, u, 1 ) );
|
||||
|
||||
struct AABB bounds;
|
||||
bounds.lower.x = l.x;
|
||||
bounds.lower.y = l.y;
|
||||
bounds.lower.z = l.z;
|
||||
bounds.lower.w = 0;
|
||||
bounds.upper.x = u.x;
|
||||
bounds.upper.y = u.y;
|
||||
bounds.upper.z = u.z;
|
||||
bounds.upper.w = 0;
|
||||
return bounds;
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE struct AABB AABB_work_group_reduce(struct AABB *aabb)
|
||||
{
|
||||
struct AABB bounds;
|
||||
bounds.lower.x = work_group_reduce_min(aabb->lower.x);
|
||||
bounds.lower.y = work_group_reduce_min(aabb->lower.y);
|
||||
bounds.lower.z = work_group_reduce_min(aabb->lower.z);
|
||||
bounds.upper.x = work_group_reduce_max(aabb->upper.x);
|
||||
bounds.upper.y = work_group_reduce_max(aabb->upper.y);
|
||||
bounds.upper.z = work_group_reduce_max(aabb->upper.z);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB AABB_sub_group_scan_exclusive_min_max(struct AABB *aabb)
|
||||
{
|
||||
struct AABB bounds;
|
||||
bounds.lower.x = sub_group_scan_exclusive_min(aabb->lower.x);
|
||||
bounds.lower.y = sub_group_scan_exclusive_min(aabb->lower.y);
|
||||
bounds.lower.z = sub_group_scan_exclusive_min(aabb->lower.z);
|
||||
bounds.lower.w = 0;
|
||||
bounds.upper.x = sub_group_scan_exclusive_max(aabb->upper.x);
|
||||
bounds.upper.y = sub_group_scan_exclusive_max(aabb->upper.y);
|
||||
bounds.upper.z = sub_group_scan_exclusive_max(aabb->upper.z);
|
||||
bounds.upper.w = 0;
|
||||
return bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB AABB_sub_group_scan_inclusive_min_max(struct AABB *aabb)
|
||||
{
|
||||
struct AABB bounds;
|
||||
bounds.lower.x = sub_group_scan_inclusive_min(aabb->lower.x);
|
||||
bounds.lower.y = sub_group_scan_inclusive_min(aabb->lower.y);
|
||||
bounds.lower.z = sub_group_scan_inclusive_min(aabb->lower.z);
|
||||
bounds.lower.w = 0;
|
||||
bounds.upper.x = sub_group_scan_inclusive_max(aabb->upper.x);
|
||||
bounds.upper.y = sub_group_scan_inclusive_max(aabb->upper.y);
|
||||
bounds.upper.z = sub_group_scan_inclusive_max(aabb->upper.z);
|
||||
bounds.upper.w = 0;
|
||||
return bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB_global_atomic_merge(global struct AABB *global_aabb, struct AABB *aabb)
|
||||
{
|
||||
atomic_min((volatile __global float *)&global_aabb->lower + 0, aabb->lower.x);
|
||||
atomic_min((volatile __global float *)&global_aabb->lower + 1, aabb->lower.y);
|
||||
atomic_min((volatile __global float *)&global_aabb->lower + 2, aabb->lower.z);
|
||||
atomic_max((volatile __global float *)&global_aabb->upper + 0, aabb->upper.x);
|
||||
atomic_max((volatile __global float *)&global_aabb->upper + 1, aabb->upper.y);
|
||||
atomic_max((volatile __global float *)&global_aabb->upper + 2, aabb->upper.z);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB_global_atomic_merge_lu(global struct AABB* global_aabb, float3 lower, float3 upper )
|
||||
{
|
||||
atomic_min((volatile __global float*) & global_aabb->lower + 0, lower.x);
|
||||
atomic_min((volatile __global float*) & global_aabb->lower + 1, lower.y);
|
||||
atomic_min((volatile __global float*) & global_aabb->lower + 2, lower.z);
|
||||
atomic_max((volatile __global float*) & global_aabb->upper + 0, upper.x);
|
||||
atomic_max((volatile __global float*) & global_aabb->upper + 1, upper.y);
|
||||
atomic_max((volatile __global float*) & global_aabb->upper + 2, upper.z);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB_global_atomic_merge_sub_group_lu(uniform global struct AABB* aabb, float3 lower, float3 upper)
|
||||
{
|
||||
uint lane = get_sub_group_local_id();
|
||||
float l[3];
|
||||
l[0] = sub_group_reduce_min(lower.x);
|
||||
l[1] = sub_group_reduce_min(lower.y);
|
||||
l[2] = sub_group_reduce_min(lower.z);
|
||||
float u[3];
|
||||
u[0] = sub_group_reduce_max(upper.x);
|
||||
u[1] = sub_group_reduce_max(upper.y);
|
||||
u[2] = sub_group_reduce_max(upper.z);
|
||||
|
||||
if (lane < 3)
|
||||
{
|
||||
atomic_min((global float*)&aabb->lower + lane, l[lane]);
|
||||
atomic_max((global float*)&aabb->upper + lane, u[lane]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void AABB_local_atomic_merge(local struct AABB *aabb, const float4 lower, const float4 upper)
|
||||
{
|
||||
if (lower.x < aabb->lower.x)
|
||||
atomic_min((local float *)&aabb->lower + 0, lower.x);
|
||||
if (lower.y < aabb->lower.y)
|
||||
atomic_min((local float *)&aabb->lower + 1, lower.y);
|
||||
if (lower.z < aabb->lower.z)
|
||||
atomic_min((local float *)&aabb->lower + 2, lower.z);
|
||||
if (upper.x > aabb->upper.x)
|
||||
atomic_max((local float *)&aabb->upper + 0, upper.x);
|
||||
if (upper.y > aabb->upper.y)
|
||||
atomic_max((local float *)&aabb->upper + 1, upper.y);
|
||||
if (upper.z > aabb->upper.z)
|
||||
atomic_max((local float *)&aabb->upper + 2, upper.z);
|
||||
}
|
||||
#endif
|
||||
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,840 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include "GRLStructs.h"
|
||||
#include "shared.h"
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
|
||||
typedef struct Geo GRL_RAYTRACING_GEOMETRY_DESC;
|
||||
|
||||
typedef struct GRL_RAYTRACING_AABB
|
||||
{
|
||||
float MinX;
|
||||
float MinY;
|
||||
float MinZ;
|
||||
float MaxX;
|
||||
float MaxY;
|
||||
float MaxZ;
|
||||
} GRL_RAYTRACING_AABB;
|
||||
|
||||
GRL_INLINE void GLR_set_raytracing_aabb(GRL_RAYTRACING_AABB* dest, struct AABB* source)
|
||||
{
|
||||
dest->MinX = source->lower.x;
|
||||
dest->MinY = source->lower.y;
|
||||
dest->MinZ = source->lower.z;
|
||||
dest->MaxX = source->upper.x;
|
||||
dest->MaxY = source->upper.y;
|
||||
dest->MaxZ = source->upper.z;
|
||||
}
|
||||
|
||||
GRL_INLINE uint3 GRL_load_triangle(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint triID)
|
||||
{
|
||||
global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
|
||||
uint index_format = geomDesc->Desc.Triangles.IndexFormat;
|
||||
|
||||
if (index_format == INDEX_FORMAT_R32_UINT)
|
||||
{
|
||||
const uint* data = (const uint*)(indices + triID * 3 * 4);
|
||||
return (uint3)(data[0], data[1], data[2]);
|
||||
}
|
||||
else if (index_format == INDEX_FORMAT_NONE)
|
||||
{
|
||||
return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
const ushort* data = (const ushort*)(indices + triID * 3 * 2);
|
||||
return (uint3)(data[0], data[1], data[2]);
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE uint3 GRL_load_indices_from_buffer(global char* indices, const uint index_format, const uint triID)
|
||||
{
|
||||
if (index_format == INDEX_FORMAT_R32_UINT)
|
||||
{
|
||||
return load_uint3_L1C_L3C((global uint3*)(indices + triID * 3 * 4), 0);
|
||||
}
|
||||
else if (index_format == INDEX_FORMAT_NONE)
|
||||
{
|
||||
return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
const ushort* data = (const ushort*)(indices + triID * 3 * 2);
|
||||
return (uint3)(data[0], data[1], data[2]);
|
||||
}
|
||||
}
|
||||
|
||||
// Load all 3 indices from one triangle, and a single index from another
|
||||
GRL_INLINE uint4 GRL_load_quad_indices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint triID, uint triID_1, ushort fourth_vert)
|
||||
{
|
||||
global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
|
||||
uint index_format = geomDesc->Desc.Triangles.IndexFormat;
|
||||
|
||||
if (index_format == INDEX_FORMAT_R32_UINT)
|
||||
{
|
||||
const uint* data0 = (const uint*)(indices + triID * 3 * 4);
|
||||
const uint* data1 = (const uint*)(indices + triID_1 * 3 * 4);
|
||||
return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
|
||||
}
|
||||
else if (index_format == INDEX_FORMAT_NONE)
|
||||
{
|
||||
return (uint4)(triID * 3, triID * 3 + 1, triID * 3 + 2, triID_1 * 3 + fourth_vert);
|
||||
}
|
||||
else
|
||||
{
|
||||
const ushort* data0 = (const ushort*)(indices + triID * 3 * 2);
|
||||
const ushort* data1 = (const ushort*)(indices + triID_1 * 3 * 2);
|
||||
return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_Type(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, GeometryType type)
|
||||
{
|
||||
geomDesc->Type = type;
|
||||
}
|
||||
|
||||
GRL_INLINE GeometryType GRL_get_Type(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Type;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_Flags(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint8_t flags)
|
||||
{
|
||||
geomDesc->Flags = flags;
|
||||
}
|
||||
|
||||
GRL_INLINE uint8_t GRL_get_Flags(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Flags;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_triangles_Transform(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t transform)
|
||||
{
|
||||
geomDesc->Desc.Triangles.pTransformBuffer = transform;
|
||||
}
|
||||
|
||||
GRL_INLINE gpuva_t GRL_get_triangles_Transform(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Triangles.pTransformBuffer;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_triangles_IndexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, IndexFormat format)
|
||||
{
|
||||
geomDesc->Desc.Triangles.IndexFormat = format;
|
||||
}
|
||||
|
||||
GRL_INLINE IndexFormat GRL_get_triangles_IndexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Triangles.IndexFormat;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_triangles_VertexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, VertexFormat format)
|
||||
{
|
||||
geomDesc->Desc.Triangles.VertexFormat = format;
|
||||
}
|
||||
|
||||
GRL_INLINE VertexFormat GRL_get_triangles_VertexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Triangles.VertexFormat;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_triangles_IndexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
|
||||
{
|
||||
geomDesc->Desc.Triangles.IndexCount = count;
|
||||
}
|
||||
|
||||
GRL_INLINE dword GRL_get_triangles_IndexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Triangles.IndexCount;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
|
||||
{
|
||||
geomDesc->Desc.Triangles.VertexCount = count;
|
||||
}
|
||||
|
||||
GRL_INLINE dword GRL_get_triangles_VertexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Triangles.VertexCount;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_triangles_IndexBuffer(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t buffer)
|
||||
{
|
||||
geomDesc->Desc.Triangles.pIndexBuffer = buffer;
|
||||
}
|
||||
|
||||
GRL_INLINE gpuva_t GRL_get_triangles_IndexBuffer(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Triangles.pIndexBuffer;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_triangles_VertexBuffer_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
|
||||
{
|
||||
geomDesc->Desc.Triangles.pVertexBuffer = address;
|
||||
}
|
||||
|
||||
GRL_INLINE gpuva_t GRL_get_triangles_VertexBuffer_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Triangles.pVertexBuffer;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_triangles_VertexBuffer_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, unsigned long stride)
|
||||
{
|
||||
geomDesc->Desc.Triangles.VertexBufferByteStride = stride;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned long GRL_get_triangles_VertexBuffer_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Triangles.VertexBufferByteStride;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned long GRL_get_triangles_IndexFormatSizeInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return (unsigned long)(geomDesc->Desc.Triangles.IndexFormat);
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_procedurals_AABBCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
|
||||
{
|
||||
geomDesc->Desc.Procedural.AABBCount = count;
|
||||
}
|
||||
|
||||
GRL_INLINE dword GRL_get_procedurals_AABBCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Procedural.AABBCount;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_procedurals_AABBs_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
|
||||
{
|
||||
geomDesc->Desc.Procedural.pAABBs_GPUVA = address;
|
||||
}
|
||||
|
||||
GRL_INLINE gpuva_t GRL_get_procedurals_AABBs_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Procedural.pAABBs_GPUVA;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_procedurals_AABBs_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, qword stride)
|
||||
{
|
||||
geomDesc->Desc.Procedural.AABBByteStride = stride;
|
||||
}
|
||||
|
||||
GRL_INLINE qword GRL_get_procedurals_AABBs_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
|
||||
{
|
||||
return geomDesc->Desc.Procedural.AABBByteStride;
|
||||
}
|
||||
|
||||
GRL_INLINE uint GRL_is_procedural(GRL_RAYTRACING_GEOMETRY_DESC* desc)
|
||||
{
|
||||
return desc->Type == (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
|
||||
}
|
||||
|
||||
GRL_INLINE uint GRL_is_triangle(GRL_RAYTRACING_GEOMETRY_DESC* desc)
|
||||
{
|
||||
return desc->Type != (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned int GRL_get_ShaderIndex_Mask(GRL_RAYTRACING_GEOMETRY_DESC* desc)
|
||||
{
|
||||
return 0x00FFFFFF;
|
||||
}
|
||||
|
||||
GRL_INLINE dword GRL_atomic_add_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* desc, dword value)
|
||||
{
|
||||
return atomic_add((global uint*) & desc->Desc.Triangles.VertexCount, value);
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned int GRL_get_primitive_count(GRL_RAYTRACING_GEOMETRY_DESC* desc)
|
||||
{
|
||||
if (GRL_is_triangle(desc))
|
||||
{
|
||||
if (desc->Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
|
||||
{
|
||||
return desc->Desc.Triangles.VertexCount / 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
return desc->Desc.Triangles.IndexCount / 3;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return desc->Desc.Procedural.AABBCount;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable // to leaf half values
|
||||
|
||||
GRL_INLINE float snorm_to_float(short v)
|
||||
{
|
||||
return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 32767.0f))); // FIXME: do we have intrinsic for this?
|
||||
}
|
||||
|
||||
GRL_INLINE float snorm8_to_float(signed char v)
|
||||
{
|
||||
return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 127.0f))); // FIXME: do we have intrinsic for this?
|
||||
}
|
||||
|
||||
GRL_INLINE float unorm_to_float(unsigned short v)
|
||||
{
|
||||
return min(1.0f, max(0.0f, ((float)v) * (1.0f / 65535.0f))); // FIXME: do we have intrinsic for this?
|
||||
}
|
||||
|
||||
//only lower 10 bits of v are used
|
||||
GRL_INLINE float unorm10_to_float(unsigned v)
|
||||
{
|
||||
const unsigned short mask = (unsigned short)((1u << 10u) - 1u);
|
||||
const unsigned short v10 = (unsigned short)v & mask;
|
||||
return min(1.0f, max(0.0f, ((float)v10) * (1.0f / 1023.0f))); // FIXME: do we have intrinsic for this?
|
||||
}
|
||||
|
||||
GRL_INLINE float unorm8_to_float(unsigned char v)
|
||||
{
|
||||
return min(1.0f, max(0.0f, ((float)v) * (1.0f / 255.0f))); // FIXME: do we have intrinsic for this?
|
||||
}
|
||||
|
||||
GRL_INLINE float4 GRL_load_vertex(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint vtxID)
|
||||
{
|
||||
float4 v = (float4)(0, 0, 0, 0);
|
||||
global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
|
||||
uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
|
||||
uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
|
||||
|
||||
if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
|
||||
{
|
||||
const float* data = (const float*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(data[0], data[1], data[2], 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
|
||||
{
|
||||
const float* data = (const float*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(data[0], data[1], 0.0f, 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
|
||||
{
|
||||
const half* data = (const half*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(data[0], data[1], data[2], 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
|
||||
{
|
||||
const half* data = (const half*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(data[0], data[1], 0.0f, 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
|
||||
{
|
||||
const short* data = (const short*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(snorm_to_float(data[0]),
|
||||
snorm_to_float(data[1]),
|
||||
snorm_to_float(data[2]),
|
||||
0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
|
||||
{
|
||||
const short* data = (const short*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(snorm_to_float(data[0]),
|
||||
snorm_to_float(data[1]),
|
||||
0.0f,
|
||||
0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
|
||||
{
|
||||
const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(unorm_to_float(data[0]),
|
||||
unorm_to_float(data[1]),
|
||||
unorm_to_float(data[2]),
|
||||
0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
|
||||
{
|
||||
const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(unorm_to_float(data[0]),
|
||||
unorm_to_float(data[1]),
|
||||
0.0f,
|
||||
0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
|
||||
{
|
||||
const unsigned data = *(const unsigned*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(unorm10_to_float(data),
|
||||
unorm10_to_float((data >> 10)),
|
||||
unorm10_to_float((data >> 20)),
|
||||
0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
|
||||
{
|
||||
const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(unorm8_to_float(data[0]),
|
||||
unorm8_to_float(data[1]),
|
||||
unorm8_to_float(data[2]),
|
||||
0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
|
||||
{
|
||||
const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(unorm8_to_float(data[0]),
|
||||
unorm8_to_float(data[1]),
|
||||
0.0f,
|
||||
0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
|
||||
{
|
||||
const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(snorm8_to_float(data[0]),
|
||||
snorm8_to_float(data[1]),
|
||||
snorm8_to_float(data[2]),
|
||||
0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
|
||||
{
|
||||
const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
|
||||
v = (float4)(snorm8_to_float(data[0]),
|
||||
snorm8_to_float(data[1]),
|
||||
0.0f,
|
||||
0.0f);
|
||||
}
|
||||
|
||||
/* perform vertex transformation */
|
||||
if (geomDesc->Desc.Triangles.pTransformBuffer)
|
||||
{
|
||||
global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
|
||||
const float x = xfm[0] * v.x + xfm[1] * v.y + xfm[2] * v.z + xfm[3];
|
||||
const float y = xfm[4] * v.x + xfm[5] * v.y + xfm[6] * v.z + xfm[7];
|
||||
const float z = xfm[8] * v.x + xfm[9] * v.y + xfm[10] * v.z + xfm[11];
|
||||
v = (float4)(x, y, z, 0.0f);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_load_triangle_vertices(global char* vertices, const uint vertex_format, const uint vertex_stride, global float* transform_buffer, const uint vtx0ID, const uint vtx1ID, const uint vtx2ID, float4* out)
|
||||
{
|
||||
if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
|
||||
{
|
||||
const float3 data0 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx0ID * vertex_stride), 0));
|
||||
const float3 data1 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx1ID * vertex_stride), 0));
|
||||
const float3 data2 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx2ID * vertex_stride), 0));
|
||||
out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
|
||||
out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
|
||||
out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
|
||||
{
|
||||
const float* data0 = (const float*)(vertices + vtx0ID * vertex_stride);
|
||||
const float* data1 = (const float*)(vertices + vtx1ID * vertex_stride);
|
||||
const float* data2 = (const float*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
|
||||
out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
|
||||
out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
|
||||
{
|
||||
const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
|
||||
const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
|
||||
const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
|
||||
out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
|
||||
out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
|
||||
{
|
||||
const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
|
||||
const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
|
||||
const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
|
||||
out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
|
||||
out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
|
||||
{
|
||||
const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
|
||||
const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
|
||||
const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]), 0.0f);
|
||||
out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]), 0.0f);
|
||||
out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]), 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
|
||||
{
|
||||
const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
|
||||
const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
|
||||
const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f, 0.0f);
|
||||
out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f, 0.0f);
|
||||
out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f, 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
|
||||
{
|
||||
const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
|
||||
const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
|
||||
const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]), 0.0f);
|
||||
out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]), 0.0f);
|
||||
out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]), 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
|
||||
{
|
||||
const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
|
||||
const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
|
||||
const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f, 0.0f);
|
||||
out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f, 0.0f);
|
||||
out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f, 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
|
||||
{
|
||||
const unsigned data0 = *(const unsigned*)(vertices + vtx0ID * vertex_stride);
|
||||
const unsigned data1 = *(const unsigned*)(vertices + vtx1ID * vertex_stride);
|
||||
const unsigned data2 = *(const unsigned*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(unorm10_to_float(data0), unorm10_to_float(data0 >> 10), unorm10_to_float(data0 >> 20), 0.0f);
|
||||
out[1] = (float4)(unorm10_to_float(data1), unorm10_to_float(data1 >> 10), unorm10_to_float(data1 >> 20), 0.0f);
|
||||
out[2] = (float4)(unorm10_to_float(data2), unorm10_to_float(data2 >> 10), unorm10_to_float(data2 >> 20), 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
|
||||
{
|
||||
const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
|
||||
const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
|
||||
const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]), 0.0f);
|
||||
out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]), 0.0f);
|
||||
out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]), 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
|
||||
{
|
||||
const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
|
||||
const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
|
||||
const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f, 0.0f);
|
||||
out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f, 0.0f);
|
||||
out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f, 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
|
||||
{
|
||||
const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
|
||||
const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
|
||||
const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]), 0.0f);
|
||||
out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]), 0.0f);
|
||||
out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]), 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
|
||||
{
|
||||
const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
|
||||
const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
|
||||
const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
|
||||
out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f, 0.0f);
|
||||
out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f, 0.0f);
|
||||
out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
/* perform vertex transformation */
|
||||
if (transform_buffer)
|
||||
{
|
||||
global float* xfm = (global float*)transform_buffer;
|
||||
for (uint i = 0; i < 3; ++i)
|
||||
{
|
||||
const float x = xfm[0] * out[i].x + xfm[1] * out[i].y + xfm[2] * out[i].z + xfm[3];
|
||||
const float y = xfm[4] * out[i].x + xfm[5] * out[i].y + xfm[6] * out[i].z + xfm[7];
|
||||
const float z = xfm[8] * out[i].x + xfm[9] * out[i].y + xfm[10] * out[i].z + xfm[11];
|
||||
out[i] = (float4)(x, y, z, 0.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_load_quad_vertices_no_stride(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
float3* out0, float3* out1, float3* out2, float3* out3,
|
||||
const uint4 vtxID, const uint vertex_format, global char* vertices)
|
||||
{
|
||||
float3 v0, v1, v2, v3;
|
||||
|
||||
if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
|
||||
{
|
||||
const float* data0 = (const float*)(vertices + vtxID.x);
|
||||
const float* data1 = (const float*)(vertices + vtxID.y);
|
||||
const float* data2 = (const float*)(vertices + vtxID.z);
|
||||
const float* data3 = (const float*)(vertices + vtxID.w);
|
||||
v0 = (float3)(data0[0], data0[1], data0[2]);
|
||||
v1 = (float3)(data1[0], data1[1], data1[2]);
|
||||
v2 = (float3)(data2[0], data2[1], data2[2]);
|
||||
v3 = (float3)(data3[0], data3[1], data3[2]);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
|
||||
{
|
||||
const float* data0 = (const float*)(vertices + vtxID.x);
|
||||
const float* data1 = (const float*)(vertices + vtxID.y);
|
||||
const float* data2 = (const float*)(vertices + vtxID.z);
|
||||
const float* data3 = (const float*)(vertices + vtxID.w);
|
||||
v0 = (float3)(data0[0], data0[1], 0.0f);
|
||||
v1 = (float3)(data1[0], data1[1], 0.0f);
|
||||
v2 = (float3)(data2[0], data2[1], 0.0f);
|
||||
v3 = (float3)(data3[0], data3[1], 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
|
||||
{
|
||||
const half* data0 = (const half*)(vertices + vtxID.x);
|
||||
const half* data1 = (const half*)(vertices + vtxID.y);
|
||||
const half* data2 = (const half*)(vertices + vtxID.z);
|
||||
const half* data3 = (const half*)(vertices + vtxID.w);
|
||||
v0 = (float3)(data0[0], data0[1], data0[2]);
|
||||
v1 = (float3)(data1[0], data1[1], data1[2]);
|
||||
v2 = (float3)(data2[0], data2[1], data2[2]);
|
||||
v3 = (float3)(data3[0], data3[1], data3[2]);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
|
||||
{
|
||||
const half* data0 = (const half*)(vertices + vtxID.x);
|
||||
const half* data1 = (const half*)(vertices + vtxID.y);
|
||||
const half* data2 = (const half*)(vertices + vtxID.z);
|
||||
const half* data3 = (const half*)(vertices + vtxID.w);
|
||||
v0 = (float3)(data0[0], data0[1], 0.0f);
|
||||
v1 = (float3)(data1[0], data1[1], 0.0f);
|
||||
v2 = (float3)(data2[0], data2[1], 0.0f);
|
||||
v3 = (float3)(data3[0], data3[1], 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
|
||||
{
|
||||
const short* data0 = (const short*)(vertices + vtxID.x);
|
||||
const short* data1 = (const short*)(vertices + vtxID.y);
|
||||
const short* data2 = (const short*)(vertices + vtxID.z);
|
||||
const short* data3 = (const short*)(vertices + vtxID.w);
|
||||
v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]));
|
||||
v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]));
|
||||
v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]));
|
||||
v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), snorm_to_float(data3[2]));
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
|
||||
{
|
||||
const short* data0 = (const short*)(vertices + vtxID.x);
|
||||
const short* data1 = (const short*)(vertices + vtxID.y);
|
||||
const short* data2 = (const short*)(vertices + vtxID.z);
|
||||
const short* data3 = (const short*)(vertices + vtxID.w);
|
||||
v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f);
|
||||
v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f);
|
||||
v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f);
|
||||
v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
|
||||
{
|
||||
const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
|
||||
const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
|
||||
const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
|
||||
const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
|
||||
v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]));
|
||||
v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]));
|
||||
v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]));
|
||||
v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), unorm_to_float(data3[2]));
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
|
||||
{
|
||||
const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
|
||||
const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
|
||||
const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
|
||||
const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
|
||||
v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f);
|
||||
v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f);
|
||||
v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f);
|
||||
v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
|
||||
{
|
||||
const unsigned data0 = *(const unsigned*)(vertices + vtxID.x);
|
||||
const unsigned data1 = *(const unsigned*)(vertices + vtxID.y);
|
||||
const unsigned data2 = *(const unsigned*)(vertices + vtxID.z);
|
||||
const unsigned data3 = *(const unsigned*)(vertices + vtxID.w);
|
||||
v0 = (float3)(unorm10_to_float(data0), unorm10_to_float((data0 >> 10)), unorm10_to_float((data0 >> 20)));
|
||||
v1 = (float3)(unorm10_to_float(data1), unorm10_to_float((data1 >> 10)), unorm10_to_float((data1 >> 20)));
|
||||
v2 = (float3)(unorm10_to_float(data2), unorm10_to_float((data2 >> 10)), unorm10_to_float((data2 >> 20)));
|
||||
v3 = (float3)(unorm10_to_float(data3), unorm10_to_float((data3 >> 10)), unorm10_to_float((data3 >> 20)));
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
|
||||
{
|
||||
const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
|
||||
const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
|
||||
const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
|
||||
const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
|
||||
v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]));
|
||||
v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]));
|
||||
v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]));
|
||||
v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), unorm8_to_float(data3[2]));
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
|
||||
{
|
||||
const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
|
||||
const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
|
||||
const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
|
||||
const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
|
||||
v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f);
|
||||
v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f);
|
||||
v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f);
|
||||
v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), 0.0f);
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
|
||||
{
|
||||
const signed char* data0 = (const signed char*)(vertices + vtxID.x);
|
||||
const signed char* data1 = (const signed char*)(vertices + vtxID.y);
|
||||
const signed char* data2 = (const signed char*)(vertices + vtxID.z);
|
||||
const signed char* data3 = (const signed char*)(vertices + vtxID.w);
|
||||
v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]));
|
||||
v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]));
|
||||
v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]));
|
||||
v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), snorm8_to_float(data3[2]));
|
||||
}
|
||||
else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
|
||||
{
|
||||
const signed char* data0 = (const signed char*)(vertices + vtxID.x);
|
||||
const signed char* data1 = (const signed char*)(vertices + vtxID.y);
|
||||
const signed char* data2 = (const signed char*)(vertices + vtxID.z);
|
||||
const signed char* data3 = (const signed char*)(vertices + vtxID.w);
|
||||
v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f);
|
||||
v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f);
|
||||
v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f);
|
||||
v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), 0.0f);
|
||||
}
|
||||
else
|
||||
{
|
||||
v0 = (float3)(0.0f, 0.0f, 0.0f);
|
||||
v1 = (float3)(0.0f, 0.0f, 0.0f);
|
||||
v2 = (float3)(0.0f, 0.0f, 0.0f);
|
||||
v3 = (float3)(0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
|
||||
/* perform vertex transformation */
|
||||
if (geomDesc->Desc.Triangles.pTransformBuffer)
|
||||
{
|
||||
global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
|
||||
|
||||
v0.xyz = (float3)(
|
||||
xfm[0] * v0.x + xfm[1] * v0.y + xfm[2] * v0.z + xfm[3],
|
||||
xfm[4] * v0.x + xfm[5] * v0.y + xfm[6] * v0.z + xfm[7],
|
||||
xfm[8] * v0.x + xfm[9] * v0.y + xfm[10] * v0.z + xfm[11]
|
||||
);
|
||||
|
||||
v1.xyz = (float3)(
|
||||
xfm[0] * v1.x + xfm[1] * v1.y + xfm[2] * v1.z + xfm[3],
|
||||
xfm[4] * v1.x + xfm[5] * v1.y + xfm[6] * v1.z + xfm[7],
|
||||
xfm[8] * v1.x + xfm[9] * v1.y + xfm[10] * v1.z + xfm[11]
|
||||
);
|
||||
|
||||
v2.xyz = (float3)(
|
||||
xfm[0] * v2.x + xfm[1] * v2.y + xfm[2] * v2.z + xfm[3],
|
||||
xfm[4] * v2.x + xfm[5] * v2.y + xfm[6] * v2.z + xfm[7],
|
||||
xfm[8] * v2.x + xfm[9] * v2.y + xfm[10] * v2.z + xfm[11]
|
||||
);
|
||||
|
||||
v3.xyz = (float3)(
|
||||
xfm[0] * v3.x + xfm[1] * v3.y + xfm[2] * v3.z + xfm[3],
|
||||
xfm[4] * v3.x + xfm[5] * v3.y + xfm[6] * v3.z + xfm[7],
|
||||
xfm[8] * v3.x + xfm[9] * v3.y + xfm[10] * v3.z + xfm[11]
|
||||
);
|
||||
}
|
||||
|
||||
*out0 = v0;
|
||||
*out1 = v1;
|
||||
*out2 = v2;
|
||||
*out3 = v3;
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void GRL_load_quad_vertices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
float3* out0, float3* out1, float3* out2, float3* out3,
|
||||
uint4 vtxID)
|
||||
{
|
||||
global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
|
||||
uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
|
||||
uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
|
||||
|
||||
vtxID *= vertex_stride;
|
||||
|
||||
GRL_load_quad_vertices_no_stride(geomDesc, out0, out1, out2, out3,
|
||||
vtxID, vertex_format, vertices);
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE GRL_RAYTRACING_AABB GRL_load_aabb(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint primID)
|
||||
{
|
||||
global char* aabb0 = (global char*)geomDesc->Desc.Procedural.pAABBs_GPUVA;
|
||||
global char* aabb = aabb0 + (primID * geomDesc->Desc.Procedural.AABBByteStride);
|
||||
return *(global GRL_RAYTRACING_AABB*)aabb;
|
||||
}
|
||||
|
||||
// same as for d3d12
|
||||
typedef struct GRL_RAYTRACING_INSTANCE_DESC
|
||||
{
|
||||
float Transform[12];
|
||||
// unsigned int InstanceID : 24;
|
||||
// unsigned int InstanceMask : 8;
|
||||
uint32_t DW0;
|
||||
// unsigned int InstanceContributionToHitGroupIndex : 24;
|
||||
// unsigned int Flags : 8;
|
||||
uint32_t DW1;
|
||||
global char* AccelerationStructure;
|
||||
} GRL_RAYTRACING_INSTANCE_DESC;
|
||||
|
||||
GRL_INLINE float GRL_get_transform(const GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column)
|
||||
{
|
||||
return d->Transform[row * 4 + column];
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t GRL_get_instanceID(const GRL_RAYTRACING_INSTANCE_DESC* d)
|
||||
{
|
||||
return d->DW0 & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t GRL_get_InstanceMask(const GRL_RAYTRACING_INSTANCE_DESC* d)
|
||||
{
|
||||
return d->DW0 >> 24;
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t GRL_get_InstanceContributionToHitGroupIndex(const GRL_RAYTRACING_INSTANCE_DESC* d)
|
||||
{
|
||||
return d->DW1 & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t GRL_get_InstanceFlags(const GRL_RAYTRACING_INSTANCE_DESC* d)
|
||||
{
|
||||
return d->DW1 >> 24;
|
||||
}
|
||||
|
||||
GRL_INLINE gpuva_t GRL_get_AccelerationStructure(const GRL_RAYTRACING_INSTANCE_DESC* d)
|
||||
{
|
||||
return (gpuva_t)d->AccelerationStructure;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_transform(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column, float value)
|
||||
{
|
||||
d->Transform[row * 4 + column] = value;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_instanceID(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t id)
|
||||
{
|
||||
d->DW0 &= 255 << 24;
|
||||
d->DW0 |= id & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_InstanceMask(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t mask)
|
||||
{
|
||||
d->DW0 &= ((1 << 24) - 1);
|
||||
d->DW0 |= mask << 24;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_InstanceContributionToHitGroupIndex(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t contribution)
|
||||
{
|
||||
d->DW1 &= 255 << 24;
|
||||
d->DW1 |= contribution & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_InstanceFlags(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t flags)
|
||||
{
|
||||
d->DW1 &= ((1 << 24) - 1);
|
||||
d->DW1 |= flags << 24;
|
||||
}
|
||||
|
||||
GRL_INLINE void GRL_set_AccelerationStructure(GRL_RAYTRACING_INSTANCE_DESC* d, gpuva_t address)
|
||||
{
|
||||
d->AccelerationStructure = (global char*)address;
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,198 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module atomic_update;
|
||||
|
||||
kernel_module atomic_update ("atomic_update.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
kernel init_refit_scratch < kernelFunction = "init_refit_scratch" >;
|
||||
kernel traverse_aabbs_quad < kernelFunction = "traverse_aabbs_quad" >;
|
||||
kernel write_inner_nodes < kernelFunction = "write_inner_nodes" >;
|
||||
kernel build_fatleaf_table < kernelFunction = "build_fatleaf_table" >;
|
||||
kernel build_innernode_table < kernelFunction = "build_innernode_table" >;
|
||||
|
||||
kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >;
|
||||
|
||||
kernel build_fatleaf_table_new_update < kernelFunction = "build_fatleaf_table_new_update" >;
|
||||
kernel fixup_quad_table < kernelFunction = "fixup_quad_table" >;
|
||||
kernel traverse_aabbs_new_update < kernelFunction = "traverse_aabbs_new_update" >;
|
||||
kernel traverse_aabbs_new_update_single_geo < kernelFunction = "traverse_aabbs_new_update_single_geo" >;
|
||||
}
|
||||
|
||||
import struct MKBuilderState "structs.grl";
|
||||
|
||||
// this metakernel only initializes registers for use in a batching loop by "init_refit_scratch"
|
||||
metakernel init_refit_scratch_metakernel_registers()
|
||||
{
|
||||
REG0.hi = 0;
|
||||
REG1 = 3;
|
||||
REG2 = 63;
|
||||
REG3 = 4;
|
||||
REG4 = 2;
|
||||
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
}
|
||||
|
||||
metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes )
|
||||
{
|
||||
REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
|
||||
define C_3 REG1;
|
||||
define C_63 REG2;
|
||||
define C_4 REG3;
|
||||
define C_2 REG4;
|
||||
|
||||
REG0 = REG0 - C_3; // nodedataCurr - fixed offset
|
||||
REG0 = REG0 + C_63; // + 63
|
||||
REG0 = REG0 >> C_4; // >> 4
|
||||
REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
|
||||
dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 )
|
||||
args(bvh_base,scratch);
|
||||
|
||||
}
|
||||
|
||||
metakernel build_node_tables( qword bvh_base )
|
||||
{
|
||||
REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
|
||||
REG1 = 2;
|
||||
REG2 = 63;
|
||||
REG3 = 4;
|
||||
REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!!
|
||||
|
||||
REG0 = REG0 - REG4; // nodedataCurr - fixed offset
|
||||
REG0 = REG0 + REG2; // + 63
|
||||
REG0 = REG0 >> REG3; // >> 4
|
||||
REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 )
|
||||
args(bvh_base);
|
||||
dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
|
||||
args(bvh_base);
|
||||
}
|
||||
|
||||
metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base )
|
||||
{
|
||||
REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
|
||||
REG1 = 2;
|
||||
REG2 = 63;
|
||||
REG3 = 4;
|
||||
REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!!
|
||||
|
||||
REG0 = REG0 - REG4; // nodedataCurr - fixed offset
|
||||
REG0 = REG0 + REG2; // + 63
|
||||
REG0 = REG0 >> REG3; // >> 4
|
||||
REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 )
|
||||
args(state.build_globals, bvh_base);
|
||||
dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
|
||||
args(bvh_base);
|
||||
}
|
||||
|
||||
metakernel fixup_quad_table( qword bvh_base )
|
||||
{
|
||||
dispatch fixup_quad_table(2,1,1)
|
||||
args(bvh_base);
|
||||
}
|
||||
|
||||
// this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes"
|
||||
metakernel init_traverse_aabbs_quad_and_write_inner_nodes()
|
||||
{
|
||||
REG0.hi = 0;
|
||||
REG1 = 1;
|
||||
REG2 = 31;
|
||||
REG3 = 4;
|
||||
REG4 = 2;
|
||||
REG5 = 7;
|
||||
REG6 = 255;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
}
|
||||
|
||||
metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes )
|
||||
{
|
||||
|
||||
REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode!
|
||||
define C_1 REG1;
|
||||
define C_31 REG2;
|
||||
define C_4 REG3;
|
||||
|
||||
REG0 = REG0 + C_31; // + 31
|
||||
REG0 = REG0 >> C_4; // >> 4
|
||||
REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
|
||||
dispatch_indirect traverse_aabbs_quad//( (max_inner_nodes+32)/32, 1, 1 )
|
||||
args(bvh_base,scratch,geos);
|
||||
}
|
||||
|
||||
metakernel write_inner_nodes( qword bvh_base, qword scratch )//, dword max_inner_nodes )
|
||||
{
|
||||
REG0.lo = load_dword( bvh_base + 68 ); // TODO: DOn't hardcode!
|
||||
define C_1 REG1;
|
||||
define C_2 REG4;
|
||||
define C_7 REG5;
|
||||
|
||||
REG0 = REG0 + C_7; // + 7
|
||||
REG0 = REG0 >> C_2; // >> 2
|
||||
REG0 = REG0 >> C_1; // >> 1 ==> >> 3 (/8)
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
|
||||
dispatch_indirect write_inner_nodes//( (max_inner_nodes+7)/8, 1, 1 )
|
||||
args(bvh_base,scratch);
|
||||
}
|
||||
|
||||
metakernel update_single_group_quads( qword bvh_base, qword geos, qword aabbs )
|
||||
{
|
||||
dispatch update_single_group_quads(1,1,1) //( (max_inner_nodes+1)/2, 1, 1 )
|
||||
args(bvh_base,geos,aabbs);
|
||||
}
|
||||
|
||||
metakernel traverse_aabbs_new_update( qword bvh_base, qword geos, qword scratch )
|
||||
{
|
||||
REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
|
||||
define C_255 REG6;
|
||||
define C_4 REG3;
|
||||
|
||||
REG0 = REG0 + C_255; // + 255
|
||||
REG0 = REG0 >> C_4; // >> 4
|
||||
REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
|
||||
dispatch_indirect traverse_aabbs_new_update//( (max_inner_nodes+255)/256, 1, 1 )
|
||||
args(bvh_base, geos, scratch);
|
||||
}
|
||||
|
||||
metakernel traverse_aabbs_new_update_single_geo( qword bvh_base, qword vertices, qword geos, qword scratch, dword vertex_format )
|
||||
{
|
||||
REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
|
||||
define C_255 REG6;
|
||||
define C_4 REG3;
|
||||
|
||||
REG0 = REG0 + C_255; // + 255
|
||||
REG0 = REG0 >> C_4; // >> 4
|
||||
REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
|
||||
dispatch_indirect traverse_aabbs_new_update_single_geo//( (max_inner_nodes+255)/256, 1, 1 )
|
||||
args(bvh_base, vertices, geos, scratch, vertex_format);
|
||||
}
|
||||
|
|
@ -1,265 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
//
|
||||
// This file contains structure definitions shared by GRL OCL kernels and host code
|
||||
//
|
||||
|
||||
#include "GRLGen12.h"
|
||||
#pragma once
|
||||
|
||||
#define BFS_NUM_BINS 16
|
||||
#define BFS_NUM_VCONTEXTS 256
|
||||
#define BFS_MAX_DEPTH 32
|
||||
|
||||
#define TRIVIAL_BUILD_THRESHOLD 6
|
||||
#define SINGLE_WG_BUILD_THRESHOLD 256
|
||||
|
||||
#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
|
||||
|
||||
|
||||
typedef uchar vcontext_id_t;
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
|
||||
|
||||
struct BFS_Split
|
||||
{
|
||||
float sah;
|
||||
int dim;
|
||||
int pos;
|
||||
};
|
||||
|
||||
|
||||
struct BFS_BinInfo
|
||||
{
|
||||
float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6]
|
||||
// The 6 are lower(xyz) and -upper(xyz)
|
||||
// bins use negated-max so that we can use vectorized mins instead of min/max pairs
|
||||
uint counts[3 * BFS_NUM_BINS];
|
||||
};
|
||||
|
||||
enum_uint8(SAHBuildFlags)
|
||||
{
|
||||
SAH_FLAG_NEED_BACKPOINTERS = 1, // identifies a mixed internal node where each child can have a different type
|
||||
SAH_FLAG_NEED_MASKS = 2
|
||||
};
|
||||
|
||||
struct SAHBuildGlobals
|
||||
{
|
||||
qword p_primref_index_buffers;
|
||||
qword p_primrefs_buffer;
|
||||
qword p_bvh2;
|
||||
qword p_globals; // TODO: deprecate this
|
||||
qword p_bvh_base;
|
||||
gpuva_t p_qnode_root_buffer;
|
||||
|
||||
dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks'
|
||||
dword num_primrefs;
|
||||
dword leaf_size;
|
||||
dword leaf_type;
|
||||
|
||||
dword root_buffer_num_produced;
|
||||
dword root_buffer_num_produced_hi;
|
||||
dword root_buffer_num_consumed;
|
||||
dword root_buffer_num_consumed_hi;
|
||||
dword root_buffer_num_to_consume;
|
||||
dword root_buffer_num_to_consume_hi;
|
||||
};
|
||||
|
||||
struct SAHBuildBuffersInfo
|
||||
{
|
||||
gpuva_t p_globals;
|
||||
gpuva_t p_primref_index_buffers;
|
||||
gpuva_t p_primrefs_buffer;
|
||||
gpuva_t p_bvh2;
|
||||
gpuva_t p_bvh_base;
|
||||
gpuva_t p_qnode_root_buffer;
|
||||
dword sah_globals_flags;
|
||||
dword _pad;
|
||||
gpuva_t _pad2;
|
||||
};
|
||||
|
||||
typedef union LRBounds
|
||||
{
|
||||
struct
|
||||
{
|
||||
struct AABB3f left_centroid_bounds;
|
||||
struct AABB3f left_geom_bounds;
|
||||
struct AABB3f right_centroid_bounds;
|
||||
struct AABB3f right_geom_bounds;
|
||||
} boxes;
|
||||
struct
|
||||
{
|
||||
float Array[24];
|
||||
} scalars;
|
||||
} LRBounds;
|
||||
|
||||
|
||||
struct VContext
|
||||
{
|
||||
uint dispatch_primref_begin; // range of primrefs for this task
|
||||
uint dispatch_primref_end;
|
||||
uint bvh2_root; // BVH2 root node for this task
|
||||
uint tree_depth; // depth of this node in the tree
|
||||
uint num_left; // primref counts
|
||||
uint num_right;
|
||||
uint lr_mask; // lower 8b : left mask. upper 8b : right mask
|
||||
uint batch_index;
|
||||
|
||||
// pass1 global working state and output
|
||||
struct BFS_Split split;
|
||||
struct BFS_BinInfo global_bin_info;
|
||||
|
||||
// pass2 global working state and output
|
||||
LRBounds lr_bounds;
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct BFSDispatchRecord
|
||||
{
|
||||
ushort batch_index;
|
||||
ushort context_id;
|
||||
};
|
||||
|
||||
|
||||
struct BFSDispatchQueue
|
||||
{
|
||||
uint num_dispatches;
|
||||
uint wg_count[BFS_NUM_VCONTEXTS];
|
||||
struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
|
||||
};
|
||||
|
||||
struct BFS1SpillStackEntry
|
||||
{
|
||||
uint primref_begin;
|
||||
uint primref_end;
|
||||
uint bvh2_root;
|
||||
ushort tree_depth;
|
||||
ushort batch_index;
|
||||
};
|
||||
|
||||
struct BFS1SpillStack
|
||||
{
|
||||
uint size;
|
||||
struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
|
||||
};
|
||||
|
||||
struct QNodeGlobalRootBufferEntry
|
||||
{
|
||||
uint bvh2_node;
|
||||
uint qnode;
|
||||
uint build_idx;
|
||||
uint _pad;
|
||||
};
|
||||
|
||||
struct QNodeGlobalRootBuffer
|
||||
{
|
||||
uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
|
||||
struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
|
||||
};
|
||||
|
||||
struct DFSDispatchRecord
|
||||
{
|
||||
uint primref_base;
|
||||
uint bvh2_base;
|
||||
uint batch_index;
|
||||
ushort num_primrefs;
|
||||
ushort tree_depth;
|
||||
};
|
||||
|
||||
|
||||
struct DFSDispatchQueue
|
||||
{
|
||||
struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
|
||||
};
|
||||
|
||||
#define VCONTEXT_STATE_EXECUTING 0
|
||||
#define VCONTEXT_STATE_UNALLOCATED 1
|
||||
|
||||
union SchedulerUnion
|
||||
{
|
||||
struct VContextScheduler
|
||||
{
|
||||
/////////////////////////////////////////////////////////////
|
||||
// State data used for communication with command streamer
|
||||
// NOTE: This part must match definition in 'new_sah_builder.grl'
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword num_bfs_wgs;
|
||||
dword num_dfs_wgs;
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
|
||||
dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
|
||||
dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
|
||||
|
||||
dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass
|
||||
dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
|
||||
dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
|
||||
|
||||
dword vcontext_state[BFS_NUM_VCONTEXTS];
|
||||
|
||||
struct BFSDispatchQueue bfs_queue;
|
||||
struct DFSDispatchQueue dfs_queue;
|
||||
|
||||
struct VContext contexts[BFS_NUM_VCONTEXTS];
|
||||
|
||||
struct BFS1SpillStack bfs2_spill_stack;
|
||||
} vContextScheduler;
|
||||
|
||||
struct QnodeScheduler
|
||||
{
|
||||
dword num_qnode_grb_curr_entries;
|
||||
dword num_qnode_grb_new_entries;
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
|
||||
dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
|
||||
dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
|
||||
|
||||
dword batched_builds_to_process;
|
||||
dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
|
||||
dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
|
||||
|
||||
struct QNodeGlobalRootBuffer qnode_global_root_buffer;
|
||||
} qnodeScheduler;
|
||||
};
|
||||
|
||||
|
||||
struct BVH2Node
|
||||
{
|
||||
struct AABB3f box;
|
||||
uint meta_u; // leaf: primref start. inner: offset from node to its first child
|
||||
uint meta_ss;
|
||||
//ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes
|
||||
//uchar is_inner; // 1 if inner, 0 if leaf
|
||||
//uchar mask;
|
||||
};
|
||||
|
||||
struct BVH2
|
||||
{
|
||||
uint num_nodes;
|
||||
uint _pad[7]; // align to 32B
|
||||
};
|
||||
|
||||
|
||||
GRL_NAMESPACE_END(GPUBVHBuilder)
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,206 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module leaf_builder;
|
||||
|
||||
kernel_module leaf_kernels ("bvh_build_leaf.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_kernel_primref_to_quads < kernelFunction="primref_to_quads" >;
|
||||
kernel opencl_kernel_primref_to_procedurals < kernelFunction="primref_to_procedurals" >;
|
||||
kernel opencl_kernel_create_HW_instance_nodes < kernelFunction="create_HW_instance_nodes" >;
|
||||
kernel opencl_kernel_create_HW_instance_nodes_pointers < kernelFunction="create_HW_instance_nodes_pointers" >;
|
||||
}
|
||||
|
||||
import struct MKBuilderState "structs.grl";
|
||||
import struct MKSizeEstimate "structs.grl";
|
||||
|
||||
const Instances_GROUPSIZE = 16;
|
||||
|
||||
metakernel buildLeafDXR_instances(
|
||||
MKBuilderState state,
|
||||
qword build_primref_index_buffers,
|
||||
qword srcInstanceDescrArray,
|
||||
dword stride,
|
||||
dword offset,
|
||||
dword numPrims)
|
||||
{
|
||||
define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
|
||||
dispatch opencl_kernel_create_HW_instance_nodes(num_groups,1,1) args(
|
||||
state.build_globals,
|
||||
build_primref_index_buffers,
|
||||
state.build_primref_buffer,
|
||||
state.bvh_buffer,
|
||||
srcInstanceDescrArray,
|
||||
stride,
|
||||
offset);
|
||||
}
|
||||
|
||||
metakernel buildLeafDXR_instances_indirect(
|
||||
MKBuilderState state,
|
||||
qword build_primref_index_buffers,
|
||||
qword srcInstanceDescrArray,
|
||||
qword indirectBuildRangeInfo,
|
||||
dword stride,
|
||||
dword offset)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // Instances_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(Instances_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_create_HW_instance_nodes args(
|
||||
state.build_globals,
|
||||
build_primref_index_buffers,
|
||||
state.build_primref_buffer,
|
||||
state.bvh_buffer,
|
||||
srcInstanceDescrArray,
|
||||
stride,
|
||||
offset);
|
||||
}
|
||||
|
||||
metakernel buildLeafDXR_instances_pointers(
|
||||
MKBuilderState state,
|
||||
qword build_primref_index_buffers,
|
||||
qword srcInstanceDescrArrayPtr,
|
||||
dword stride,
|
||||
dword offset,
|
||||
dword numPrims)
|
||||
{
|
||||
define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
|
||||
dispatch opencl_kernel_create_HW_instance_nodes_pointers(num_groups,1,1) args(
|
||||
state.build_globals,
|
||||
build_primref_index_buffers,
|
||||
state.build_primref_buffer,
|
||||
state.bvh_buffer,
|
||||
srcInstanceDescrArrayPtr,
|
||||
stride,
|
||||
offset);
|
||||
}
|
||||
|
||||
metakernel buildLeafDXR_instances_pointers_indirect(
|
||||
MKBuilderState state,
|
||||
qword build_primref_index_buffers,
|
||||
qword srcInstanceDescrArrayPtr,
|
||||
qword indirectBuildRangeInfo,
|
||||
dword stride,
|
||||
dword offset)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // Instances_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(Instances_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_create_HW_instance_nodes_pointers args(
|
||||
state.build_globals,
|
||||
build_primref_index_buffers,
|
||||
state.build_primref_buffer,
|
||||
state.bvh_buffer,
|
||||
srcInstanceDescrArrayPtr,
|
||||
stride,
|
||||
offset);
|
||||
}
|
||||
|
||||
metakernel buildLeafDXR_procedurals(
|
||||
MKBuilderState state,
|
||||
qword build_primref_index_buffers,
|
||||
dword stride,
|
||||
dword offset,
|
||||
qword p_numPrimitives)
|
||||
{
|
||||
define C_1 REG0;
|
||||
define REG_PRIMS_PER_WG REG1;
|
||||
define REG_PRIMS_PER_WG_SHR REG2;
|
||||
|
||||
C_1 = 1;
|
||||
REG_PRIMS_PER_WG = 16;
|
||||
REG_PRIMS_PER_WG_SHR = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
|
||||
|
||||
define reg_numPrimitives REG3;
|
||||
define reg_num_wgs REG4;
|
||||
|
||||
reg_numPrimitives = load_dword(p_numPrimitives);
|
||||
reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
|
||||
reg_num_wgs = reg_num_wgs - C_1;
|
||||
reg_num_wgs = reg_num_wgs >> REG_PRIMS_PER_WG_SHR;
|
||||
|
||||
DISPATCHDIM_X = reg_num_wgs;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_primref_to_procedurals args(
|
||||
state.build_globals,
|
||||
state.build_primref_buffer,
|
||||
build_primref_index_buffers,
|
||||
state.bvh_buffer,
|
||||
state.geomDesc_buffer,
|
||||
stride,
|
||||
offset);
|
||||
}
|
||||
|
||||
metakernel buildLeafDXR_quads(
|
||||
MKBuilderState state,
|
||||
qword build_primref_index_buffers,
|
||||
dword stride,
|
||||
dword offset,
|
||||
qword p_numPrimitives,
|
||||
dword allow_update)
|
||||
{
|
||||
define C_1 REG0;
|
||||
define REG_PRIMS_PER_WG REG1;
|
||||
define SHIFT REG2;
|
||||
|
||||
C_1 = 1;
|
||||
REG_PRIMS_PER_WG = 32;
|
||||
SHIFT = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
|
||||
|
||||
define reg_numPrimitives REG3;
|
||||
define reg_num_wgs REG4;
|
||||
|
||||
reg_numPrimitives = load_dword(p_numPrimitives);
|
||||
reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
|
||||
reg_num_wgs = reg_num_wgs - C_1;
|
||||
reg_num_wgs = reg_num_wgs >> SHIFT;
|
||||
reg_num_wgs = reg_num_wgs >> C_1;
|
||||
|
||||
DISPATCHDIM_X = reg_num_wgs;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_primref_to_quads args(
|
||||
state.build_globals,
|
||||
state.build_primref_buffer,
|
||||
build_primref_index_buffers,
|
||||
state.bvh_buffer,
|
||||
state.geomDesc_buffer,
|
||||
stride,
|
||||
offset,
|
||||
allow_update);
|
||||
}
|
||||
|
|
@ -1,229 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module build_primref;
|
||||
|
||||
kernel_module primref_kernels ("bvh_build_primref.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_kernel_primrefs_from_DXR_instances < kernelFunction="primrefs_from_DXR_instances" >;
|
||||
kernel opencl_kernel_primrefs_from_DXR_instances_indirect < kernelFunction="primrefs_from_DXR_instances_indirect" >;
|
||||
kernel opencl_kernel_primrefs_from_DXR_instances_pointers < kernelFunction="primrefs_from_DXR_instances_pointers" >;
|
||||
kernel opencl_kernel_primrefs_from_DXR_instances_pointers_indirect < kernelFunction="primrefs_from_DXR_instances_pointers_indirect" >;
|
||||
|
||||
kernel opencl_kernel_triangles_to_primrefs < kernelFunction="triangles_to_primrefs" >;
|
||||
kernel opencl_kernel_triangles_to_primrefs_indirect < kernelFunction="triangles_to_primrefs_indirect" >;
|
||||
kernel opencl_kernel_procedurals_to_primrefs < kernelFunction="procedurals_to_primrefs" >;
|
||||
kernel opencl_kernel_procedurals_to_primrefs_indirect < kernelFunction="procedurals_to_primrefs_indirect" >;
|
||||
}
|
||||
|
||||
import struct MKBuilderState "structs.grl";
|
||||
import struct MKSizeEstimate "structs.grl";
|
||||
|
||||
|
||||
const PrimirefsFromInstances_GROUPSIZE = 16;
|
||||
|
||||
metakernel buildPrimirefsFromInstances(
|
||||
qword instanceDescBuff,
|
||||
MKSizeEstimate estimate,
|
||||
MKBuilderState build_state,
|
||||
dword allowUpdate)
|
||||
{
|
||||
define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
|
||||
dispatch opencl_kernel_primrefs_from_DXR_instances(num_groups,1,1) args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
instanceDescBuff,
|
||||
estimate.numPrimitives,
|
||||
build_state.build_primref_buffer,
|
||||
allowUpdate);
|
||||
}
|
||||
|
||||
metakernel buildPrimirefsFromInstancesIndirect(
|
||||
qword instanceDescBuff,
|
||||
qword indirectBuildRangeInfo,
|
||||
MKBuilderState build_state,
|
||||
dword allowUpdate)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_indirect args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
instanceDescBuff,
|
||||
indirectBuildRangeInfo,
|
||||
build_state.build_primref_buffer,
|
||||
allowUpdate);
|
||||
}
|
||||
|
||||
metakernel buildPrimirefsFromInstancesArrOfPtrs(
|
||||
qword instanceDescPtrArrayBuff,
|
||||
MKSizeEstimate estimate,
|
||||
MKBuilderState build_state,
|
||||
dword allowUpdate)
|
||||
{
|
||||
define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
|
||||
dispatch opencl_kernel_primrefs_from_DXR_instances_pointers(num_groups,1,1) args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
instanceDescPtrArrayBuff,
|
||||
estimate.numPrimitives,
|
||||
build_state.build_primref_buffer,
|
||||
allowUpdate);
|
||||
}
|
||||
|
||||
metakernel buildPrimirefsFromInstancesArrOfPtrsIndirect(
|
||||
qword instanceDescPtrArrayBuff,
|
||||
qword indirectBuildRangeInfo,
|
||||
MKSizeEstimate estimate,
|
||||
MKBuilderState build_state,
|
||||
dword allowUpdate)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_pointers_indirect args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
instanceDescPtrArrayBuff,
|
||||
build_state.build_primref_buffer,
|
||||
indirectBuildRangeInfo,
|
||||
allowUpdate);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
metakernel primrefs_from_tris(
|
||||
MKBuilderState build_state,
|
||||
MKSizeEstimate estimate,
|
||||
qword geo_ptr,
|
||||
dword geom_id,
|
||||
dword geom_flags,
|
||||
dword num_prims)
|
||||
{
|
||||
define num_threads ((num_prims+15)/16);
|
||||
dispatch opencl_kernel_triangles_to_primrefs(num_threads,1,1) args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
build_state.build_primref_buffer,
|
||||
geo_ptr,
|
||||
(geom_id & 0x00ffffff) + (geom_flags<<24),
|
||||
num_prims);
|
||||
}
|
||||
|
||||
metakernel primrefs_from_tris_indirect(
|
||||
MKBuilderState build_state,
|
||||
MKSizeEstimate estimate,
|
||||
qword geo_ptr,
|
||||
qword indirectBuildRangeInfo,
|
||||
dword geom_id,
|
||||
dword geom_flags)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_triangles_to_primrefs_indirect args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
build_state.build_primref_buffer,
|
||||
geo_ptr,
|
||||
indirectBuildRangeInfo,
|
||||
(geom_id & 0x00ffffff) + (geom_flags << 24));
|
||||
}
|
||||
|
||||
metakernel primrefs_from_proc(
|
||||
MKBuilderState build_state,
|
||||
MKSizeEstimate estimate,
|
||||
qword geo_ptr,
|
||||
dword geom_id,
|
||||
dword geom_flags,
|
||||
dword num_prims)
|
||||
{
|
||||
define num_threads ((num_prims+15)/16);
|
||||
dispatch opencl_kernel_procedurals_to_primrefs(num_threads,1,1) args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
build_state.build_primref_buffer,
|
||||
geo_ptr,
|
||||
(geom_id & 0x00ffffff) + (geom_flags<<24),
|
||||
num_prims);
|
||||
}
|
||||
|
||||
metakernel primrefs_from_proc_indirect(
|
||||
MKBuilderState build_state,
|
||||
MKSizeEstimate estimate,
|
||||
qword geo_ptr,
|
||||
qword indirectBuildRangeInfo,
|
||||
dword geom_id,
|
||||
dword geom_flags)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_procedurals_to_primrefs_indirect args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
build_state.build_primref_buffer,
|
||||
geo_ptr,
|
||||
indirectBuildRangeInfo,
|
||||
(geom_id & 0x00ffffff) + (geom_flags<<24));
|
||||
}
|
||||
|
|
@ -1,324 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module build_refit;
|
||||
|
||||
kernel_module morton_kernels ("bvh_build_refit.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel update_instance_leaves < kernelFunction="update_instance_leaves" >;
|
||||
kernel refit_indirect_sg < kernelFunction="Refit_indirect_sg" >;
|
||||
kernel update_instance_leaves_indirect < kernelFunction="update_instance_leaves_indirect" >;
|
||||
|
||||
|
||||
}
|
||||
|
||||
const INSTANCE_LEAF_GROUP_SIZE = 16;
|
||||
const REFIT_GROUP_SIZE = 8;
|
||||
|
||||
metakernel update_instance_leaves(
|
||||
qword bvh,
|
||||
qword dxrInstancesArray,
|
||||
qword dxrInstancesPtrArray,
|
||||
qword instance_leaf_aabbs,
|
||||
dword num_instances )
|
||||
{
|
||||
define num_groups (num_instances + INSTANCE_LEAF_GROUP_SIZE - 1) / INSTANCE_LEAF_GROUP_SIZE;
|
||||
|
||||
dispatch update_instance_leaves(num_groups, 1, 1) args(
|
||||
bvh,
|
||||
dxrInstancesArray,
|
||||
dxrInstancesPtrArray,
|
||||
instance_leaf_aabbs);
|
||||
}
|
||||
|
||||
metakernel update_instance_leaves_indirect(
|
||||
qword bvh,
|
||||
qword dxrInstancesArray,
|
||||
qword dxrInstancesPtrArray,
|
||||
qword instance_leaf_aabbs,
|
||||
qword indirectBuildRangeInfo)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // INSTANCE_LEAF_GROUP_SIZE - 1
|
||||
C_4 = 4; // log_2(INSTANCE_LEAF_GROUP_SIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / INSTANCE_LEAF_GROUP_SIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect update_instance_leaves_indirect args(
|
||||
bvh,
|
||||
dxrInstancesArray,
|
||||
dxrInstancesPtrArray,
|
||||
instance_leaf_aabbs,
|
||||
indirectBuildRangeInfo);
|
||||
}
|
||||
|
||||
/*
|
||||
metakernel refit(
|
||||
qword bvh,
|
||||
qword geomDesc,
|
||||
qword instance_aabbs,
|
||||
dword dispatchSize )
|
||||
{
|
||||
define num_groups (dispatchSize + REFIT_GROUP_SIZE - 1) / REFIT_GROUP_SIZE;
|
||||
|
||||
dispatch refit(num_groups, 1, 1) args(
|
||||
bvh,
|
||||
geomDesc,
|
||||
instance_aabbs);
|
||||
}
|
||||
|
||||
const REFIT_SIMD_SIZE = 8;
|
||||
const REFIT_SIMD_SIZE_SHIFT = 3;
|
||||
|
||||
metakernel refit_indirect(
|
||||
qword bvh,
|
||||
qword bvh_inner_nodes_start_value,
|
||||
qword bvh_inner_nodes_end,
|
||||
qword geomDesc,
|
||||
qword instance_aabbs )
|
||||
{
|
||||
define cRoundingSIMD REG4;
|
||||
define TWO REG3;
|
||||
define ONE REG5;
|
||||
cRoundingSIMD = (REFIT_SIMD_SIZE - 1);
|
||||
|
||||
TWO = 2;
|
||||
ONE = 1;
|
||||
|
||||
REG0 = bvh_inner_nodes_start_value;
|
||||
REG1 = load_dword(bvh_inner_nodes_end);
|
||||
REG1.hi = 0;
|
||||
REG2 = REG1 - REG0;
|
||||
REG2 = REG2 + cRoundingSIMD;
|
||||
REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer
|
||||
REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area.
|
||||
|
||||
DISPATCHDIM_X = REG2.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect refit_indirect args(
|
||||
bvh,
|
||||
geomDesc,
|
||||
instance_aabbs);
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
metakernel refit_indirect_sg(
|
||||
qword bvh,
|
||||
qword bvh_inner_nodes_start_value,
|
||||
qword bvh_inner_nodes_end,
|
||||
qword geomDesc,
|
||||
qword instance_aabbs )
|
||||
{
|
||||
|
||||
REG0 = bvh_inner_nodes_start_value;
|
||||
REG1.lo = load_dword(bvh_inner_nodes_end);
|
||||
REG1.hi = 0;
|
||||
REG2 = REG1 - REG0;
|
||||
|
||||
DISPATCHDIM_X = REG2.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect refit_indirect_sg args(
|
||||
bvh,
|
||||
geomDesc,
|
||||
instance_aabbs);
|
||||
|
||||
}
|
||||
/*
|
||||
////////////////////////////////////////////////////////////////
|
||||
// constructing treelets
|
||||
// phase 1: mark nodes that will be roots of bottom treelets
|
||||
// also for each node leave a number of startpoints that are under it and max depth of the path from the node
|
||||
metakernel find_refit_treelets(
|
||||
qword bvh,
|
||||
qword treelet_node_data,
|
||||
qword scratch_startpoints,
|
||||
qword startpointAlloc,
|
||||
qword bvh_inner_nodes_start_value,
|
||||
qword bvh_inner_nodes_end )
|
||||
{
|
||||
define cRoundingSIMD REG4;
|
||||
define TWO REG3;
|
||||
define ONE REG5;
|
||||
cRoundingSIMD = (REFIT_SIMD_SIZE - 1);
|
||||
|
||||
TWO = 2;
|
||||
ONE = 1;
|
||||
|
||||
REG0 = bvh_inner_nodes_start_value;
|
||||
REG1.lo = load_dword(bvh_inner_nodes_end);
|
||||
REG1.hi = 0;
|
||||
REG2 = REG1 - REG0;
|
||||
REG2 = REG2 + cRoundingSIMD;
|
||||
REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer
|
||||
REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area.
|
||||
|
||||
DISPATCHDIM_X = REG2.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect find_refit_treelets args(
|
||||
bvh,
|
||||
treelet_node_data,
|
||||
scratch_startpoints,
|
||||
startpointAlloc);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// constructing treelets
|
||||
// phase 2 totally parallel, run threads up to assign startpoints to given treelet
|
||||
//
|
||||
metakernel assign_refit_startpoints_to_treelets(
|
||||
qword bvh,
|
||||
qword treelet_node_data,
|
||||
qword scratch_startpoints,
|
||||
qword bvh_inner_nodes_start_value,
|
||||
qword bvh_inner_nodes_end )
|
||||
{
|
||||
define cRoundingSIMD REG4;
|
||||
define TWO REG3;
|
||||
define ONE REG5;
|
||||
cRoundingSIMD = (REFIT_SIMD_SIZE - 1);
|
||||
|
||||
TWO = 2;
|
||||
ONE = 1;
|
||||
|
||||
REG0 = bvh_inner_nodes_start_value;
|
||||
REG1.lo = load_dword(bvh_inner_nodes_end);
|
||||
REG1.hi = 0;
|
||||
REG2 = REG1 - REG0;
|
||||
REG2 = REG2 + cRoundingSIMD;
|
||||
REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer
|
||||
REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area.
|
||||
|
||||
DISPATCHDIM_X = REG2.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect assign_refit_startpoints_to_treelets args(
|
||||
bvh,
|
||||
treelet_node_data,
|
||||
scratch_startpoints);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// constructing treelets
|
||||
// phase 3 local work: group per treelet, sort the startpoints in treelets ?// by length of the path
|
||||
metakernel finalize_treelets_in_groups(
|
||||
qword bvh,
|
||||
qword scratch_startpoints,
|
||||
qword ptrNumTreelets )
|
||||
{
|
||||
REG0 = load_qword(ptrNumTreelets);
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect finalize_treelets_in_groups args(
|
||||
bvh,
|
||||
scratch_startpoints);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Updating treelets
|
||||
// phase 1 update vertex and generate boxes for vertices
|
||||
//
|
||||
|
||||
const PER_GROUP_ELEMENTS_ROUNDING = 15;
|
||||
const PER_GROUP_ELEMENTS_SHIFT = 4;
|
||||
|
||||
metakernel init_treelets_refit(qword pSquashGroupsCountToReset)
|
||||
{
|
||||
REG1 = 0;
|
||||
store_qword(pSquashGroupsCountToReset, REG1);
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
//REG4 = PER_GROUP_ELEMENTS_SHIFT;
|
||||
//REG5.hi = PER_GROUP_ELEMENTS_ROUNDING;
|
||||
//REG5.lo = 0;
|
||||
}
|
||||
|
||||
metakernel update_quads(
|
||||
qword scratch_box,
|
||||
qword bvh,
|
||||
qword input,
|
||||
dword numPrimsDividedBy32,
|
||||
qword bigSquashInput)
|
||||
{
|
||||
//REG0 = load_qword(quads_nodes_begin_end_pair);
|
||||
//REG1.hi = REG0.lo; // this holds inner nodes begin
|
||||
//REG2 = REG0 - REG1;
|
||||
//REG2 = REG2 + REG5;
|
||||
//REG2 = REG2 >> REG4;
|
||||
//DISPATCHDIM_X = REG2.hi;
|
||||
|
||||
dispatch refit_quads(numPrimsDividedBy32, 1, 1) args(
|
||||
bvh,
|
||||
input,
|
||||
scratch_box,
|
||||
numPrimsDividedBy32,
|
||||
bigSquashInput );
|
||||
}
|
||||
|
||||
//
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// phase 1 or 2 - update primitives as well as bottom up refit internal nodes
|
||||
// in single dispatch (in single group per tree)
|
||||
metakernel refit_tree_by_group_including_quads(
|
||||
qword squashed_inputs,
|
||||
dword numBuilds
|
||||
)
|
||||
{
|
||||
dispatch refit_tree_per_group(numBuilds, 1, 1) args(
|
||||
squashed_inputs);
|
||||
}
|
||||
//
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// phase 2 bottom up refit internal nodes
|
||||
//
|
||||
metakernel refit_treelet_per_group(
|
||||
qword bigSquashInput,
|
||||
qword ptrNumTreelets)
|
||||
{
|
||||
DISPATCHDIM_X = load_dword(ptrNumTreelets);
|
||||
|
||||
dispatch_indirect refit_treelet_per_group args(
|
||||
bigSquashInput);
|
||||
}
|
||||
//
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif
|
||||
*/
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -1,357 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "api_interface.h"
|
||||
#include "common.h"
|
||||
#include "instance.h"
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(32, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel
|
||||
primref_to_quads(global struct Globals *globals,
|
||||
global struct AABB *primref,
|
||||
global char *primref_index,
|
||||
global char *bvh_mem,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
|
||||
const uint stride,
|
||||
const uint offset,
|
||||
const uint allow_update)
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
|
||||
uint quadIndicesStart = bvh->quadIndicesDataStart;
|
||||
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
uint i = get_group_id( 0 ) * get_local_size( 0 ) + get_local_id(0);
|
||||
if (i < numPrimitives)
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
|
||||
const uint primrefID = *(uint *)(primref_index + i * stride + offset);
|
||||
|
||||
const uint geomID = PRIMREF_geomID(&primref[primrefID]);
|
||||
const uint primID0 = PRIMREF_primID0(&primref[primrefID]);
|
||||
const uint primID1 = PRIMREF_primID1(&primref[primrefID]);
|
||||
const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
|
||||
|
||||
const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
|
||||
const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
|
||||
|
||||
const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
|
||||
|
||||
uint vertex_stride = geomDesc[geomID].Desc.Triangles.VertexBufferByteStride;
|
||||
|
||||
const uint4 indices = q.a;
|
||||
|
||||
const uint mask = 0xff; // FIXME: hardcoded mask
|
||||
float3 vtx0, vtx1, vtx2, vtx3;
|
||||
GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
|
||||
|
||||
uint j0 = q.lb.x;
|
||||
uint j1 = q.lb.y;
|
||||
uint j2 = q.lb.z;
|
||||
uint shaderIndex = (mask << 24) | geomID;
|
||||
uint geomIndex = geomID | (geomFlags << 30);
|
||||
uint primIndex0 = primID0;
|
||||
const uint delta = primID1 - primID0;
|
||||
const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
|
||||
uint primIndex1Delta = delta | (j << 16) | (1 << 22);
|
||||
|
||||
uint4 pack0 = (uint4)(shaderIndex, geomIndex, primIndex0, primIndex1Delta);
|
||||
float4 pack1 = (float4)(vtx0.x, vtx0.y, vtx0.z, vtx1.x);
|
||||
float4 pack2 = (float4)(vtx1.y, vtx1.z, vtx2.x, vtx2.y);
|
||||
float4 pack3 = (float4)(vtx2.z, vtx3.x, vtx3.y, vtx3.z);
|
||||
|
||||
global uint4* dst = (global uint4*)&quads[i];
|
||||
store_uint4_L1WB_L3WB(dst, 0, pack0);
|
||||
store_uint4_L1WB_L3WB(dst, 1, as_uint4(pack1));
|
||||
store_uint4_L1WB_L3WB(dst, 2, as_uint4(pack2));
|
||||
store_uint4_L1WB_L3WB(dst, 3, as_uint4(pack3));
|
||||
|
||||
if(allow_update)
|
||||
{
|
||||
global uint4* vertex_indice_ptr = (global uint4*)(((char*)bvh) + (64u * quadIndicesStart + 32 * i));
|
||||
|
||||
uint4 pack_indices = (uint4) ( indices.x , indices.y, indices.z, indices.w );
|
||||
|
||||
store_uint4_L1WB_L3WB( vertex_indice_ptr, 0, pack0 );
|
||||
store_uint4_L1WB_L3WB( vertex_indice_ptr, 1, pack_indices * vertex_stride);
|
||||
}
|
||||
|
||||
if (i == 0)
|
||||
bvh->quadLeafCur += numPrimitives ;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
|
||||
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
const uint startID = get_group_id( 0 ) * get_local_size( 0 );
|
||||
const uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives);
|
||||
|
||||
for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
|
||||
{
|
||||
const uint primrefID = *(uint *)(primref_index + i * stride + offset);
|
||||
|
||||
const uint geomID = PRIMREF_geomID(&primref[primrefID]);
|
||||
const uint primID0 = PRIMREF_primID0(&primref[primrefID]);
|
||||
const uint primID1 = PRIMREF_primID1(&primref[primrefID]);
|
||||
const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
|
||||
|
||||
const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
|
||||
const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
|
||||
|
||||
const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
|
||||
|
||||
const uint4 indices = q.a;
|
||||
const uint mask = 0xff; // FIXME: hardcoded mask
|
||||
float3 vtx0, vtx1, vtx2, vtx3;
|
||||
GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
|
||||
|
||||
setQuad(&quads[i], (float4)(vtx0,0), (float4)(vtx1,0), (float4)(vtx2,0), (float4)(vtx3,0), q.lb.x, q.lb.y, q.lb.z, geomID, primID0, primID1, mask, geomFlags );
|
||||
}
|
||||
|
||||
if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
|
||||
bvh->quadLeafCur += numPrimitives ;
|
||||
#endif
|
||||
}
|
||||
|
||||
GRL_INLINE void create_procedural_leaf(global struct Globals *globals,
|
||||
global struct AABB *primref,
|
||||
local uint *primrefids,
|
||||
uint numProcedurals,
|
||||
struct QBVHNodeN *qnode,
|
||||
global char *bvh_mem,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
if (get_local_id(0) >= 8)
|
||||
return;
|
||||
|
||||
global struct BVHBase* bvh_base = (global struct BVHBase*)bvh_mem;
|
||||
|
||||
/* first read geomID of all primitives */
|
||||
uint primrefID = -1;
|
||||
uint geomID = -1;
|
||||
uint geomFlags = 0;
|
||||
if (get_local_id(0) < numProcedurals)
|
||||
{
|
||||
primrefID = primrefids[get_local_id(0)];
|
||||
geomID = PRIMREF_geomID(&primref[primrefID]);
|
||||
geomFlags = PRIMREF_geomFlags( &primref[primrefID] );
|
||||
}
|
||||
|
||||
// cannot sort by geomID as bounds in parent node are then wrong
|
||||
//ulong geomID_primrefID = (((ulong)geomID) << 32) | ((ulong)primrefID);
|
||||
//geomID_primrefID = sort8_ascending_ulong(geomID_primrefID);
|
||||
//geomID = geomID_primrefID >> 32;
|
||||
//primrefID = geomID_primrefID;
|
||||
|
||||
/* We have to split at geomID boundaries into multiple leaves. This
|
||||
* block calculates the lane where a leaf starts and ends. */
|
||||
const uint geomIDprev = intel_sub_group_shuffle_up(0xFFFFFFFFu, geomID, 1u);
|
||||
const uint geomIDnext = intel_sub_group_shuffle_down(geomID, 0xFFFFFFFFu, 1u);
|
||||
const uint leaf_start = geomIDprev != geomID;
|
||||
const uint leaf_end = geomIDnext != geomID;
|
||||
const uint leaf_start_next = intel_sub_group_shuffle_down(leaf_start, 0u, 1u);
|
||||
|
||||
/* This computes which leaf a lane processes. E.g. form geomID =
|
||||
* [3,3,4,4,4,0] we get leaf_id = [0,0,1,1,1,2] */
|
||||
//const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); // FIXME: exclusive?
|
||||
|
||||
/* This computes the n'th primitive a lane processes inside its
|
||||
* leaf. For the example above we compute leaf_prim =
|
||||
* [0,1,0,1,2,0]. */
|
||||
const uint leaf_prim = get_local_id(0) - sub_group_scan_inclusive_max(leaf_start ? get_local_id(0) : 0);
|
||||
|
||||
/* from here on we allocate data and write to memory, thus only
|
||||
* lanes that process a primitive should continue. */
|
||||
if (get_local_id(0) >= numProcedurals)
|
||||
return;
|
||||
|
||||
/* Here we allocate a single memory block for each required
|
||||
* ProceduralLeaf node. We do this from a single lane to ensure
|
||||
* the allocation is contiguous. */
|
||||
uint leaf_base_offset = 0;
|
||||
uint n_leafs = sub_group_reduce_add(leaf_start);
|
||||
if (get_local_id(0) == 0)
|
||||
leaf_base_offset = allocate_procedural_leaves( bvh_base, n_leafs );
|
||||
leaf_base_offset = sub_group_broadcast(leaf_base_offset, 0);
|
||||
|
||||
/* Compute the leaf offset for each lane. */
|
||||
uint leaf_offset = leaf_base_offset + sub_group_scan_inclusive_add(leaf_start) - 1;
|
||||
|
||||
struct ProceduralLeaf *pleaf = ((global struct ProceduralLeaf *)(bvh_mem)) + leaf_offset;
|
||||
|
||||
/* write the procedural leaf headers */
|
||||
if (leaf_end)
|
||||
{
|
||||
pleaf->leafDesc.shaderIndex_geomMask = 0xFF000000 | (geomID & 0x00FFFFFF); // FIXME: use accessor function. Future extensions may have shaderIndex != geomID
|
||||
pleaf->leafDesc.geomIndex_flags = geomID | (geomFlags<<30); // FIXME: Use setter function
|
||||
pleaf->DW1 = 0xFFFFFFF0 | (leaf_prim + 1); // !!!
|
||||
}
|
||||
/* write the procedural leaf primIDs */
|
||||
pleaf->_primIndex[leaf_prim] = PRIMREF_primID0(&primref[primrefID]);
|
||||
|
||||
/* update leaf node offset inside parent node */
|
||||
if (get_local_id(0) == 0)
|
||||
{
|
||||
QBVH6Node_set_offset(qnode, pleaf);
|
||||
QBVH6Node_set_type(qnode, NODE_TYPE_PROCEDURAL);
|
||||
}
|
||||
|
||||
/* Let parent node children point to proper procedural leaf block
|
||||
* and primitive. */
|
||||
qnode->childData[get_local_id(0)] = leaf_start_next | (leaf_prim << 2);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
GRL_ANNOTATE_BIG_REG_REQ
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
primref_to_procedurals(global struct Globals *globals,
|
||||
global struct AABB *primref,
|
||||
global char *primref_index,
|
||||
global char *bvh_mem,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
|
||||
const uint stride,
|
||||
const uint offset)
|
||||
{
|
||||
global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
|
||||
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
uint startID = get_group_id( 0 ) * get_local_size( 0 );
|
||||
uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives);
|
||||
|
||||
uint offset1 = stride * globals->numPrimitives;
|
||||
if (stride == 8)
|
||||
offset1 = 4;
|
||||
|
||||
uint prev_start_back_pointer = startID == 0 ? -1 : *(uint *)(primref_index + (startID-1) * stride + offset1);
|
||||
/* start at leaf start */
|
||||
while (startID < numPrimitives)
|
||||
{
|
||||
const uint back_pointer = *(uint *)(primref_index + startID * stride + offset1);
|
||||
if (back_pointer != prev_start_back_pointer)
|
||||
break;
|
||||
startID++;
|
||||
}
|
||||
|
||||
uint prev_end_back_pointer = *(uint *)(primref_index + (endID-1) * stride + offset1);
|
||||
/* end at next leaf start */
|
||||
while (endID < numPrimitives)
|
||||
{
|
||||
const uint back_pointer = *(uint *)(primref_index + endID * stride + offset1);
|
||||
if (back_pointer != prev_end_back_pointer)
|
||||
break;
|
||||
endID++;
|
||||
}
|
||||
|
||||
local uint procedurals[16];
|
||||
|
||||
for (uint lid = startID + get_local_id(0); lid < endID + get_local_id(0);)
|
||||
{
|
||||
/* load leaf start points and back_pointer */
|
||||
const uint primrefID = *(uint *)(primref_index + lid * stride + offset);
|
||||
uint back_pointer = *(uint *)(primref_index + lid * stride + offset1);
|
||||
uint prev_back_pointer = get_local_id(0) == 0 ? -1 : *(uint *)(primref_index + (lid-1) * stride + offset1);
|
||||
|
||||
const uint leaf_start = back_pointer != prev_back_pointer;
|
||||
uint leaf_start_back_pointer = sub_group_broadcast(back_pointer, 0);
|
||||
|
||||
/* compute number of primitives inside the leaf starting at lid */
|
||||
const uint leaf_id = sub_group_scan_inclusive_add(leaf_start);
|
||||
uint numPrimitives = 0;
|
||||
if (back_pointer == leaf_start_back_pointer && lid < endID)
|
||||
numPrimitives = sub_group_reduce_add(1);
|
||||
numPrimitives = sub_group_broadcast(numPrimitives, 0);
|
||||
|
||||
procedurals[get_local_id(0)] = primrefID;
|
||||
|
||||
struct QBVHNodeN *qnode = (struct QBVHNodeN *)bvh_mem + back_pointer;
|
||||
|
||||
create_procedural_leaf(globals, primref, procedurals, numPrimitives, qnode, bvh_mem, geomDesc);
|
||||
|
||||
lid += numPrimitives;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void create_HW_instance_leaf(
|
||||
global struct BVHBase* bvh,
|
||||
global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
|
||||
uint dstLeafId,
|
||||
uint instanceIndex,
|
||||
uint rootNodeByteOffset,
|
||||
uint instanceMask)
|
||||
{
|
||||
/* convert DXR instance to instance leaf node */
|
||||
global struct HwInstanceLeaf* leaves = (__global struct HwInstanceLeaf*)BVHBase_quadLeaves(bvh);
|
||||
HwInstanceLeaf_Constructor(&leaves[dstLeafId], instDesc, instanceIndex, rootNodeByteOffset, instanceMask);
|
||||
}
|
||||
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel create_HW_instance_nodes(
|
||||
global const struct Globals *globals,
|
||||
global char *primref_index,
|
||||
global struct AABB *primref,
|
||||
global struct BVHBase *bvh,
|
||||
global struct GRL_RAYTRACING_INSTANCE_DESC *src_instances,
|
||||
uint32_t stride,
|
||||
uint32_t offset)
|
||||
{
|
||||
uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
|
||||
uint num_prims = globals->numPrimitives;
|
||||
if (dstLeafId >= num_prims)
|
||||
return;
|
||||
if( dstLeafId == 0 )
|
||||
bvh->instanceLeafEnd += 2*num_prims;
|
||||
|
||||
/* get instance ID */
|
||||
const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
|
||||
const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
|
||||
const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
|
||||
const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
|
||||
create_HW_instance_leaf(bvh, &src_instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel create_HW_instance_nodes_pointers(
|
||||
global const struct Globals *globals,
|
||||
global char *primref_index,
|
||||
global struct AABB *primref,
|
||||
global struct BVHBase *bvh,
|
||||
global void *instances_in,
|
||||
uint32_t stride,
|
||||
uint32_t offset)
|
||||
{
|
||||
uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
|
||||
uint num_prims = globals->numPrimitives;
|
||||
if (dstLeafId >= num_prims)
|
||||
return;
|
||||
if (dstLeafId == 0)
|
||||
bvh->instanceLeafEnd += 2 * num_prims;
|
||||
|
||||
global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
|
||||
(global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
|
||||
|
||||
/* get instance ID */
|
||||
const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
|
||||
const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
|
||||
const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
|
||||
const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
|
||||
create_HW_instance_leaf(bvh, instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
|
||||
}
|
||||
|
|
@ -1,556 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "api_interface.h"
|
||||
#include "common.h"
|
||||
|
||||
#define GRID_SIZE 1024
|
||||
|
||||
/*
|
||||
This presplit item contains for each primitive a number of splits to
|
||||
perform (priority) and the primref index.
|
||||
*/
|
||||
|
||||
struct PresplitItem
|
||||
{
|
||||
unsigned int index;
|
||||
float priority;
|
||||
};
|
||||
|
||||
/*
|
||||
|
||||
This function splits a line v0->v1 at position pos in dimension dim
|
||||
and merges the bounds for the left and right line segments into
|
||||
lbounds and rbounds.
|
||||
|
||||
*/
|
||||
|
||||
GRL_INLINE void splitLine(const uint dim,
|
||||
const float pos,
|
||||
const float4 v0,
|
||||
const float4 v1,
|
||||
struct AABB *lbounds,
|
||||
struct AABB *rbounds)
|
||||
{
|
||||
const float v0d = v0[dim];
|
||||
const float v1d = v1[dim];
|
||||
|
||||
/* this point is on left side */
|
||||
if (v0d <= pos)
|
||||
AABB_extend_point(lbounds, v0);
|
||||
|
||||
/* this point is on right side */
|
||||
if (v0d >= pos)
|
||||
AABB_extend_point(rbounds, v0);
|
||||
|
||||
/* the edge crosses the splitting location */
|
||||
if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d))
|
||||
{
|
||||
const float f = (pos - v0d) / (v1d - v0d);
|
||||
const float4 c = f * (v1 - v0) + v0;
|
||||
AABB_extend_point(lbounds, c);
|
||||
AABB_extend_point(rbounds, c);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This function splits a clipped triangle v0,v1,v2 with bounds prim at
|
||||
position pos in dimension dim and merges the bounds for the left and
|
||||
right clipped triangle fragments into lbounds and rbounds.
|
||||
|
||||
*/
|
||||
|
||||
GRL_INLINE void splitTriangle(struct AABB *prim,
|
||||
const uint dim,
|
||||
const float pos,
|
||||
const float4 v0,
|
||||
const float4 v1,
|
||||
const float4 v2,
|
||||
struct AABB *lbounds,
|
||||
struct AABB *rbounds)
|
||||
{
|
||||
/* clip each triangle edge */
|
||||
splitLine(dim, pos, v0, v1, lbounds, rbounds);
|
||||
splitLine(dim, pos, v1, v2, lbounds, rbounds);
|
||||
splitLine(dim, pos, v2, v0, lbounds, rbounds);
|
||||
|
||||
/* the triangle itself was clipped already, thus clip against triangle bounds */
|
||||
AABB_intersect(lbounds, prim);
|
||||
AABB_intersect(rbounds, prim);
|
||||
}
|
||||
|
||||
float calculate_priority(struct AABB *prim, global GRL_RAYTRACING_GEOMETRY_DESC *geom)
|
||||
{
|
||||
/* calculate projected area of first triangles */
|
||||
const uint primID0 = PRIMREF_primID0(prim);
|
||||
const uint3 tri0 = GRL_load_triangle(geom, primID0);
|
||||
const float4 av0 = GRL_load_vertex(geom, tri0.x);
|
||||
const float4 av1 = GRL_load_vertex(geom, tri0.y);
|
||||
const float4 av2 = GRL_load_vertex(geom, tri0.z);
|
||||
const float area_tri0 = areaProjectedTriangle(av0, av1, av2);
|
||||
|
||||
/* calculate projected area of second triangle */
|
||||
const uint primID1 = PRIMREF_primID1(prim);
|
||||
const uint3 tri1 = GRL_load_triangle(geom, primID1);
|
||||
const float4 bv0 = GRL_load_vertex(geom, tri1.x);
|
||||
const float4 bv1 = GRL_load_vertex(geom, tri1.y);
|
||||
const float4 bv2 = GRL_load_vertex(geom, tri1.z);
|
||||
const float area_tri1 = areaProjectedTriangle(bv0, bv1, bv2);
|
||||
|
||||
/* as priority we use the AABB area */
|
||||
const float area_aabb = AABB_halfArea(prim);
|
||||
float priority = area_aabb;
|
||||
|
||||
/* prefer triangles with a large potential SAH gain. */
|
||||
const float area_tris = area_tri0 + area_tri1;
|
||||
const float area_ratio = min(4.0f, area_aabb / max(1E-12f, area_tris));
|
||||
priority *= area_ratio;
|
||||
|
||||
/* ignore too small primitives */
|
||||
//const float4 size = AABB_size(prim);
|
||||
//const float max_size = max(size.x,max(size.y,size.z));
|
||||
//if (max_size < 0.5f*max_scene_size/GRID_SIZE)
|
||||
// priority = 0.0f;
|
||||
|
||||
return priority;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This kernel calculates for each primitive an estimated splitting priority.
|
||||
|
||||
*/
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel compute_num_presplits(global struct Globals *globals,
|
||||
global struct BVHBase* bvh_base,
|
||||
global struct AABB *primref,
|
||||
global struct PresplitItem *presplit,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
//assert(sizeof(PresplitItem) == sizeof_PresplitItem);
|
||||
|
||||
/* calculate the range of primitives each work group should process */
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
const uint startID = (get_group_id(0) + 0) * numPrimitives / get_num_groups(0);
|
||||
const uint endID = (get_group_id(0) + 1) * numPrimitives / get_num_groups(0);
|
||||
|
||||
/* get scene bounding box size */
|
||||
const float3 scene_size = AABB3f_size(&bvh_base->Meta.bounds);
|
||||
const float max_scene_size = max(scene_size.x, max(scene_size.y, scene_size.z));
|
||||
|
||||
/* each work group iterates over its range of primitives */
|
||||
for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
|
||||
{
|
||||
const uint geomID = PRIMREF_geomID(&primref[i]);
|
||||
|
||||
/* splitting heuristic for triangles */
|
||||
if (GRL_is_triangle(&geomDesc[geomID]))
|
||||
{
|
||||
presplit[i].index = i;
|
||||
presplit[i].priority = calculate_priority(&primref[i], &geomDesc[geomID]);
|
||||
}
|
||||
|
||||
/* splitting of procedurals is not supported */
|
||||
else if (GRL_is_procedural(&geomDesc[geomID]))
|
||||
{
|
||||
presplit[i].index = i;
|
||||
presplit[i].priority = 0.0f;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
//assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
|
||||
globals->numOriginalPrimitives = globals->numPrimitives;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This kernel computes the sum of all priorities.
|
||||
|
||||
*/
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
priority_sum(global struct Globals *globals,
|
||||
global struct PresplitItem *presplit,
|
||||
uint numPrimitivesToSplit)
|
||||
{
|
||||
const uint N = globals->numPrimitives;
|
||||
const uint j = get_local_id(0);
|
||||
const uint J = get_local_size(0);
|
||||
const uint BLOCKSIZE = (N + J - 1) / J;
|
||||
const uint start = min((j + 0) * BLOCKSIZE, N);
|
||||
const uint end = min((j + 1) * BLOCKSIZE, N);
|
||||
|
||||
float prioritySum = 0;
|
||||
for (uint i = start; i < end; i++)
|
||||
prioritySum += presplit[i].priority;
|
||||
|
||||
prioritySum = work_group_reduce_add(prioritySum);
|
||||
globals->presplitPrioritySum = prioritySum;
|
||||
|
||||
#if 0
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
float scale = 1.0f;
|
||||
for (uint i = 0; i < 10; i++)
|
||||
{
|
||||
//if (j == 0)
|
||||
//printf("prioritySum = %f\n",scale*prioritySum);
|
||||
|
||||
uint numSplits = 0;
|
||||
for (uint i = start; i < end; i++)
|
||||
numSplits += presplit[i].priority / (scale*prioritySum)*numPrimitivesToSplit;
|
||||
|
||||
numSplits = work_group_reduce_add(numSplits);
|
||||
|
||||
if (numSplits > numPrimitivesToSplit)
|
||||
break;
|
||||
|
||||
//if (j == 0)
|
||||
// printf("numSplits = %i (%i)\n",numSplits,numPrimitivesToSplit);
|
||||
|
||||
globals->presplitPrioritySum = scale * prioritySum;
|
||||
scale -= 0.05f;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
GRL_INLINE void heapify_down(struct AABB *array, uint size)
|
||||
{
|
||||
/* we start at the root */
|
||||
uint cur_node_id = 0;
|
||||
struct AABB *cur_node = array;
|
||||
|
||||
while (true)
|
||||
{
|
||||
int larger_node_id = cur_node_id;
|
||||
struct AABB *larger_node = cur_node;
|
||||
|
||||
/* check if left child is largest */
|
||||
const int left_node_id = 2 * cur_node_id + 1;
|
||||
struct AABB *left_node = &array[left_node_id];
|
||||
if (left_node_id < size && AABB_halfArea(left_node) > AABB_halfArea(larger_node))
|
||||
{
|
||||
larger_node_id = left_node_id;
|
||||
larger_node = left_node;
|
||||
}
|
||||
|
||||
/* check if right child is largest */
|
||||
const int right_node_id = 2 * cur_node_id + 2;
|
||||
struct AABB *right_node = &array[right_node_id];
|
||||
if (right_node_id < size && AABB_halfArea(right_node) > AABB_halfArea(larger_node))
|
||||
{
|
||||
larger_node_id = right_node_id;
|
||||
larger_node = right_node;
|
||||
}
|
||||
|
||||
/* if current node is largest heap property is fulfilled and we are done */
|
||||
if (larger_node_id == cur_node_id)
|
||||
break;
|
||||
|
||||
/* otherwise we swap cur and largest */
|
||||
struct AABB tmp = *cur_node;
|
||||
*cur_node = *larger_node;
|
||||
*larger_node = tmp;
|
||||
|
||||
/* we continue downwards with the largest node */
|
||||
cur_node_id = larger_node_id;
|
||||
cur_node = larger_node;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void heapify_up(struct AABB *array, uint cur_node_id)
|
||||
{
|
||||
/* stop if we start at the root */
|
||||
if (cur_node_id == 0)
|
||||
return;
|
||||
|
||||
struct AABB *cur_node = &array[cur_node_id];
|
||||
|
||||
/* we loop until we reach the root node */
|
||||
while (cur_node_id)
|
||||
{
|
||||
/* get parent node */
|
||||
uint parent_node_id = (cur_node_id - 1) / 2;
|
||||
struct AABB *parent_node = &array[parent_node_id];
|
||||
|
||||
/* if parent is larger then current we fulfill the heap property and can terminate */
|
||||
if (AABB_halfArea(parent_node) > AABB_halfArea(cur_node))
|
||||
break;
|
||||
|
||||
/* otherwise we swap cur and parent */
|
||||
struct AABB tmp = *cur_node;
|
||||
*cur_node = *parent_node;
|
||||
*parent_node = tmp;
|
||||
|
||||
/* and continue upwards */
|
||||
cur_node_id = parent_node_id;
|
||||
cur_node = parent_node;
|
||||
}
|
||||
}
|
||||
|
||||
/* splits a quad primref */
|
||||
GRL_INLINE void splitQuadPrimRef(global GRL_RAYTRACING_GEOMETRY_DESC *geom,
|
||||
struct AABB *cur, uint dim, float fsplit,
|
||||
struct AABB *left, struct AABB *right)
|
||||
{
|
||||
/* left and right bounds to compute */
|
||||
AABB_init(left);
|
||||
AABB_init(right);
|
||||
|
||||
/* load first triangle and split it */
|
||||
const uint primID0 = PRIMREF_primID0(cur);
|
||||
const uint3 tri0 = GRL_load_triangle(geom, primID0);
|
||||
const float4 av0 = GRL_load_vertex(geom, tri0.x);
|
||||
const float4 av1 = GRL_load_vertex(geom, tri0.y);
|
||||
const float4 av2 = GRL_load_vertex(geom, tri0.z);
|
||||
splitTriangle(cur, dim, fsplit, av0, av1, av2, left, right);
|
||||
|
||||
/* load second triangle and split it */
|
||||
const uint primID1 = PRIMREF_primID1(cur);
|
||||
const uint3 tri1 = GRL_load_triangle(geom, primID1);
|
||||
const float4 bv0 = GRL_load_vertex(geom, tri1.x);
|
||||
const float4 bv1 = GRL_load_vertex(geom, tri1.y);
|
||||
const float4 bv2 = GRL_load_vertex(geom, tri1.z);
|
||||
splitTriangle(cur, dim, fsplit, bv0, bv1, bv2, left, right);
|
||||
|
||||
/* copy the PrimRef payload into left and right */
|
||||
left->lower.w = cur->lower.w;
|
||||
left->upper.w = cur->upper.w;
|
||||
right->lower.w = cur->lower.w;
|
||||
right->upper.w = cur->upper.w;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This kernel performs the actual pre-splitting. It selects split
|
||||
locations based on an implicit octree over the scene.
|
||||
|
||||
*/
|
||||
|
||||
#define USE_HEAP 0
|
||||
#define HEAP_SIZE 32u
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
//__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel
|
||||
perform_presplits(global struct Globals *globals,
|
||||
global struct BVHBase* bvh_base,
|
||||
global struct AABB *primref,
|
||||
global struct PresplitItem *presplit,
|
||||
global char *bvh_mem,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
|
||||
uint numPrimitivesToSplit)
|
||||
{
|
||||
/* calculate the range of primitives each work group should process */
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
int pstart = globals->numOriginalPrimitives - numPrimitivesToSplit;
|
||||
pstart = max(0, pstart);
|
||||
const uint numPrimitivesToProcess = globals->numPrimitives - pstart;
|
||||
const uint startID = (get_group_id(0) + 0) * numPrimitivesToProcess / get_num_groups(0);
|
||||
const uint endID = (get_group_id(0) + 1) * numPrimitivesToProcess / get_num_groups(0);
|
||||
|
||||
/* calculates the 3D grid */
|
||||
float4 grid_base;
|
||||
grid_base.xyz = AABB3f_load_lower( &bvh_base->Meta.bounds );
|
||||
grid_base.w = 0;
|
||||
|
||||
float4 grid_extend;
|
||||
grid_extend.xyz = AABB3f_size(&bvh_base->Meta.bounds);
|
||||
grid_extend.w=0;
|
||||
|
||||
grid_extend = max(grid_extend.x, max(grid_extend.y, grid_extend.z));
|
||||
const float4 grid_scale = select(GRID_SIZE / grid_extend, 0.0f, grid_extend == 0.0f);
|
||||
const float inv_grid_size = 1.0f / GRID_SIZE;
|
||||
|
||||
/* we have to update centroid bounds */
|
||||
struct AABB centroidBounds;
|
||||
AABB_init(¢roidBounds);
|
||||
|
||||
/* initialize heap */
|
||||
struct AABB heap[HEAP_SIZE];
|
||||
uint heap_size = 0;
|
||||
|
||||
/* each work group iterates over its range of primitives */
|
||||
for (uint j = startID + get_local_id(0); j < endID; j += get_local_size(0))
|
||||
{
|
||||
/* array is in ascending order */
|
||||
//const uint ID = numPrimitives-1-j;
|
||||
const uint ID = pstart + j;
|
||||
const float prob = presplit[ID].priority;
|
||||
const uint i = presplit[ID].index;
|
||||
const uint geomID = PRIMREF_geomID(&primref[i]);
|
||||
|
||||
/* do not split primitives with low splitting priority */
|
||||
if (prob <= 0.0f)
|
||||
continue;
|
||||
|
||||
/* we support splitting only for triangles */
|
||||
if (!GRL_is_triangle(&geomDesc[geomID]))
|
||||
continue;
|
||||
|
||||
/* compute number of split primitives to produce */
|
||||
uint numSplitPrims = prob / globals->presplitPrioritySum * numPrimitivesToSplit;
|
||||
numSplitPrims = min(HEAP_SIZE, numSplitPrims);
|
||||
|
||||
/* stop if not splits have to get performed */
|
||||
if (numSplitPrims <= 1)
|
||||
continue;
|
||||
|
||||
/* add primref to heap */
|
||||
heap[0] = primref[i];
|
||||
heap_size = 1;
|
||||
uint heap_pos = 0;
|
||||
|
||||
/* iterate until all splits are done */
|
||||
uint prims = 1;
|
||||
uint last_heap_size = heap_size;
|
||||
while (prims < numSplitPrims)
|
||||
{
|
||||
/* map the primitive bounds to the grid */
|
||||
const float4 lower = heap[heap_pos].lower;
|
||||
const float4 upper = heap[heap_pos].upper;
|
||||
const float4 glower = (lower - grid_base) * grid_scale + 0.2f;
|
||||
const float4 gupper = (upper - grid_base) * grid_scale - 0.2f;
|
||||
uint4 ilower = convert_uint4_rtz(glower);
|
||||
uint4 iupper = convert_uint4_rtz(gupper);
|
||||
|
||||
/* this ignores dimensions that are empty */
|
||||
if (glower.x >= gupper.x)
|
||||
iupper.x = ilower.x;
|
||||
if (glower.y >= gupper.y)
|
||||
iupper.y = ilower.y;
|
||||
if (glower.z >= gupper.z)
|
||||
iupper.z = ilower.z;
|
||||
|
||||
/* Now we compute a morton code for the lower and upper grid
|
||||
* coordinates. */
|
||||
const uint lower_code = bitInterleave3D(ilower);
|
||||
const uint upper_code = bitInterleave3D(iupper);
|
||||
|
||||
/* if all bits are equal then we cannot split */
|
||||
if (lower_code == upper_code)
|
||||
{
|
||||
#if !USE_HEAP
|
||||
prims++; // !!!!!!!
|
||||
|
||||
heap_pos++;
|
||||
if (heap_pos == last_heap_size)
|
||||
{
|
||||
heap_pos = 0;
|
||||
last_heap_size = heap_size;
|
||||
}
|
||||
continue;
|
||||
#else
|
||||
if (heap_size == 1)
|
||||
break;
|
||||
|
||||
const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
|
||||
primref[offset] = heap[heap_pos];
|
||||
|
||||
presplit[offset].index = offset;
|
||||
presplit[offset].priority = calculate_priority(&heap[heap_pos], &geomDesc[geomID]);
|
||||
|
||||
heap[0] = heap[--heap_size];
|
||||
heapify_down(heap, heap_size);
|
||||
continue;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* We find the bit position of the first differing bit from the
|
||||
* top down. This bit indicates a split position inside an
|
||||
* implicit octree. */
|
||||
const uint diff = 31 - clz(lower_code ^ upper_code);
|
||||
|
||||
/* compute octree level and dimension to perform the split in */
|
||||
const uint level = diff / 3;
|
||||
const uint dim = diff % 3;
|
||||
|
||||
/* now we compute the grid position of the split */
|
||||
const uint isplit = iupper[dim] & ~((1 << level) - 1);
|
||||
|
||||
/* compute world space position of split */
|
||||
const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend[dim];
|
||||
|
||||
/* split primref into left and right part */
|
||||
struct AABB left, right;
|
||||
splitQuadPrimRef(&geomDesc[geomID], &heap[heap_pos], dim, fsplit, &left, &right);
|
||||
prims++;
|
||||
|
||||
/* update centroid bounds */
|
||||
AABB_extend_point(¢roidBounds, AABB_centroid2(&left));
|
||||
AABB_extend_point(¢roidBounds, AABB_centroid2(&right));
|
||||
|
||||
#if !USE_HEAP
|
||||
|
||||
heap[heap_pos] = left;
|
||||
heap[heap_size] = right;
|
||||
heap_size++;
|
||||
|
||||
heap_pos++;
|
||||
if (heap_pos == last_heap_size)
|
||||
{
|
||||
heap_pos = 0;
|
||||
last_heap_size = heap_size;
|
||||
}
|
||||
#else
|
||||
|
||||
/* insert left element into heap */
|
||||
heap[0] = left;
|
||||
heapify_down(heap, heap_size);
|
||||
|
||||
/* insert right element into heap */
|
||||
heap[heap_size] = right;
|
||||
heapify_up(heap, heap_size);
|
||||
|
||||
heap_size++;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* copy primities to primref array */
|
||||
primref[i] = heap[0];
|
||||
|
||||
presplit[ID].index = i;
|
||||
presplit[ID].priority = calculate_priority(&heap[0], &geomDesc[geomID]);
|
||||
|
||||
for (uint k = 1; k < heap_size; k++)
|
||||
{
|
||||
const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
|
||||
primref[offset] = heap[k];
|
||||
|
||||
presplit[offset].index = offset;
|
||||
presplit[offset].priority = calculate_priority(&heap[k], &geomDesc[geomID]);
|
||||
}
|
||||
}
|
||||
|
||||
/* merge centroid bounds into global bounds */
|
||||
centroidBounds = AABB_sub_group_reduce(¢roidBounds);
|
||||
if (get_sub_group_local_id() == 0)
|
||||
AABB_global_atomic_merge(&globals->centroidBounds, ¢roidBounds);
|
||||
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
/* update number of primitives on finish */
|
||||
if (Globals_OnFinish(globals))
|
||||
{
|
||||
globals->numPrimitives = globals->numPrimitives + globals->numSplittedPrimitives;
|
||||
globals->numSplittedPrimitives = 0;
|
||||
|
||||
/* update first build record */ // FIXME: should be done in builder itself
|
||||
global struct BuildRecord *record = (global struct BuildRecord *)(bvh_mem + bvh_base->quadLeafStart*64);
|
||||
record->end = globals->numPrimitives;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,674 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "api_interface.h"
|
||||
#include "common.h"
|
||||
#include "instance.h"
|
||||
|
||||
#include "bvh_build_primref.h"
|
||||
|
||||
//#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
|
||||
//int sub_group_non_uniform_any(int predicate);
|
||||
|
||||
#define WINDOW_SIZE 16
|
||||
|
||||
/* Representation of two merged triangles. */
|
||||
struct QuadIndices
|
||||
{
|
||||
uint primID0, primID1;
|
||||
uint v0, v1, v2, v3;
|
||||
};
|
||||
|
||||
/*
|
||||
|
||||
This function calculates a PrimRef from a merged quad and writes
|
||||
this PrimRef to memory.
|
||||
|
||||
*/
|
||||
GRL_INLINE void create_prim_ref(const uint geomID,
|
||||
const struct QuadIndices quad,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
|
||||
struct AABB *geometryBounds,
|
||||
struct AABB *centroidBounds,
|
||||
global uint *numPrimitives,
|
||||
global struct AABB *primref)
|
||||
{
|
||||
|
||||
/* load quad vertices */
|
||||
const float4 vtx0 = GRL_load_vertex(geomDesc, quad.v0); // FIXME: these multiple load_vertex calls should get merged
|
||||
const float4 vtx1 = GRL_load_vertex(geomDesc, quad.v1);
|
||||
const float4 vtx2 = GRL_load_vertex(geomDesc, quad.v2);
|
||||
const float4 vtx3 = GRL_load_vertex(geomDesc, quad.v3);
|
||||
|
||||
/* calculate bounds for quad */
|
||||
float4 lower = min(min(vtx0, vtx1), min(vtx2, vtx3));
|
||||
float4 upper = max(max(vtx0, vtx1), max(vtx2, vtx3));
|
||||
|
||||
/* extend geometry and centroid bounds */
|
||||
const float4 centroid2 = lower + upper;
|
||||
AABB_extendlu(geometryBounds, lower, upper);
|
||||
AABB_extendlu(centroidBounds, centroid2, centroid2);
|
||||
|
||||
PrimRef ref;
|
||||
PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
|
||||
PRIMREF_setQuadMetaData( &ref, quad.primID0, quad.primID1, geomID, GRL_get_Flags( geomDesc ) );
|
||||
|
||||
/* store primref to memory */
|
||||
const uint offset = atomic_add_global(numPrimitives, 1);
|
||||
primref[offset] = ref;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This function calculates a PrimRef from a procedural and writes
|
||||
this PrimRef to memory.
|
||||
|
||||
*/
|
||||
GRL_INLINE void create_prim_ref_procedural(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
|
||||
const uint geomID,
|
||||
const uint primID,
|
||||
struct AABB *geometryBounds,
|
||||
struct AABB *centroidBounds,
|
||||
global uint *numPrimitives,
|
||||
global struct AABB *primref)
|
||||
{
|
||||
/* load aabb from memory */
|
||||
struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
|
||||
|
||||
/* extend geometry and centroid bounds */
|
||||
float4 lower = (float4)(aabb.MinX, aabb.MinY, aabb.MinZ, 0.0f);
|
||||
float4 upper = (float4)(aabb.MaxX, aabb.MaxY, aabb.MaxZ, 0.0f);
|
||||
const float4 centroid2 = lower + upper;
|
||||
AABB_extendlu(geometryBounds, lower, upper);
|
||||
AABB_extendlu(centroidBounds, centroid2, centroid2);
|
||||
|
||||
/* encode geomID, primID */
|
||||
uint geomFlags = GRL_get_Flags(&geomDesc[geomID]);
|
||||
|
||||
PrimRef ref;
|
||||
PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
|
||||
PRIMREF_setProceduralMetaData( &ref, geomID, primID, geomFlags );
|
||||
|
||||
/* store primref to memory */
|
||||
const uint offset = atomic_add_global(numPrimitives, 1);
|
||||
primref[offset] = ref;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This function performs a binary search to calculate the geomID and
|
||||
primID of the i'th primitive of the scene. For the search a
|
||||
prefix_sum array is used that stores for each location j the sum of
|
||||
the number of primitives of all meshes k with k<j.
|
||||
|
||||
*/
|
||||
|
||||
struct GeomPrimID
|
||||
{
|
||||
uint geomID, primID;
|
||||
};
|
||||
|
||||
struct GeomPrimID binary_search_geomID_primID(global uint *prefix_sum, const uint prefix_sum_size, const uint i)
|
||||
{
|
||||
uint l = 0;
|
||||
uint r = prefix_sum_size;
|
||||
uint k = 0;
|
||||
|
||||
while (r - l > 1)
|
||||
{
|
||||
const uint m = (l + r) / 2;
|
||||
k = prefix_sum[m];
|
||||
if (k <= i)
|
||||
{
|
||||
l = m;
|
||||
}
|
||||
else if (i < k)
|
||||
{
|
||||
r = m;
|
||||
}
|
||||
}
|
||||
|
||||
struct GeomPrimID id;
|
||||
id.geomID = l;
|
||||
id.primID = i - prefix_sum[l];
|
||||
return id;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Checks if a vertex contains only finite floating point numbers.
|
||||
|
||||
*/
|
||||
|
||||
GRL_INLINE bool isfinite_vertex(float4 vtx)
|
||||
{
|
||||
return isfinite(vtx.x) && isfinite(vtx.y) && isfinite(vtx.z);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Create primrefs from array of instance descriptors.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
|
||||
primrefs_from_DXR_instances(global struct Globals *globals,
|
||||
global struct BVHBase* bvh,
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
|
||||
uint numInstances,
|
||||
global struct AABB *primrefs,
|
||||
uint allowUpdate)
|
||||
{
|
||||
const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
if (instanceIndex < numInstances)
|
||||
{
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
|
||||
|
||||
primrefs_from_instances(
|
||||
globals,
|
||||
bvh,
|
||||
instance,
|
||||
instanceIndex,
|
||||
primrefs,
|
||||
0,
|
||||
allowUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Create primrefs from array of instance descriptors.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
void kernel
|
||||
primrefs_from_DXR_instances_indirect(global struct Globals *globals,
|
||||
global struct BVHBase* bvh,
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
|
||||
global struct IndirectBuildRangeInfo* indirect_data,
|
||||
global struct AABB *primrefs,
|
||||
uint allowUpdate)
|
||||
{
|
||||
// TODO: On DG2, we have 8 dwords of 'inline data' which can be pushed
|
||||
// directly to the kernel. THe rest of the kernel args are pulled using
|
||||
// loads from memory. It may be more efficient to put 'numInstances' and
|
||||
// 'allowUpdate' into 'globals'
|
||||
|
||||
const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
|
||||
if (instanceIndex < indirect_data->primitiveCount)
|
||||
{
|
||||
instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
|
||||
(((global char*)instances) + indirect_data->primitiveOffset);
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
|
||||
primrefs_from_instances(
|
||||
globals,
|
||||
bvh,
|
||||
instance,
|
||||
instanceIndex,
|
||||
primrefs,
|
||||
0,
|
||||
allowUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Create primrefs from array of pointers to instance descriptors.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
|
||||
primrefs_from_DXR_instances_pointers(global struct Globals *globals,
|
||||
global struct BVHBase* bvh,
|
||||
global void *instances_in,
|
||||
uint numInstances,
|
||||
global struct AABB *primrefs,
|
||||
uint allowUpdate)
|
||||
{
|
||||
global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
|
||||
(global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
|
||||
|
||||
const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
if (instanceIndex < numInstances)
|
||||
{
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
|
||||
|
||||
primrefs_from_instances(
|
||||
globals,
|
||||
bvh,
|
||||
instance,
|
||||
instanceIndex,
|
||||
primrefs,
|
||||
0,
|
||||
allowUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Create primrefs from array of pointers to instance descriptors.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
void kernel
|
||||
primrefs_from_DXR_instances_pointers_indirect(global struct Globals *globals,
|
||||
global struct BVHBase* bvh,
|
||||
global void *instances_in,
|
||||
global struct AABB *primrefs,
|
||||
global struct IndirectBuildRangeInfo* indirect_data,
|
||||
uint allowUpdate)
|
||||
{
|
||||
global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
|
||||
(global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
|
||||
|
||||
const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
|
||||
if (instanceIndex < indirect_data->primitiveCount)
|
||||
{
|
||||
instances = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)
|
||||
(((global char*)instances) + indirect_data->primitiveOffset);
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
|
||||
|
||||
primrefs_from_instances(
|
||||
globals,
|
||||
bvh,
|
||||
instance,
|
||||
instanceIndex,
|
||||
primrefs,
|
||||
0,
|
||||
allowUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
bool can_pair( uint3 a, uint3 b )
|
||||
{
|
||||
bool match0 = any( a.xxx == b.xyz ) ? 1 : 0;
|
||||
bool match1 = any( a.yyy == b.xyz ) ? 1 : 0;
|
||||
bool match2 = any( a.zzz == b.xyz ) ? 1 : 0;
|
||||
return (match0 + match1 + match2) >= 2;
|
||||
}
|
||||
|
||||
void reduce_bounds(
|
||||
float3 lower,
|
||||
float3 upper,
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh )
|
||||
{
|
||||
|
||||
// reduce centroid bounds... make sure to exclude lanes with invalid AABBs
|
||||
float3 cent = lower + upper;
|
||||
float3 cent_lower = select( (float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
|
||||
float3 cent_upper = select(-(float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
|
||||
|
||||
// reduce geo bounds
|
||||
AABB3f_atomic_merge_global_sub_group_lu( &bvh->Meta.bounds, lower, upper );
|
||||
AABB_global_atomic_merge_sub_group_lu(&globals->centroidBounds, cent_lower, cent_upper );
|
||||
}
|
||||
|
||||
|
||||
struct TriState
|
||||
{
|
||||
bool valid;
|
||||
uint prim_index;
|
||||
uint pairing;
|
||||
uint3 indices;
|
||||
float3 lower;
|
||||
float3 upper;
|
||||
};
|
||||
|
||||
#define NOT_PAIRED 0xffffffff
|
||||
|
||||
void load_triangle_data(uniform global char* index_buffer,
|
||||
uniform const uint index_format,
|
||||
uniform global char* vertex_buffer,
|
||||
uniform const uint vertex_format,
|
||||
uniform const uint vertex_stride,
|
||||
uniform global float* transform_buffer,
|
||||
uniform uint total_vert_count,
|
||||
struct TriState* state,
|
||||
float4* v)
|
||||
{
|
||||
state->indices = GRL_load_indices_from_buffer(index_buffer, index_format, state->prim_index );
|
||||
|
||||
const uint last_vertex = total_vert_count - 1;
|
||||
const uint x = min(state->indices.x, last_vertex);
|
||||
const uint y = min(state->indices.y, last_vertex);
|
||||
const uint z = min(state->indices.z, last_vertex);
|
||||
|
||||
GRL_load_triangle_vertices(vertex_buffer, vertex_format, vertex_stride, transform_buffer, x, y, z, v);
|
||||
}
|
||||
|
||||
struct TriState load_triangle( uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
uniform uint base,
|
||||
uniform uint num_prims,
|
||||
uniform uint total_vert_count )
|
||||
{
|
||||
|
||||
struct TriState state;
|
||||
state.pairing = NOT_PAIRED;
|
||||
state.valid = false;
|
||||
state.prim_index = base + get_sub_group_local_id();
|
||||
state.lower = (float3)(INFINITY, INFINITY, INFINITY);
|
||||
state.upper = -(float3)(INFINITY, INFINITY, INFINITY);
|
||||
|
||||
if (state.prim_index < num_prims)
|
||||
{
|
||||
state.valid = true;
|
||||
float4 v[3];
|
||||
load_triangle_data((global char*)geomDesc->Desc.Triangles.pIndexBuffer,
|
||||
geomDesc->Desc.Triangles.IndexFormat,
|
||||
(global char*)geomDesc->Desc.Triangles.pVertexBuffer,
|
||||
geomDesc->Desc.Triangles.VertexFormat,
|
||||
geomDesc->Desc.Triangles.VertexBufferByteStride,
|
||||
(global float*)geomDesc->Desc.Triangles.pTransformBuffer,
|
||||
total_vert_count,
|
||||
&state,
|
||||
v);
|
||||
|
||||
if (state.indices.x >= total_vert_count || state.indices.y >= total_vert_count || state.indices.z >= total_vert_count ||
|
||||
!isfinite_vertex(v[0]) || !isfinite_vertex(v[1]) || !isfinite_vertex(v[2]) ||
|
||||
state.indices.x == state.indices.y || state.indices.x == state.indices.z || state.indices.y == state.indices.z)
|
||||
{
|
||||
state.valid = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
state.lower.xyz = min(v[2].xyz, min(v[1].xyz, v[0].xyz));
|
||||
state.upper.xyz = max(v[2].xyz, max(v[1].xyz, v[0].xyz));
|
||||
}
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
void broadcast_triangles_local( struct TriState* state )
|
||||
{
|
||||
varying uint my_prim = state->prim_index;
|
||||
varying uint my_pairing = state->pairing;
|
||||
varying float3 my_lower = state->lower;
|
||||
varying float3 my_upper = state->upper;
|
||||
varying bool valid = state->valid;
|
||||
varying uint3 indices = state->indices;
|
||||
|
||||
for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
|
||||
{
|
||||
// don't broadcast invalid prims
|
||||
if ( !sub_group_broadcast( valid, broadcast_lane ) )
|
||||
continue;
|
||||
|
||||
uint broadcast_pairing = sub_group_broadcast(my_pairing, broadcast_lane);
|
||||
uint broadcast_prim = sub_group_broadcast(my_prim, broadcast_lane);
|
||||
|
||||
if (broadcast_pairing == NOT_PAIRED)
|
||||
{
|
||||
// if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
|
||||
bool pairable = false;
|
||||
uint3 other_indices = sub_group_broadcast_uint3( indices, broadcast_lane );
|
||||
if (broadcast_prim != my_prim && my_pairing == NOT_PAIRED && valid )
|
||||
{
|
||||
pairable = can_pair( indices, other_indices );
|
||||
}
|
||||
|
||||
|
||||
uint pairable_lane = ctz(intel_sub_group_ballot(pairable));
|
||||
if (valid && pairable_lane < get_sub_group_size())
|
||||
{
|
||||
// pair the broadcast primitive with the first lane that can accept it
|
||||
float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
|
||||
float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
|
||||
if (get_sub_group_local_id() == pairable_lane)
|
||||
{
|
||||
my_pairing = broadcast_prim;
|
||||
my_lower.xyz = min(my_lower.xyz, broadcast_lower);
|
||||
my_upper.xyz = max(my_upper.xyz, broadcast_upper);
|
||||
}
|
||||
|
||||
// pair the broadcast primitive with the same that was paired to it
|
||||
uint pairable_prim = sub_group_broadcast(my_pairing, pairable_lane);
|
||||
if (get_sub_group_local_id() == broadcast_lane)
|
||||
{
|
||||
my_pairing = pairable_prim;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//
|
||||
// if this lane was already paired with the broadcasting tri
|
||||
// in an earlier loop iteration, then record the pairing in this lane's registers
|
||||
float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
|
||||
float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
|
||||
if (broadcast_pairing == my_prim)
|
||||
{
|
||||
my_pairing = broadcast_prim;
|
||||
my_lower.xyz = min(my_lower.xyz, broadcast_lower);
|
||||
my_upper.xyz = max(my_upper.xyz, broadcast_upper);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
state->pairing = my_pairing;
|
||||
state->lower = my_lower;
|
||||
state->upper = my_upper;
|
||||
}
|
||||
|
||||
|
||||
void broadcast_triangles_nonlocal(struct TriState* state, const struct TriState* other )
|
||||
{
|
||||
varying uint my_prim = state->prim_index;
|
||||
varying uint my_pairing = state->pairing;
|
||||
varying float3 my_lower = state->lower;
|
||||
varying float3 my_upper = state->upper;
|
||||
varying bool valid = state->valid;
|
||||
varying uint3 indices = state->indices;
|
||||
|
||||
for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
|
||||
{
|
||||
// don't broadcast invalid prims
|
||||
if (!sub_group_broadcast(other->valid, broadcast_lane))
|
||||
continue;
|
||||
|
||||
uint broadcast_pairing = sub_group_broadcast(other->pairing, broadcast_lane);
|
||||
uint broadcast_prim = sub_group_broadcast(other->prim_index, broadcast_lane);
|
||||
|
||||
if (broadcast_pairing == NOT_PAIRED)
|
||||
{
|
||||
// if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
|
||||
bool pairable = false;
|
||||
if ( my_pairing == NOT_PAIRED && valid )
|
||||
{
|
||||
uint3 other_indices = sub_group_broadcast_uint3(other->indices, broadcast_lane);
|
||||
pairable = can_pair(indices, other_indices);
|
||||
}
|
||||
|
||||
// pair the broadcast primitive with the first lane that can accept it
|
||||
uint pairable_mask = intel_sub_group_ballot(pairable);
|
||||
if (valid && (ctz(pairable_mask) == get_sub_group_local_id()))
|
||||
{
|
||||
my_pairing = broadcast_prim;
|
||||
my_lower.xyz = min(my_lower.xyz, sub_group_broadcast_float3(other->lower.xyz, broadcast_lane));
|
||||
my_upper.xyz = max(my_upper.xyz, sub_group_broadcast_float3(other->upper.xyz, broadcast_lane));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
state->pairing = my_pairing;
|
||||
state->lower = my_lower;
|
||||
state->upper = my_upper;
|
||||
}
|
||||
|
||||
GRL_INLINE void do_triangles_to_primrefs(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global struct AABB* primref,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
uint geomID_and_flags,
|
||||
const uint num_prims)
|
||||
{
|
||||
uint geomID = geomID_and_flags & 0x00ffffff;
|
||||
uint geom_flags = geomID_and_flags >> 24;
|
||||
uint prim_base = get_group_id(0) * get_local_size(0);
|
||||
uint total_vert_count = GRL_get_triangles_VertexCount(geomDesc);
|
||||
|
||||
struct TriState tri = load_triangle( geomDesc, prim_base, num_prims, total_vert_count );
|
||||
broadcast_triangles_local( &tri );
|
||||
|
||||
|
||||
// we will produce output if the lane creates a triangle (my_pairing == NOT_PAIRED)
|
||||
// or for the lane corresponding to the larger of two triangles
|
||||
bool will_write = (tri.pairing > tri.prim_index) && tri.valid;
|
||||
uint write_mask = intel_sub_group_ballot(will_write);
|
||||
uint write_offs = subgroup_bit_prefix_exclusive( write_mask );
|
||||
uint write_count = popcount(write_mask);
|
||||
|
||||
// allocate space in primref buffer
|
||||
uint write_base;
|
||||
if( get_sub_group_local_id() == 0 )
|
||||
write_base = atomic_add_global( &globals->numPrimitives, write_count );
|
||||
write_offs += sub_group_broadcast( write_base, 0 );
|
||||
|
||||
uint primID0 = tri.prim_index;
|
||||
uint primID1 = (tri.pairing != NOT_PAIRED) ? tri.pairing : tri.prim_index;
|
||||
|
||||
if (will_write)
|
||||
{
|
||||
PrimRef ref;
|
||||
PRIMREF_setAABB(&ref, tri.lower.xyz, tri.upper.xyz);
|
||||
PRIMREF_setQuadMetaData(&ref, primID0, primID1, geomID, geom_flags);
|
||||
uint8 val = (uint8)(
|
||||
as_uint(ref.lower.x), as_uint(ref.lower.y), as_uint(ref.lower.z), as_uint(ref.lower.w),
|
||||
as_uint(ref.upper.x), as_uint(ref.upper.y), as_uint(ref.upper.z), as_uint(ref.upper.w));
|
||||
store_uint8_L1WB_L3WB((global uint8*)(primref + write_offs), 0, val);
|
||||
}
|
||||
|
||||
reduce_bounds( tri.lower, tri.upper, globals, bvh );
|
||||
}
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
triangles_to_primrefs(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global struct AABB* primref,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
uint geomID_and_flags,
|
||||
uint num_prims
|
||||
)
|
||||
{
|
||||
do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel
|
||||
triangles_to_primrefs_indirect(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global struct AABB* primref,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
global struct IndirectBuildRangeInfo* indirect_data,
|
||||
uint geomID_and_flags)
|
||||
{
|
||||
const uint num_prims = indirect_data->primitiveCount;
|
||||
do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
|
||||
}
|
||||
|
||||
GRL_INLINE void do_procedurals_to_primrefs(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global struct AABB* primref,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
uint geomID_and_flags,
|
||||
const uint num_prims)
|
||||
{
|
||||
uint geomID = geomID_and_flags & 0x00ffffff;
|
||||
uint geomFlags = geomID_and_flags >> 24;
|
||||
|
||||
uint primID = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
|
||||
|
||||
bool create_primref = false;
|
||||
float3 lower = (float3)(INFINITY, INFINITY, INFINITY);
|
||||
float3 upper = -(float3)(INFINITY, INFINITY, INFINITY);
|
||||
if (primID < num_prims)
|
||||
{
|
||||
/* check if procedural is valid */
|
||||
struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(geomDesc, primID);
|
||||
const bool valid_min = isfinite(aabb.MinX) && isfinite(aabb.MinY) && isfinite(aabb.MinZ);
|
||||
const bool valid_max = isfinite(aabb.MaxX) && isfinite(aabb.MaxY) && isfinite(aabb.MaxZ);
|
||||
if (valid_min & valid_max)
|
||||
{
|
||||
/* load aabb from memory */
|
||||
float3 l = (float3)(aabb.MinX, aabb.MinY, aabb.MinZ);
|
||||
float3 u = (float3)(aabb.MaxX, aabb.MaxY, aabb.MaxZ);
|
||||
|
||||
// convert degenerate boxes to points at the box centroid
|
||||
lower = min( l, u );
|
||||
upper = max( l, u );
|
||||
|
||||
create_primref = true;
|
||||
}
|
||||
}
|
||||
|
||||
uint write_mask = intel_sub_group_ballot(create_primref);
|
||||
uint write_offs = subgroup_bit_prefix_exclusive(write_mask);
|
||||
uint write_count = popcount(write_mask);
|
||||
|
||||
// allocate space in primref buffer
|
||||
uint write_base;
|
||||
if (get_sub_group_local_id() == 0)
|
||||
write_base = atomic_add_global(&globals->numPrimitives, write_count);
|
||||
write_offs += sub_group_broadcast(write_base, 0);
|
||||
|
||||
// write the primref
|
||||
if (create_primref)
|
||||
{
|
||||
PrimRef ref;
|
||||
PRIMREF_setAABB(&ref, lower.xyz, upper.xyz);
|
||||
PRIMREF_setProceduralMetaData(&ref, geomID, primID, geomFlags);
|
||||
primref[write_offs] = ref;
|
||||
}
|
||||
|
||||
reduce_bounds(lower, upper, globals, bvh);
|
||||
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
procedurals_to_primrefs(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global struct AABB* primref,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
uint geomID_and_flags,
|
||||
uint num_prims
|
||||
)
|
||||
{
|
||||
do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel
|
||||
procedurals_to_primrefs_indirect(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global struct AABB* primref,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
global const struct IndirectBuildRangeInfo* indirect_data,
|
||||
uint geomID_and_flags
|
||||
)
|
||||
{
|
||||
const uint num_prims = indirect_data->primitiveCount;
|
||||
do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
|
||||
}
|
||||
|
|
@ -1,246 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#if 0
|
||||
/*
|
||||
|
||||
Create primrefs from array of instance descriptors.
|
||||
|
||||
*/
|
||||
|
||||
void store_instance_primref(
|
||||
global struct BVHBase* top_bvh,
|
||||
global struct Globals* globals,
|
||||
global PrimRef* primrefs,
|
||||
bool alloc_primref,
|
||||
PrimRef new_primref )
|
||||
{
|
||||
uint allocatePrimref = alloc_primref ? 1 : 0;
|
||||
uint index = 0;
|
||||
uint numAllocations = sub_group_reduce_add(allocatePrimref);
|
||||
|
||||
if (get_sub_group_local_id() == 0)
|
||||
{
|
||||
index = atomic_add_global(&globals->numPrimitives, numAllocations);
|
||||
}
|
||||
|
||||
index = sub_group_broadcast(index, 0);
|
||||
index = index + sub_group_scan_exclusive_add(allocatePrimref);
|
||||
|
||||
if (allocatePrimref)
|
||||
{
|
||||
primrefs[index] = new_primref;
|
||||
}
|
||||
|
||||
struct AABB centroidBounds;
|
||||
centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
|
||||
struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
|
||||
struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(¢roidBounds);
|
||||
|
||||
if (get_sub_group_local_id() == 0)
|
||||
{
|
||||
AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
|
||||
AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Compute transformed blas AABB. Returns false if instance is degenerate
|
||||
bool create_instance_primref(
|
||||
PrimRef* ref_out,
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
|
||||
global struct BVHBase* bvh,
|
||||
uint instanceMask,
|
||||
uint instanceIndex
|
||||
)
|
||||
{
|
||||
struct AABB3f bbox;
|
||||
bool alloc_primref = false;
|
||||
uint rootNodeOffset = NO_NODE_OFFSET;
|
||||
if (bvh != 0)
|
||||
{
|
||||
alloc_primref = true;
|
||||
AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
|
||||
|
||||
const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
|
||||
const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
|
||||
|
||||
if (!valid_min || !valid_max || instanceMask == 0)
|
||||
{
|
||||
// degenerated instance case
|
||||
|
||||
// TODO this should be under if ( allocate backpointers )
|
||||
{
|
||||
// we have to allocate the primref because this instance can be updated to non-degenerated
|
||||
// take the origin of the instance as a bounding box.
|
||||
|
||||
bbox.lower[0] = instance->Transform[3];
|
||||
bbox.lower[1] = instance->Transform[7];
|
||||
bbox.lower[2] = instance->Transform[11];
|
||||
bbox.upper[0] = instance->Transform[3];
|
||||
bbox.upper[1] = instance->Transform[7];
|
||||
bbox.upper[2] = instance->Transform[11];
|
||||
instanceMask = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
rootNodeOffset = BVH_ROOT_NODE_OFFSET;
|
||||
float transformOverhead = 0.0f;
|
||||
bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
|
||||
}
|
||||
}
|
||||
|
||||
*ref_out = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, 0);
|
||||
return alloc_primref;
|
||||
}
|
||||
|
||||
GRL_INLINE void primrefs_from_instances(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* top_bvh,
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
|
||||
uint instanceIndex,
|
||||
global struct AABB* primrefs)
|
||||
{
|
||||
bool alloc_primref = false;
|
||||
PrimRef new_primref;
|
||||
AABB_init(&new_primref);
|
||||
|
||||
if (instance)
|
||||
{
|
||||
uint mask = GRL_get_InstanceMask(instance);
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)instance->AccelerationStructure;
|
||||
alloc_primref = create_instance_primref(&new_primref, instance, bvh, mask, instanceIndex);
|
||||
}
|
||||
|
||||
store_instance_primref(top_bvh, globals, primrefs, alloc_primref, new_primref);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
GRL_INLINE void primrefs_from_instances(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* top_bvh,
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
|
||||
uint instanceIndex,
|
||||
global struct AABB* primrefs,
|
||||
global GRL_RAYTRACING_AABB* procedural_aabb,
|
||||
uint allowUpdate
|
||||
)
|
||||
{
|
||||
struct AABB3f bbox;
|
||||
uint allocatePrimref = 0;
|
||||
|
||||
uint rootNodeOffset = NO_NODE_OFFSET;
|
||||
uint instanceMask = 0;
|
||||
|
||||
bool is_procedural = (procedural_aabb != 0);
|
||||
|
||||
if( instance )
|
||||
{
|
||||
instanceMask = GRL_get_InstanceMask(instance) ;
|
||||
if ( is_procedural )
|
||||
{
|
||||
// procedural instance primref
|
||||
allocatePrimref = 1;
|
||||
|
||||
float3 lower = (float3)(procedural_aabb->MinX, procedural_aabb->MinY, procedural_aabb->MinZ);
|
||||
float3 upper = (float3)(procedural_aabb->MaxX, procedural_aabb->MaxY, procedural_aabb->MaxZ);
|
||||
|
||||
if (instanceMask == 0 || any(lower > upper))
|
||||
{
|
||||
bbox.lower[0] = instance->Transform[3];
|
||||
bbox.lower[1] = instance->Transform[7];
|
||||
bbox.lower[2] = instance->Transform[11];
|
||||
bbox.upper[0] = instance->Transform[3];
|
||||
bbox.upper[1] = instance->Transform[7];
|
||||
bbox.upper[2] = instance->Transform[11];
|
||||
instanceMask = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
bbox = transform_aabb(lower, upper, instance->Transform);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// HW-instance primref
|
||||
|
||||
global struct BVHBase* bvh = instance ?
|
||||
(global struct BVHBase*)instance->AccelerationStructure :
|
||||
0;
|
||||
|
||||
if (bvh != 0)
|
||||
{
|
||||
AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
|
||||
|
||||
const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
|
||||
const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
|
||||
|
||||
|
||||
if (valid_min && valid_max && instanceMask != 0)
|
||||
{
|
||||
allocatePrimref = 1;
|
||||
rootNodeOffset = BVH_ROOT_NODE_OFFSET;
|
||||
float transformOverhead = 0.0f;
|
||||
bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
|
||||
}
|
||||
else if (allowUpdate)
|
||||
{
|
||||
// degenerated instance case
|
||||
// we have to allocate the primref because this instance can be updated to non-degenerated
|
||||
// take the origin of the instance as a bounding box.
|
||||
allocatePrimref = 1;
|
||||
bbox.lower[0] = instance->Transform[3];
|
||||
bbox.lower[1] = instance->Transform[7];
|
||||
bbox.lower[2] = instance->Transform[11];
|
||||
bbox.upper[0] = instance->Transform[3];
|
||||
bbox.upper[1] = instance->Transform[7];
|
||||
bbox.upper[2] = instance->Transform[11];
|
||||
instanceMask = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint index = 0;
|
||||
uint numAllocations = sub_group_reduce_add(allocatePrimref);
|
||||
|
||||
if (get_sub_group_local_id() == 0)
|
||||
{
|
||||
index = atomic_add_global(&globals->numPrimitives, numAllocations);
|
||||
}
|
||||
|
||||
index = sub_group_broadcast(index, 0);
|
||||
index = index + sub_group_scan_exclusive_add(allocatePrimref);
|
||||
|
||||
struct AABB new_primref;
|
||||
struct AABB centroidBounds;
|
||||
if (allocatePrimref)
|
||||
{
|
||||
new_primref = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, is_procedural);
|
||||
primrefs[index] = new_primref;
|
||||
centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
|
||||
}
|
||||
else
|
||||
{
|
||||
AABB_init(&new_primref);
|
||||
AABB_init(¢roidBounds);
|
||||
}
|
||||
|
||||
|
||||
struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
|
||||
struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(¢roidBounds);
|
||||
|
||||
if (get_sub_group_local_id() == 0)
|
||||
{
|
||||
AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
|
||||
AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,491 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "bvh_build_refit.h"
|
||||
#include "api_interface.h"
|
||||
#include "common.h"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
|
||||
void kernel
|
||||
update_instance_leaves( global struct BVHBase* bvh,
|
||||
uint64_t dxrInstancesArray,
|
||||
uint64_t dxrInstancesPtr,
|
||||
global struct AABB3f* instance_aabb_scratch
|
||||
)
|
||||
{
|
||||
uint num_leaves = BVHBase_GetNumHWInstanceLeaves( bvh );
|
||||
uint id = get_local_id( 0 ) + get_local_size( 0 ) * get_group_id( 0 );
|
||||
if ( id >= num_leaves )
|
||||
return;
|
||||
|
||||
global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
|
||||
(global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
|
||||
global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
|
||||
(global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
|
||||
|
||||
global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
|
||||
|
||||
/* iterate over all children of the instance node and get their bounds */
|
||||
|
||||
uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex( &leafs[id] );
|
||||
global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
|
||||
if ( dxrInstancesArray != NULL )
|
||||
instance = &instancesArray[instanceIdx];
|
||||
else
|
||||
instance = instancesPtrArray[instanceIdx];
|
||||
|
||||
struct AffineSpace3f xfm = AffineSpace3f_load_row_major( instance->Transform );
|
||||
global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
|
||||
struct AABB3f newSubtreeBounds = instanceBvh->Meta.bounds;
|
||||
struct AABB3f bbox = AABB3f_transform( xfm, newSubtreeBounds ); // JDB TODO: Use faster abs-matrix method
|
||||
|
||||
const bool valid_min = isfinite( bbox.lower[0] ) && isfinite( bbox.lower[1] ) && isfinite( bbox.lower[2] );
|
||||
const bool valid_max = isfinite( bbox.upper[0] ) && isfinite( bbox.upper[1] ) && isfinite( bbox.upper[2] );
|
||||
|
||||
uint mask = GRL_get_InstanceMask(instance);
|
||||
|
||||
uint offset = instanceBvh->rootNodeOffset;
|
||||
if ( !valid_min || !valid_max )
|
||||
{
|
||||
bbox.lower[0] = xfm.p.x;
|
||||
bbox.lower[1] = xfm.p.y;
|
||||
bbox.lower[2] = xfm.p.z;
|
||||
bbox.upper[0] = xfm.p.x;
|
||||
bbox.upper[1] = xfm.p.y;
|
||||
bbox.upper[2] = xfm.p.z;
|
||||
offset = NO_NODE_OFFSET;
|
||||
mask = 0;
|
||||
}
|
||||
|
||||
instance_aabb_scratch[id] = bbox;
|
||||
|
||||
HwInstanceLeaf_Constructor( &leafs[id], instance, instanceIdx, offset, mask ); // TODO: No instance opening for refittable BVH
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel
|
||||
update_instance_leaves(global struct BVHBase* bvh,
|
||||
uint64_t dxrInstancesArray,
|
||||
uint64_t dxrInstancesPtr,
|
||||
global struct AABB3f* instance_aabb_scratch
|
||||
)
|
||||
{
|
||||
uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
|
||||
uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
|
||||
if (id >= num_leaves)
|
||||
return;
|
||||
|
||||
DO_update_instance_leaves(
|
||||
bvh,
|
||||
dxrInstancesArray,
|
||||
dxrInstancesPtr,
|
||||
instance_aabb_scratch,
|
||||
id,
|
||||
0 );
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel
|
||||
update_instance_leaves_indirect(global struct BVHBase* bvh,
|
||||
uint64_t dxrInstancesArray,
|
||||
uint64_t dxrInstancesPtr,
|
||||
global struct AABB3f* instance_aabb_scratch,
|
||||
global struct IndirectBuildRangeInfo* indirect_data)
|
||||
{
|
||||
uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
|
||||
uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
|
||||
if (id >= num_leaves)
|
||||
return;
|
||||
|
||||
DO_update_instance_leaves(
|
||||
bvh,
|
||||
dxrInstancesArray + indirect_data->primitiveOffset,
|
||||
dxrInstancesPtr,
|
||||
instance_aabb_scratch,
|
||||
id,
|
||||
0 );
|
||||
}
|
||||
|
||||
#if 0
|
||||
/*
|
||||
|
||||
This kernel refit a BVH. The algorithm iterates over all BVH nodes
|
||||
to find all leaf nodes, which is where refitting starts. For these
|
||||
leaf nodes bounds get recalculated and then propagates up the tree.
|
||||
|
||||
One kernel instance considers a range of inner nodes as startpoints.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(8, 1, 1))) void kernel refit(
|
||||
global struct BVHBase *bvh,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
|
||||
global struct AABB3f* instance_leaf_aabbs )
|
||||
{
|
||||
/* here we temporarily store the bounds for the children of a node */
|
||||
struct AABB childrenAABB[BVH_NODE_N6];
|
||||
|
||||
/* get pointer to inner nodes and back pointers */
|
||||
global struct QBVHNodeN *inner_nodes = BVHBase_rootNode(bvh);
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
|
||||
|
||||
/* construct range of nodes that each work group will process */
|
||||
const uint numInnerNodes = BVHBase_numNodes(bvh);
|
||||
const uint startID = (get_group_id(0) + 0) * numInnerNodes / get_num_groups(0);
|
||||
const uint endID = (get_group_id(0) + 1) * numInnerNodes / get_num_groups(0);
|
||||
|
||||
/* each workgroup iterates over its range of nodes */
|
||||
for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
|
||||
{
|
||||
global struct QBVHNodeN* curNode = &inner_nodes[i];
|
||||
uint numChildren = refit_bottom(bvh, geosArray,
|
||||
instance_leaf_aabbs,
|
||||
curNode,
|
||||
childrenAABB,
|
||||
*InnerNode_GetBackPointer(backPointers, i));
|
||||
if (numChildren != 0)
|
||||
{
|
||||
/* update bounds of node */
|
||||
QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
|
||||
|
||||
/* refit upper parts of the BVH */
|
||||
// TODO: this will not gonna work for mixed nodes
|
||||
refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(8, 1, 1)))
|
||||
void kernel Find_refit_treelets(
|
||||
global struct BVHBase* bvh,
|
||||
global TreeletNodeData* treelets,
|
||||
global uint* scratchStartpoints,
|
||||
global uint* startpointAlloc)
|
||||
{
|
||||
find_refit_treelets(bvh,
|
||||
treelets,
|
||||
scratchStartpoints,
|
||||
startpointAlloc);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel Assign_refit_startpoints_to_treelets(
|
||||
global struct BVHBase* bvh,
|
||||
global TreeletNodeData* treelets,
|
||||
global uint* scratchStartpoints)
|
||||
{
|
||||
assign_refit_startpoints_to_treelets(bvh, treelets, scratchStartpoints);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(128, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel Finalize_treelets_in_groups(
|
||||
global struct BVHBase* bvh,
|
||||
global uint* scratchStartpoints )
|
||||
{
|
||||
local uint depths[FINALIZE_TREELETS_SLM_DEPTHS_SPACE];
|
||||
|
||||
finalize_treelets_in_groups(bvh, scratchStartpoints, depths);
|
||||
}
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel Refit_quads_tree_per_group(global SquashedInput* psqinputs)
|
||||
{
|
||||
uint group_id = get_group_id(0);
|
||||
SquashedInput sqinput = psqinputs[group_id];
|
||||
global struct BVHBase* bvh = sqinput.pBvh;
|
||||
uint numLeaves = BVHBase_GetNumQuads(bvh);
|
||||
global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
|
||||
|
||||
global void* input = sqinput.pInput;
|
||||
global struct AABB* bbox_scratch = sqinput.bbox_scratch;
|
||||
|
||||
uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
|
||||
uint id = get_local_id(0);
|
||||
|
||||
for (uint leaf_id = id; leaf_id < numLeaves; leaf_id += get_local_size(0))
|
||||
{
|
||||
struct AABB theAABB;
|
||||
refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
|
||||
theAABB.lower.w = as_float(0xABBADEFFu);
|
||||
bbox_scratch[leafsIndexOffset + leaf_id] = theAABB;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(32, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel Refit_quads(
|
||||
global struct BVHBase* bvh,
|
||||
global void* input,
|
||||
global struct AABB* bbox_scratch,
|
||||
uint numGroupsExecuted,
|
||||
global SquashedInputGroupDesc* sqinput)
|
||||
{
|
||||
uint numLeafs = BVHBase_GetNumQuads(bvh);
|
||||
if (numLeafs == 0) return;
|
||||
global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
|
||||
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
|
||||
uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
|
||||
|
||||
uint numLeafsPerGr = (numLeafs + (numGroupsExecuted - 1)) / numGroupsExecuted;
|
||||
|
||||
uint id_start = get_group_id(0) * numLeafsPerGr + get_local_id(0);
|
||||
uint id_end = min(id_start + numLeafsPerGr, numLeafs);
|
||||
for (uint id = id_start; id < id_end; id+= get_local_size(0))
|
||||
{
|
||||
struct AABB theAABB;
|
||||
refit_bottom_child_quad(leafs + id, geosArray, &theAABB);
|
||||
theAABB.lower.w = as_float(0xABBADEFFu);
|
||||
bbox_scratch[leafsIndexOffset + id] = theAABB;
|
||||
}
|
||||
|
||||
if (get_group_id(0) == 0 && get_local_id(0) < 16)
|
||||
{
|
||||
|
||||
uint groupnr;
|
||||
uint treeletCnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
groupnr = atomic_add_global(&sqinput->totalNumGroups, treeletCnt);
|
||||
}
|
||||
groupnr = sub_group_broadcast(groupnr, 0);
|
||||
for (uint subtree = get_sub_group_local_id(); subtree < treeletCnt; subtree += get_sub_group_size())
|
||||
{
|
||||
uint gr = groupnr + subtree;
|
||||
//printf("tree %llx, treelet %d/%d, grId %d, numStartpoints %d\n", bvh, subtree,treeletCnt, gr, BVHBase_GetRefitTreeletDescs(bvh)[subtree].numStartpoints);
|
||||
sqinput[gr].bvh = (qword)bvh;
|
||||
sqinput[gr].scratch = (qword)bbox_scratch;
|
||||
sqinput[gr].groupInTree = subtree;
|
||||
}
|
||||
//if (get_local_id(0)==0 && treeletCnt > 1)
|
||||
//{
|
||||
// printf("tree %llx, tip treelet %d/%d = numStartpoints %d depth %d\n", bvh, treeletCnt, treeletCnt, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].numStartpoints, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].maxDepth);
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel
|
||||
Refit_tree_per_group_quad(
|
||||
global SquashedInput* psqinputs)
|
||||
{
|
||||
uint group_id = get_group_id(0);
|
||||
SquashedInput sqinput = psqinputs[group_id];
|
||||
global struct BVHBase* bvh = sqinput.pBvh;
|
||||
global struct AABB* bbox_scratch = sqinput.bbox_scratch;
|
||||
global void* pInput = sqinput.pInput;
|
||||
local Treelet_by_single_group_locals loc;
|
||||
|
||||
if (*BVHBase_GetRefitTreeletCntPtr(bvh) == 0)
|
||||
return;
|
||||
|
||||
#if REFIT_DEBUG_CHECKS
|
||||
uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
|
||||
if (bottoms_cnt != 1) {
|
||||
if (get_local_id(0) == 0)
|
||||
{
|
||||
printf("Error: this tree has more than 1 treelets!\n");
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* get pointer to inner nodes and back pointers */
|
||||
uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
|
||||
|
||||
// uniform per group
|
||||
uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);
|
||||
|
||||
uint numLeafs = bvh->quadLeafCur - bvh->quadLeafStart;
|
||||
|
||||
if (numLeafs == 0) { return; }
|
||||
|
||||
uint numLeafsByOneThread = (numLeafs + (get_local_size(0) - 1)) / get_local_size(0);
|
||||
|
||||
update_quads(bvh, pInput, bbox_scratch, get_local_id(0), numLeafsByOneThread);
|
||||
|
||||
mem_fence_workgroup_default(); work_group_barrier(0);
|
||||
|
||||
RefitTreelet trltDsc = *pTrltDsc;
|
||||
|
||||
refit_treelet_by_single_group(
|
||||
bbox_scratch,
|
||||
&loc,
|
||||
bvh,
|
||||
trltDsc,
|
||||
false,
|
||||
true);
|
||||
|
||||
if (trltDsc.maxDepth > 0)
|
||||
{
|
||||
mem_fence_workgroup_default(); work_group_barrier(0);
|
||||
post_refit_encode_qnode_tree_per_group(bbox_scratch,bvh);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel
|
||||
Refit_treelet_per_group(
|
||||
global SquashedInputGroupDesc* sqinput)
|
||||
{
|
||||
uint group_id = get_group_id(0);
|
||||
global struct AABB* bbox_scratch = (global struct AABB* )sqinput[group_id].scratch;
|
||||
global struct BVHBase* bvh = (global struct BVHBase* )sqinput[group_id].bvh;
|
||||
group_id = sqinput[group_id].groupInTree;
|
||||
|
||||
/* get pointer to inner nodes and back pointers */
|
||||
uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
|
||||
|
||||
uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
|
||||
|
||||
// uniform per group
|
||||
uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);
|
||||
|
||||
bool should_we_process_treetip = true;
|
||||
local Treelet_by_single_group_locals loc;
|
||||
local bool* l_should_we_process_treetip = (local bool*)&loc;
|
||||
#if REFIT_VERBOSE_LOG
|
||||
if (group_id != 0) return;
|
||||
#endif
|
||||
|
||||
if (bottoms_cnt > 1)
|
||||
{
|
||||
#if REFIT_VERBOSE_LOG
|
||||
for (; group_id < bottoms_cnt; group_id++)
|
||||
{
|
||||
if (get_local_id(0) == 0) { printf("\n ====== treelet %d ====== \n", group_id); }
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, memory_scope_device);
|
||||
#endif
|
||||
bool rootProcThread = refit_treelet_by_single_group(
|
||||
bbox_scratch,
|
||||
&loc,
|
||||
bvh,
|
||||
pTrltDsc[group_id],
|
||||
true,
|
||||
false);
|
||||
|
||||
// we have to make last group that finishes go up and process the treetip
|
||||
if (rootProcThread)
|
||||
{
|
||||
|
||||
mem_fence_gpu_invalidate();
|
||||
uint finished_cnt = atomic_inc_global((global uint*) & bvh->refitTreeletCnt2);
|
||||
should_we_process_treetip = finished_cnt + 1 == bottoms_cnt;
|
||||
|
||||
* l_should_we_process_treetip = should_we_process_treetip;
|
||||
|
||||
if (should_we_process_treetip) mem_fence_gpu_invalidate();
|
||||
}
|
||||
#if REFIT_VERBOSE_LOG
|
||||
}
|
||||
#endif
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
|
||||
|
||||
should_we_process_treetip = *l_should_we_process_treetip;
|
||||
}
|
||||
|
||||
if (should_we_process_treetip)
|
||||
{
|
||||
//this group will process treetip
|
||||
if (get_local_id(0) == 0) { bvh->refitTreeletCnt2 = 0; }
|
||||
if (bottoms_cnt == 1) { bottoms_cnt = 0; }
|
||||
refit_treelet_by_single_group(
|
||||
bbox_scratch,
|
||||
&loc,
|
||||
bvh,
|
||||
pTrltDsc[bottoms_cnt],
|
||||
true,
|
||||
true);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
This kernel refit a BVH. The algorithm iterates over all BVH nodes
|
||||
to find all leaf nodes, which is where refitting starts. For these
|
||||
leaf nodes bounds get recalculated and then propagates up the tree.
|
||||
|
||||
One kernel instance considers exactly one inner_node startpoint.
|
||||
not range of inner nodes.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(8, 1, 1))) void kernel
|
||||
Refit_per_one_startpoint(
|
||||
global struct BVHBase* bvh,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
|
||||
global struct AABB3f* instance_leaf_aabbs )
|
||||
{
|
||||
/* here we temporarily store the bounds for the children of a node */
|
||||
struct AABB childrenAABB[BVH_NODE_N6];
|
||||
|
||||
/* get pointer to inner nodes and back pointers */
|
||||
global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
|
||||
|
||||
/* get the inner node that we will consider as a bottom startpoint */
|
||||
const uint numInnerNodes = BVHBase_numNodes(bvh);
|
||||
const uint innerNodeIdx = (get_group_id(0) + 0) * get_local_size(0) + get_local_id(0);
|
||||
|
||||
if (innerNodeIdx >= numInnerNodes) return;
|
||||
|
||||
global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
|
||||
uint numChildren = refit_bottom(
|
||||
bvh,
|
||||
geosArray,
|
||||
instance_leaf_aabbs,
|
||||
curNode,
|
||||
childrenAABB,
|
||||
*InnerNode_GetBackPointer(backPointers, innerNodeIdx));
|
||||
|
||||
if (numChildren != 0)
|
||||
{
|
||||
/* update bounds of node */
|
||||
QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
|
||||
|
||||
/* refit upper parts of the BVH */
|
||||
/* TODO: this will not gonna work for mixed nodes */
|
||||
refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
|
||||
Refit_indirect_sg(
|
||||
global struct BVHBase* bvh,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
|
||||
global struct AABB3f* instance_leaf_aabbs)
|
||||
{
|
||||
DO_Refit_per_one_startpoint_sg(bvh, geosArray, instance_leaf_aabbs, 0);
|
||||
|
||||
}
|
||||
|
|
@ -1,546 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "api_interface.h"
|
||||
#include "instance.h"
|
||||
#include "GRLGen12.h"
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel
|
||||
DO_update_instance_leaves(global struct BVHBase* bvh,
|
||||
uint64_t dxrInstancesArray,
|
||||
uint64_t dxrInstancesPtr,
|
||||
global struct AABB3f* instance_aabb_scratch,
|
||||
uint id ,
|
||||
global struct GRL_RAYTRACING_AABB* procedural_box
|
||||
)
|
||||
{
|
||||
|
||||
global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
|
||||
(global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
|
||||
global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
|
||||
(global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
|
||||
|
||||
global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
|
||||
|
||||
|
||||
/* iterate over all children of the instance node and get their bounds */
|
||||
|
||||
uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]);
|
||||
global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
|
||||
if (dxrInstancesArray != NULL)
|
||||
instance = &instancesArray[instanceIdx];
|
||||
else
|
||||
instance = instancesPtrArray[instanceIdx];
|
||||
|
||||
uint mask = GRL_get_InstanceMask(instance);
|
||||
uint offset = NO_NODE_OFFSET;
|
||||
|
||||
struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform);
|
||||
struct AABB3f bbox;
|
||||
|
||||
if (procedural_box != 0)
|
||||
{
|
||||
bbox.lower[0] = procedural_box->MinX;
|
||||
bbox.lower[1] = procedural_box->MinY;
|
||||
bbox.lower[2] = procedural_box->MinZ;
|
||||
bbox.upper[0] = procedural_box->MaxX;
|
||||
bbox.upper[1] = procedural_box->MaxY;
|
||||
bbox.upper[2] = procedural_box->MaxZ;
|
||||
}
|
||||
else
|
||||
{
|
||||
global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
|
||||
bbox = instanceBvh->Meta.bounds;
|
||||
offset = BVH_ROOT_NODE_OFFSET;
|
||||
}
|
||||
|
||||
|
||||
const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]);
|
||||
const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]);
|
||||
|
||||
if (!valid_min || !valid_max )
|
||||
{
|
||||
bbox.lower[0] = xfm.p.x;
|
||||
bbox.lower[1] = xfm.p.y;
|
||||
bbox.lower[2] = xfm.p.z;
|
||||
bbox.upper[0] = xfm.p.x;
|
||||
bbox.upper[1] = xfm.p.y;
|
||||
bbox.upper[2] = xfm.p.z;
|
||||
offset = NO_NODE_OFFSET;
|
||||
mask = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
bbox = AABB3f_transform(xfm, bbox); // JDB TODO: Use faster abs-matrix method
|
||||
}
|
||||
|
||||
instance_aabb_scratch[id] = bbox;
|
||||
|
||||
HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH
|
||||
}
|
||||
|
||||
/*
|
||||
This function starts at some BVH node and refits all nodes upwards
|
||||
to the root. At some node the algorithm only proceeds upwards if
|
||||
all children of the current node have already been processed. This
|
||||
is checked as each time a node is reached an atomic counter is
|
||||
incremented, which will reach the number of children of the node at
|
||||
some time.
|
||||
*/
|
||||
|
||||
GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed)
|
||||
global struct BVHBase *bvh, // pointer to BVH
|
||||
struct AABB *childrenAABB, // temporary data to use
|
||||
uint numChildrenTotal)
|
||||
{
|
||||
global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
|
||||
|
||||
/* compute the index of the start node */
|
||||
uint curNodeIndex = qnode_start - nodeData;
|
||||
|
||||
/* the start node got already processed, thus go to its parent node */
|
||||
curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6;
|
||||
|
||||
/* end at root node */
|
||||
while (curNodeIndex != 0x03FFFFFF)
|
||||
{
|
||||
/* increment refit counter that counts refitted children of current node */
|
||||
const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex));
|
||||
|
||||
/* if all children got refitted, then continue */
|
||||
const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
|
||||
numChildrenTotal = (parentPointer >> 3) & 0x7;
|
||||
if (numChildrenRefitted != numChildrenTotal)
|
||||
return;
|
||||
|
||||
/* reset refit counter for next refit */
|
||||
*InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8;
|
||||
|
||||
/* get bounds of all children from child nodes directly */
|
||||
global struct QBVHNodeN *qnode = nodeData + curNodeIndex;
|
||||
global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode);
|
||||
for (uint k = 0; k < numChildrenTotal; k++)
|
||||
childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k);
|
||||
|
||||
/* update node bounds of all children */
|
||||
QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal);
|
||||
|
||||
write_mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
/* make parent node the current node */
|
||||
curNodeIndex = parentPointer >> 6;
|
||||
}
|
||||
|
||||
/* update QBVH6 bounds */
|
||||
struct AABB bounds;
|
||||
AABB_init(&bounds);
|
||||
|
||||
for (uint i = 0; i < numChildrenTotal; i++)
|
||||
AABB_extend(&bounds, &childrenAABB[i]);
|
||||
|
||||
setBVHBaseBounds(bvh, &bounds);
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void SUBGROUP_refit_bottom_up(
|
||||
uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed)
|
||||
uniform global struct BVHBase* bvh, // pointer to BVH
|
||||
varying struct AABB reduce_bounds,
|
||||
uniform uint numChildrenTotal,
|
||||
varying ushort lane,
|
||||
varying ushort head_lane)
|
||||
{
|
||||
uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
|
||||
uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
|
||||
|
||||
/* compute the index of the start node */
|
||||
uniform uint curNodeIndex = qnode_start - nodeData;
|
||||
|
||||
/* the start node got already processed, thus go to its parent node */
|
||||
uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6;
|
||||
|
||||
varying struct AABB childrenAABB;
|
||||
|
||||
/* end at root node */
|
||||
while ( curNodeIndex != 0x03FFFFFF )
|
||||
{
|
||||
mem_fence_gpu_invalidate();
|
||||
|
||||
/* increment refit counter that counts refitted children of current node */
|
||||
uniform uint parentPointer = 1;
|
||||
if (lane == 0)
|
||||
{
|
||||
// acquire fence ensures that all previous writes complete before the atomic starts
|
||||
parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex));
|
||||
}
|
||||
|
||||
parentPointer = intel_sub_group_shuffle( parentPointer, head_lane );
|
||||
|
||||
/* if all children got refitted, then continue */
|
||||
uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
|
||||
numChildrenTotal = (parentPointer >> 3) & 0x7;
|
||||
if ( numChildrenRefitted != numChildrenTotal )
|
||||
return;
|
||||
|
||||
/* reset refit counter for next refit */
|
||||
if (lane == 0)
|
||||
{
|
||||
*InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8);
|
||||
}
|
||||
|
||||
/* get bounds of all children from child nodes directly */
|
||||
global struct QBVHNodeN* qnode = nodeData + curNodeIndex;
|
||||
global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
|
||||
|
||||
varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0;
|
||||
childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
|
||||
|
||||
/* update node bounds of all children */
|
||||
reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
|
||||
reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane );
|
||||
|
||||
subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane);
|
||||
|
||||
/* update node mask */
|
||||
uchar childrenMask = qnode_child[child_idx].instMask;
|
||||
|
||||
qnode->instMask = sub_group_reduce_or_N6(childrenMask);
|
||||
|
||||
/* make parent node the current node */
|
||||
curNodeIndex = parentPointer >> 6;
|
||||
}
|
||||
|
||||
/* update QBVH6 bounds */
|
||||
|
||||
if( lane == 0 )
|
||||
setBVHBaseBounds( bvh, &reduce_bounds );
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void quadCopyVertices(
|
||||
const struct QuadLeaf* pQuad,
|
||||
struct QuadLeaf* newQuad)
|
||||
{
|
||||
const uint4* s = (const uint4*) & (pQuad->v[0][0]);
|
||||
uint4* d = (uint4*) & (newQuad->v[0][0]);
|
||||
const uint8* s2 = (const uint8*)(s+1);
|
||||
uint8* d2 = (uint8*)(d+1);
|
||||
*d = *s;
|
||||
*d2 = *s2;
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void get_updated_quad(
|
||||
global const struct QuadLeaf* pQuad,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs,
|
||||
struct QuadLeaf* newQuad)
|
||||
{
|
||||
struct QuadLeaf tempQuad;
|
||||
|
||||
// fetch non vtx data;
|
||||
{
|
||||
uint4* tempQuad4U = (uint4*)&tempQuad;
|
||||
global const uint4* pQuad4U = (global const uint4*)pQuad;
|
||||
*tempQuad4U = *pQuad4U;
|
||||
}
|
||||
|
||||
/* get the geomID and primID0/1 for both quad triangles */
|
||||
const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc);
|
||||
const uint primID0 = tempQuad.primIndex0;
|
||||
const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad);
|
||||
ushort fourth_vert = 0;
|
||||
|
||||
if (primID1 != primID0)
|
||||
{
|
||||
ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad);
|
||||
fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
|
||||
fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
|
||||
}
|
||||
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID;
|
||||
|
||||
uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
|
||||
|
||||
// read the indices of the 4 verts we want
|
||||
float3 vtx0, vtx1, vtx2, vtx3;
|
||||
GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
|
||||
|
||||
QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3);
|
||||
|
||||
*newQuad = tempQuad;
|
||||
}
|
||||
|
||||
// This calculates children BBs for innerNode having *all* children leafs.
|
||||
// mixed nodes will be updated by passing through bottom-up thread.
|
||||
GRL_INLINE uint refit_bottom( global struct BVHBase* bvh,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
global struct AABB3f* instance_leaf_aabbs,
|
||||
global struct QBVHNodeN* curNode,
|
||||
struct AABB *childrenAABB,
|
||||
uint backPointer)
|
||||
{
|
||||
uint numChildren = 0;
|
||||
|
||||
/* we start refit at leaf nodes, this case is for quad nodes */
|
||||
if (curNode->type == BVH_QUAD_NODE)
|
||||
{
|
||||
global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
|
||||
|
||||
/* iterate over all quads of the quad node and get their bounds */
|
||||
numChildren = (backPointer >> 3) & 0x7;
|
||||
for (uint k = 0; k < numChildren; k++)
|
||||
{
|
||||
struct QuadLeaf Q;
|
||||
get_updated_quad(&quads[k], geomDesc, &Q);
|
||||
quadCopyVertices(&Q, &quads[k]);
|
||||
childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad
|
||||
}
|
||||
}
|
||||
|
||||
/* we start refit at leaf nodes, this case is for procedural nodes */
|
||||
else if (curNode->type == BVH_PROCEDURAL_NODE)
|
||||
{
|
||||
global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
|
||||
|
||||
/* iterate over all children of the procedural node and get their bounds */
|
||||
numChildren = (backPointer >> 3) & 0x7;
|
||||
for (uint k = 0; k < numChildren; k++)
|
||||
{
|
||||
/* extract geomID and primID from leaf */
|
||||
const uint startPrim = QBVHNodeN_startPrim(curNode, k);
|
||||
const uint geomID = ProceduralLeaf_geomIndex(leaf);
|
||||
const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
|
||||
|
||||
/* read bounds from geometry descriptor */
|
||||
struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
|
||||
childrenAABB[k].lower.x = aabb.MinX;
|
||||
childrenAABB[k].lower.y = aabb.MinY;
|
||||
childrenAABB[k].lower.z = aabb.MinZ;
|
||||
childrenAABB[k].upper.x = aabb.MaxX;
|
||||
childrenAABB[k].upper.y = aabb.MaxY;
|
||||
childrenAABB[k].upper.z = aabb.MaxZ;
|
||||
|
||||
/* advance leaf pointer to next child */
|
||||
leaf += QBVHNodeN_blockIncr(curNode, k);
|
||||
}
|
||||
}
|
||||
|
||||
/* we start refit at leaf nodes, this case is for instance nodes */
|
||||
else if (curNode->type == BVH_INSTANCE_NODE)
|
||||
{
|
||||
global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
|
||||
global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
|
||||
|
||||
/* iterate over all children of the instance node and get their bounds */
|
||||
numChildren = (backPointer >> 3) & 0x7;
|
||||
for (uint k = 0; k < numChildren; k++)
|
||||
{
|
||||
uint leafindex = (instancesLeaves + k) - leafBase;
|
||||
childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
|
||||
childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
|
||||
}
|
||||
}
|
||||
|
||||
return numChildren;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// This calculates children BBs for innerNode having *all* children leafs.
|
||||
// mixed nodes will be updated by passing through bottom-up thread.
|
||||
GRL_INLINE uint SUBGROUP_refit_bottom(
|
||||
uniform global struct BVHBase* bvh,
|
||||
uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
uniform global struct AABB3f* instance_leaf_aabbs,
|
||||
uniform global struct QBVHNodeN* curNode,
|
||||
uniform uint backPointer,
|
||||
varying struct AABB* childrenAABB,
|
||||
varying uchar* childrenMask,
|
||||
varying ushort lane,
|
||||
global uchar* is_procedural_instance
|
||||
)
|
||||
{
|
||||
uniform uint numChildren = 0;
|
||||
bool enable_procedural_instance = (is_procedural_instance != 0);
|
||||
|
||||
/* we start refit at leaf nodes, this case is for quad nodes */
|
||||
if (curNode->type == BVH_QUAD_NODE)
|
||||
{
|
||||
/* iterate over all quads of the quad node and get their bounds */
|
||||
numChildren = (backPointer >> 3) & 0x7;
|
||||
|
||||
uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
|
||||
|
||||
struct QuadLeaf Q;
|
||||
if (lane < numChildren)
|
||||
{
|
||||
get_updated_quad(&quads[lane], geomDesc, &Q);
|
||||
|
||||
*childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
|
||||
|
||||
quadCopyVertices(&Q, &quads[lane]);
|
||||
*childrenMask = 0xff;
|
||||
}
|
||||
// FIXME: support leaves with more than one quad
|
||||
}
|
||||
|
||||
/* we start refit at leaf nodes, this case is for procedural nodes */
|
||||
else if (curNode->type == BVH_PROCEDURAL_NODE)
|
||||
{
|
||||
uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
|
||||
|
||||
|
||||
|
||||
/* iterate over all children of the procedural node and get their bounds */
|
||||
numChildren = (backPointer >> 3) & 0x7;
|
||||
|
||||
varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0;
|
||||
incr = sub_group_scan_exclusive_add(incr);
|
||||
|
||||
if( lane < numChildren )
|
||||
{
|
||||
/* extract geomID and primID from leaf */
|
||||
varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane );
|
||||
varying global struct ProceduralLeaf* my_leaf = leaf + incr;
|
||||
const uint geomID = ProceduralLeaf_geomIndex(my_leaf);
|
||||
const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim);
|
||||
|
||||
/* read bounds from geometry descriptor */
|
||||
struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
|
||||
childrenAABB->lower.x = aabb.MinX;
|
||||
childrenAABB->lower.y = aabb.MinY;
|
||||
childrenAABB->lower.z = aabb.MinZ;
|
||||
childrenAABB->upper.x = aabb.MaxX;
|
||||
childrenAABB->upper.y = aabb.MaxY;
|
||||
childrenAABB->upper.z = aabb.MaxZ;
|
||||
*childrenMask = 0xff;
|
||||
}
|
||||
}
|
||||
|
||||
/* we start refit at leaf nodes, this case is for instance nodes */
|
||||
else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE)
|
||||
{
|
||||
uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
|
||||
uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
|
||||
|
||||
/* iterate over all children of the instance node and get their bounds and masks */
|
||||
numChildren = (backPointer >> 3) & 0x7;
|
||||
if( lane < numChildren )
|
||||
{
|
||||
uint leafindex = (instancesLeaves + lane) - leafBase;
|
||||
childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]);
|
||||
childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]);
|
||||
*childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]);
|
||||
}
|
||||
}
|
||||
else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE)
|
||||
{
|
||||
// Handle procedural-instance leaves
|
||||
// TODO: Generalize this! Should re-write the kernel to work with arbitrary mixed-mode leaves
|
||||
|
||||
numChildren = (backPointer >> 3) & 0x7;
|
||||
uint childType = BVH_INTERNAL_NODE;
|
||||
if ( lane < numChildren )
|
||||
{
|
||||
childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane );
|
||||
if (childType != BVH_INTERNAL_NODE)
|
||||
{
|
||||
uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode );
|
||||
uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
|
||||
uint leafindex = (instancesLeaves + lane) - leafBase;
|
||||
childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
|
||||
childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
|
||||
*childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] );
|
||||
|
||||
// see if the child has flipped from procedural to non-procedural and update the child type field as needed
|
||||
uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] );
|
||||
uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
|
||||
if (newChildType != childType)
|
||||
{
|
||||
InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// don't ascend the tree for a true internal node
|
||||
if (sub_group_all(childType == BVH_INTERNAL_NODE))
|
||||
numChildren = 0;
|
||||
}
|
||||
|
||||
return numChildren;
|
||||
}
|
||||
|
||||
#define SG_REFIT_WG_SIZE 8
|
||||
|
||||
void DO_Refit_per_one_startpoint_sg(
|
||||
global struct BVHBase* bvh,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
|
||||
global struct AABB3f* instance_leaf_aabbs,
|
||||
global uchar* is_procedural_instance )
|
||||
{
|
||||
/* get pointer to inner nodes and back pointers */
|
||||
global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
|
||||
|
||||
/* get the inner node that we will consider as a bottom startpoint */
|
||||
const uint numInnerNodes = BVHBase_numNodes(bvh);
|
||||
const uint innerNodeIdx = get_sub_group_global_id();
|
||||
|
||||
varying ushort lane = get_sub_group_local_id();
|
||||
|
||||
if (innerNodeIdx >= numInnerNodes) return;
|
||||
|
||||
varying struct AABB childrenAABB; // one child AABB per lane
|
||||
AABB_init(&childrenAABB);
|
||||
|
||||
varying uchar childrenMask = 0; // one child mask per lane
|
||||
|
||||
global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
|
||||
uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
|
||||
uint numChildren = SUBGROUP_refit_bottom(
|
||||
bvh,
|
||||
geosArray,
|
||||
instance_leaf_aabbs,
|
||||
curNode,
|
||||
backPointer,
|
||||
&childrenAABB,
|
||||
&childrenMask,
|
||||
lane,
|
||||
is_procedural_instance
|
||||
);
|
||||
|
||||
|
||||
if (numChildren != 0)
|
||||
{
|
||||
/* update bounds of node */
|
||||
struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB);
|
||||
reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0);
|
||||
subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane);
|
||||
|
||||
/* update mask of node */
|
||||
uchar mask = sub_group_reduce_or_N6(childrenMask);
|
||||
curNode->instMask = mask;
|
||||
|
||||
/* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done
|
||||
only by the first thread (similar to morton phase1) the machine hangs. */
|
||||
mem_fence_gpu_invalidate();
|
||||
|
||||
/* refit upper parts of the BVH */
|
||||
/* TODO: this will not gonna work for mixed nodes */
|
||||
SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -1,763 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "api_interface.h"
|
||||
#include "d3d12.h"
|
||||
#include "common.h"
|
||||
#include "mem_utils.h"
|
||||
#include "misc_shared.h"
|
||||
|
||||
#define offsetof(TYPE, ELEMENT) ((size_t)&(((TYPE *)0)->ELEMENT))
|
||||
|
||||
GRL_INLINE
|
||||
uint GroupCountForCopySize(uint size)
|
||||
{
|
||||
return (size >> 8) + 4;
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
uint GroupCountForCopy(BVHBase* base)
|
||||
{
|
||||
return GroupCountForCopySize(base->Meta.allocationSize);
|
||||
}
|
||||
|
||||
GRL_INLINE void copyInstanceDescs(InstanceDesc* instances, D3D12_RAYTRACING_INSTANCE_DESC* descs, uint64_t numInstances)
|
||||
{
|
||||
for (uint64_t instanceIndex = get_local_id(0); instanceIndex < numInstances; instanceIndex += get_local_size(0))
|
||||
{
|
||||
for (uint row = 0; row < 3; row++)
|
||||
{
|
||||
for (uint column = 0; column < 4; column++)
|
||||
{
|
||||
D3D12_set_transform(&descs[instanceIndex], row, column, InstanceDesc_get_transform(&instances[instanceIndex], row, column));
|
||||
}
|
||||
}
|
||||
D3D12_set_instanceID(&descs[instanceIndex], InstanceDesc_get_instanceID(&instances[instanceIndex]));
|
||||
D3D12_set_InstanceMask(&descs[instanceIndex], InstanceDesc_get_InstanceMask(&instances[instanceIndex]));
|
||||
D3D12_set_InstanceContributionToHitGroupIndex(&descs[instanceIndex], InstanceDesc_get_InstanceContributionToHitGroupIndex(&instances[instanceIndex]));
|
||||
D3D12_set_InstanceFlags(&descs[instanceIndex], InstanceDesc_get_InstanceFlags(&instances[instanceIndex]));
|
||||
D3D12_set_AccelerationStructure(&descs[instanceIndex], InstanceDesc_get_AccelerationStructure(&instances[instanceIndex]));
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void createGeoDescs(GeoMetaData* geoMetaData, D3D12_RAYTRACING_GEOMETRY_DESC* descs, uint64_t numGeos, const uint64_t dataBufferStart)
|
||||
{
|
||||
if (get_local_id(0) == 0)
|
||||
{
|
||||
uint64_t previousGeoDataBufferEnd = dataBufferStart;
|
||||
for (uint64_t geoIndex = 0; geoIndex < numGeos; geoIndex += 1)
|
||||
{
|
||||
D3D12_set_Type(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Type));
|
||||
D3D12_set_Flags(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Flags));
|
||||
if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
|
||||
{
|
||||
// Every triangle is stored separately
|
||||
uint64_t vertexBufferSize = 9 * sizeof(float) * geoMetaData[geoIndex].PrimitiveCount;
|
||||
D3D12_set_triangles_Transform(&descs[geoIndex], 0);
|
||||
D3D12_set_triangles_IndexFormat(&descs[geoIndex], INDEX_FORMAT_NONE);
|
||||
D3D12_set_triangles_VertexFormat(&descs[geoIndex], VERTEX_FORMAT_R32G32B32_FLOAT);
|
||||
D3D12_set_triangles_IndexCount(&descs[geoIndex], 0);
|
||||
D3D12_set_triangles_VertexCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount * 3);
|
||||
D3D12_set_triangles_IndexBuffer(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
|
||||
D3D12_set_triangles_VertexBuffer_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
|
||||
D3D12_set_triangles_VertexBuffer_StrideInBytes(&descs[geoIndex], 3 * sizeof(float));
|
||||
previousGeoDataBufferEnd += vertexBufferSize;
|
||||
}
|
||||
else
|
||||
{
|
||||
D3D12_set_procedurals_AABBCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount);
|
||||
D3D12_set_procedurals_AABBs_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
|
||||
D3D12_set_procedurals_AABBs_StrideInBytes(&descs[geoIndex], sizeof(D3D12_RAYTRACING_AABB));
|
||||
previousGeoDataBufferEnd += sizeof(D3D12_RAYTRACING_AABB) * geoMetaData[geoIndex].PrimitiveCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void copyIndiciesAndVerticies(D3D12_RAYTRACING_GEOMETRY_DESC* desc, QuadLeaf* quad)
|
||||
{
|
||||
float* vertices = (float*)D3D12_get_triangles_VertexBuffer_StartAddress(desc);
|
||||
uint64_t firstTriangleIndex = quad->primIndex0;
|
||||
uint64_t numTriangles = QuadLeaf_IsSingleTriangle(quad) ? 1 : 2;
|
||||
|
||||
vertices[firstTriangleIndex * 9] = quad->v[0][0];
|
||||
vertices[firstTriangleIndex * 9 + 1] = quad->v[0][1];
|
||||
vertices[firstTriangleIndex * 9 + 2] = quad->v[0][2];
|
||||
|
||||
vertices[firstTriangleIndex * 9 + 3] = quad->v[1][0];
|
||||
vertices[firstTriangleIndex * 9 + 4] = quad->v[1][1];
|
||||
vertices[firstTriangleIndex * 9 + 5] = quad->v[1][2];
|
||||
|
||||
vertices[firstTriangleIndex * 9 + 6] = quad->v[2][0];
|
||||
vertices[firstTriangleIndex * 9 + 7] = quad->v[2][1];
|
||||
vertices[firstTriangleIndex * 9 + 8] = quad->v[2][2];
|
||||
|
||||
if (numTriangles == 2)
|
||||
{
|
||||
uint64_t secondTriangleIndex = firstTriangleIndex + QuadLeaf_GetPrimIndexDelta(quad);
|
||||
uint32_t packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
|
||||
for( size_t i=0; i<3; i++ )
|
||||
{
|
||||
uint32_t idx = packed_indices & 3 ; packed_indices >>= 2;
|
||||
for( size_t j=0; j<3; j++ )
|
||||
vertices[secondTriangleIndex * 9 + i * 3 + j] = quad->v[idx][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
void storeProceduralDesc(
|
||||
struct AABB procAABB,
|
||||
uint32_t primId,
|
||||
D3D12_RAYTRACING_GEOMETRY_DESC* geoDesc)
|
||||
{
|
||||
D3D12_RAYTRACING_AABB* proceduralDescs = (D3D12_RAYTRACING_AABB*)D3D12_get_procedurals_AABBs_StartAddress(geoDesc);
|
||||
D3D12_set_raytracing_aabb(&proceduralDescs[primId], &procAABB);
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
void copyDataFromLProcedurals(
|
||||
BVHBase* base,
|
||||
D3D12_RAYTRACING_GEOMETRY_DESC* descs)
|
||||
{
|
||||
unsigned numProcedurals = BVHBase_GetNumProcedurals(base);
|
||||
InternalNode* innerNodes = BVHBase_GetInternalNodes(base);
|
||||
unsigned numInnerNodes = BVHBase_GetNumInternalNodes(base);
|
||||
|
||||
if (BVHBase_GetNumProcedurals(base) > 0) //< there's no point entering here if there are no procedurals
|
||||
{
|
||||
|
||||
// iterate on all inner nodes to identify those with procedural children, we have to take aabbs from them
|
||||
for (uint32_t nodeI = get_local_id(0); nodeI < numInnerNodes; nodeI += get_local_size(0))
|
||||
{
|
||||
InternalNode* innerNode = innerNodes + nodeI;
|
||||
|
||||
if (innerNode->nodeType == NODE_TYPE_PROCEDURAL)
|
||||
{
|
||||
float* origin = innerNode->lower;
|
||||
|
||||
global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer((struct QBVHNodeN*)innerNode);
|
||||
|
||||
for (uint k = 0; k < 6; k++)
|
||||
{
|
||||
if (InternalNode_IsChildValid(innerNode, k))
|
||||
{
|
||||
struct AABB3f qbounds = {
|
||||
(float)(innerNode->lower_x[k]), (float)(innerNode->lower_y[k]), (float)(innerNode->lower_z[k]),
|
||||
(float)(innerNode->upper_x[k]), (float)(innerNode->upper_y[k]), (float)(innerNode->upper_z[k]) };
|
||||
|
||||
struct AABB dequantizedAABB;
|
||||
|
||||
dequantizedAABB.lower[0] = origin[0] + bitShiftLdexp(qbounds.lower[0], innerNode->exp_x - 8);
|
||||
dequantizedAABB.lower[1] = origin[1] + bitShiftLdexp(qbounds.lower[1], innerNode->exp_y - 8);
|
||||
dequantizedAABB.lower[2] = origin[2] + bitShiftLdexp(qbounds.lower[2], innerNode->exp_z - 8);
|
||||
dequantizedAABB.upper[0] = origin[0] + bitShiftLdexp(qbounds.upper[0], innerNode->exp_x - 8);
|
||||
dequantizedAABB.upper[1] = origin[1] + bitShiftLdexp(qbounds.upper[1], innerNode->exp_y - 8);
|
||||
dequantizedAABB.upper[2] = origin[2] + bitShiftLdexp(qbounds.upper[2], innerNode->exp_z - 8);
|
||||
|
||||
dequantizedAABB = conservativeAABB(&dequantizedAABB);
|
||||
/* extract geomID and primID from leaf */
|
||||
const uint startPrim = QBVHNodeN_startPrim((struct QBVHNodeN*) innerNode, k);
|
||||
const uint geomID = ProceduralLeaf_geomIndex(leaf);
|
||||
const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
|
||||
|
||||
storeProceduralDesc(dequantizedAABB, primID, descs + geomID);
|
||||
}
|
||||
/* advance leaf pointer to next child */
|
||||
leaf += QBVHNodeN_blockIncr((struct QBVHNodeN*)innerNode, k);
|
||||
}
|
||||
|
||||
}
|
||||
else if (innerNode->nodeType == NODE_TYPE_MIXED) { ERROR(); }
|
||||
else {/* do nothing for other internal node types, they can't have procedural child (directly)*/; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
void copyDataFromQuadLeaves(BVHBase* base,
|
||||
D3D12_RAYTRACING_GEOMETRY_DESC* descs)
|
||||
{
|
||||
QuadLeaf* quads = BVHBase_GetQuadLeaves(base);
|
||||
uint64_t numQuads = BVHBase_GetNumQuads(base);
|
||||
for (uint64_t quadIdx = get_local_id(0); quadIdx < numQuads; quadIdx += get_local_size(0))
|
||||
{
|
||||
uint64_t descIdx = PrimLeaf_GetGeoIndex(&quads[quadIdx].leafDesc);
|
||||
copyIndiciesAndVerticies(&descs[descIdx], &quads[quadIdx]);
|
||||
}
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel clone_indirect(global char* dest,
|
||||
global char* src)
|
||||
{
|
||||
BVHBase* base = (BVHBase*)src;
|
||||
uint64_t bvhSize = base->Meta.allocationSize;
|
||||
|
||||
uint numGroups = GroupCountForCopy(base);
|
||||
CopyMemory(dest, src, bvhSize, numGroups);
|
||||
}
|
||||
|
||||
GRL_INLINE void compactT(global char* dest, global char* src, uint64_t compactedSize, uint skipCopy, uint groupCnt)
|
||||
{
|
||||
global BVHBase* baseSrc = (global BVHBase*)src;
|
||||
global BVHBase* baseDest = (global BVHBase*)dest;
|
||||
|
||||
uint32_t offset = sizeof(BVHBase);
|
||||
uint32_t numNodes = BVHBase_GetNumInternalNodes(baseSrc);
|
||||
uint32_t nodeSize = numNodes * sizeof(InternalNode);
|
||||
offset += nodeSize;
|
||||
|
||||
int quadChildFix = baseSrc->quadLeafStart;
|
||||
int procChildFix = baseSrc->proceduralDataStart;
|
||||
int instChildFix = baseSrc->instanceLeafStart;
|
||||
|
||||
// serialization already copies part of bvh base so skip this part
|
||||
CopyMemory(dest + skipCopy, src + skipCopy, sizeof(BVHBase) - skipCopy, groupCnt);
|
||||
baseDest->Meta.allocationSize = compactedSize;
|
||||
|
||||
if (baseSrc->Meta.instanceCount)
|
||||
{
|
||||
const uint32_t instLeafsSize = BVHBase_GetNumHWInstanceLeaves(baseSrc) * sizeof(HwInstanceLeaf);
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetHWInstanceLeaves(baseSrc), instLeafsSize, groupCnt);
|
||||
const uint instanceLeafStart = (uint)(offset / 64);
|
||||
baseDest->instanceLeafStart = instanceLeafStart;
|
||||
instChildFix -= instanceLeafStart;
|
||||
offset += instLeafsSize;
|
||||
baseDest->instanceLeafEnd = (uint)(offset / 64);
|
||||
}
|
||||
if (baseSrc->Meta.geoCount)
|
||||
{
|
||||
const uint quadLeafsSize = BVHBase_GetNumQuads(baseSrc) * sizeof(QuadLeaf);
|
||||
if (quadLeafsSize)
|
||||
{
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetQuadLeaves(baseSrc), quadLeafsSize, groupCnt);
|
||||
const uint quadLeafStart = (uint)(offset / 64);
|
||||
baseDest->quadLeafStart = quadLeafStart;
|
||||
quadChildFix -= quadLeafStart;
|
||||
offset += quadLeafsSize;
|
||||
baseDest->quadLeafCur = (uint)(offset / 64);
|
||||
}
|
||||
|
||||
const uint procLeafsSize = BVHBase_GetNumProcedurals(baseSrc) * sizeof(ProceduralLeaf);
|
||||
if (procLeafsSize)
|
||||
{
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetProceduralLeaves(baseSrc), procLeafsSize, groupCnt);
|
||||
const uint proceduralDataStart = (uint)(offset / 64);
|
||||
baseDest->proceduralDataStart = proceduralDataStart;
|
||||
procChildFix -= proceduralDataStart;
|
||||
offset += procLeafsSize;
|
||||
baseDest->proceduralDataCur = (uint)(offset / 64);
|
||||
}
|
||||
}
|
||||
// copy nodes with fixed child offsets
|
||||
global uint* nodeDest = (global uint*)(dest + sizeof(BVHBase));
|
||||
global InternalNode* nodeSrc = (global InternalNode*)BVHBase_GetInternalNodes(baseSrc);
|
||||
// used in mixed case
|
||||
char* instanceLeavesBegin = (char*)BVHBase_GetHWInstanceLeaves(baseSrc);
|
||||
char* instanceLeavesEnd = (char*)BVHBase_GetHWInstanceLeaves_End(baseSrc);
|
||||
uint localId = get_sub_group_local_id();
|
||||
for (uint i = get_group_id(0); i < numNodes; i += groupCnt)
|
||||
{
|
||||
uint nodePart = CacheLineSubgroupRead((const global char*)&nodeSrc[i]);
|
||||
char nodeType = as_char4(sub_group_broadcast(nodePart, offsetof(InternalNode, nodeType) / 4))[0];
|
||||
if (localId * 4 == offsetof(InternalNode, childOffset))
|
||||
{
|
||||
int childOffset = as_int(nodePart);
|
||||
if (nodeType == NODE_TYPE_MIXED)
|
||||
{
|
||||
char* childPtr = (char*)&nodeSrc[i] + 64 * childOffset;
|
||||
if (childPtr > instanceLeavesBegin && childPtr < instanceLeavesEnd)
|
||||
nodePart = as_int(childOffset - instChildFix);
|
||||
}
|
||||
else if (nodeType == NODE_TYPE_INSTANCE)
|
||||
nodePart = as_int(childOffset - instChildFix);
|
||||
else if (nodeType == NODE_TYPE_QUAD)
|
||||
nodePart = as_int(childOffset - quadChildFix);
|
||||
else if (nodeType == NODE_TYPE_PROCEDURAL)
|
||||
nodePart = as_int(childOffset - procChildFix);
|
||||
}
|
||||
nodeDest[i * 16 + localId] = nodePart;
|
||||
}
|
||||
|
||||
if (baseSrc->Meta.instanceCount)
|
||||
{
|
||||
const uint32_t instanceDescSize = baseSrc->Meta.instanceCount * sizeof(InstanceDesc);
|
||||
CopyMemory(dest + offset, src + baseSrc->Meta.instanceDescsStart, instanceDescSize, groupCnt);
|
||||
baseDest->Meta.instanceDescsStart = offset;
|
||||
offset += instanceDescSize;
|
||||
}
|
||||
if (baseSrc->Meta.geoCount)
|
||||
{
|
||||
const uint32_t geoMetaSize = baseSrc->Meta.geoCount * sizeof(GeoMetaData);
|
||||
CopyMemory(dest + offset, src + baseSrc->Meta.geoDescsStart, geoMetaSize, groupCnt);
|
||||
baseDest->Meta.geoDescsStart = offset;
|
||||
offset += (geoMetaSize + 63) & ~63; // align to 64
|
||||
}
|
||||
|
||||
uint backPointerDataStart = offset / 64;
|
||||
uint refitTreeletsDataStart = backPointerDataStart;
|
||||
uint refitStartPointDataStart = backPointerDataStart;
|
||||
uint dataEnd = backPointerDataStart;
|
||||
uint fatLeafTableStart = dataEnd;
|
||||
uint fatLeafCount = baseSrc->fatLeafCount;
|
||||
uint innerTableStart = dataEnd;
|
||||
uint innerCount = baseSrc->innerCount;
|
||||
|
||||
uint quadLeftoversCountNewAtomicUpdate = baseSrc->quadLeftoversCountNewAtomicUpdate;
|
||||
uint quadTableSizeNewAtomicUpdate = baseSrc->quadTableSizeNewAtomicUpdate;
|
||||
uint quadIndicesDataStart = dataEnd;
|
||||
|
||||
if (BVHBase_HasBackPointers(baseSrc))
|
||||
{
|
||||
#if 0 //
|
||||
const uint oldbackpontersDataStart = baseSrc->backPointerDataStart;
|
||||
const uint shift = oldbackpontersDataStart - backPointerDataStart;
|
||||
const uint refitStructsSize = ((BVHBase_GetRefitStructsDataSize(baseSrc)) + 63) & ~63;
|
||||
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), refitStructsSize, groupCnt);
|
||||
|
||||
refitTreeletsDataStart = baseSrc->refitTreeletsDataStart - shift;
|
||||
refitStartPointDataStart = baseSrc->refitStartPointDataStart - shift;
|
||||
dataEnd = baseSrc->BVHDataEnd - shift;
|
||||
#else // compacting version
|
||||
const uint backpointersSize = ((numNodes*sizeof(uint)) + 63) & ~63;
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), backpointersSize, groupCnt);
|
||||
offset += backpointersSize;
|
||||
|
||||
refitTreeletsDataStart = offset / 64;
|
||||
refitStartPointDataStart = offset / 64;
|
||||
|
||||
// TODO: remove treelets from .... everywhere
|
||||
const uint treeletExecutedCnt = *BVHBase_GetRefitTreeletCntPtr(baseSrc);
|
||||
|
||||
if (treeletExecutedCnt)
|
||||
{
|
||||
const uint treeletCnt = treeletExecutedCnt > 1 ? treeletExecutedCnt + 1 : 1;
|
||||
|
||||
refitTreeletsDataStart = offset / 64;
|
||||
const uint treeletsSize = ((treeletCnt * sizeof(RefitTreelet)) + 63) & ~63;
|
||||
RefitTreelet* destTreelets = (RefitTreelet*)(dest + offset);
|
||||
RefitTreelet* srcTreelets = BVHBase_GetRefitTreeletDescs(baseSrc);
|
||||
|
||||
uint numThreads = groupCnt * get_local_size(0);
|
||||
uint globalID = (get_group_id(0) * get_local_size(0)) + get_local_id(0);
|
||||
|
||||
for (uint i = globalID; i < treeletCnt; i += numThreads)
|
||||
{
|
||||
RefitTreelet dsc = srcTreelets[i];
|
||||
RefitTreeletTrivial* trivial_dsc = (RefitTreeletTrivial*)&dsc;
|
||||
if (trivial_dsc->numStartpoints == 1 && trivial_dsc->childrenOffsetOfTheNode > numNodes) {
|
||||
trivial_dsc->childrenOffsetOfTheNode -= quadChildFix;
|
||||
}
|
||||
destTreelets[i] = dsc;
|
||||
}
|
||||
|
||||
offset += treeletsSize;
|
||||
|
||||
refitStartPointDataStart = offset / 64;
|
||||
const uint startPointsSize = (BVHBase_GetRefitStartPointsSize(baseSrc) + 63) & ~63;
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetRefitStartPoints(baseSrc), startPointsSize, groupCnt);
|
||||
offset += startPointsSize;
|
||||
dataEnd = offset / 64;
|
||||
}
|
||||
|
||||
uint fatleafEntriesSize = ((fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63);
|
||||
fatLeafTableStart = offset / 64;
|
||||
if (fatleafEntriesSize) {
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), fatleafEntriesSize, groupCnt);
|
||||
}
|
||||
offset += fatleafEntriesSize;
|
||||
|
||||
// New atomic update
|
||||
if(baseSrc->quadIndicesDataStart > baseSrc->backPointerDataStart)
|
||||
{
|
||||
uint numQuads = BVHBase_GetNumQuads(baseSrc);
|
||||
uint quadTableMainBufferSize = (numQuads + 255) & ~255;
|
||||
uint quadLeftoversSize = (quadLeftoversCountNewAtomicUpdate + 255) & ~255;
|
||||
uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
|
||||
if (quadTableEntriesSize) {
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), quadTableEntriesSize, groupCnt);
|
||||
}
|
||||
offset += quadTableEntriesSize;
|
||||
|
||||
uint quadIndicesDataSize = ((numQuads * sizeof(QuadDataIndices) + 63) & ~63);
|
||||
quadIndicesDataStart = offset / 64;
|
||||
if (quadIndicesDataSize) {
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetQuadDataIndicesTable(baseSrc), quadIndicesDataSize, groupCnt);
|
||||
}
|
||||
offset += quadIndicesDataSize;
|
||||
}
|
||||
|
||||
uint innerEntriesSize = ((innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63);
|
||||
innerTableStart = offset / 64;
|
||||
if (innerEntriesSize) {
|
||||
CopyMemory(dest + offset, (global char*)BVHBase_GetInnerNodeTable(baseSrc), innerEntriesSize, groupCnt);
|
||||
}
|
||||
offset += innerEntriesSize;
|
||||
|
||||
dataEnd = offset / 64;
|
||||
#endif
|
||||
}
|
||||
|
||||
baseDest->backPointerDataStart = backPointerDataStart;
|
||||
baseDest->refitTreeletsDataStart = refitTreeletsDataStart;
|
||||
baseDest->refitStartPointDataStart = refitStartPointDataStart;
|
||||
baseDest->fatLeafTableStart = fatLeafTableStart ;
|
||||
baseDest->fatLeafCount = fatLeafCount;
|
||||
baseDest->innerTableStart = innerTableStart;
|
||||
baseDest->innerCount = innerCount;
|
||||
|
||||
baseDest->quadLeftoversCountNewAtomicUpdate = quadLeftoversCountNewAtomicUpdate;
|
||||
baseDest->quadTableSizeNewAtomicUpdate = quadTableSizeNewAtomicUpdate;
|
||||
baseDest->quadIndicesDataStart = quadIndicesDataStart;
|
||||
baseDest->BVHDataEnd = dataEnd;
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
void kernel compact(global char* dest,
|
||||
global char* src,
|
||||
uint groupCnt)
|
||||
{
|
||||
uint64_t compactedSize = compute_compacted_size((BVHBase*)src);
|
||||
compactT(dest, src, compactedSize, 0, groupCnt);
|
||||
}
|
||||
|
||||
// set serialization header along all lanes, each lane will get one dword of header plus 64bit reminding data
|
||||
GRL_INLINE
|
||||
unsigned prepare_header(
|
||||
uint64_t headerSize,
|
||||
uint64_t instancePtrSize,
|
||||
uint64_t numInstances,
|
||||
uint64_t bvhSize,
|
||||
uint8_t* driverID,
|
||||
uint64_t reminder)
|
||||
{
|
||||
|
||||
unsigned loc_id = get_sub_group_local_id();
|
||||
|
||||
uint64_t SerializedSizeInBytesIncludingHeader = headerSize + instancePtrSize * numInstances + bvhSize;
|
||||
uint64_t DeserializedSizeInBytes = bvhSize;
|
||||
uint64_t InstanceHandleCount = numInstances;
|
||||
|
||||
char bvh_magic_str[] = BVH_MAGIC_MACRO;
|
||||
uint* bvh_magic_uint = (uint*)bvh_magic_str;
|
||||
|
||||
unsigned headerTempLanePiece;
|
||||
if (loc_id < 4) { headerTempLanePiece = *((unsigned*)&driverID[4*loc_id]); }
|
||||
else if (loc_id == 4) { headerTempLanePiece = bvh_magic_uint[0]; }
|
||||
else if (loc_id == 5) { headerTempLanePiece = bvh_magic_uint[1]; }
|
||||
else if (loc_id == 6) { headerTempLanePiece = bvh_magic_uint[2]; }
|
||||
else if (loc_id == 7) { headerTempLanePiece = bvh_magic_uint[3]; }
|
||||
else if (loc_id == 8) { headerTempLanePiece = (uint)SerializedSizeInBytesIncludingHeader; }
|
||||
else if (loc_id == 9) { headerTempLanePiece = (uint)(SerializedSizeInBytesIncludingHeader >> 32ul); }
|
||||
else if (loc_id == 10) { headerTempLanePiece = (uint)DeserializedSizeInBytes; }
|
||||
else if (loc_id == 11) { headerTempLanePiece = (uint)(DeserializedSizeInBytes >> 32ul); }
|
||||
else if (loc_id == 12) { headerTempLanePiece = (uint)InstanceHandleCount; }
|
||||
else if (loc_id == 13) { headerTempLanePiece = (uint)(InstanceHandleCount >> 32ul); }
|
||||
else if (loc_id == 14) { headerTempLanePiece = (uint)reminder; }
|
||||
else if (loc_id == 15) { headerTempLanePiece = (uint)(reminder >> 32ul); }
|
||||
|
||||
return headerTempLanePiece;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
GRL_INLINE
|
||||
void serializeT(
|
||||
global byte_align64B* dest,
|
||||
global byte_align64B* src,
|
||||
global uint8_t* driverID,
|
||||
uint groups_count)
|
||||
{
|
||||
SerializationHeader* header = (SerializationHeader*)dest;
|
||||
BVHBase* base = (BVHBase*)src;
|
||||
|
||||
const uint headerSize = sizeof(SerializationHeader);
|
||||
const uint numInstances = base->Meta.instanceCount;
|
||||
const uint instancePtrSize = sizeof(gpuva_t);
|
||||
const uint compactedSize = compute_compacted_size(base);
|
||||
uint local_id = get_sub_group_local_id();
|
||||
|
||||
// this is not 64byte aligned :(
|
||||
const uint offsetToBvh = headerSize + instancePtrSize * numInstances;
|
||||
|
||||
global InstanceDesc* src_instances = 0;
|
||||
|
||||
if (numInstances) {
|
||||
src_instances = (global InstanceDesc*)((uint64_t)base + base->Meta.instanceDescsStart);
|
||||
}
|
||||
|
||||
// effectively this part should end up as one 64B aligned 64B write
|
||||
if (get_group_id(0) == groups_count - 1)
|
||||
{
|
||||
Block64B headerPlus;
|
||||
|
||||
// we patch the missing piece with instance or bhv beginning (TRICK A and B)
|
||||
// we assume header is 56B.
|
||||
global uint64_t* srcPiece = (numInstances != 0) ? &src_instances[0].AccelerationStructureGPUVA : (global uint64_t*)src;
|
||||
|
||||
unsigned headerTemp;
|
||||
|
||||
headerTemp = prepare_header(
|
||||
headerSize,
|
||||
instancePtrSize,
|
||||
numInstances,
|
||||
compactedSize,
|
||||
driverID,
|
||||
*srcPiece);
|
||||
|
||||
CacheLineSubgroupWrite((global byte_align64B*)dest, headerTemp);
|
||||
}
|
||||
|
||||
if (numInstances > 0)
|
||||
{
|
||||
uint instancesOffset = headerSize;
|
||||
uint aligned_instance_ptrs_offset = ((instancesOffset + 63) >> 6) << 6;
|
||||
uint unaligned_prefixing_instance_cnt = (aligned_instance_ptrs_offset - instancesOffset) >> 3;
|
||||
unaligned_prefixing_instance_cnt = min(unaligned_prefixing_instance_cnt, numInstances);
|
||||
|
||||
global uint64_t* dst_instances = (global uint64_t*)(dest + instancesOffset);
|
||||
|
||||
// we've copied first instance onto a header, (see TRICK A)
|
||||
// now we have only instances start at aligned memory
|
||||
uint numAlignedInstances = numInstances - unaligned_prefixing_instance_cnt;
|
||||
dst_instances += unaligned_prefixing_instance_cnt;
|
||||
src_instances += unaligned_prefixing_instance_cnt;
|
||||
|
||||
if (numAlignedInstances)
|
||||
{
|
||||
// each 8 instances form a cacheline
|
||||
uint numCachelines = numAlignedInstances >> 3; //qwords -> 64Bs
|
||||
// qwords besides multiple of 8;
|
||||
uint startReminder = numAlignedInstances & ~((1 << 3) - 1);
|
||||
uint numreminder = numAlignedInstances & ((1 << 3) - 1);
|
||||
|
||||
uint task_id = get_group_id(0);
|
||||
|
||||
while (task_id < numCachelines)
|
||||
{
|
||||
uint src_id = task_id * 8 + (local_id >> 1);
|
||||
uint* src_uncorected = (uint*)& src_instances[src_id].AccelerationStructureGPUVA;
|
||||
uint* src = ((local_id & 1) != 0) ? src_uncorected + 1 : src_uncorected;
|
||||
uint data = *src;
|
||||
|
||||
global char* dst = (global byte_align64B*)(dst_instances + (8 * task_id));
|
||||
CacheLineSubgroupWrite(dst, data);
|
||||
task_id += groups_count;
|
||||
}
|
||||
|
||||
if (task_id == numCachelines && local_id < 8 && numreminder > 0)
|
||||
{
|
||||
// this should write full cacheline
|
||||
|
||||
uint index = startReminder + local_id;
|
||||
// data will be taken from instances for lanes (local_id < numreminder)
|
||||
// copy srcbvh beginning as uint64_t for remaining lanes (TRICK B)
|
||||
global uint64_t* srcData = (local_id < numreminder) ?
|
||||
&src_instances[index].AccelerationStructureGPUVA :
|
||||
((global uint64_t*)src) + (local_id - numreminder);
|
||||
dst_instances[index] = *srcData;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the parts above copied unaligned dst beginning of bvh (see TRICK B)
|
||||
uint32_t unalignedPartCopiedElsewhere = (64u - (offsetToBvh & (64u - 1u)))&(64u - 1u);
|
||||
|
||||
compactT(dest + offsetToBvh, src, compactedSize, unalignedPartCopiedElsewhere, groups_count);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
void kernel serialize_indirect(
|
||||
global char* dest,
|
||||
global char* src,
|
||||
global uint8_t* driverID)
|
||||
{
|
||||
BVHBase* base = (BVHBase*)src;
|
||||
uint groups_count = GroupCountForCopy(base);
|
||||
serializeT(dest, src, driverID, groups_count);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
void kernel serialize_for_input_dump_indirect(
|
||||
global struct OutputBatchPtrs* batchPtrs,
|
||||
global dword* dstOffset,
|
||||
global char* src,
|
||||
global uint8_t* driverID)
|
||||
{
|
||||
BVHBase* base = (BVHBase*)src;
|
||||
uint groups_count = GroupCountForCopy(base);
|
||||
global char* dest = (global char*)(batchPtrs->dataStart + *dstOffset);
|
||||
dest += (sizeof(OutputData) + 127) & ~127;
|
||||
serializeT(dest, src, driverID, groups_count);
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
void deserializeT(
|
||||
global char* dest,
|
||||
global char* src,
|
||||
unsigned groupCnt)
|
||||
{
|
||||
SerializationHeader* header = (SerializationHeader*)src;
|
||||
|
||||
const uint64_t headerSize = sizeof(struct SerializationHeader);
|
||||
const uint64_t instancePtrSize = sizeof(gpuva_t);
|
||||
const uint64_t numInstances = header->InstanceHandleCount;
|
||||
const uint64_t offsetToBvh = headerSize + instancePtrSize * numInstances;
|
||||
const uint64_t bvhSize = header->DeserializedSizeInBytes;
|
||||
|
||||
if (numInstances)
|
||||
{
|
||||
const bool instances_mixed_with_inner_nodes = false;
|
||||
if (instances_mixed_with_inner_nodes)
|
||||
{
|
||||
// not implemented !
|
||||
// copy each node with 64byte granularity if node is instance, patch it mid-copy
|
||||
}
|
||||
else
|
||||
{
|
||||
BVHBase* srcBvhBase = (BVHBase*)(src + offsetToBvh);
|
||||
|
||||
// numHWInstances can be bigger (because of rebraiding) or smaller (because of inactive instances) than
|
||||
// numInstances (count of pointers and descriptors).
|
||||
uint offsetToHwInstances = srcBvhBase->instanceLeafStart << 6;
|
||||
uint numHwInstances = (srcBvhBase->instanceLeafEnd - srcBvhBase->instanceLeafStart) >> 1;
|
||||
|
||||
//
|
||||
// instances are in separate memory intervals
|
||||
// copy all the other data simple way
|
||||
//
|
||||
uint nodesEnd = srcBvhBase->Meta.instanceDescsStart;
|
||||
// copy before instance leafs
|
||||
CopyMemory(dest, (global char*)(src + offsetToBvh), offsetToHwInstances, groupCnt);
|
||||
|
||||
uint offsetPostInstances = srcBvhBase->instanceLeafEnd << 6;
|
||||
uint instanceDescStart = srcBvhBase->Meta.instanceDescsStart;
|
||||
uint sizePostInstances = instanceDescStart - offsetPostInstances;
|
||||
// copy after instance leafs before instance desc
|
||||
CopyMemory(dest + offsetPostInstances, (global char*)(src + offsetToBvh + offsetPostInstances), sizePostInstances, groupCnt);
|
||||
|
||||
uint instanceDescEnd = instanceDescStart + numInstances * sizeof(InstanceDesc);
|
||||
uint sizePostInstanceDescs = bvhSize - instanceDescEnd;
|
||||
// copy after instance desc
|
||||
CopyMemory(dest + instanceDescEnd, (global char*)(src + offsetToBvh + instanceDescEnd), sizePostInstanceDescs, groupCnt);
|
||||
|
||||
global gpuva_t* newInstancePtrs = (global gpuva_t*)(src + headerSize);
|
||||
global InstanceDesc* dstDesc = (global InstanceDesc*)(dest + instanceDescStart);
|
||||
global InstanceDesc* srcDesc = (global InstanceDesc*)(src + offsetToBvh + instanceDescStart);
|
||||
|
||||
// copy and patch instance descriptors
|
||||
for (uint64_t instanceIndex = get_group_id(0); instanceIndex < numInstances; instanceIndex += groupCnt)
|
||||
{
|
||||
InstanceDesc desc = srcDesc[instanceIndex];
|
||||
uint64_t newInstancePtr = newInstancePtrs[instanceIndex];
|
||||
desc.AccelerationStructureGPUVA = newInstancePtr; // patch it with new ptr;
|
||||
|
||||
dstDesc[instanceIndex] = desc;
|
||||
}
|
||||
|
||||
// copy and patch hw instance leafs
|
||||
global HwInstanceLeaf* dstInstleafs = (global HwInstanceLeaf*)(dest + offsetToHwInstances);
|
||||
global HwInstanceLeaf* srcInstleafs = (global HwInstanceLeaf*)(src + offsetToBvh + offsetToHwInstances);
|
||||
|
||||
for (uint hwLeafIndex = get_group_id(0); hwLeafIndex < numHwInstances; hwLeafIndex += groupCnt)
|
||||
{
|
||||
// pull the instance from srcBVH
|
||||
HwInstanceLeaf tmpInstleaf = srcInstleafs[hwLeafIndex];
|
||||
|
||||
uint swInstanceIndex = HwInstanceLeaf_GetInstanceIndex(&tmpInstleaf);
|
||||
uint64_t childBvhPtr = (uint64_t)newInstancePtrs[swInstanceIndex];
|
||||
uint64_t originalBvhPtr = (uint64_t)HwInstanceLeaf_GetBVH(&tmpInstleaf);
|
||||
|
||||
HwInstanceLeaf_SetBVH(&tmpInstleaf, childBvhPtr);
|
||||
uint64_t startNode = HwInstanceLeaf_GetStartNode(&tmpInstleaf);
|
||||
|
||||
if (startNode != 0) {
|
||||
uint64_t rootNodeOffset = startNode - originalBvhPtr;
|
||||
HwInstanceLeaf_SetStartNode(&tmpInstleaf, childBvhPtr + rootNodeOffset);
|
||||
}
|
||||
|
||||
dstInstleafs[hwLeafIndex] = tmpInstleaf;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
CopyMemory(dest, (global char*)(src + offsetToBvh), bvhSize, groupCnt);
|
||||
}
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel deserialize_indirect(
|
||||
global char* dest,
|
||||
global char* src)
|
||||
{
|
||||
SerializationHeader* header = (SerializationHeader*)src;
|
||||
const uint64_t bvhSize = header->DeserializedSizeInBytes;
|
||||
unsigned groupCnt = GroupCountForCopySize(bvhSize);
|
||||
deserializeT(dest, src, groupCnt);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel dxr_decode(global char* dest,
|
||||
global char* src)
|
||||
{
|
||||
|
||||
DecodeHeader* header = (DecodeHeader*)dest;
|
||||
BVHBase* base = (BVHBase*)src;
|
||||
|
||||
uint32_t numGeos = base->Meta.geoCount;
|
||||
uint32_t numInstances = base->Meta.instanceCount;
|
||||
|
||||
if (numInstances > 0)
|
||||
{
|
||||
header->Type = TOP_LEVEL;
|
||||
header->NumDesc = numInstances;
|
||||
|
||||
D3D12_RAYTRACING_INSTANCE_DESC* instanceDesc = (D3D12_RAYTRACING_INSTANCE_DESC*)(dest + sizeof(DecodeHeader));
|
||||
copyInstanceDescs((InstanceDesc*)((uint64_t)base + (uint64_t)base->Meta.instanceDescsStart),
|
||||
instanceDesc,
|
||||
numInstances);
|
||||
}
|
||||
else if (numGeos > 0)
|
||||
{
|
||||
header->Type = BOTTOM_LEVEL;
|
||||
header->NumDesc = numGeos;
|
||||
|
||||
D3D12_RAYTRACING_GEOMETRY_DESC* geomDescs = (D3D12_RAYTRACING_GEOMETRY_DESC*)(dest + sizeof(DecodeHeader));
|
||||
uint64_t data = (uint64_t)geomDescs + sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) * numGeos;
|
||||
createGeoDescs((GeoMetaData*)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
|
||||
geomDescs,
|
||||
numGeos,
|
||||
data);
|
||||
|
||||
work_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
copyDataFromQuadLeaves(base,
|
||||
geomDescs);
|
||||
|
||||
copyDataFromLProcedurals(base,
|
||||
geomDescs);
|
||||
}
|
||||
else
|
||||
{
|
||||
header->Type = BOTTOM_LEVEL;
|
||||
header->NumDesc = 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,208 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
// @file bvh_debug.cl
|
||||
//
|
||||
// @brief routines to do basic integrity checks
|
||||
//
|
||||
// Notes:
|
||||
//
|
||||
|
||||
#include "GRLGen12.h"
|
||||
#include "intrinsics.h"
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
#include "GRLGen12IntegrityChecks.h"
|
||||
#include "api_interface.h"
|
||||
|
||||
#define ERROR_PRINTF 0
|
||||
GRL_INLINE bool commit_err(
|
||||
global uint* some_null,
|
||||
global BVHBase* bvh,
|
||||
global ERROR_INFO* err_info_slot,
|
||||
ERROR_INFO err)
|
||||
{
|
||||
if (err.type != error_t_no_error) {
|
||||
uint expected = error_t_no_error;
|
||||
atomic_compare_exchange_global(&err_info_slot->type, &expected, err.type);
|
||||
if (expected == error_t_no_error)
|
||||
{
|
||||
err_info_slot->offset_in_BVH = err.offset_in_BVH;
|
||||
err_info_slot->when = err.when;
|
||||
err_info_slot->reserved = 0xAAACCAAA;
|
||||
mem_fence_evict_to_memory();
|
||||
#if ERROR_PRINTF
|
||||
printf("bvh = 0x%llX, err.type = %X, err.offset_in_BVH = %d\n", bvh, err.type, err.offset_in_BVH);
|
||||
#else
|
||||
// This is to trigger PF. Note we have to write directly to memory.
|
||||
// If write would stay in L3 it won't give a PF untill this will get evicted to mem.
|
||||
store_uint_L1UC_L3UC(some_null, 0, 0x0EEE0000 + err.type);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel check_tree_topology(
|
||||
global uint* some_null,
|
||||
global BVHBase* bvh,
|
||||
global ERROR_INFO* err,
|
||||
uint phase)
|
||||
{
|
||||
uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
|
||||
if (err->type != error_t_no_error) return;
|
||||
|
||||
uint dummy1, dummy2, dummy3;
|
||||
ERROR_INFO reterr = check_tree_topology_helper(bvh, globalID, &dummy1, &dummy2, &dummy3, false);
|
||||
if (reterr.type == error_t_no_error)
|
||||
{
|
||||
reterr = check_backpointers(bvh, globalID);
|
||||
}
|
||||
if (reterr.type == error_t_no_error)
|
||||
{
|
||||
reterr = validate_atomic_update_structs(bvh, globalID);
|
||||
}
|
||||
reterr.when = phase;
|
||||
commit_err(some_null, bvh, err, reterr);
|
||||
}
|
||||
|
||||
GRL_INLINE bool IsValid48bPtr(qword ptr)
|
||||
{
|
||||
qword CANONIZED_BITS = 0xFFFFul << 48ul;
|
||||
qword canonized_part = ptr & CANONIZED_BITS;
|
||||
bool isIt = ptr != 0 && (
|
||||
canonized_part == 0 || canonized_part == CANONIZED_BITS);
|
||||
return isIt;
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel check_geos_before_quad_update(
|
||||
global BVHBase* bvh, //dest bvh
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
global uint* some_null,
|
||||
global ERROR_INFO* err,
|
||||
uint phase,
|
||||
uint numGeos,
|
||||
uint numThreads)
|
||||
{
|
||||
uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
|
||||
if (err->type != error_t_no_error) return;
|
||||
|
||||
// first check sanity of geos
|
||||
ERROR_INFO geo_insanity_error = { error_t_input_geo_insane, 0 };
|
||||
|
||||
for (uint ID = globalID; ID < numGeos; ID += numThreads * get_sub_group_size())
|
||||
{
|
||||
bool IsSane = IsValid48bPtr((qword)(qword)geomDesc);
|
||||
|
||||
if (IsSane) {
|
||||
GRL_RAYTRACING_GEOMETRY_DESC geo = geomDesc[globalID];
|
||||
IsSane = geo.Type < NUM_GEOMETRY_TYPES;
|
||||
if (IsSane) {
|
||||
if (geo.Type == GEOMETRY_TYPE_TRIANGLES) {
|
||||
if (geo.Desc.Triangles.IndexFormat >= INDEX_FORMAT_END) {
|
||||
IsSane = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (geo.Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE && geo.Desc.Triangles.IndexCount > 2)
|
||||
{
|
||||
IsSane = (geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END) &&
|
||||
IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) &&
|
||||
IsValid48bPtr((qword)geo.Desc.Triangles.pIndexBuffer);
|
||||
}
|
||||
else if (geo.Desc.Triangles.VertexCount > 2)
|
||||
{
|
||||
IsSane =
|
||||
geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END&&
|
||||
IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) != 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
geo_insanity_error.offset_in_BVH = ID;
|
||||
geo_insanity_error.when = phase;
|
||||
if (!IsSane) {
|
||||
commit_err(some_null, bvh, err, geo_insanity_error);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel check_geos_vs_quads(
|
||||
global BVHBase* bvh,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
|
||||
global uint* some_null,
|
||||
global ERROR_INFO* err,
|
||||
uint phase,
|
||||
uint numGeos,
|
||||
uint numThreads)
|
||||
{
|
||||
uint numQuads = BVHBase_GetNumQuads(bvh);
|
||||
|
||||
QuadLeaf* quads = BVHBase_GetQuadLeaves(bvh);
|
||||
|
||||
uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
uint qoffset = bvh->quadLeafStart;
|
||||
|
||||
if (err->type != error_t_no_error) return;
|
||||
|
||||
ERROR_INFO theErr = { error_t_no_error, 0 };
|
||||
|
||||
for (uint ID = globalID; ID < numQuads; ID += numThreads * get_sub_group_size())
|
||||
{
|
||||
ERROR_INFO quadErr = { error_t_quad_leaf_broken, qoffset + ID, phase };
|
||||
|
||||
QuadLeaf quad = quads[ID];
|
||||
|
||||
uint geoIdx = PrimLeaf_GetGeoIndex(&quad.leafDesc);
|
||||
|
||||
if (geoIdx > numGeos) { commit_err(some_null, bvh, err, quadErr); return; }
|
||||
|
||||
uint numPrimsInGeo = geomDesc[geoIdx].Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE ?
|
||||
geomDesc[geoIdx].Desc.Triangles.IndexCount / 3 :
|
||||
geomDesc[geoIdx].Desc.Triangles.VertexCount / 3;
|
||||
|
||||
if(quad.primIndex0 >= numPrimsInGeo) {
|
||||
commit_err(some_null, bvh, err, quadErr);
|
||||
return;
|
||||
}
|
||||
|
||||
if(!QuadLeaf_IsSingleTriangle(&quad) &&
|
||||
(quad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&quad) >= numPrimsInGeo))
|
||||
{
|
||||
commit_err(some_null, bvh, err, quadErr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel check_instances_linked_bvhs(
|
||||
global uint* some_null,
|
||||
global BVHBase* bvh,
|
||||
global ERROR_INFO* err,
|
||||
uint phase)
|
||||
{
|
||||
if (err->type != error_t_no_error) return;
|
||||
|
||||
uint instanceLeafStart = bvh->instanceLeafStart;
|
||||
uint instanceLeafEnd = bvh->instanceLeafEnd;
|
||||
uint numInstances = (instanceLeafEnd - instanceLeafStart) / 2;
|
||||
|
||||
uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
|
||||
ERROR_INFO reterr = check_instances_linked_bvhs_helper(bvh, globalID, /*touchBlas*/true);
|
||||
reterr.when = phase;
|
||||
commit_err(some_null, bvh, err, reterr);
|
||||
}
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module bvh_on_gpu_checks;
|
||||
|
||||
kernel_module debug_kernels ("bvh_debug.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
kernel opencl_check_tree_topology < kernelFunction="check_tree_topology">;
|
||||
kernel opencl_check_instances_linked_bvhs < kernelFunction="check_instances_linked_bvhs">;
|
||||
kernel opencl_check_geos_before_quad_update < kernelFunction="check_geos_before_quad_update">;
|
||||
kernel opencl_check_geos_vs_quads < kernelFunction="check_geos_vs_quads">;
|
||||
}
|
||||
|
||||
|
||||
metakernel debug_checks_prepare_const_regs()
|
||||
{
|
||||
define cRoundingSIMD REG4;
|
||||
define cInit0 REG5;
|
||||
define cShiftForSIMD REG3;
|
||||
cRoundingSIMD = (16-1);
|
||||
cShiftForSIMD = 4;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
}
|
||||
|
||||
metakernel debug_checks_bvh_topology(
|
||||
qword some_null_ptr,
|
||||
qword bvh,
|
||||
qword bvh_inner_nodes_end,
|
||||
qword error_struct,
|
||||
dword when,
|
||||
dword bvh_inner_nodes_start_value )
|
||||
{
|
||||
define cRoundingSIMD REG4;
|
||||
define cShiftForSIMD REG3;
|
||||
REG1.lo = load_dword(bvh_inner_nodes_end);
|
||||
REG0 = bvh_inner_nodes_start_value;
|
||||
REG1.hi = 0;
|
||||
REG2 = REG1 - REG0;
|
||||
REG2 = REG2 + cRoundingSIMD;
|
||||
REG2 = REG2 >> cShiftForSIMD;
|
||||
|
||||
DISPATCHDIM_X = REG2.lo;
|
||||
|
||||
dispatch_indirect opencl_check_tree_topology args(
|
||||
some_null_ptr,
|
||||
bvh,
|
||||
error_struct,
|
||||
when);
|
||||
}
|
||||
|
||||
metakernel debug_check_instances_linked_bvhs(
|
||||
qword some_null_ptr,
|
||||
qword bvh,
|
||||
qword error_struct,
|
||||
dword numHWThreads,
|
||||
dword when)
|
||||
{
|
||||
dispatch opencl_check_instances_linked_bvhs(numHWThreads,1,1) args(
|
||||
some_null_ptr,
|
||||
bvh,
|
||||
error_struct,
|
||||
when);
|
||||
}
|
||||
|
||||
metakernel debug_check_geos_before_quad_update(
|
||||
qword bvh,
|
||||
qword geos,
|
||||
qword some_null_ptr,
|
||||
qword error_struct,
|
||||
dword when,
|
||||
dword numGeos,
|
||||
dword numHWThreads )
|
||||
{
|
||||
dispatch opencl_check_geos_before_quad_update(numHWThreads,1,1) args(
|
||||
bvh,
|
||||
geos,
|
||||
some_null_ptr,
|
||||
error_struct,
|
||||
when,
|
||||
numGeos,
|
||||
numHWThreads );
|
||||
}
|
||||
|
||||
metakernel debug_check_geos_vs_quads(
|
||||
qword bvh,
|
||||
qword geos,
|
||||
qword some_null_ptr,
|
||||
qword error_struct,
|
||||
dword when,
|
||||
dword numGeos,
|
||||
dword numHWThreads )
|
||||
{
|
||||
dispatch opencl_check_geos_vs_quads(numHWThreads,1,1) args(
|
||||
bvh,
|
||||
geos,
|
||||
some_null_ptr,
|
||||
error_struct,
|
||||
when,
|
||||
numGeos,
|
||||
numHWThreads );
|
||||
}
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "api_interface.h"
|
||||
#include "d3d12.h"
|
||||
#include "common.h"
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(1, 1, 1))) void kernel compacted_size(global char *bvh_mem,
|
||||
global char *postbuild_info)
|
||||
{
|
||||
BVHBase *base = (BVHBase *)bvh_mem;
|
||||
PostbuildInfoCompactedSize *postbuildInfoCompacted = (PostbuildInfoCompactedSize *)postbuild_info;
|
||||
|
||||
postbuildInfoCompacted->CompactedSizeInBytes = compute_compacted_size(base);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(1, 1, 1))) void kernel current_size(global char *bvh_mem,
|
||||
global char *postbuild_info)
|
||||
{
|
||||
|
||||
BVHBase *base = (BVHBase *)bvh_mem;
|
||||
PostbuildInfoCurrentSize *postbuildInfoCurrent = (PostbuildInfoCurrentSize *)postbuild_info;
|
||||
|
||||
postbuildInfoCurrent->CurrentSizeInBytes = base->Meta.allocationSize;
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(1, 1, 1))) void kernel serialized_size(global char *bvh_mem,
|
||||
global char *postbuild_info)
|
||||
{
|
||||
|
||||
BVHBase *base = (BVHBase *)bvh_mem;
|
||||
PostbuildInfoSerializationDesc *postbuildInfoSerialization = (PostbuildInfoSerializationDesc *)postbuild_info;
|
||||
|
||||
uint64_t headerSize = sizeof(SerializationHeader);
|
||||
uint64_t numInstances = base->Meta.instanceCount;
|
||||
|
||||
postbuildInfoSerialization->SerializedSizeInBytes = sizeof(SerializationHeader) +
|
||||
numInstances * sizeof(gpuva_t) +
|
||||
compute_compacted_size(base);
|
||||
//base->Meta.allocationSize;
|
||||
postbuildInfoSerialization->NumBottomLevelAccelerationStructurePointers = numInstances;
|
||||
}
|
||||
|
||||
void countTrianglesAndProcedurals(GeoMetaData *geoMetaData,
|
||||
uint64_t numGeos,
|
||||
uint64_t *numTriangles,
|
||||
uint64_t *numProcedurals)
|
||||
{
|
||||
uint64_t numTrianglesLoc = 0;
|
||||
uint64_t numProceduralsLoc = 0;
|
||||
|
||||
for (uint64_t geoIndex = get_local_id(0); geoIndex < numGeos; geoIndex += get_local_size(0))
|
||||
{
|
||||
if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
|
||||
{
|
||||
*numTriangles += geoMetaData[geoIndex].PrimitiveCount;
|
||||
}
|
||||
else
|
||||
{
|
||||
*numProcedurals += geoMetaData[geoIndex].PrimitiveCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(1, 1, 1))) void kernel decoded_size(global char *bvh_mem,
|
||||
global char *postbuild_info)
|
||||
{
|
||||
BVHBase *base = (BVHBase *)bvh_mem;
|
||||
PostbuildInfoToolsVisualizationDesc *postbuildInfoDecoded = (PostbuildInfoToolsVisualizationDesc *)postbuild_info;
|
||||
|
||||
uint64_t numTriangles = 0;
|
||||
uint64_t numProcedurals = 0;
|
||||
countTrianglesAndProcedurals((GeoMetaData *)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
|
||||
base->Meta.geoCount,
|
||||
&numTriangles,
|
||||
&numProcedurals);
|
||||
uint64_t numInstances = base->Meta.instanceCount;
|
||||
uint64_t numDescs = base->Meta.geoCount;
|
||||
uint64_t headerSize = sizeof(DecodeHeader);
|
||||
uint64_t descsSize = numDescs * sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) +
|
||||
numInstances * sizeof(D3D12_RAYTRACING_INSTANCE_DESC);
|
||||
|
||||
// Each triangle is stored separately - 3 vertices (9 floats) per triangle
|
||||
uint64_t triangleDataSize = 9 * sizeof(float);
|
||||
uint64_t proceduralDataSize = sizeof(D3D12_RAYTRACING_AABB);
|
||||
uint64_t geoDataSize = numTriangles * triangleDataSize + numProcedurals * proceduralDataSize;
|
||||
|
||||
postbuildInfoDecoded->DecodedSizeInBytes = headerSize + descsSize + geoDataSize;
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,429 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared.h"
|
||||
#include "intrinsics.h"
|
||||
#include "AABB.h"
|
||||
#include "AABB3f.h"
|
||||
#include "qbvh6.h"
|
||||
|
||||
/* ====== BVH_BUILDER config ====== */
|
||||
|
||||
__constant const float cfg_intCost = 4.0f;
|
||||
__constant const float cfg_travCost = 1.0f;
|
||||
__constant const uint cfg_minLeafSize = BVH_LEAF_N_MIN;
|
||||
__constant const uint cfg_maxLeafSize = BVH_LEAF_N_MAX;
|
||||
__constant const uint cfg_maxDepth = BUILDRECORD_STACK_SIZE;
|
||||
|
||||
#define ENABLE_CONVERSION_CHECKS 0
|
||||
|
||||
#ifdef ENABLE_BIG_REG_ANNOTATION
|
||||
#define GRL_ANNOTATE_BIG_REG_REQ __attribute__((annotate("num-thread-per-eu 4")))
|
||||
#else
|
||||
#define GRL_ANNOTATE_BIG_REG_REQ
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_IGC_DO_NOT_SPILL
|
||||
#define GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((annotate("igc-do-not-spill")))
|
||||
#else
|
||||
#define GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
#endif
|
||||
|
||||
#define ERROR()
|
||||
|
||||
/* =================================================================================================================================================== */
|
||||
/* =================================================================================================================================================== */
|
||||
/* =================================================================================================================================================== */
|
||||
/* =================================================================================================================================================== */
|
||||
|
||||
GRL_INLINE unsigned int getNumLeafPrims(unsigned int offset)
|
||||
{
|
||||
return (offset & 0x7) - 3;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned int getLeafOffset(unsigned int offset)
|
||||
{
|
||||
return offset & (~0x7);
|
||||
}
|
||||
|
||||
GRL_INLINE float4 triangleNormal(const float4 v0, const float4 v1, const float4 v2)
|
||||
{
|
||||
const float4 a = v1 - v0;
|
||||
const float4 b = v2 - v0;
|
||||
return cross(a, b);
|
||||
}
|
||||
|
||||
GRL_INLINE float areaTriangle(const float4 v0, const float4 v1, const float4 v2)
|
||||
{
|
||||
const float4 normal = triangleNormal(v0, v1, v2);
|
||||
return length((float3)(normal.x, normal.y, normal.z)) * 0.5f;
|
||||
}
|
||||
|
||||
GRL_INLINE float det2(const float2 a, const float2 b)
|
||||
{
|
||||
return a.x * b.y - a.y * b.x;
|
||||
}
|
||||
|
||||
GRL_INLINE float areaProjectedTriangle(const float4 v0, const float4 v1, const float4 v2)
|
||||
{
|
||||
const float xy = 0.5f * fabs(det2(v1.xy - v0.xy, v2.xy - v0.xy));
|
||||
const float yz = 0.5f * fabs(det2(v1.yz - v0.yz, v2.yz - v0.yz));
|
||||
const float zx = 0.5f * fabs(det2(v1.zx - v0.zx, v2.zx - v0.zx));
|
||||
return xy + yz + zx;
|
||||
}
|
||||
|
||||
typedef struct Block64B {
|
||||
char data[64];
|
||||
} Block64B __attribute__((aligned(64)));
|
||||
|
||||
typedef char byte_align64B __attribute__((aligned(64)));
|
||||
|
||||
/* ====================================================================== */
|
||||
/* ============================== GLOBALS =============================== */
|
||||
/* ====================================================================== */
|
||||
|
||||
GRL_INLINE bool Globals_OnFinish(global struct Globals *globals)
|
||||
{
|
||||
/* last active HW thread ? */
|
||||
if (get_local_id(0) == 0)
|
||||
{
|
||||
const uint sync = atomic_add(&globals->sync, 1);
|
||||
if (sync + 1 == get_num_groups(0))
|
||||
{
|
||||
globals->sync = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
GRL_INLINE uint BlockAllocator_BytesUsed(struct BlockAllocator *p)
|
||||
{
|
||||
return p->cur - p->start;
|
||||
};
|
||||
|
||||
GRL_INLINE uint BlockAllocator_Alloc(__global struct BlockAllocator *p, const uint size)
|
||||
{
|
||||
return atomic_add(&p->cur, size);
|
||||
}
|
||||
|
||||
GRL_INLINE uint BlockAllocator_Alloc_Single(__global struct BlockAllocator *p, const uint size)
|
||||
{
|
||||
uint offset = 0;
|
||||
if (get_sub_group_local_id() == 0)
|
||||
offset = atomic_add(&p->cur, size);
|
||||
return sub_group_broadcast(offset, 0);
|
||||
}
|
||||
|
||||
// node allocation returns an offset from beginning of BVH to allocated node
|
||||
// in multiples of 64B
|
||||
GRL_INLINE uint allocate_inner_nodes(global struct BVHBase* base, uint num_nodes )
|
||||
{
|
||||
return atomic_add_global( &base->nodeDataCur, num_nodes );
|
||||
}
|
||||
GRL_INLINE uint allocate_procedural_leaves(global struct BVHBase* base, uint num_nodes)
|
||||
{
|
||||
return atomic_add_global(&base->proceduralDataCur, num_nodes);
|
||||
}
|
||||
|
||||
GRL_INLINE uint allocate_quad_leaves(global struct BVHBase* base, uint num_nodes)
|
||||
{
|
||||
return atomic_add_global(&base->quadLeafCur, num_nodes);
|
||||
}
|
||||
|
||||
#if 0
|
||||
GRL_INLINE uint alloc_node_mem(global struct Globals *globals, const uint size)
|
||||
{
|
||||
const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
|
||||
return BlockAllocator_Alloc(&globals->node_mem_allocator, aligned_size);
|
||||
}
|
||||
|
||||
GRL_INLINE uint alloc_single_node_mem(global struct Globals *globals, const uint size)
|
||||
{
|
||||
const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
|
||||
return BlockAllocator_Alloc_Single(&globals->node_mem_allocator, aligned_size);
|
||||
}
|
||||
|
||||
GRL_INLINE uint alloc_quad_leaf_mem(global struct Globals *globals, const uint size)
|
||||
{
|
||||
const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
|
||||
return BlockAllocator_Alloc(&globals->quad_mem_allocator, aligned_size);
|
||||
}
|
||||
|
||||
GRL_INLINE uint alloc_procedural_leaf_mem(global struct Globals *globals, const uint size)
|
||||
{
|
||||
const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
|
||||
return BlockAllocator_Alloc(&globals->procedural_mem_allocator, aligned_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
GRL_INLINE global struct BuildRecord *getBuildRecords(char *bvh_mem, struct Globals *globals)
|
||||
{
|
||||
return (global struct BuildRecord *)(bvh_mem + globals->build_record_start);
|
||||
}
|
||||
|
||||
/* ======================================================================= */
|
||||
/* ============================== TRIANGLE =============================== */
|
||||
/* ======================================================================= */
|
||||
|
||||
/*GRL_INLINE void printTriangle(struct Triangle *t)
|
||||
{
|
||||
printf("vtx[0] %d vtx[1] %d vtx[2] %d primID %d geomID %d \n",t->vtx[0],t->vtx[1],t->vtx[2],t->primID,t->geomID);
|
||||
}*/
|
||||
|
||||
/* ==================================================================== */
|
||||
/* ============================== SPLIT =============================== */
|
||||
/* ==================================================================== */
|
||||
|
||||
GRL_INLINE void printSplit(struct Split *split)
|
||||
{
|
||||
printf("split sah %f dim %d pos %d \n", split->sah, split->dim, split->pos);
|
||||
}
|
||||
|
||||
/* ========================================================================== */
|
||||
/* ============================== BUILDRECORD =============================== */
|
||||
/* ========================================================================== */
|
||||
|
||||
GRL_INLINE void initBuildRecord(struct BuildRecord *buildRecord, uint start, uint end)
|
||||
{
|
||||
AABB_init(&buildRecord->centroidBounds);
|
||||
buildRecord->start = start;
|
||||
buildRecord->end = end;
|
||||
}
|
||||
|
||||
GRL_INLINE void extendBuildRecord(struct BuildRecord *buildRecord, struct AABB *primref)
|
||||
{
|
||||
AABB_extend_point(&buildRecord->centroidBounds, AABB_centroid2(primref));
|
||||
}
|
||||
|
||||
GRL_INLINE uint getBuildRecursionDepth(struct BuildRecord *buildRecord)
|
||||
{
|
||||
return as_uint(buildRecord->centroidBounds.upper.w);
|
||||
}
|
||||
|
||||
GRL_INLINE void setBuildRecursionDepth(struct BuildRecord *buildRecord, uint depth)
|
||||
{
|
||||
buildRecord->centroidBounds.upper.w = as_float(depth);
|
||||
}
|
||||
|
||||
GRL_INLINE uint getNumPrimsBuildRecord(struct BuildRecord *buildRecord)
|
||||
{
|
||||
return buildRecord->end - buildRecord->start;
|
||||
}
|
||||
|
||||
/* ========================================================================== */
|
||||
/* =================== BinaryMortonCodeHierarchy ============================= */
|
||||
/* ========================================================================== */
|
||||
|
||||
GRL_INLINE void BinaryMortonCodeHierarchy_init(struct BinaryMortonCodeHierarchy *record, uint start, uint end)
|
||||
{
|
||||
record->range.start = start;
|
||||
record->range.end = end;
|
||||
record->leftChild = -1;
|
||||
record->rightChild = -1;
|
||||
// record->flag = 0;
|
||||
}
|
||||
|
||||
GRL_INLINE uint BinaryMortonCodeHierarchy_getNumPrimitives(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
|
||||
{
|
||||
/* leaf case */
|
||||
if (nodeID & (uint)(1 << 31))
|
||||
return 1;
|
||||
|
||||
/* inner node case*/
|
||||
else
|
||||
return nodes[nodeID].range.end - nodes[nodeID].range.start + 1;
|
||||
}
|
||||
|
||||
GRL_INLINE struct BinaryMortonCodeHierarchy BinaryMortonCodeHierarchy_getEntry(global struct BinaryMortonCodeHierarchy* nodes, uint nodeID)
|
||||
{
|
||||
struct BinaryMortonCodeHierarchy entry;
|
||||
|
||||
if (nodeID & (uint)(1 << 31)) {
|
||||
/* leaf case */
|
||||
uint rangeStart = nodeID ^ (uint)(1 << 31);
|
||||
BinaryMortonCodeHierarchy_init(&entry, rangeStart, rangeStart);
|
||||
}
|
||||
else {
|
||||
/* inner node case*/
|
||||
entry = nodes[nodeID];
|
||||
}
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
GRL_INLINE uint BinaryMortonCodeHierarchy_getRangeStart(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
|
||||
{
|
||||
/* leaf case */
|
||||
if (nodeID & (uint)(1 << 31))
|
||||
return nodeID ^ (uint)(1 << 31);
|
||||
|
||||
/* inner node case*/
|
||||
else
|
||||
return nodes[nodeID].range.start;
|
||||
}
|
||||
|
||||
/* ==================================================================== */
|
||||
/* ============================== RANGE =============================== */
|
||||
/* ==================================================================== */
|
||||
|
||||
GRL_INLINE void printRange(struct Range *range)
|
||||
{
|
||||
printf("start %d end %d \n", range->start, range->end);
|
||||
}
|
||||
|
||||
GRL_INLINE bool equalRange(struct Range *range0, struct Range *range1)
|
||||
{
|
||||
if (range0->start == range1->start &&
|
||||
range0->end == range1->end)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
GRL_INLINE uint getSizeRange(struct Range *range)
|
||||
{
|
||||
return range->end - range->start;
|
||||
}
|
||||
|
||||
/* ==================================================================== */
|
||||
/* ========================= ProceduralLeaf =========================== */
|
||||
/* ==================================================================== */
|
||||
|
||||
#if 0
|
||||
struct ProceduralLeaf
|
||||
{
|
||||
uint shaderIndex_geomMask;
|
||||
uint geomIndex_flags;
|
||||
uint N_last;
|
||||
uint primIndex[13];
|
||||
};
|
||||
#endif
|
||||
|
||||
GRL_INLINE uint ProceduralLeaf_geomIndex(global struct ProceduralLeaf *This)
|
||||
{
|
||||
return This->leafDesc.geomIndex_flags & 0x1FFFFFFF;
|
||||
}
|
||||
|
||||
GRL_INLINE uint ProceduralLeaf_primIndex(global struct ProceduralLeaf *This, uint i)
|
||||
{
|
||||
//assert(i < N);
|
||||
return This->_primIndex[i];
|
||||
}
|
||||
|
||||
/* ==================================================================== */
|
||||
/* =========================== TrianglePair =========================== */
|
||||
/* ==================================================================== */
|
||||
|
||||
struct TrianglePair
|
||||
{
|
||||
uint4 a; // indices of the 4 verts to store in the quad
|
||||
uint3 lb; // index of the second triangle's verts in 'a'
|
||||
};
|
||||
|
||||
GRL_INLINE struct TrianglePair TrianglePair_Constructor(uint3 tri0, uint primID0, uint3 tri1, uint primID1)
|
||||
{
|
||||
struct TrianglePair q;
|
||||
q.a.x = tri0.x;
|
||||
q.a.y = tri0.y;
|
||||
q.a.z = tri0.z;
|
||||
q.a.w = tri0.z;
|
||||
|
||||
uint3 b;
|
||||
b.x = tri1.x;
|
||||
b.y = tri1.y;
|
||||
b.z = tri1.z;
|
||||
|
||||
q.lb = (uint3)(3);
|
||||
|
||||
q.lb.x = (b.x == q.a.x) ? 0 : q.lb.x;
|
||||
q.lb.y = (b.y == q.a.x) ? 0 : q.lb.y;
|
||||
q.lb.z = (b.z == q.a.x) ? 0 : q.lb.z;
|
||||
|
||||
q.lb.x = (b.x == q.a.y) ? 1 : q.lb.x;
|
||||
q.lb.y = (b.y == q.a.y) ? 1 : q.lb.y;
|
||||
q.lb.z = (b.z == q.a.y) ? 1 : q.lb.z;
|
||||
|
||||
q.lb.x = (b.x == q.a.z) ? 2 : q.lb.x;
|
||||
q.lb.y = (b.y == q.a.z) ? 2 : q.lb.y;
|
||||
q.lb.z = (b.z == q.a.z) ? 2 : q.lb.z;
|
||||
|
||||
q.lb.x = (primID0 != primID1) ? q.lb.x : 0;
|
||||
q.lb.y = (primID0 != primID1) ? q.lb.y : 0;
|
||||
q.lb.z = (primID0 != primID1) ? q.lb.z : 0;
|
||||
|
||||
q.a.w = (q.lb.x == 3) ? b.x : q.a.w;
|
||||
q.a.w = (q.lb.y == 3) ? b.y : q.a.w;
|
||||
q.a.w = (q.lb.z == 3) ? b.z : q.a.w;
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
GRL_INLINE float InstanceDesc_get_transform(const InstanceDesc *d, const uint32_t row, const uint32_t column)
|
||||
{
|
||||
return d->Transform[row][column];
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t InstanceDesc_get_instanceID(const InstanceDesc *d)
|
||||
{
|
||||
return d->InstanceIDAndMask & (0x00FFFFFF);
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t InstanceDesc_get_InstanceMask(const InstanceDesc *d)
|
||||
{
|
||||
return d->InstanceIDAndMask >> 24;
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t InstanceDesc_get_InstanceContributionToHitGroupIndex(const InstanceDesc *d)
|
||||
{
|
||||
return d->InstanceContributionToHitGroupIndexAndFlags & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t InstanceDesc_get_InstanceFlags(const InstanceDesc *d)
|
||||
{
|
||||
return d->InstanceContributionToHitGroupIndexAndFlags >> 24;
|
||||
}
|
||||
|
||||
GRL_INLINE gpuva_t InstanceDesc_get_AccelerationStructure(const InstanceDesc *d)
|
||||
{
|
||||
return d->AccelerationStructureGPUVA;
|
||||
}
|
||||
|
||||
GRL_INLINE void InstanceDesc_set_transform(InstanceDesc *d, const uint32_t row, const uint32_t column, float value)
|
||||
{
|
||||
d->Transform[row][column] = value;
|
||||
}
|
||||
|
||||
GRL_INLINE void InstanceDesc_set_instanceID(InstanceDesc *d, const uint32_t id)
|
||||
{
|
||||
d->InstanceIDAndMask &= 255 << 24;
|
||||
d->InstanceIDAndMask |= id & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE void InstanceDesc_set_InstanceMask(InstanceDesc *d, const uint32_t mask)
|
||||
{
|
||||
d->InstanceIDAndMask &= ((1 << 24) - 1);
|
||||
d->InstanceIDAndMask |= mask << 24;
|
||||
}
|
||||
|
||||
GRL_INLINE void InstanceDesc_set_InstanceContributionToHitGroupIndex(InstanceDesc *d, const uint32_t contribution)
|
||||
{
|
||||
d->InstanceContributionToHitGroupIndexAndFlags &= 255 << 24;
|
||||
d->InstanceContributionToHitGroupIndexAndFlags |= contribution & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE void InstanceDesc_set_InstanceFlags(InstanceDesc *d, const uint32_t flags)
|
||||
{
|
||||
d->InstanceContributionToHitGroupIndexAndFlags &= ((1 << 24) - 1);
|
||||
d->InstanceContributionToHitGroupIndexAndFlags |= flags << 24;
|
||||
}
|
||||
|
||||
GRL_INLINE void InstanceDesc_set_AccelerationStructure(InstanceDesc *d, gpuva_t address)
|
||||
{
|
||||
d->AccelerationStructureGPUVA = address;
|
||||
}
|
||||
|
|
@ -1,129 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module copy; // In copy we assume output data structure to be DXR compatible
|
||||
|
||||
kernel clone_indirect < source="bvh_copy.cl", kernelFunction="clone_indirect" >
|
||||
kernel compact < source="bvh_copy.cl", kernelFunction="compact" >
|
||||
kernel serialize_indirect < source="bvh_copy.cl", kernelFunction="serialize_indirect" >
|
||||
kernel serialize_for_input_dump_indirect < source="bvh_copy.cl", kernelFunction="serialize_for_input_dump_indirect" >
|
||||
kernel deserialize_indirect < source="bvh_copy.cl", kernelFunction="deserialize_indirect" >
|
||||
kernel dxr_decode < source="bvh_copy.cl", kernelFunction="dxr_decode" >
|
||||
|
||||
metakernel clone_indirect(
|
||||
qword dest,
|
||||
qword src,
|
||||
qword srcBVHsizedwordAddr)
|
||||
{
|
||||
// this has to be compatible with in kernel GroupCountForCopy(...)
|
||||
define byteSize REG0;
|
||||
define numGroupsRqd REG1;
|
||||
define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
|
||||
define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8;
|
||||
define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4;
|
||||
byteSize = load_dword(srcBVHsizedwordAddr);
|
||||
numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
|
||||
numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
|
||||
|
||||
DISPATCHDIM_X = numGroupsRqd.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect clone_indirect args(
|
||||
dest,
|
||||
src);
|
||||
}
|
||||
|
||||
metakernel compact(
|
||||
qword dest,
|
||||
qword src)
|
||||
{
|
||||
dispatch compact(32,1,1) args(
|
||||
dest,
|
||||
src,
|
||||
32);
|
||||
}
|
||||
|
||||
metakernel serialize_indirect(
|
||||
qword dest,
|
||||
qword src,
|
||||
qword driverID,
|
||||
qword srcBVHsizedwordAddr)
|
||||
{
|
||||
define byteSize REG0;
|
||||
define numGroupsRqd REG1;
|
||||
define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
|
||||
define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8;
|
||||
define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4;
|
||||
byteSize = load_dword(srcBVHsizedwordAddr);
|
||||
numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
|
||||
numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
|
||||
DISPATCHDIM_X = numGroupsRqd.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect serialize_indirect args(
|
||||
dest,
|
||||
src,
|
||||
driverID);
|
||||
}
|
||||
|
||||
metakernel serialize_for_input_dump_indirect(
|
||||
qword batchPtrs,
|
||||
qword dstOffset,
|
||||
qword src,
|
||||
qword driverID,
|
||||
qword srcBVHsizedwordAddr)
|
||||
{
|
||||
define byteSize REG0;
|
||||
define numGroupsRqd REG1;
|
||||
define BYTE_PER_GROUP_CHUNK_SHIFT REG2; BYTE_PER_GROUP_CHUNK_SHIFT = 8;
|
||||
define REMINDER_NUM_GROUPS REG3; REMINDER_NUM_GROUPS = 4;
|
||||
byteSize = load_dword(srcBVHsizedwordAddr);
|
||||
numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
|
||||
numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
|
||||
DISPATCHDIM_X = numGroupsRqd.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect serialize_for_input_dump_indirect args(
|
||||
batchPtrs,
|
||||
dstOffset,
|
||||
src,
|
||||
driverID);
|
||||
}
|
||||
|
||||
metakernel deserialize_indirect(
|
||||
qword dest,
|
||||
qword src,
|
||||
qword srcBVHsizedwordAddr)
|
||||
{
|
||||
define byteSize REG0;
|
||||
define numGroupsRqd REG1;
|
||||
define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
|
||||
define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8;
|
||||
define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4;
|
||||
byteSize = load_dword(srcBVHsizedwordAddr);
|
||||
numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
|
||||
numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
|
||||
DISPATCHDIM_X = numGroupsRqd.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect deserialize_indirect args(
|
||||
dest,
|
||||
src);
|
||||
}
|
||||
|
||||
metakernel dxr_decode(
|
||||
qword dest,
|
||||
qword src)
|
||||
{
|
||||
dispatch dxr_decode(1,1,1) args(
|
||||
dest,
|
||||
src);
|
||||
}
|
||||
|
|
@ -1,525 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include "GRLStructs.h"
|
||||
#include "shared.h"
|
||||
|
||||
typedef global void *D3D12_GPU_VIRTUAL_ADDRESS;
|
||||
typedef void *ID3D12StateObjectPrototype;
|
||||
|
||||
enum DXGI_FORMAT
|
||||
{
|
||||
DXGI_FORMAT_UNKNOWN,
|
||||
DXGI_FORMAT_R32G32B32A32_TYPELESS,
|
||||
DXGI_FORMAT_R32G32B32A32_FLOAT,
|
||||
DXGI_FORMAT_R32G32B32A32_UINT,
|
||||
DXGI_FORMAT_R32G32B32A32_SINT,
|
||||
DXGI_FORMAT_R32G32B32_TYPELESS,
|
||||
DXGI_FORMAT_R32G32B32_FLOAT,
|
||||
DXGI_FORMAT_R32G32B32_UINT,
|
||||
DXGI_FORMAT_R32G32B32_SINT,
|
||||
DXGI_FORMAT_R16G16B16A16_TYPELESS,
|
||||
DXGI_FORMAT_R16G16B16A16_FLOAT,
|
||||
DXGI_FORMAT_R16G16B16A16_UNORM,
|
||||
DXGI_FORMAT_R16G16B16A16_UINT,
|
||||
DXGI_FORMAT_R16G16B16A16_SNORM,
|
||||
DXGI_FORMAT_R16G16B16A16_SINT,
|
||||
DXGI_FORMAT_R32G32_TYPELESS,
|
||||
DXGI_FORMAT_R32G32_FLOAT,
|
||||
DXGI_FORMAT_R32G32_UINT,
|
||||
DXGI_FORMAT_R32G32_SINT,
|
||||
DXGI_FORMAT_R32G8X24_TYPELESS,
|
||||
DXGI_FORMAT_D32_FLOAT_S8X24_UINT,
|
||||
DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS,
|
||||
DXGI_FORMAT_X32_TYPELESS_G8X24_UINT,
|
||||
DXGI_FORMAT_R10G10B10A2_TYPELESS,
|
||||
DXGI_FORMAT_R10G10B10A2_UNORM,
|
||||
DXGI_FORMAT_R10G10B10A2_UINT,
|
||||
DXGI_FORMAT_R11G11B10_FLOAT,
|
||||
DXGI_FORMAT_R8G8B8A8_TYPELESS,
|
||||
DXGI_FORMAT_R8G8B8A8_UNORM,
|
||||
DXGI_FORMAT_R8G8B8A8_UNORM_SRGB,
|
||||
DXGI_FORMAT_R8G8B8A8_UINT,
|
||||
DXGI_FORMAT_R8G8B8A8_SNORM,
|
||||
DXGI_FORMAT_R8G8B8A8_SINT,
|
||||
DXGI_FORMAT_R16G16_TYPELESS,
|
||||
DXGI_FORMAT_R16G16_FLOAT,
|
||||
DXGI_FORMAT_R16G16_UNORM,
|
||||
DXGI_FORMAT_R16G16_UINT,
|
||||
DXGI_FORMAT_R16G16_SNORM,
|
||||
DXGI_FORMAT_R16G16_SINT,
|
||||
DXGI_FORMAT_R32_TYPELESS,
|
||||
DXGI_FORMAT_D32_FLOAT,
|
||||
DXGI_FORMAT_R32_FLOAT,
|
||||
DXGI_FORMAT_R32_UINT,
|
||||
DXGI_FORMAT_R32_SINT,
|
||||
DXGI_FORMAT_R24G8_TYPELESS,
|
||||
DXGI_FORMAT_D24_UNORM_S8_UINT,
|
||||
DXGI_FORMAT_R24_UNORM_X8_TYPELESS,
|
||||
DXGI_FORMAT_X24_TYPELESS_G8_UINT,
|
||||
DXGI_FORMAT_R8G8_TYPELESS,
|
||||
DXGI_FORMAT_R8G8_UNORM,
|
||||
DXGI_FORMAT_R8G8_UINT,
|
||||
DXGI_FORMAT_R8G8_SNORM,
|
||||
DXGI_FORMAT_R8G8_SINT,
|
||||
DXGI_FORMAT_R16_TYPELESS,
|
||||
DXGI_FORMAT_R16_FLOAT,
|
||||
DXGI_FORMAT_D16_UNORM,
|
||||
DXGI_FORMAT_R16_UNORM,
|
||||
DXGI_FORMAT_R16_UINT,
|
||||
DXGI_FORMAT_R16_SNORM,
|
||||
DXGI_FORMAT_R16_SINT,
|
||||
DXGI_FORMAT_R8_TYPELESS,
|
||||
DXGI_FORMAT_R8_UNORM,
|
||||
DXGI_FORMAT_R8_UINT,
|
||||
DXGI_FORMAT_R8_SNORM,
|
||||
DXGI_FORMAT_R8_SINT,
|
||||
DXGI_FORMAT_A8_UNORM,
|
||||
DXGI_FORMAT_R1_UNORM,
|
||||
DXGI_FORMAT_R9G9B9E5_SHAREDEXP,
|
||||
DXGI_FORMAT_R8G8_B8G8_UNORM,
|
||||
DXGI_FORMAT_G8R8_G8B8_UNORM,
|
||||
DXGI_FORMAT_BC1_TYPELESS,
|
||||
DXGI_FORMAT_BC1_UNORM,
|
||||
DXGI_FORMAT_BC1_UNORM_SRGB,
|
||||
DXGI_FORMAT_BC2_TYPELESS,
|
||||
DXGI_FORMAT_BC2_UNORM,
|
||||
DXGI_FORMAT_BC2_UNORM_SRGB,
|
||||
DXGI_FORMAT_BC3_TYPELESS,
|
||||
DXGI_FORMAT_BC3_UNORM,
|
||||
DXGI_FORMAT_BC3_UNORM_SRGB,
|
||||
DXGI_FORMAT_BC4_TYPELESS,
|
||||
DXGI_FORMAT_BC4_UNORM,
|
||||
DXGI_FORMAT_BC4_SNORM,
|
||||
DXGI_FORMAT_BC5_TYPELESS,
|
||||
DXGI_FORMAT_BC5_UNORM,
|
||||
DXGI_FORMAT_BC5_SNORM,
|
||||
DXGI_FORMAT_B5G6R5_UNORM,
|
||||
DXGI_FORMAT_B5G5R5A1_UNORM,
|
||||
DXGI_FORMAT_B8G8R8A8_UNORM,
|
||||
DXGI_FORMAT_B8G8R8X8_UNORM,
|
||||
DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM,
|
||||
DXGI_FORMAT_B8G8R8A8_TYPELESS,
|
||||
DXGI_FORMAT_B8G8R8A8_UNORM_SRGB,
|
||||
DXGI_FORMAT_B8G8R8X8_TYPELESS,
|
||||
DXGI_FORMAT_B8G8R8X8_UNORM_SRGB,
|
||||
DXGI_FORMAT_BC6H_TYPELESS,
|
||||
DXGI_FORMAT_BC6H_UF16,
|
||||
DXGI_FORMAT_BC6H_SF16,
|
||||
DXGI_FORMAT_BC7_TYPELESS,
|
||||
DXGI_FORMAT_BC7_UNORM,
|
||||
DXGI_FORMAT_BC7_UNORM_SRGB,
|
||||
DXGI_FORMAT_AYUV,
|
||||
DXGI_FORMAT_Y410,
|
||||
DXGI_FORMAT_Y416,
|
||||
DXGI_FORMAT_NV12,
|
||||
DXGI_FORMAT_P010,
|
||||
DXGI_FORMAT_P016,
|
||||
DXGI_FORMAT_420_OPAQUE,
|
||||
DXGI_FORMAT_YUY2,
|
||||
DXGI_FORMAT_Y210,
|
||||
DXGI_FORMAT_Y216,
|
||||
DXGI_FORMAT_NV11,
|
||||
DXGI_FORMAT_AI44,
|
||||
DXGI_FORMAT_IA44,
|
||||
DXGI_FORMAT_P8,
|
||||
DXGI_FORMAT_A8P8,
|
||||
DXGI_FORMAT_B4G4R4A4_UNORM,
|
||||
DXGI_FORMAT_P208,
|
||||
DXGI_FORMAT_V208,
|
||||
DXGI_FORMAT_V408,
|
||||
DXGI_FORMAT_FORCE_UINT
|
||||
};
|
||||
|
||||
typedef enum D3D12_RAYTRACING_GEOMETRY_FLAGS
|
||||
{
|
||||
D3D12_RAYTRACING_GEOMETRY_FLAG_NONE = 0,
|
||||
D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE = 0x1,
|
||||
D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2
|
||||
} D3D12_RAYTRACING_GEOMETRY_FLAGS;
|
||||
|
||||
typedef enum D3D12_RAYTRACING_GEOMETRY_TYPE
|
||||
{
|
||||
D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES = 0,
|
||||
D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS = (D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES + 1)
|
||||
} D3D12_RAYTRACING_GEOMETRY_TYPE;
|
||||
|
||||
typedef enum D3D12_RAYTRACING_INSTANCE_FLAGS
|
||||
{
|
||||
D3D12_RAYTRACING_INSTANCE_FLAG_NONE = 0,
|
||||
D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
|
||||
D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
|
||||
D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
|
||||
D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8
|
||||
} D3D12_RAYTRACING_INSTANCE_FLAGS;
|
||||
|
||||
typedef struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE
|
||||
{
|
||||
D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
|
||||
unsigned long StrideInBytes;
|
||||
} D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE;
|
||||
|
||||
typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE
|
||||
{
|
||||
D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
|
||||
unsigned long SizeInBytes;
|
||||
} D3D12_GPU_VIRTUAL_ADDRESSRANGE;
|
||||
|
||||
typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE
|
||||
{
|
||||
D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
|
||||
unsigned long SizeInBytes;
|
||||
unsigned long StrideInBytes;
|
||||
} D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE;
|
||||
|
||||
typedef struct D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC
|
||||
{
|
||||
D3D12_GPU_VIRTUAL_ADDRESS Transform;
|
||||
enum DXGI_FORMAT IndexFormat;
|
||||
enum DXGI_FORMAT VertexFormat;
|
||||
unsigned int IndexCount;
|
||||
unsigned int VertexCount;
|
||||
D3D12_GPU_VIRTUAL_ADDRESS IndexBuffer;
|
||||
struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE VertexBuffer;
|
||||
} D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC;
|
||||
|
||||
typedef struct D3D12_RAYTRACING_AABB
|
||||
{
|
||||
float MinX;
|
||||
float MinY;
|
||||
float MinZ;
|
||||
float MaxX;
|
||||
float MaxY;
|
||||
float MaxZ;
|
||||
} D3D12_RAYTRACING_AABB;
|
||||
|
||||
GRL_INLINE void D3D12_set_raytracing_aabb(D3D12_RAYTRACING_AABB* dest, struct AABB* source)
|
||||
{
|
||||
dest->MinX = source->lower.x;
|
||||
dest->MinY = source->lower.y;
|
||||
dest->MinZ = source->lower.z;
|
||||
dest->MaxX = source->upper.x;
|
||||
dest->MaxY = source->upper.y;
|
||||
dest->MaxZ = source->upper.z;
|
||||
}
|
||||
|
||||
typedef struct D3D12_RAYTRACING_GEOMETRY_AABBS_DESC
|
||||
{
|
||||
unsigned long AABBCount;
|
||||
D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE AABBs;
|
||||
} D3D12_RAYTRACING_GEOMETRY_AABBS_DESC;
|
||||
|
||||
typedef struct D3D12_RAYTRACING_GEOMETRY_DESC
|
||||
{
|
||||
D3D12_RAYTRACING_GEOMETRY_TYPE Type;
|
||||
D3D12_RAYTRACING_GEOMETRY_FLAGS Flags;
|
||||
//unsigned int ShaderIndex : 24; // extension
|
||||
//unsigned int Mask : 8; // extension
|
||||
//unsigned int ShaderIndex_Mask; // extension
|
||||
union {
|
||||
D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC Triangles;
|
||||
D3D12_RAYTRACING_GEOMETRY_AABBS_DESC AABBs;
|
||||
};
|
||||
} D3D12_RAYTRACING_GEOMETRY_DESC;
|
||||
|
||||
GRL_INLINE void D3D12_set_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_TYPE type)
|
||||
{
|
||||
geomDesc->Type = type;
|
||||
}
|
||||
|
||||
GRL_INLINE D3D12_RAYTRACING_GEOMETRY_TYPE D3D12_get_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->Type;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_FLAGS flags)
|
||||
{
|
||||
geomDesc->Flags = flags;
|
||||
}
|
||||
|
||||
GRL_INLINE D3D12_RAYTRACING_GEOMETRY_FLAGS D3D12_get_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->Flags;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS transform)
|
||||
{
|
||||
geomDesc->Triangles.Transform = transform;
|
||||
}
|
||||
|
||||
GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->Triangles.Transform;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, IndexFormat format)
|
||||
{
|
||||
switch (format)
|
||||
{
|
||||
case INDEX_FORMAT_NONE:
|
||||
geomDesc->Triangles.IndexFormat = DXGI_FORMAT_UNKNOWN;
|
||||
break;
|
||||
case INDEX_FORMAT_R16_UINT:
|
||||
geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R16_UINT;
|
||||
break;
|
||||
case INDEX_FORMAT_R32_UINT:
|
||||
geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R32_UINT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE IndexFormat D3D12_get_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
switch (geomDesc->Triangles.IndexFormat)
|
||||
{
|
||||
case DXGI_FORMAT_R16_UINT:
|
||||
return INDEX_FORMAT_R16_UINT;
|
||||
case DXGI_FORMAT_R32_UINT:
|
||||
return INDEX_FORMAT_R32_UINT;
|
||||
case DXGI_FORMAT_UNKNOWN:
|
||||
default:
|
||||
return INDEX_FORMAT_NONE;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, VertexFormat format)
|
||||
{
|
||||
switch (format)
|
||||
{
|
||||
case VERTEX_FORMAT_R32G32_FLOAT:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32_FLOAT;
|
||||
break;
|
||||
case VERTEX_FORMAT_R32G32B32_FLOAT:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32B32_FLOAT;
|
||||
break;
|
||||
case VERTEX_FORMAT_R16G16_FLOAT:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_FLOAT;
|
||||
break;
|
||||
case VERTEX_FORMAT_R16G16B16A16_FLOAT:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_FLOAT;
|
||||
break;
|
||||
case VERTEX_FORMAT_R16G16_SNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_SNORM;
|
||||
break;
|
||||
case VERTEX_FORMAT_R16G16B16A16_SNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_SNORM;
|
||||
break;
|
||||
case VERTEX_FORMAT_R16G16B16A16_UNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_UNORM;
|
||||
break;
|
||||
case VERTEX_FORMAT_R16G16_UNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_UNORM;
|
||||
break;
|
||||
case VERTEX_FORMAT_R10G10B10A2_UNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R10G10B10A2_UNORM;
|
||||
break;
|
||||
case VERTEX_FORMAT_R8G8B8A8_UNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_UNORM;
|
||||
break;
|
||||
case VERTEX_FORMAT_R8G8_UNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_UNORM;
|
||||
break;
|
||||
case VERTEX_FORMAT_R8G8B8A8_SNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_SNORM;
|
||||
break;
|
||||
case VERTEX_FORMAT_R8G8_SNORM:
|
||||
geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_SNORM;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE VertexFormat D3D12_get_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
switch(geomDesc->Triangles.VertexFormat)
|
||||
{
|
||||
case DXGI_FORMAT_R32G32_FLOAT:
|
||||
return VERTEX_FORMAT_R32G32_FLOAT;
|
||||
case DXGI_FORMAT_R32G32B32_FLOAT:
|
||||
return VERTEX_FORMAT_R32G32B32_FLOAT;
|
||||
case DXGI_FORMAT_R16G16_FLOAT:
|
||||
return VERTEX_FORMAT_R16G16_FLOAT;
|
||||
case DXGI_FORMAT_R16G16B16A16_FLOAT:
|
||||
return VERTEX_FORMAT_R16G16B16A16_FLOAT;
|
||||
case DXGI_FORMAT_R16G16_SNORM:
|
||||
return VERTEX_FORMAT_R16G16_SNORM;
|
||||
case DXGI_FORMAT_R16G16B16A16_SNORM:
|
||||
return VERTEX_FORMAT_R16G16B16A16_SNORM;
|
||||
case DXGI_FORMAT_R16G16B16A16_UNORM:
|
||||
return VERTEX_FORMAT_R16G16B16A16_UNORM;
|
||||
case DXGI_FORMAT_R16G16_UNORM:
|
||||
return VERTEX_FORMAT_R16G16_UNORM;
|
||||
case DXGI_FORMAT_R10G10B10A2_UNORM:
|
||||
return VERTEX_FORMAT_R10G10B10A2_UNORM;
|
||||
case DXGI_FORMAT_R8G8B8A8_UNORM:
|
||||
return VERTEX_FORMAT_R8G8B8A8_UNORM;
|
||||
case DXGI_FORMAT_R8G8_UNORM:
|
||||
return VERTEX_FORMAT_R8G8_UNORM;
|
||||
case DXGI_FORMAT_R8G8B8A8_SNORM:
|
||||
return VERTEX_FORMAT_R8G8B8A8_SNORM;
|
||||
case DXGI_FORMAT_R8G8_SNORM:
|
||||
return VERTEX_FORMAT_R8G8_SNORM;
|
||||
default:
|
||||
return VERTEX_FORMAT_R32G32_FLOAT;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
|
||||
{
|
||||
geomDesc->Triangles.IndexCount = count;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned int D3D12_get_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->Triangles.IndexCount;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
|
||||
{
|
||||
geomDesc->Triangles.VertexCount = count;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned int D3D12_get_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->Triangles.VertexCount;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS buffer)
|
||||
{
|
||||
geomDesc->Triangles.IndexBuffer = buffer;
|
||||
}
|
||||
|
||||
GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->Triangles.IndexBuffer;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
|
||||
{
|
||||
geomDesc->Triangles.VertexBuffer.StartAddress = address;
|
||||
}
|
||||
|
||||
GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->Triangles.VertexBuffer.StartAddress;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
|
||||
{
|
||||
geomDesc->Triangles.VertexBuffer.StrideInBytes = stride;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned long D3D12_get_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->Triangles.VertexBuffer.StrideInBytes;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long count)
|
||||
{
|
||||
geomDesc->AABBs.AABBCount = count;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned long D3D12_get_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->AABBs.AABBCount;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
|
||||
{
|
||||
geomDesc->AABBs.AABBs.StartAddress = address;
|
||||
}
|
||||
|
||||
GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->AABBs.AABBs.StartAddress;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
|
||||
{
|
||||
geomDesc->AABBs.AABBs.StrideInBytes = stride;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned long D3D12_get_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
|
||||
{
|
||||
return geomDesc->AABBs.AABBs.StrideInBytes;
|
||||
}
|
||||
|
||||
typedef struct D3D12_RAYTRACING_INSTANCE_DESC
|
||||
{
|
||||
float Transform[12];
|
||||
// unsigned int InstanceID : 24;
|
||||
// unsigned int InstanceMask : 8;
|
||||
uint32_t DW0;
|
||||
// unsigned int InstanceContributionToHitGroupIndex : 24;
|
||||
// unsigned int Flags : 8;
|
||||
uint32_t DW1;
|
||||
global char *AccelerationStructure;
|
||||
} D3D12_RAYTRACING_INSTANCE_DESC;
|
||||
|
||||
GRL_INLINE float D3D12_get_transform(const D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column)
|
||||
{
|
||||
return d->Transform[row * 4 + column];
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t D3D12_get_instanceID(const D3D12_RAYTRACING_INSTANCE_DESC *d)
|
||||
{
|
||||
return d->DW0 & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t D3D12_get_InstanceMask(const D3D12_RAYTRACING_INSTANCE_DESC *d)
|
||||
{
|
||||
return d->DW0 >> 24;
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t D3D12_get_InstanceContributionToHitGroupIndex(const D3D12_RAYTRACING_INSTANCE_DESC *d)
|
||||
{
|
||||
return d->DW1 & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t D3D12_get_InstanceFlags(const D3D12_RAYTRACING_INSTANCE_DESC *d)
|
||||
{
|
||||
return d->DW1 >> 24;
|
||||
}
|
||||
|
||||
GRL_INLINE gpuva_t D3D12_get_AccelerationStructure(const D3D12_RAYTRACING_INSTANCE_DESC *d)
|
||||
{
|
||||
return (gpuva_t)d->AccelerationStructure;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_transform(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column, float value)
|
||||
{
|
||||
d->Transform[row * 4 + column] = value;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_instanceID(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t id)
|
||||
{
|
||||
d->DW0 &= 255 << 24;
|
||||
d->DW0 |= id & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_InstanceMask(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t mask)
|
||||
{
|
||||
d->DW0 &= ((1 << 24) - 1);
|
||||
d->DW0 |= mask << 24;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_InstanceContributionToHitGroupIndex(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t contribution)
|
||||
{
|
||||
d->DW1 &= 255 << 24;
|
||||
d->DW1 |= contribution & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_InstanceFlags(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t flags)
|
||||
{
|
||||
d->DW1 &= ((1 << 24) - 1);
|
||||
d->DW1 |= flags << 24;
|
||||
}
|
||||
|
||||
GRL_INLINE void D3D12_set_AccelerationStructure(D3D12_RAYTRACING_INSTANCE_DESC *d, gpuva_t address)
|
||||
{
|
||||
d->AccelerationStructure = (global char*)address;
|
||||
}
|
||||
|
|
@ -1,59 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "api_interface.h"
|
||||
|
||||
__attribute__((reqd_work_group_size(1, 1, 1))) void kernel CopyGeom(
|
||||
global struct Geo *src,
|
||||
global struct Geo *dst,
|
||||
global float4 *vec,
|
||||
global ushort *indices,
|
||||
dword step)
|
||||
{
|
||||
src = src + get_group_id(0);
|
||||
dst = dst + get_group_id(0);
|
||||
dst->Flags = src->Flags;
|
||||
dst->Type = src->Type;
|
||||
if (src->Type == GEOMETRY_TYPE_PROCEDURAL)
|
||||
{
|
||||
dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
|
||||
dst->Desc.Procedural.AABBCount = src->Desc.Procedural.AABBCount;
|
||||
dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
|
||||
}
|
||||
else
|
||||
{
|
||||
dst->Desc.Triangles.pTransformBuffer = src->Desc.Triangles.pTransformBuffer;
|
||||
if (step == 0)
|
||||
return;
|
||||
dst->Desc.Triangles.IndexCount = src->Desc.Triangles.IndexCount;
|
||||
if (step == 1)
|
||||
return;
|
||||
dst->Desc.Triangles.VertexCount = src->Desc.Triangles.VertexCount;
|
||||
if (step == 2)
|
||||
return;
|
||||
dst->Desc.Triangles.IndexFormat = src->Desc.Triangles.IndexFormat;
|
||||
if (step == 3)
|
||||
return;
|
||||
dst->Desc.Triangles.pIndexBuffer = src->Desc.Triangles.pIndexBuffer;
|
||||
if (step == 4)
|
||||
return;
|
||||
dst->Desc.Triangles.pVertexBuffer = src->Desc.Triangles.pVertexBuffer;
|
||||
if (step == 5)
|
||||
return;
|
||||
dst->Desc.Triangles.VertexBufferByteStride = src->Desc.Triangles.VertexBufferByteStride;
|
||||
|
||||
dst->Desc.Triangles.VertexFormat = src->Desc.Triangles.VertexFormat;
|
||||
|
||||
for (uint t = 0; t * 3 < dst->Desc.Triangles.IndexCount; t++)
|
||||
{
|
||||
uint3 tri = GRL_load_triangle(src, t);
|
||||
vec[t * 3] = GRL_load_vertex(src, tri[0]);
|
||||
vec[t * 3 + 1] = GRL_load_vertex(src, tri[1]);
|
||||
vec[t * 3 + 2] = GRL_load_vertex(src, tri[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module api_interface_verify;
|
||||
|
||||
kernel copy_geom < source="grl_api_interface_verify.cl", kernelFunction="CopyGeom" >
|
||||
|
||||
metakernel ifc0_copy(
|
||||
qword src,
|
||||
qword dst,
|
||||
qword vec,
|
||||
qword srcIndices,
|
||||
dword numGroups,
|
||||
dword step)
|
||||
{
|
||||
dispatch copy_geom(numGroups,1,1) args(
|
||||
src,
|
||||
dst,
|
||||
vec,
|
||||
srcIndices,
|
||||
step
|
||||
);
|
||||
}
|
||||
|
|
@ -1,723 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "api_interface.h"
|
||||
#include "common.h"
|
||||
#include "d3d12.h"
|
||||
#include "mem_utils.h"
|
||||
#include "misc_shared.h"
|
||||
|
||||
/// Align value to 128
|
||||
///
|
||||
/// @param value vale to align
|
||||
/// @return aligned value
|
||||
GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; }
|
||||
|
||||
GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) {
|
||||
return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch)));
|
||||
}
|
||||
|
||||
/// Finds max used byte in vertex buffer
|
||||
///
|
||||
/// @param indexBuffPtr pointer to index buffer
|
||||
/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers
|
||||
/// @param IndexCount number of indices in index buffer
|
||||
/// @param IndexFormat index format
|
||||
/// @param VertexCount number of vertices in vertex buffer
|
||||
/// @param VertexBufferByteStride vertex buffer byte stride
|
||||
__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel find_max_used_byte_in_buff(
|
||||
global void* indexBuffPtr,
|
||||
global uint* vertexBufferUsedByteEnd,
|
||||
dword IndexCount,
|
||||
dword IndexFormat,
|
||||
dword VertexCount,
|
||||
qword VertexBufferByteStride)
|
||||
{
|
||||
local uint sgMax[16];
|
||||
uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0);
|
||||
|
||||
if (IndexFormat != INDEX_FORMAT_NONE)
|
||||
{
|
||||
uint endByte = 0;
|
||||
if (glob_id < IndexCount)
|
||||
{
|
||||
if (IndexFormat == INDEX_FORMAT_R16_UINT)
|
||||
{
|
||||
global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr;
|
||||
endByte = indexBuffPtrShort[glob_id];
|
||||
}
|
||||
else
|
||||
{
|
||||
global uint* indexBuffPtrUint = (global uint*) indexBuffPtr;
|
||||
endByte = indexBuffPtrUint[glob_id];
|
||||
}
|
||||
}
|
||||
|
||||
endByte = sub_group_reduce_max(endByte);
|
||||
|
||||
if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; }
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (get_sub_group_id() == 0)
|
||||
{
|
||||
endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]);
|
||||
if (get_sub_group_local_id() == 0)
|
||||
{
|
||||
endByte = min(endByte, VertexCount);
|
||||
if (endByte < VertexCount && IndexCount != 0)
|
||||
++endByte;
|
||||
endByte *= (dword)VertexBufferByteStride;
|
||||
atomic_max(vertexBufferUsedByteEnd, endByte);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (glob_id == 0)
|
||||
{
|
||||
uint endByte = VertexCount * VertexBufferByteStride;
|
||||
atomic_max(vertexBufferUsedByteEnd, endByte);
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocates buffer for vertices
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers
|
||||
/// @param vertexBufferOffset pointer to offsets to vertex buffers
|
||||
/// @param numVertexBuffers number of vertex buffers
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel allocate_linear_offsets_for_vertex_buffers(
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
global uint* vertexBufferUsedByteEnd,
|
||||
global uint* vertexBufferOffset,
|
||||
dword numVertexBuffers)
|
||||
{
|
||||
uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
|
||||
|
||||
if (glob_id < numVertexBuffers)
|
||||
{
|
||||
uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]);
|
||||
uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes);
|
||||
vertexBufferOffset[glob_id] = position;
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the dst data space for input dump of this batch
|
||||
///
|
||||
/// @param inputDumpMainBuffer pointer to main dump buffer
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param nonVertexSize size of non vertex data
|
||||
/// @param batchIdPtr pointer to batch id
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel allocate_data_space_for_inputs(
|
||||
global DebugBufferHeader* inputDumpMainBuffer,
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
uint nonVertexSize,
|
||||
global qword* batchIdPtr)
|
||||
{
|
||||
if (get_sub_group_local_id() == 0)
|
||||
{
|
||||
uint vertexBufferSize = batchPtrs->vertexBuffersSize;
|
||||
uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize;
|
||||
|
||||
if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2))
|
||||
{
|
||||
inputDumpMainBuffer->overflow = 1;
|
||||
batchPtrs->dumpDst = 0;
|
||||
batchPtrs->globalDumpBuffer = 0;
|
||||
batchPtrs->nonVertexDataStart = 0;
|
||||
batchPtrs->totalSize = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
dword prevHead = inputDumpMainBuffer->gpuHead;
|
||||
dword newHead;
|
||||
bool circled;
|
||||
|
||||
do
|
||||
{
|
||||
circled = false;
|
||||
newHead = prevHead + sizeOfThisBatch;
|
||||
dword bufferBegin = prevHead;
|
||||
if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize)
|
||||
{
|
||||
circled = true;
|
||||
newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch;
|
||||
bufferBegin = inputDumpMainBuffer->headStart;
|
||||
}
|
||||
dword bufferEnd = newHead + sizeof(InputBatch);
|
||||
|
||||
uint tail;
|
||||
uint tail2 = 7;
|
||||
bool wait;
|
||||
do
|
||||
{
|
||||
wait = true;
|
||||
tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0);
|
||||
|
||||
// dead code, workaround so IGC won't move tail load out of loop
|
||||
if (tail > inputDumpMainBuffer->totalSize)
|
||||
{
|
||||
store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2);
|
||||
tail2 = tail;
|
||||
}
|
||||
|
||||
if( prevHead >= tail )
|
||||
{
|
||||
//colision example:
|
||||
// ----------T=======H------------
|
||||
// -------B=====E-----------------
|
||||
//
|
||||
if((bufferEnd < tail) || (bufferBegin >= prevHead))
|
||||
{
|
||||
wait = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//colision example:
|
||||
// ==========H-------T============
|
||||
// B==============E---------------
|
||||
// caution: we will never have H circled completely so that H == T
|
||||
if((bufferEnd < tail) && (bufferBegin >= prevHead))
|
||||
{
|
||||
wait = false;
|
||||
}
|
||||
}
|
||||
} while (wait);
|
||||
} while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead));
|
||||
|
||||
if (circled)
|
||||
{
|
||||
global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead);
|
||||
endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER;
|
||||
prevHead = inputDumpMainBuffer->headStart;
|
||||
}
|
||||
|
||||
global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead;
|
||||
batchPtrs->dumpDst = (qword)thisBatchDump;
|
||||
batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer;
|
||||
batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize);
|
||||
batchPtrs->totalSize = sizeOfThisBatch;
|
||||
|
||||
global InputBatch* batchOp = (global InputBatch*) thisBatchDump;
|
||||
batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH;
|
||||
batchOp->header.opHeader.endOfData = sizeOfThisBatch;
|
||||
batchOp->vertexBufferDataSize = vertexBufferSize;
|
||||
batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize;
|
||||
batchOp->batchId = *batchIdPtr;
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the dst data space for output dump of this batch
|
||||
///
|
||||
/// @param outputDumpMainBuffer pointer to main dump buffer
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param batchIdPtr pointer to batch id
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel allocate_data_space_for_outputs(
|
||||
global DebugBufferHeader* outputDumpMainBuffer,
|
||||
global OutputBatchPtrs* batchPtrs,
|
||||
global qword* batchIdPtr)
|
||||
{
|
||||
if (get_sub_group_local_id() == 0)
|
||||
{
|
||||
uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize;
|
||||
|
||||
if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2))
|
||||
{
|
||||
outputDumpMainBuffer->overflow = 1;
|
||||
batchPtrs->dumpDst = 0;
|
||||
batchPtrs->dataStart = 0;
|
||||
batchPtrs->totalSize = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
|
||||
dword newHead;
|
||||
bool circled;
|
||||
|
||||
do
|
||||
{
|
||||
//mem_fence_gpu_invalidate();
|
||||
//prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
|
||||
circled = false;
|
||||
newHead = prevHead + sizeOfThisBatch;
|
||||
dword bufferBegin = prevHead;
|
||||
if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize)
|
||||
{
|
||||
circled = true;
|
||||
newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch;
|
||||
bufferBegin = outputDumpMainBuffer->headStart;
|
||||
}
|
||||
dword bufferEnd = newHead + sizeof(OutputBatch);
|
||||
|
||||
uint tail;
|
||||
uint tail2 = 7;
|
||||
bool wait;
|
||||
do
|
||||
{
|
||||
wait = true;
|
||||
tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0);
|
||||
|
||||
// dead code, workaround so IGC won't move tail load out of loop
|
||||
if (tail > outputDumpMainBuffer->totalSize)
|
||||
{
|
||||
store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2);
|
||||
tail2 = tail;
|
||||
}
|
||||
|
||||
if( prevHead >= tail )
|
||||
{
|
||||
//colision example:
|
||||
// ----------T=======H------------
|
||||
// -------B=====E-----------------
|
||||
//
|
||||
if((bufferEnd < tail) || (bufferBegin >= prevHead))
|
||||
{
|
||||
wait = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//colision example:
|
||||
// ==========H-------T============
|
||||
// B==============E---------------
|
||||
// caution: we will never have H circled completely so that H == T
|
||||
if((bufferEnd < tail) && (bufferBegin >= prevHead))
|
||||
{
|
||||
wait = false;
|
||||
}
|
||||
}
|
||||
} while (wait);
|
||||
} while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead));
|
||||
|
||||
if (circled)
|
||||
{
|
||||
global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead);
|
||||
endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER;
|
||||
prevHead = outputDumpMainBuffer->headStart;
|
||||
}
|
||||
|
||||
global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead;
|
||||
batchPtrs->dumpDst = (qword)thisBatchDump;
|
||||
batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch)));
|
||||
batchPtrs->totalSize = sizeOfThisBatch;
|
||||
|
||||
global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump;
|
||||
batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH;
|
||||
batchOp->header.opHeader.endOfData = sizeOfThisBatch;
|
||||
batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch));
|
||||
batchOp->batchId = *batchIdPtr;
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates sum of output sizes
|
||||
///
|
||||
/// @param pbi pointer to post build infos
|
||||
/// @param destOffset offset in dest buffer
|
||||
/// @param numOutputs number of outputs
|
||||
/// @param batchPtrs batch pointers struct
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel calc_outputs_data_size(
|
||||
global PostbuildInfoSerializationDesc* pbi,
|
||||
global dword* destOffsets,
|
||||
qword numOutputs,
|
||||
global OutputBatchPtrs* batchPtrs)
|
||||
{
|
||||
uint offset = 0;
|
||||
for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH)
|
||||
{
|
||||
uint size = 0;
|
||||
if (i < numOutputs)
|
||||
{
|
||||
size = AlignTo128(pbi[i].SerializedSizeInBytes);
|
||||
size += AlignTo128(sizeof(OutputData));
|
||||
destOffsets[i] = offset + sub_group_scan_exclusive_add(size);
|
||||
}
|
||||
offset += sub_group_reduce_add(size);
|
||||
}
|
||||
if (get_sub_group_local_id() == 0)
|
||||
batchPtrs->dataSize = offset;
|
||||
}
|
||||
|
||||
/// Adds output data operation to batch
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param destOffset offset in dest buffer
|
||||
/// @param src pointer to source bvh
|
||||
/// @param pbi pointer to post build info
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel write_output_data_op(
|
||||
global OutputBatchPtrs* batchPtrs,
|
||||
global dword* destOffset,
|
||||
qword src,
|
||||
global PostbuildInfoSerializationDesc* pbi)
|
||||
{
|
||||
if (batchPtrs->dataStart == 0)
|
||||
return;
|
||||
|
||||
global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset);
|
||||
out->header.operationType = OUTPUT_DUMP_OP_DATA;
|
||||
out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes);
|
||||
out->srcBvhPtr = src;
|
||||
}
|
||||
|
||||
/// Writes indices and transform or procedurals data
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param srcDesc description of source geometry
|
||||
/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer
|
||||
/// @param dstDescOffset offset to dest geo desc
|
||||
/// @param dstDataOffset offset to dest geo data
|
||||
/// @param numThreads number of threads
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel write_geo_data(
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc,
|
||||
global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
|
||||
global uint* pVertexBufferSize,
|
||||
qword dstDescOffset,
|
||||
qword dstDataOffset,
|
||||
dword numThreads)
|
||||
{
|
||||
if (batchPtrs->dumpDst == 0) return;
|
||||
|
||||
uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
|
||||
|
||||
GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc;
|
||||
|
||||
global char* dstDataPtr = (global char*)(
|
||||
batchPtrs->nonVertexDataStart + dstDataOffset);
|
||||
|
||||
global char* srcDataPtr;
|
||||
global char* dstTransform;
|
||||
uint bytesToCopy = 0;
|
||||
|
||||
if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES)
|
||||
{
|
||||
uint sizeOfMatrix = 0;
|
||||
|
||||
if (geoDescToStore.Desc.Triangles.pTransformBuffer)
|
||||
{
|
||||
sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float));
|
||||
if (glob_id < 12)
|
||||
{
|
||||
global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer;
|
||||
global float* matrixDst = (global float*)dstDataPtr;
|
||||
matrixDst[glob_id] = matrixSrc[glob_id];
|
||||
if (glob_id == 0)
|
||||
{
|
||||
geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dstDataPtr += sizeOfMatrix;
|
||||
srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer;
|
||||
|
||||
bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount);
|
||||
|
||||
if (bytesToCopy && (glob_id == 0))
|
||||
{
|
||||
qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
|
||||
// for this we remember offset relative to global debug buffer
|
||||
geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
|
||||
geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
|
||||
geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride;
|
||||
}
|
||||
else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0)
|
||||
{
|
||||
if (geoDescToStore.Desc.Triangles.pVertexBuffer)
|
||||
{
|
||||
qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
|
||||
// for this we remember offset relative to global debug buffer
|
||||
geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
|
||||
}
|
||||
}
|
||||
else if (glob_id == 0)
|
||||
{
|
||||
geoDescToStore.Desc.Triangles.IndexCount = 0;
|
||||
geoDescToStore.Desc.Triangles.VertexCount = 0;
|
||||
geoDescToStore.Desc.Triangles.pVertexBuffer = 0;
|
||||
geoDescToStore.Desc.Triangles.pIndexBuffer = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
srcDataPtr = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA;
|
||||
bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount);
|
||||
if (glob_id == 0)
|
||||
{
|
||||
geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
if (bytesToCopy)
|
||||
{
|
||||
CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads);
|
||||
}
|
||||
|
||||
if (glob_id == 0)
|
||||
{
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)(
|
||||
batchPtrs->nonVertexDataStart + dstDescOffset);
|
||||
*dstDescPtr = geoDescToStore;
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds build operation to batch
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param buildOpOffset offset in dst buffer
|
||||
/// @param srcBvh address of src bvh (in case of update)
|
||||
/// @param dstBvhAddr address of dest bvh buffer
|
||||
/// @param offsetToEnd offset to end of this operation
|
||||
/// @param flags build flags
|
||||
/// @param numGeometries number of geometries in build
|
||||
/// @param numInstances number of instances in build
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel write_input_build_op(
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
qword buildOpOffset,
|
||||
qword srcBvh,
|
||||
qword dstBvhAddr,
|
||||
dword offsetToEnd,
|
||||
dword flags,
|
||||
dword numGeometries,
|
||||
dword numInstances,
|
||||
dword instArrayOfPtrs)
|
||||
{
|
||||
uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
|
||||
if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
|
||||
|
||||
global InputBuild* buildOp = (global InputBuild*)(
|
||||
batchPtrs->nonVertexDataStart + buildOpOffset);
|
||||
buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD;
|
||||
buildOp->header.endOfData = offsetToEnd;
|
||||
buildOp->dstBvhPtr = dstBvhAddr;
|
||||
buildOp->srcBvhPtr = srcBvh;
|
||||
buildOp->flags = flags;
|
||||
buildOp->numGeos = numGeometries;
|
||||
buildOp->numInstances = numInstances;
|
||||
buildOp->instArrayOfPtrs = instArrayOfPtrs;
|
||||
}
|
||||
|
||||
/// Copies instance description
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param instanceDescArr inst desc source
|
||||
/// @param offset ptr to offset in dst buffer
|
||||
/// @param numInstances number of instances to copy
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
|
||||
copy_instance_descriptors_array(
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr,
|
||||
qword offset,
|
||||
dword numInstances)
|
||||
{
|
||||
uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
|
||||
if (batchPtrs->dumpDst == 0) return;
|
||||
|
||||
global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )(
|
||||
batchPtrs->nonVertexDataStart + offset);
|
||||
|
||||
if (glob_id < numInstances)
|
||||
{
|
||||
dst[glob_id] = instanceDescArr[glob_id];
|
||||
}
|
||||
}
|
||||
|
||||
/// Copies instance description, array of pointers version
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param pInstanceDescPtrsArr inst desc source
|
||||
/// @param offset ptr to offset in dst buffer
|
||||
/// @param numInstances number of instances to copy
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
|
||||
copy_instance_descriptors_array_of_ptrs(
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
global qword* pInstanceDescPtrsArr,
|
||||
qword offset,
|
||||
dword numInstances)
|
||||
{
|
||||
uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
|
||||
if (batchPtrs->dumpDst == 0) return;
|
||||
|
||||
// save gpuva of instance descs for debug
|
||||
global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset);
|
||||
|
||||
global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)(
|
||||
batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset);
|
||||
global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr;
|
||||
|
||||
if (glob_id < numInstances)
|
||||
{
|
||||
gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id];
|
||||
dst[glob_id] = *(instanceDescPtrsArr[glob_id]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds copy operation to batch
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param offset ptr to offset in dst buffer
|
||||
/// @param src copy source pointer
|
||||
/// @param dst copy destination pointer
|
||||
/// @param copyOpType copy type
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel insert_copy_op(
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
qword offset,
|
||||
global void* src,
|
||||
global void* dst,
|
||||
uint copyOpType)
|
||||
{
|
||||
uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
|
||||
if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
|
||||
|
||||
global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset);
|
||||
|
||||
copyOp->header.operationType = copyOpType;
|
||||
copyOp->header.endOfData = AlignTo128(sizeof(InputCopy));
|
||||
copyOp->srcBvhPtr = (qword)src;
|
||||
copyOp->dstBvhPtr = (qword)dst;
|
||||
}
|
||||
|
||||
/// Copies vertex buffer
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param src input buffer
|
||||
/// @param offset ptr to offset in dst buffer
|
||||
/// @param size ptr to number of bytes to copy
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_vertex_data(
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
global const char* src,
|
||||
global const uint* offset,
|
||||
global const uint* size)
|
||||
{
|
||||
if (batchPtrs->dumpDst == 0) return;
|
||||
|
||||
global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset);
|
||||
uint numGroups = (*size >> 6) + 1;
|
||||
CopyMemory(dst, src, *size, numGroups);
|
||||
}
|
||||
|
||||
/// Generate unique batch id
|
||||
///
|
||||
/// @param batchIds array of unique batch ids
|
||||
/// @param index index of batch id to generate
|
||||
__attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) {
|
||||
global unsigned int *counterPtrs = (global unsigned int *)batchIds;
|
||||
atomic_add(&counterPtrs[index * 2 + 1], 1);
|
||||
batchIds[index] |= (unsigned long)index;
|
||||
}
|
||||
|
||||
/// Sets batch as ready to read and moves cpuHead forward, inputs case
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param dumpMainBuffer pointer to main dump buffer
|
||||
__attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void kernel finish_batch_dump_inputs(
|
||||
global InputBatchPtrs* batchPtrs,
|
||||
global DebugBufferHeader* dumpMainBuffer)
|
||||
{
|
||||
if (batchPtrs->dumpDst == 0)
|
||||
return;
|
||||
|
||||
global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst;
|
||||
|
||||
dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
|
||||
|
||||
dword seven = 7;
|
||||
while (true)
|
||||
{
|
||||
dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
|
||||
if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
|
||||
{
|
||||
store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
|
||||
currentHead = seven;
|
||||
}
|
||||
|
||||
if (currentHead == myDstOffset)
|
||||
{
|
||||
mem_fence_evict_to_memory();
|
||||
dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
|
||||
break;
|
||||
}
|
||||
else if (myDstOffset == dumpMainBuffer->headStart)
|
||||
{
|
||||
global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead);
|
||||
if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER)
|
||||
{
|
||||
mem_fence_evict_to_memory();
|
||||
dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets batch as ready to read and moves cpuHead forward, outputs case
|
||||
///
|
||||
/// @param batchPtrs batch pointers struct
|
||||
/// @param dumpMainBuffer pointer to main dump buffer
|
||||
__attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void kernel finish_batch_dump_outputs(
|
||||
global OutputBatchPtrs* batchPtrs,
|
||||
global DebugBufferHeader* dumpMainBuffer)
|
||||
{
|
||||
if (batchPtrs->dumpDst == 0)
|
||||
return;
|
||||
|
||||
global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst;
|
||||
|
||||
dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
|
||||
|
||||
dword seven = 7;
|
||||
while (true)
|
||||
{
|
||||
dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
|
||||
if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
|
||||
{
|
||||
store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
|
||||
currentHead = seven;
|
||||
}
|
||||
|
||||
if (currentHead == myDstOffset)
|
||||
{
|
||||
mem_fence_evict_to_memory();
|
||||
dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
|
||||
break;
|
||||
}
|
||||
else if (myDstOffset == dumpMainBuffer->headStart)
|
||||
{
|
||||
global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead);
|
||||
if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER)
|
||||
{
|
||||
mem_fence_evict_to_memory();
|
||||
dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,252 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module input_dump;
|
||||
|
||||
kernel_module input_dumper("input_dump.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_kernel_find_max_used_byte_in_buff < kernelFunction="find_max_used_byte_in_buff" >;
|
||||
kernel opencl_kernel_allocate_linear_offsets_for_vertex_buffers < kernelFunction="allocate_linear_offsets_for_vertex_buffers" >;
|
||||
kernel opencl_kernel_allocate_data_space_for_inputs < kernelFunction="allocate_data_space_for_inputs" >;
|
||||
kernel opencl_kernel_allocate_data_space_for_outputs < kernelFunction="allocate_data_space_for_outputs" >;
|
||||
kernel opencl_kernel_calc_outputs_data_size < kernelFunction="calc_outputs_data_size" >;
|
||||
kernel opencl_kernel_write_output_data_op < kernelFunction="write_output_data_op" >;
|
||||
kernel opencl_kernel_write_geo_data < kernelFunction="write_geo_data" >;
|
||||
kernel opencl_kernel_write_input_build_op < kernelFunction="write_input_build_op" >;
|
||||
kernel opencl_kernel_copy_instance_descriptors_array < kernelFunction="copy_instance_descriptors_array" >;
|
||||
kernel opencl_kernel_copy_instance_descriptors_array_of_ptrs < kernelFunction="copy_instance_descriptors_array_of_ptrs" >;
|
||||
kernel opencl_kernel_insert_copy_op < kernelFunction="insert_copy_op" >;
|
||||
kernel opencl_kernel_copy_vertex_data < kernelFunction="copy_vertex_data" >;
|
||||
kernel opencl_kernel_generate_unique_batch_id < kernelFunction="generate_unique_batch_id" >;
|
||||
kernel opencl_kernel_finish_batch_dump_inputs < kernelFunction="finish_batch_dump_inputs" >;
|
||||
kernel opencl_kernel_finish_batch_dump_outputs < kernelFunction="finish_batch_dump_outputs" >;
|
||||
}
|
||||
|
||||
|
||||
metakernel find_max_used_byte_in_buff(
|
||||
qword indexBuffPtr,
|
||||
qword vertexBufferUsedByteEnd,
|
||||
dword IndexCount,
|
||||
dword IndexFormat,
|
||||
dword VertexCount,
|
||||
qword VertexBufferByteStride,
|
||||
dword numPhysThreads)
|
||||
{
|
||||
dispatch opencl_kernel_find_max_used_byte_in_buff(numPhysThreads, 1, 1) args(
|
||||
indexBuffPtr,
|
||||
vertexBufferUsedByteEnd,
|
||||
IndexCount,
|
||||
IndexFormat,
|
||||
VertexCount,
|
||||
VertexBufferByteStride);
|
||||
}
|
||||
|
||||
metakernel allocate_linear_offsets_for_vertex_buffers(
|
||||
qword batchPtrs,
|
||||
qword m_VertexBufferUsedByteEnd,
|
||||
qword m_VertexBufferOffset,
|
||||
dword numVertexBuffers,
|
||||
dword numPhysThreads)
|
||||
{
|
||||
dispatch opencl_kernel_allocate_linear_offsets_for_vertex_buffers(numPhysThreads, 1, 1) args(
|
||||
batchPtrs,
|
||||
m_VertexBufferUsedByteEnd,
|
||||
m_VertexBufferOffset,
|
||||
numVertexBuffers);
|
||||
}
|
||||
|
||||
metakernel allocate_data_space_for_inputs(
|
||||
qword inputDumpMainBuffer,
|
||||
qword batchPtrs,
|
||||
dword nonVertexSize,
|
||||
qword batchIdPtr)
|
||||
{
|
||||
dispatch opencl_kernel_allocate_data_space_for_inputs(1, 1, 1) args(
|
||||
inputDumpMainBuffer,
|
||||
batchPtrs,
|
||||
nonVertexSize,
|
||||
batchIdPtr);
|
||||
}
|
||||
|
||||
metakernel allocate_data_space_for_outputs(
|
||||
qword inputDumpMainBuffer,
|
||||
qword batchPtrs,
|
||||
qword batchIdPtr)
|
||||
{
|
||||
dispatch opencl_kernel_allocate_data_space_for_outputs(1, 1, 1) args(
|
||||
inputDumpMainBuffer,
|
||||
batchPtrs,
|
||||
batchIdPtr);
|
||||
}
|
||||
|
||||
metakernel calc_outputs_data_size(
|
||||
qword pbi,
|
||||
qword destOffsets,
|
||||
qword numOutputs,
|
||||
qword batchPtrs)
|
||||
{
|
||||
dispatch opencl_kernel_calc_outputs_data_size(1, 1, 1) args(
|
||||
pbi,
|
||||
destOffsets,
|
||||
numOutputs,
|
||||
batchPtrs);
|
||||
}
|
||||
|
||||
metakernel write_output_data_op(
|
||||
qword batchPtrs,
|
||||
qword destOffset,
|
||||
qword src,
|
||||
qword pbi)
|
||||
{
|
||||
dispatch opencl_kernel_write_output_data_op(1, 1, 1) args(
|
||||
batchPtrs,
|
||||
destOffset,
|
||||
src,
|
||||
pbi);
|
||||
}
|
||||
|
||||
metakernel write_geo_data(
|
||||
qword batchPtrs,
|
||||
qword srcDesc,
|
||||
qword pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
|
||||
qword pVertexBufferSize,
|
||||
qword dstDescOffset,
|
||||
qword dstDataOffset,
|
||||
dword numThreads)
|
||||
{
|
||||
dispatch opencl_kernel_write_geo_data(numThreads, 1, 1) args(
|
||||
batchPtrs,
|
||||
srcDesc,
|
||||
pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
|
||||
pVertexBufferSize,
|
||||
dstDescOffset,
|
||||
dstDataOffset,
|
||||
numThreads);
|
||||
}
|
||||
|
||||
metakernel write_input_build_op(
|
||||
qword batchPtrs,
|
||||
qword buildOpOffset,
|
||||
qword srcBvh,
|
||||
qword dstBvhAddr,
|
||||
dword offsetToEnd,
|
||||
dword flags,
|
||||
dword numGeometries,
|
||||
dword numInstances,
|
||||
dword instArrayOfPtrs)
|
||||
|
||||
{
|
||||
dispatch opencl_kernel_write_input_build_op(1, 1, 1) args(
|
||||
batchPtrs,
|
||||
buildOpOffset,
|
||||
srcBvh,
|
||||
dstBvhAddr,
|
||||
offsetToEnd,
|
||||
flags,
|
||||
numGeometries,
|
||||
numInstances,
|
||||
instArrayOfPtrs);
|
||||
}
|
||||
|
||||
metakernel copy_instance_descriptors_array(
|
||||
qword batchPtrs,
|
||||
qword instanceDescArr,
|
||||
qword offset,
|
||||
dword numInstances,
|
||||
dword numPhysThreads)
|
||||
{
|
||||
dispatch opencl_kernel_copy_instance_descriptors_array(numPhysThreads, 1, 1) args(
|
||||
batchPtrs,
|
||||
instanceDescArr,
|
||||
offset,
|
||||
numInstances);
|
||||
}
|
||||
|
||||
metakernel copy_instance_descriptors_array_of_ptrs(
|
||||
qword batchPtrs,
|
||||
qword instanceDescArrPtrs,
|
||||
qword offset,
|
||||
dword numInstances,
|
||||
dword numPhysThreads)
|
||||
{
|
||||
dispatch opencl_kernel_copy_instance_descriptors_array_of_ptrs(numPhysThreads, 1, 1) args(
|
||||
batchPtrs,
|
||||
instanceDescArrPtrs,
|
||||
offset,
|
||||
numInstances);
|
||||
}
|
||||
|
||||
metakernel insert_copy_op(
|
||||
qword batchPtrs,
|
||||
qword offset,
|
||||
qword src,
|
||||
qword dst,
|
||||
dword type)
|
||||
{
|
||||
dispatch opencl_kernel_insert_copy_op(1, 1, 1) args(
|
||||
batchPtrs,
|
||||
offset,
|
||||
src,
|
||||
dst,
|
||||
type);
|
||||
}
|
||||
|
||||
metakernel copy_vertex_data(
|
||||
qword desc,
|
||||
qword src,
|
||||
qword offset,
|
||||
qword size)
|
||||
{
|
||||
define byteSize REG0;
|
||||
define numGroupsRqd REG1;
|
||||
define shift REG2;
|
||||
define minimum REG3;
|
||||
|
||||
shift = 6;
|
||||
minimum = 1;
|
||||
byteSize = load_dword(size);
|
||||
numGroupsRqd = byteSize >> shift;
|
||||
numGroupsRqd = numGroupsRqd + minimum;
|
||||
DISPATCHDIM_X = numGroupsRqd.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_copy_vertex_data args(
|
||||
desc,
|
||||
src,
|
||||
offset,
|
||||
size);
|
||||
}
|
||||
|
||||
metakernel generate_unique_batch_id(
|
||||
qword batchIds,
|
||||
dword batchIndex)
|
||||
{
|
||||
dispatch opencl_kernel_generate_unique_batch_id(1, 1, 1) args(
|
||||
batchIds,
|
||||
batchIndex);
|
||||
}
|
||||
|
||||
metakernel finish_batch_dump_inputs(
|
||||
qword batchPtrs,
|
||||
qword dumpMainBuffer)
|
||||
{
|
||||
dispatch opencl_kernel_finish_batch_dump_inputs(1, 1, 1) args(
|
||||
batchPtrs,
|
||||
dumpMainBuffer);
|
||||
}
|
||||
|
||||
metakernel finish_batch_dump_outputs(
|
||||
qword batchPtrs,
|
||||
qword dumpMainBuffer)
|
||||
{
|
||||
dispatch opencl_kernel_finish_batch_dump_outputs(1, 1, 1) args(
|
||||
batchPtrs,
|
||||
dumpMainBuffer);
|
||||
}
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared.h"
|
||||
#include "affinespace.h"
|
||||
#include "api_interface.h"
|
||||
#include "qbvh6.h"
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
|
||||
GRL_INLINE uint32_t HwInstanceLeafPart1_getInstanceIndex(struct HwInstanceLeaf *I)
|
||||
{
|
||||
return I->part1.instanceIndex;
|
||||
}
|
||||
|
||||
GRL_INLINE void encodeDW0_HwInstanceLeafPart0(
|
||||
uint32_t shaderIndex,
|
||||
uint32_t geomMask,
|
||||
uint4 *dst)
|
||||
{
|
||||
(*dst).x = (shaderIndex & ((1 << 24) - 1)) |
|
||||
(geomMask << 24);
|
||||
}
|
||||
|
||||
GRL_INLINE void encodeDW1_HwInstanceLeafPart0(
|
||||
uint32_t instanceContributionToHitGroupIndex,
|
||||
uint32_t notProcedural,
|
||||
uint32_t geomFlags,
|
||||
uint4* dst)
|
||||
{
|
||||
(*dst).y = (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
|
||||
((notProcedural & 1) << (24 + 5)) |
|
||||
((geomFlags & 3) << (24 + 5 + 1));
|
||||
}
|
||||
|
||||
GRL_INLINE void encodeDW2DW3_HwInstanceLeafPart0(
|
||||
uint64_t rootNodePtr,
|
||||
uint32_t instFlags,
|
||||
uint4* dst)
|
||||
{
|
||||
uint64_t flags = instFlags;
|
||||
uint DW2 = (uint)rootNodePtr;
|
||||
uint DW3 = ((uint)(rootNodePtr >> 32ul) & 0xffff);
|
||||
DW3 |= flags << 16ull;
|
||||
(*dst).z = DW2;
|
||||
(*dst).w = DW3;
|
||||
}
|
||||
|
||||
GRL_INLINE void HwInstanceLeafPart0_setDW0(struct HwInstanceLeaf *I,
|
||||
uint32_t shaderIndex,
|
||||
uint32_t geomMask)
|
||||
{
|
||||
I->part0.DW0 =
|
||||
(shaderIndex & ((1 << 24) - 1)) |
|
||||
(geomMask << 24);
|
||||
}
|
||||
|
||||
GRL_INLINE void HwInstanceLeafPart0_setDW1(struct HwInstanceLeaf *I,
|
||||
uint32_t instanceContributionToHitGroupIndex,
|
||||
uint32_t notProcedural,
|
||||
uint32_t geomFlags)
|
||||
{
|
||||
I->part0.DW1 =
|
||||
(instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
|
||||
((notProcedural & 1) << (24 + 5)) |
|
||||
((geomFlags & 3) << (24 + 5 + 1));
|
||||
}
|
||||
|
||||
GRL_INLINE void HwInstanceLeafPart1_setDW0DW1(struct HwInstanceLeaf *I,
|
||||
global char *pBvhPtr)
|
||||
{
|
||||
I->part1.DW0_DW1 = ((uint64_t)pBvhPtr) & (((uint64_t)1 << 48) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE void HwInstanceLeafPart0_setDW2DW3(struct HwInstanceLeaf *I,
|
||||
uint64_t rootNodePtr,
|
||||
uint32_t instFlags)
|
||||
{
|
||||
uint64_t flags = instFlags;
|
||||
flags = flags << 48ull;
|
||||
uint64_t ptr = rootNodePtr & 0x0000ffffffffffff;
|
||||
I->part0.DW2_DW3 = ptr + flags;
|
||||
}
|
||||
|
||||
GRL_INLINE void HwInstanceLeaf_Constructor(global struct HwInstanceLeaf* leaf,
|
||||
global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
|
||||
uint instanceIndex,
|
||||
uint rootNodeByteOffset,
|
||||
uint instanceMask)
|
||||
{
|
||||
global uint4* InstanceLeaf_4DWparts = (global uint4*) (leaf);
|
||||
|
||||
struct AffineSpace3f obj2world = AffineSpace3f_load_row_major(instDesc->Transform);
|
||||
|
||||
qword accStructPtr = (qword)instDesc->AccelerationStructure;
|
||||
uint4 p1_DW0_3 = (uint4)(
|
||||
(uint)accStructPtr,
|
||||
(uint)(accStructPtr >> (uint64_t)32),
|
||||
GRL_get_instanceID(instDesc),
|
||||
instanceIndex);
|
||||
|
||||
struct AffineSpace3f world2obj = AffineSpace3f_invert(obj2world);
|
||||
|
||||
store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 0 /*part1 + 0DW*/, p1_DW0_3);
|
||||
|
||||
uint4 p1_DW4_7 = (uint4)(
|
||||
as_uint(obj2world.l.vx.x),
|
||||
as_uint(obj2world.l.vx.y),
|
||||
as_uint(obj2world.l.vx.z),
|
||||
as_uint(obj2world.l.vy.x));
|
||||
|
||||
store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 1 /*part1 + 4DW*/, p1_DW4_7);
|
||||
|
||||
uint4 p1_DW8_11 = (uint4)(
|
||||
as_uint(obj2world.l.vy.y),
|
||||
as_uint(obj2world.l.vy.z),
|
||||
as_uint(obj2world.l.vz.x),
|
||||
as_uint(obj2world.l.vz.y));
|
||||
|
||||
store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 2 /*part1 + 8DW*/, p1_DW8_11);
|
||||
|
||||
|
||||
uint4 p1_DW12_15 = (uint4)(
|
||||
as_uint(obj2world.l.vz.z),
|
||||
as_uint(world2obj.p.x),
|
||||
as_uint(world2obj.p.y),
|
||||
as_uint(world2obj.p.z));
|
||||
|
||||
store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 3 /*part1 + 12DW*/, p1_DW12_15);
|
||||
|
||||
|
||||
uint hit_group_index = GRL_get_InstanceContributionToHitGroupIndex(instDesc);
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)instDesc->AccelerationStructure;
|
||||
|
||||
uint4 p0_DW0_3;
|
||||
|
||||
encodeDW0_HwInstanceLeafPart0(
|
||||
hit_group_index,
|
||||
instanceMask,
|
||||
&p0_DW0_3);
|
||||
|
||||
encodeDW1_HwInstanceLeafPart0(
|
||||
hit_group_index, // for HW instance leaf, this field is used to offset the hit-group index
|
||||
1, // disable opaque culling.. Necessary for SW instancing.. don't-care for HW instancing
|
||||
0,
|
||||
&p0_DW0_3);
|
||||
|
||||
encodeDW2DW3_HwInstanceLeafPart0(
|
||||
rootNodeByteOffset == NO_NODE_OFFSET ? 0 : ((uint64_t)bvh) + rootNodeByteOffset, // offset NO_NODE_OFFSET is for degenerated instance, put null as root pointer
|
||||
GRL_get_InstanceFlags(instDesc),
|
||||
&p0_DW0_3);
|
||||
|
||||
store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 0 /*part0 + 0DW*/, p0_DW0_3);
|
||||
|
||||
uint4 p0_DW4_7 = (uint4)(
|
||||
as_uint(world2obj.l.vx.x),
|
||||
as_uint(world2obj.l.vx.y),
|
||||
as_uint(world2obj.l.vx.z),
|
||||
as_uint(world2obj.l.vy.x));
|
||||
|
||||
store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 1 /*part0 + 4DW*/, p0_DW4_7);
|
||||
|
||||
uint4 p0_DW8_11 = (uint4)(
|
||||
as_uint(world2obj.l.vy.y),
|
||||
as_uint(world2obj.l.vy.z),
|
||||
as_uint(world2obj.l.vz.x),
|
||||
as_uint(world2obj.l.vz.y));
|
||||
|
||||
store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 2 /*part0 + 8DW*/, p0_DW8_11);
|
||||
|
||||
uint4 p0_DW12_15 = (uint4)(
|
||||
as_uint(world2obj.l.vz.z),
|
||||
as_uint(obj2world.p.x),
|
||||
as_uint(obj2world.p.y),
|
||||
as_uint(obj2world.p.z));
|
||||
|
||||
store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 3 /*part0 + 12DW*/, p0_DW12_15);
|
||||
}
|
||||
|
|
@ -1,581 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
// TODO: AABB_work_group_reduce is super slow, remove !!!
|
||||
|
||||
#pragma cl_intel_subgroups : enable
|
||||
#pragma cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
|
||||
uint intel_sub_group_ballot(bool valid);
|
||||
|
||||
// atom_min
|
||||
float __attribute__((overloadable)) atom_min(volatile __global float *p, float val);
|
||||
float __attribute__((overloadable)) atom_min(volatile __local float *p, float val);
|
||||
float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val);
|
||||
float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val);
|
||||
// atom_max
|
||||
float __attribute__((overloadable)) atom_max(volatile __global float *p, float val);
|
||||
float __attribute__((overloadable)) atom_max(volatile __local float *p, float val);
|
||||
float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val);
|
||||
float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val);
|
||||
// atom_cmpxchg
|
||||
float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val);
|
||||
float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val);
|
||||
float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val);
|
||||
float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val);
|
||||
|
||||
|
||||
|
||||
inline uint subgroup_single_atomic_add(global uint *p, uint val)
|
||||
{
|
||||
const uint subgroupLocalID = get_sub_group_local_id();
|
||||
const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0;
|
||||
return sub_group_broadcast(v, 0);
|
||||
}
|
||||
|
||||
inline float halfarea(const float3 d)
|
||||
{
|
||||
return fma(d.x, (d.y + d.z), d.y * d.z);
|
||||
}
|
||||
|
||||
inline float area(const float3 d)
|
||||
{
|
||||
return halfarea(d) * 2.0f;
|
||||
}
|
||||
|
||||
inline uint maxDim(const float3 a)
|
||||
{
|
||||
const float3 b = fabs(a);
|
||||
const bool b_x_y = b.x > b.y;
|
||||
const float cur_max = b_x_y ? b.x : b.y;
|
||||
const uint cur_idx = b_x_y ? 0 : 1;
|
||||
const bool b_x_y_z = b.z > cur_max;
|
||||
return b_x_y_z ? 2 : cur_idx;
|
||||
}
|
||||
|
||||
inline uint3 sortByMaxDim(const float3 a)
|
||||
{
|
||||
const uint kz = maxDim(a);
|
||||
const uint _kx = (kz + 1) % 3;
|
||||
const uint _ky = (_kx + 1) % 3;
|
||||
const bool kz_pos = a[kz] >= 0.0f;
|
||||
const uint kx = kz_pos ? _ky : _kx;
|
||||
const uint ky = kz_pos ? _kx : _ky;
|
||||
return (uint3)(kx, ky, kz);
|
||||
}
|
||||
|
||||
inline uint4 sort4_ascending(const uint4 dist)
|
||||
{
|
||||
const uint a0 = dist.s0;
|
||||
const uint a1 = dist.s1;
|
||||
const uint a2 = dist.s2;
|
||||
const uint a3 = dist.s3;
|
||||
const uint b0 = min(a0, a2);
|
||||
const uint b1 = min(a1, a3);
|
||||
const uint b2 = max(a0, a2);
|
||||
const uint b3 = max(a1, a3);
|
||||
const uint c0 = min(b0, b1);
|
||||
const uint c1 = max(b0, b1);
|
||||
const uint c2 = min(b2, b3);
|
||||
const uint c3 = max(b2, b3);
|
||||
const uint d0 = c0;
|
||||
const uint d1 = min(c1, c2);
|
||||
const uint d2 = max(c1, c2);
|
||||
const uint d3 = c3;
|
||||
return (uint4)(d0, d1, d2, d3);
|
||||
}
|
||||
|
||||
__constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6};
|
||||
__constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4};
|
||||
__constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6};
|
||||
__constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0};
|
||||
__constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5};
|
||||
__constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6};
|
||||
__constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6};
|
||||
|
||||
__constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1};
|
||||
__constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1};
|
||||
__constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1};
|
||||
|
||||
__constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1};
|
||||
|
||||
inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask)
|
||||
{
|
||||
const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
|
||||
const uint a_min = min(a0, a1);
|
||||
const uint a_max = max(a0, a1);
|
||||
return select(a_max, a_min, selectMask);
|
||||
}
|
||||
|
||||
inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask)
|
||||
{
|
||||
const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
|
||||
const uint a_min = min(a0, a1);
|
||||
const uint a_max = max(a0, a1);
|
||||
return select(a_min, a_max, selectMask);
|
||||
}
|
||||
|
||||
inline uint sort8_descending(const uint aa)
|
||||
{
|
||||
const unsigned int slotID = get_sub_group_local_id() % 8;
|
||||
const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
|
||||
const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
|
||||
const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]);
|
||||
const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]);
|
||||
const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]);
|
||||
const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]);
|
||||
return gg;
|
||||
}
|
||||
|
||||
inline uint sort8_ascending(const uint aa)
|
||||
{
|
||||
const unsigned int slotID = get_sub_group_local_id() % 8;
|
||||
const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]);
|
||||
const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]);
|
||||
const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]);
|
||||
const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]);
|
||||
const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]);
|
||||
const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]);
|
||||
return gg;
|
||||
}
|
||||
|
||||
inline uint sort4_descending(const uint aa)
|
||||
{
|
||||
const unsigned int slotID = get_sub_group_local_id() % 8;
|
||||
const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
|
||||
const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
|
||||
const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]);
|
||||
return dd;
|
||||
}
|
||||
|
||||
inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
|
||||
{
|
||||
const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
|
||||
const ulong a_min = min(a0, a1);
|
||||
const ulong a_max = max(a0, a1);
|
||||
return select(a_max, a_min, (ulong)selectMask);
|
||||
}
|
||||
|
||||
inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
|
||||
{
|
||||
const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
|
||||
const ulong a_min = min(a0, a1);
|
||||
const ulong a_max = max(a0, a1);
|
||||
return select(a_min, a_max, (ulong)selectMask);
|
||||
}
|
||||
|
||||
inline ulong sort8_ascending_ulong(const ulong aa)
|
||||
{
|
||||
const unsigned int slotID = get_sub_group_local_id() % 8;
|
||||
const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]);
|
||||
const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]);
|
||||
const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]);
|
||||
const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]);
|
||||
const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]);
|
||||
const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]);
|
||||
return gg;
|
||||
}
|
||||
|
||||
inline uint bitInterleave3D(const uint4 in)
|
||||
{
|
||||
uint x = in.x, y = in.y, z = in.z;
|
||||
x = (x | (x << 16)) & 0x030000FF;
|
||||
x = (x | (x << 8)) & 0x0300F00F;
|
||||
x = (x | (x << 4)) & 0x030C30C3;
|
||||
x = (x | (x << 2)) & 0x09249249;
|
||||
|
||||
y = (y | (y << 16)) & 0x030000FF;
|
||||
y = (y | (y << 8)) & 0x0300F00F;
|
||||
y = (y | (y << 4)) & 0x030C30C3;
|
||||
y = (y | (y << 2)) & 0x09249249;
|
||||
|
||||
z = (z | (z << 16)) & 0x030000FF;
|
||||
z = (z | (z << 8)) & 0x0300F00F;
|
||||
z = (z | (z << 4)) & 0x030C30C3;
|
||||
z = (z | (z << 2)) & 0x09249249;
|
||||
|
||||
return x | (y << 1) | (z << 2);
|
||||
}
|
||||
|
||||
inline uint bitInterleave4D(const uint4 in)
|
||||
{
|
||||
uint x = in.x, y = in.y, z = in.z, w = in.w;
|
||||
|
||||
x = x & 0x000000ff;
|
||||
x = (x ^ (x << 16)) & 0x00c0003f;
|
||||
x = (x ^ (x << 8)) & 0x00c03807;
|
||||
x = (x ^ (x << 4)) & 0x08530853;
|
||||
x = (x ^ (x << 2)) & 0x09090909;
|
||||
x = (x ^ (x << 1)) & 0x11111111;
|
||||
|
||||
y = y & 0x000000ff;
|
||||
y = (y ^ (y << 16)) & 0x00c0003f;
|
||||
y = (y ^ (y << 8)) & 0x00c03807;
|
||||
y = (y ^ (y << 4)) & 0x08530853;
|
||||
y = (y ^ (y << 2)) & 0x09090909;
|
||||
y = (y ^ (y << 1)) & 0x11111111;
|
||||
|
||||
z = z & 0x000000ff;
|
||||
z = (z ^ (z << 16)) & 0x00c0003f;
|
||||
z = (z ^ (z << 8)) & 0x00c03807;
|
||||
z = (z ^ (z << 4)) & 0x08530853;
|
||||
z = (z ^ (z << 2)) & 0x09090909;
|
||||
z = (z ^ (z << 1)) & 0x11111111;
|
||||
|
||||
w = w & 0x000000ff;
|
||||
w = (w ^ (w << 16)) & 0x00c0003f;
|
||||
w = (w ^ (w << 8)) & 0x00c03807;
|
||||
w = (w ^ (w << 4)) & 0x08530853;
|
||||
w = (w ^ (w << 2)) & 0x09090909;
|
||||
w = (w ^ (w << 1)) & 0x11111111;
|
||||
|
||||
return (x | (y << 1) | (z << 2) | (w << 3));
|
||||
}
|
||||
|
||||
inline ulong ulong_bitInterleave4D(const uint4 in)
|
||||
{
|
||||
ulong x = in.x, y = in.y, z = in.z, w = in.w;
|
||||
|
||||
x = x & 0x0000ffff;
|
||||
x = (x ^ (x << 32)) & 0x0000f800000007ff;
|
||||
x = (x ^ (x << 16)) & 0x0000f80007c0003f;
|
||||
x = (x ^ (x << 8)) & 0x00c0380700c03807;
|
||||
x = (x ^ (x << 4)) & 0x0843084308430843;
|
||||
x = (x ^ (x << 2)) & 0x0909090909090909;
|
||||
x = (x ^ (x << 1)) & 0x1111111111111111;
|
||||
|
||||
y = y & 0x0000ffff;
|
||||
y = (y ^ (y << 32)) & 0x0000f800000007ff;
|
||||
y = (y ^ (y << 16)) & 0x0000f80007c0003f;
|
||||
y = (y ^ (y << 8)) & 0x00c0380700c03807;
|
||||
y = (y ^ (y << 4)) & 0x0843084308430843;
|
||||
y = (y ^ (y << 2)) & 0x0909090909090909;
|
||||
y = (y ^ (y << 1)) & 0x1111111111111111;
|
||||
|
||||
z = z & 0x0000ffff;
|
||||
z = (z ^ (z << 32)) & 0x0000f800000007ff;
|
||||
z = (z ^ (z << 16)) & 0x0000f80007c0003f;
|
||||
z = (z ^ (z << 8)) & 0x00c0380700c03807;
|
||||
z = (z ^ (z << 4)) & 0x0843084308430843;
|
||||
z = (z ^ (z << 2)) & 0x0909090909090909;
|
||||
z = (z ^ (z << 1)) & 0x1111111111111111;
|
||||
|
||||
w = w & 0x0000ffff;
|
||||
w = (w ^ (w << 32)) & 0x0000f800000007ff;
|
||||
w = (w ^ (w << 16)) & 0x0000f80007c0003f;
|
||||
w = (w ^ (w << 8)) & 0x00c0380700c03807;
|
||||
w = (w ^ (w << 4)) & 0x0843084308430843;
|
||||
w = (w ^ (w << 2)) & 0x0909090909090909;
|
||||
w = (w ^ (w << 1)) & 0x1111111111111111;
|
||||
|
||||
return (x | (y << 1) | (z << 2) | (w << 3));
|
||||
}
|
||||
|
||||
inline uint bitCompact(uint x)
|
||||
{
|
||||
x &= 0x09249249;
|
||||
x = (x ^ (x >> 2)) & 0x030c30c3;
|
||||
x = (x ^ (x >> 4)) & 0x0300f00f;
|
||||
x = (x ^ (x >> 8)) & 0xff0000ff;
|
||||
x = (x ^ (x >> 16)) & 0x000003ff;
|
||||
return x;
|
||||
}
|
||||
|
||||
inline uint3 bitCompact3D(const uint in)
|
||||
{
|
||||
const uint x = bitCompact(x >> 0);
|
||||
const uint y = bitCompact(y >> 1);
|
||||
const uint z = bitCompact(z >> 2);
|
||||
return (uint3)(x, y, z);
|
||||
}
|
||||
|
||||
inline uint convertToPushIndices8(uint ID)
|
||||
{
|
||||
const unsigned int slotID = get_sub_group_local_id();
|
||||
uint index = 0;
|
||||
for (uint i = 0; i < 8; i++)
|
||||
{
|
||||
const uint mask = intel_sub_group_ballot(ID == i);
|
||||
const uint new_index = ctz(mask);
|
||||
index = i == slotID ? new_index : index;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
inline uint convertToPushIndices16(uint ID)
|
||||
{
|
||||
const unsigned int slotID = get_sub_group_local_id();
|
||||
uint index = 0;
|
||||
for (uint i = 0; i < 16; i++)
|
||||
{
|
||||
const uint mask = intel_sub_group_ballot(ID == i);
|
||||
const uint new_index = ctz(mask);
|
||||
index = i == slotID ? new_index : index;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
#define FLOAT_EXPONENT_MASK (0x7F800000) // used to be EXPONENT_MASK
|
||||
#define FLOAT_MANTISSA_MASK (0x007FFFFF) // used to be MANTISSA_MASK
|
||||
#define FLOAT_NEG_ONE_EXP_MASK (0x3F000000)
|
||||
#define FLOAT_BIAS (127)
|
||||
#define FLOAT_MANTISSA_BITS (23)
|
||||
|
||||
inline float3 frexp_vec3(float3 len, int3* exp)
|
||||
{
|
||||
float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK));
|
||||
mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f)));
|
||||
mant = copysign(mant, len);
|
||||
*exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1));
|
||||
return mant;
|
||||
}
|
||||
|
||||
|
||||
#ifndef uniform
|
||||
#define uniform
|
||||
#endif
|
||||
|
||||
#ifndef varying
|
||||
#define varying
|
||||
#endif
|
||||
|
||||
uint get_sub_group_global_id()
|
||||
{
|
||||
return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 );
|
||||
}
|
||||
|
||||
// each lane contains the number of 1 bits below the corresponding position in 'mask'
|
||||
uint subgroup_bit_prefix_exclusive(uniform uint mask)
|
||||
{
|
||||
varying ushort lane = get_sub_group_local_id();
|
||||
varying uint lane_mask = (1 << lane) - 1;
|
||||
varying uint m = mask & lane_mask;
|
||||
return popcount(m);
|
||||
}
|
||||
|
||||
uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx )
|
||||
{
|
||||
varying uint lane_mask = (1 << lane_idx) - 1;
|
||||
varying uint m = mask & lane_mask;
|
||||
return popcount(m);
|
||||
}
|
||||
|
||||
|
||||
uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx)
|
||||
{
|
||||
return (uint3)(sub_group_broadcast(v.x,idx),
|
||||
sub_group_broadcast(v.y,idx),
|
||||
sub_group_broadcast(v.z,idx));
|
||||
}
|
||||
|
||||
float3 sub_group_broadcast_float3(float3 v, uniform ushort idx)
|
||||
{
|
||||
return (float3)(sub_group_broadcast(v.x, idx),
|
||||
sub_group_broadcast(v.y, idx),
|
||||
sub_group_broadcast(v.z, idx));
|
||||
}
|
||||
|
||||
float3 sub_group_reduce_min_float3(float3 v)
|
||||
{
|
||||
return (float3)(sub_group_reduce_min(v.x),
|
||||
sub_group_reduce_min(v.y),
|
||||
sub_group_reduce_min(v.z) );
|
||||
}
|
||||
float3 sub_group_reduce_max_float3(float3 v)
|
||||
{
|
||||
return (float3)(sub_group_reduce_max(v.x),
|
||||
sub_group_reduce_max(v.y),
|
||||
sub_group_reduce_max(v.z));
|
||||
}
|
||||
|
||||
float3 sub_group_shuffle_float3(float3 v, uniform ushort idx)
|
||||
{
|
||||
return (float3)(intel_sub_group_shuffle(v.x, idx),
|
||||
intel_sub_group_shuffle(v.y, idx),
|
||||
intel_sub_group_shuffle(v.z, idx));
|
||||
}
|
||||
uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx)
|
||||
{
|
||||
return (uint3)( intel_sub_group_shuffle(v.x, idx),
|
||||
intel_sub_group_shuffle(v.y, idx),
|
||||
intel_sub_group_shuffle(v.z, idx));
|
||||
}
|
||||
|
||||
|
||||
inline uchar sub_group_reduce_or_N6(uchar val)
|
||||
{
|
||||
val = val | intel_sub_group_shuffle_down(val, val, 4);
|
||||
val = val | intel_sub_group_shuffle_down(val, val, 2);
|
||||
val = val | intel_sub_group_shuffle_down(val, val, 1);
|
||||
return sub_group_broadcast(val, 0);
|
||||
}
|
||||
|
||||
inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)
|
||||
{
|
||||
uint SIMD8_id = get_sub_group_local_id() / 8;
|
||||
val = val | intel_sub_group_shuffle_down(val, val, 4);
|
||||
val = val | intel_sub_group_shuffle_down(val, val, 2);
|
||||
val = val | intel_sub_group_shuffle_down(val, val, 1);
|
||||
|
||||
return intel_sub_group_shuffle(val, SIMD8_id * 8);
|
||||
}
|
||||
|
||||
|
||||
inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p )
|
||||
{
|
||||
return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group );
|
||||
}
|
||||
|
||||
inline __attribute__((overloadable)) int atomic_inc_local(local int* p)
|
||||
{
|
||||
return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p)
|
||||
{
|
||||
return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline __attribute__((overloadable)) int atomic_dec_local(local int* p)
|
||||
{
|
||||
return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n)
|
||||
{
|
||||
return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n )
|
||||
{
|
||||
return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline uint atomic_add_local( local uint* p, uint n )
|
||||
{
|
||||
return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline uint atomic_xor_local(local uint* p, uint n)
|
||||
{
|
||||
return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline uint atomic_or_local(local uint* p, uint n)
|
||||
{
|
||||
return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline uint atomic_min_local(local uint* p, uint n)
|
||||
{
|
||||
return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
inline uint atomic_max_local(local uint* p, uint n)
|
||||
{
|
||||
return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
inline uint atomic_inc_global( global uint* p )
|
||||
{
|
||||
return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
|
||||
}
|
||||
|
||||
inline uint atomic_dec_global(global uint* p)
|
||||
{
|
||||
return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
|
||||
}
|
||||
|
||||
inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired)
|
||||
{
|
||||
return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device);
|
||||
}
|
||||
|
||||
inline uint atomic_add_global( global uint* p, uint n )
|
||||
{
|
||||
return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
|
||||
}
|
||||
|
||||
inline uint atomic_sub_global(global uint* p, uint n)
|
||||
{
|
||||
return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
|
||||
}
|
||||
|
||||
inline uint atomic_or_global(global uint* p, uint n)
|
||||
{
|
||||
return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
|
||||
}
|
||||
|
||||
|
||||
inline uint atomic_inc_global_acquire(global uint* p)
|
||||
{
|
||||
return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device);
|
||||
}
|
||||
|
||||
|
||||
inline uint atomic_inc_global_release(global uint* p)
|
||||
{
|
||||
return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
|
||||
}
|
||||
inline uint atomic_dec_global_release(global uint* p)
|
||||
{
|
||||
return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
|
||||
}
|
||||
|
||||
inline uint generic_atomic_add(uint* p, uint val)
|
||||
{
|
||||
if (to_global(p) != NULL)
|
||||
return atomic_add_global(to_global(p), val);
|
||||
if (to_local(p) != NULL)
|
||||
return atomic_add_local(to_local(p), val);
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n )
|
||||
{
|
||||
n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
|
||||
n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
|
||||
n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
|
||||
return sub_group_broadcast( n, 0 );
|
||||
}
|
||||
|
||||
inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n )
|
||||
{
|
||||
n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
|
||||
n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
|
||||
n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
|
||||
return sub_group_broadcast( n, 0 );
|
||||
}
|
||||
|
||||
inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)
|
||||
{
|
||||
n = max(n, intel_sub_group_shuffle_down(n, n, 4));
|
||||
n = max(n, intel_sub_group_shuffle_down(n, n, 2));
|
||||
n = max(n, intel_sub_group_shuffle_down(n, n, 1));
|
||||
return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0);
|
||||
}
|
||||
|
||||
inline uint generic_atomic_inc(uint* p)
|
||||
{
|
||||
if (to_global(p) != NULL)
|
||||
return atomic_inc_global(to_global(p));
|
||||
if (to_local(p) != NULL)
|
||||
return atomic_inc(to_local(p));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// Built-in GRL function which, if called in a kernel body, will force the kernel
|
||||
// to be compiled to the minimum SIMD width supported by the platform
|
||||
void GRL_UseMinimumSIMDWidth();
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
library lsc_intrinsics
|
||||
{
|
||||
default "lsc_intrinsics.cl" ;
|
||||
fallback "lsc_intrinsics_fallback.cl";
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,207 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
// LSC Loads
|
||||
uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset);
|
||||
uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset);
|
||||
uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset);
|
||||
uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset);
|
||||
uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset);
|
||||
uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset);
|
||||
uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset);
|
||||
|
||||
uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset);
|
||||
uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset);
|
||||
uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset);
|
||||
uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset);
|
||||
uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset);
|
||||
uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset);
|
||||
uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset);
|
||||
|
||||
uint load_uint_L1UC_L3UC(global uint* it, int offset);
|
||||
uint load_uint_L1UC_L3C(global uint* it, int offset);
|
||||
uint load_uint_L1C_L3UC(global uint* it, int offset);
|
||||
uint load_uint_L1C_L3C(global uint* it, int offset);
|
||||
uint load_uint_L1S_L3UC(global uint* it, int offset);
|
||||
uint load_uint_L1S_L3C(global uint* it, int offset);
|
||||
uint load_uint_L1IAR_L3C(global uint* it, int offset);
|
||||
|
||||
uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset);
|
||||
uint2 load_uint2_L1UC_L3C(global uint2* it, int offset);
|
||||
uint2 load_uint2_L1C_L3UC(global uint2* it, int offset);
|
||||
uint2 load_uint2_L1C_L3C(global uint2* it, int offset);
|
||||
uint2 load_uint2_L1S_L3UC(global uint2* it, int offset);
|
||||
uint2 load_uint2_L1S_L3C(global uint2* it, int offset);
|
||||
uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset);
|
||||
|
||||
uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset);
|
||||
uint3 load_uint3_L1UC_L3C(global uint3* it, int offset);
|
||||
uint3 load_uint3_L1C_L3UC(global uint3* it, int offset);
|
||||
uint3 load_uint3_L1C_L3C(global uint3* it, int offset);
|
||||
uint3 load_uint3_L1S_L3UC(global uint3* it, int offset);
|
||||
uint3 load_uint3_L1S_L3C(global uint3* it, int offset);
|
||||
uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset);
|
||||
|
||||
uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset);
|
||||
uint4 load_uint4_L1UC_L3C(global uint4* it, int offset);
|
||||
uint4 load_uint4_L1C_L3UC(global uint4* it, int offset);
|
||||
uint4 load_uint4_L1C_L3C(global uint4* it, int offset);
|
||||
uint4 load_uint4_L1S_L3UC(global uint4* it, int offset);
|
||||
uint4 load_uint4_L1S_L3C(global uint4* it, int offset);
|
||||
uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset);
|
||||
|
||||
uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset);
|
||||
uint8 load_uint8_L1UC_L3C(global uint8* it, int offset);
|
||||
uint8 load_uint8_L1C_L3UC(global uint8* it, int offset);
|
||||
uint8 load_uint8_L1C_L3C(global uint8* it, int offset);
|
||||
uint8 load_uint8_L1S_L3UC(global uint8* it, int offset);
|
||||
uint8 load_uint8_L1S_L3C(global uint8* it, int offset);
|
||||
uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset);
|
||||
|
||||
ulong load_ulong_L1UC_L3UC(global ulong* it, int offset);
|
||||
ulong load_ulong_L1UC_L3C(global ulong* it, int offset);
|
||||
ulong load_ulong_L1C_L3UC(global ulong* it, int offset);
|
||||
ulong load_ulong_L1C_L3C(global ulong* it, int offset);
|
||||
ulong load_ulong_L1S_L3UC(global ulong* it, int offset);
|
||||
ulong load_ulong_L1S_L3C(global ulong* it, int offset);
|
||||
ulong load_ulong_L1IAR_L3C(global ulong* it, int offset);
|
||||
|
||||
ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset);
|
||||
ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset);
|
||||
ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset);
|
||||
ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset);
|
||||
ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset);
|
||||
ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset);
|
||||
ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset);
|
||||
|
||||
ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset);
|
||||
ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset);
|
||||
ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset);
|
||||
ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset);
|
||||
ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset);
|
||||
ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset);
|
||||
ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset);
|
||||
|
||||
ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset);
|
||||
ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset);
|
||||
ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset);
|
||||
ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset);
|
||||
ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset);
|
||||
ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset);
|
||||
ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset);
|
||||
|
||||
ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset);
|
||||
ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset);
|
||||
ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset);
|
||||
ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset);
|
||||
ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset);
|
||||
ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset);
|
||||
ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset);
|
||||
|
||||
// LSC Stores
|
||||
void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value);
|
||||
void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value);
|
||||
void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value);
|
||||
void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value);
|
||||
void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value);
|
||||
void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value);
|
||||
void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value);
|
||||
|
||||
void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value);
|
||||
void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value);
|
||||
void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value);
|
||||
void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value);
|
||||
void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value);
|
||||
void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value);
|
||||
void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value);
|
||||
|
||||
void store_uint_L1UC_L3UC(global uint* it, int offset, uint value);
|
||||
void store_uint_L1UC_L3WB(global uint* it, int offset, uint value);
|
||||
void store_uint_L1WT_L3UC(global uint* it, int offset, uint value);
|
||||
void store_uint_L1WT_L3WB(global uint* it, int offset, uint value);
|
||||
void store_uint_L1S_L3UC(global uint* it, int offset, uint value);
|
||||
void store_uint_L1S_L3WB(global uint* it, int offset, uint value);
|
||||
void store_uint_L1WB_L3WB(global uint* it, int offset, uint value);
|
||||
|
||||
void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value);
|
||||
void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value);
|
||||
void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value);
|
||||
void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value);
|
||||
void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value);
|
||||
void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value);
|
||||
void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value);
|
||||
|
||||
void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value);
|
||||
void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value);
|
||||
void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value);
|
||||
void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value);
|
||||
void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value);
|
||||
void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value);
|
||||
void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value);
|
||||
|
||||
void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value);
|
||||
void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value);
|
||||
void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value);
|
||||
void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value);
|
||||
void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value);
|
||||
void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value);
|
||||
void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value);
|
||||
|
||||
void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value);
|
||||
void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value);
|
||||
void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value);
|
||||
void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value);
|
||||
void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value);
|
||||
void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value);
|
||||
void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value);
|
||||
|
||||
void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value);
|
||||
void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value);
|
||||
void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value);
|
||||
void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value);
|
||||
void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value);
|
||||
void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value);
|
||||
void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value);
|
||||
|
||||
void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value);
|
||||
void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value);
|
||||
void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value);
|
||||
void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value);
|
||||
void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value);
|
||||
void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value);
|
||||
void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value);
|
||||
|
||||
void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value);
|
||||
void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value);
|
||||
void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value);
|
||||
void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value);
|
||||
void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value);
|
||||
void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value);
|
||||
void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value);
|
||||
|
||||
void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value);
|
||||
void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value);
|
||||
void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value);
|
||||
void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value);
|
||||
void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value);
|
||||
void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value);
|
||||
void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value);
|
||||
|
||||
void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value);
|
||||
void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value);
|
||||
void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value);
|
||||
void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value);
|
||||
void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value);
|
||||
void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value);
|
||||
void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value);
|
||||
|
||||
// LSC Fence support
|
||||
void mem_fence_gpu_default();
|
||||
void mem_fence_workgroup_default();
|
||||
void mem_fence_gpu_invalidate();
|
||||
void mem_fence_gpu_evict();
|
||||
void mem_fence_evict_to_memory();
|
||||
|
|
@ -1,898 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
// LSC Loads
|
||||
// uchar
|
||||
uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
// ushort
|
||||
uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
|
||||
{
|
||||
return (uint)(it[offset]);
|
||||
}
|
||||
|
||||
// uint
|
||||
uint load_uint_L1UC_L3UC(global uint* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint load_uint_L1UC_L3C(global uint* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint load_uint_L1C_L3UC(global uint* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint load_uint_L1C_L3C(global uint* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint load_uint_L1S_L3UC(global uint* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint load_uint_L1S_L3C(global uint* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint load_uint_L1IAR_L3C(global uint* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// uint2
|
||||
uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// uint3
|
||||
uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// uint4
|
||||
uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// uint8
|
||||
uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// ulong
|
||||
ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong load_ulong_L1C_L3C(global ulong* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong load_ulong_L1S_L3C(global ulong* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// ulong2
|
||||
ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// ulong3
|
||||
ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// ulong4
|
||||
ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// ulong8
|
||||
ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
|
||||
{
|
||||
return it[offset];
|
||||
}
|
||||
|
||||
// LSC Stores
|
||||
// uchar
|
||||
void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (uchar)(value);
|
||||
}
|
||||
|
||||
void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (uchar)(value);
|
||||
}
|
||||
|
||||
void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (uchar)(value);
|
||||
}
|
||||
|
||||
void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (uchar)(value);
|
||||
}
|
||||
|
||||
void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (uchar)(value);
|
||||
}
|
||||
|
||||
void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (uchar)(value);
|
||||
}
|
||||
|
||||
void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (uchar)(value);
|
||||
}
|
||||
|
||||
// ushort
|
||||
void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (ushort)(value);
|
||||
}
|
||||
|
||||
void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (ushort)(value);
|
||||
}
|
||||
|
||||
void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (ushort)(value);
|
||||
}
|
||||
|
||||
void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (ushort)(value);
|
||||
}
|
||||
|
||||
void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (ushort)(value);
|
||||
}
|
||||
|
||||
void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (ushort)(value);
|
||||
}
|
||||
|
||||
void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = (ushort)(value);
|
||||
}
|
||||
|
||||
// uint
|
||||
void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// uint2
|
||||
void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// uint3
|
||||
void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// uint4
|
||||
void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// uint8
|
||||
void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// ulong
|
||||
void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// ulong2
|
||||
void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// ulong3
|
||||
void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// ulong4
|
||||
void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// ulong8
|
||||
void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
|
||||
{
|
||||
it[offset] = value;
|
||||
}
|
||||
|
||||
// LSC Fence support
|
||||
void mem_fence_gpu_default()
|
||||
{
|
||||
write_mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
void mem_fence_workgroup_default()
|
||||
{
|
||||
write_mem_fence( CLK_GLOBAL_MEM_FENCE );
|
||||
}
|
||||
|
||||
void mem_fence_gpu_invalidate()
|
||||
{
|
||||
read_mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
void mem_fence_gpu_evict()
|
||||
{
|
||||
read_mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
void mem_fence_evict_to_memory()
|
||||
{
|
||||
mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
|
@ -1,161 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "shared.h"
|
||||
|
||||
/// Write cache line to global memory
|
||||
/// Assumes subgroup_size is 16
|
||||
///
|
||||
/// @param dst 64 bytes aligned output pointer
|
||||
/// @param val value to write
|
||||
GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val)
|
||||
{
|
||||
global uint* addrAligned = (global uint*)(global uint16*)dst;
|
||||
intel_sub_group_block_write(addrAligned, val);
|
||||
}
|
||||
|
||||
/// Read cache line from global memory
|
||||
/// Assumes subgroup_size is 16
|
||||
///
|
||||
/// @param src 64 bytes aligned input pointer
|
||||
/// @return uint read from memory
|
||||
GRL_INLINE uint CacheLineSubgroupRead(const global char* src)
|
||||
{
|
||||
const global uint* addrAligned = (const global uint*)(global uint16*)src;
|
||||
return intel_sub_group_block_read(addrAligned);
|
||||
}
|
||||
|
||||
/// Copy cache line
|
||||
/// Assumes subgroup_size is 16
|
||||
///
|
||||
/// @param dst 64 bytes aligned output pointer
|
||||
/// @param src input pointer
|
||||
GRL_INLINE void CopyCacheLine(global char* dst, const global char* src)
|
||||
{
|
||||
global const uint* usrc = (global const uint*) (src);
|
||||
|
||||
uint data = intel_sub_group_block_read(usrc);
|
||||
CacheLineSubgroupWrite(dst, data);
|
||||
}
|
||||
|
||||
/// Fast memory copy
|
||||
///
|
||||
/// @param dst output pointer
|
||||
/// @param src input pointer
|
||||
/// @param size number of bytes to copy
|
||||
/// @param numGroups number of groups that execute this function
|
||||
GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups)
|
||||
{
|
||||
const uint CACHELINE_SIZE = 64;
|
||||
|
||||
uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0);
|
||||
|
||||
// this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline.
|
||||
// it copies laso reminder
|
||||
{
|
||||
uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1);
|
||||
alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1);
|
||||
|
||||
if (size > alignAdd)
|
||||
{
|
||||
uint alignedBytesCount = size - alignAdd;
|
||||
uint alignedDWsCount = alignedBytesCount >> 2;
|
||||
global uint* dstAlignedPart = (global uint*)(dst + alignAdd);
|
||||
global uint* srcAlignedPart = (global uint*)(src + alignAdd);
|
||||
|
||||
for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups)
|
||||
{
|
||||
dstAlignedPart[id] = srcAlignedPart[id];
|
||||
}
|
||||
|
||||
if (globalID < alignedBytesCount - (alignedDWsCount << 2))
|
||||
{
|
||||
global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount);
|
||||
global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount);
|
||||
dstByteRem[globalID] = srcByteRem[globalID];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copy to dst below aligned up to chacheline
|
||||
{
|
||||
uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3;
|
||||
if (misalignmentBytesSize)
|
||||
{
|
||||
if (globalID < misalignmentBytesSize)
|
||||
{
|
||||
dst[globalID] = src[globalID];
|
||||
}
|
||||
dst += misalignmentBytesSize;
|
||||
src += misalignmentBytesSize;
|
||||
}
|
||||
|
||||
uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1);
|
||||
if (misalignmentDWSize)
|
||||
{
|
||||
if (globalID < (misalignmentDWSize >> 2))
|
||||
{
|
||||
((global uint*)dst)[globalID] = ((global uint*)src)[globalID];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define CACHELINE_SIZE 64
|
||||
#define CACHELINE_PER_BLOCK 4
|
||||
#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
|
||||
|
||||
GRL_INLINE
|
||||
global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset)
|
||||
{
|
||||
if (array != NULL)
|
||||
{
|
||||
return array + byteOffset;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (global char *)arrayOfPtrs[byteOffset >> 6];
|
||||
}
|
||||
}
|
||||
|
||||
// assummed:
|
||||
// dst is always 64 bytes alligned
|
||||
// size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes)
|
||||
GRL_INLINE
|
||||
void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups)
|
||||
{
|
||||
uint taskId = get_group_id(0);
|
||||
|
||||
uint blockedSize = (size) & (~(BLOCK_SIZE - 1));
|
||||
|
||||
uint cachelinedTailOffset = blockedSize;
|
||||
uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1));
|
||||
|
||||
uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE
|
||||
uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1)));
|
||||
if (reversedTaskId < tailCacheLines)
|
||||
{
|
||||
uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE);
|
||||
global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
|
||||
CopyCacheLine(dst + byteOffset, src);
|
||||
}
|
||||
|
||||
uint numBlocks = blockedSize >> 8;
|
||||
while (taskId < numBlocks)
|
||||
{
|
||||
uint byteOffset = (taskId * BLOCK_SIZE);
|
||||
|
||||
for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++)
|
||||
{
|
||||
global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
|
||||
CopyCacheLine(dst + byteOffset, src);
|
||||
byteOffset += CACHELINE_SIZE;
|
||||
}
|
||||
|
||||
taskId += numGroups;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,367 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "api_interface.h"
|
||||
#include "common.h"
|
||||
#include "instance.h"
|
||||
#include "misc_shared.h"
|
||||
#include "mem_utils.h"
|
||||
|
||||
#define DBG(x)
|
||||
#define ENABLE_CHECKS 0
|
||||
|
||||
#define CACHELINE_SIZE 64
|
||||
#define CACHELINE_PER_BLOCK 4
|
||||
#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
|
||||
|
||||
GRL_INLINE
|
||||
uint32_t getGeomDescPrimitiveCountAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
|
||||
{
|
||||
return (uint32_t)GRL_get_primitive_count(&geomDesc[index]);
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
uint32_t getGeomDescTypeAndFlagsAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
|
||||
{
|
||||
return (uint32_t)GRL_get_Type(&geomDesc[index]) |
|
||||
(((uint32_t)GRL_get_Flags(&geomDesc[index])) << 16);
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
uint64_t getGeomDescAsUint64t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
|
||||
{
|
||||
return (uint64_t)getGeomDescPrimitiveCountAsUint32t(geomDesc, index) |
|
||||
(((uint64_t)getGeomDescTypeAndFlagsAsUint32t(geomDesc, index)) << 32);
|
||||
}
|
||||
|
||||
// assummed:
|
||||
// dst is always 64 bytes alligned
|
||||
GRL_INLINE
|
||||
void copyGeoMetaData(global char* dst, global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t size, uint numGroups)
|
||||
{
|
||||
uint taskId = get_group_id(0);
|
||||
uint localId = get_sub_group_local_id();
|
||||
|
||||
uint cachelinedSize = (size) & (~(CACHELINE_SIZE-1));
|
||||
|
||||
uint reminderOffset = cachelinedSize;
|
||||
uint reminderQWSize = (size - reminderOffset) >> 3;
|
||||
|
||||
uint tailCacheLines = cachelinedSize >> 6; // divide by CACHELINE_SIZE
|
||||
uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups-1)));
|
||||
if (reversedTaskId == tailCacheLines && localId < reminderQWSize)
|
||||
{
|
||||
uint reminderOffsetQW = reminderOffset >> 3;
|
||||
global uint64_t* dstQW = (global uint64_t*)(dst);
|
||||
dstQW[localId + reminderOffsetQW] = getGeomDescAsUint64t(geomDesc, localId + reminderOffsetQW);
|
||||
}
|
||||
|
||||
uint numCacheLines = cachelinedSize >> 6;
|
||||
while (taskId < numCacheLines)
|
||||
{
|
||||
uint byteOffset = taskId * CACHELINE_SIZE;
|
||||
uint geoIdFromOffset = (byteOffset >> 3) + (localId >> 1);
|
||||
|
||||
uint32_t data = 0;
|
||||
if (localId & 1)
|
||||
{
|
||||
data = getGeomDescTypeAndFlagsAsUint32t(geomDesc, geoIdFromOffset);
|
||||
}
|
||||
else
|
||||
{
|
||||
data = getGeomDescPrimitiveCountAsUint32t(geomDesc, geoIdFromOffset);
|
||||
}
|
||||
CacheLineSubgroupWrite(dst + byteOffset, data);
|
||||
|
||||
taskId += numGroups;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
uint groupCountForInstancesCopySize(uint size)
|
||||
{
|
||||
return (size >> 8) + 3;
|
||||
}
|
||||
|
||||
GRL_INLINE
|
||||
uint groupCountForGeoMetaDataCopySize(uint size)
|
||||
{
|
||||
return (size >> 6) + 1;
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_instances(global char* dest, global char* instancesArray, uint64_t size)
|
||||
{
|
||||
// global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
|
||||
copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_instances_indirect(global char* dest, global char* instancesArray, global const struct IndirectBuildRangeInfo* const indirect_data)
|
||||
{
|
||||
uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
|
||||
instancesArray += indirect_data->primitiveOffset;
|
||||
uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
if (tid == 0)
|
||||
{
|
||||
struct BVHBase* bvh = (struct BVHBase*)dest;
|
||||
bvh->Meta.instanceCount = indirect_data->primitiveCount;
|
||||
}
|
||||
copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_instance_ptrs(global char* dest, global uint64_t* arrayOfPtrs, uint64_t size)
|
||||
{
|
||||
//global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
|
||||
copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_instance_ptrs_indirect(global char* dest, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
|
||||
{
|
||||
uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
|
||||
arrayOfPtrs += indirect_data->primitiveOffset;
|
||||
uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
if (tid == 0)
|
||||
{
|
||||
struct BVHBase* bvh = (struct BVHBase*)dest;
|
||||
bvh->Meta.instanceCount = indirect_data->primitiveCount;
|
||||
}
|
||||
copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_instances_base_ptr(global BVHBase* bvh, global char* instancesArray, uint64_t size)
|
||||
{
|
||||
global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
|
||||
copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_instances_base_ptr_indirect(global BVHBase* bvh, global char* instancesArray, global struct IndirectBuildRangeInfo const * const indirect_data)
|
||||
{
|
||||
global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
|
||||
uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
|
||||
instancesArray += indirect_data->primitiveOffset;
|
||||
copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_instance_ptrs_base_ptr(global BVHBase* bvh, global uint64_t* arrayOfPtrs, uint64_t size)
|
||||
{
|
||||
global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
|
||||
copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_instance_ptrs_base_ptr_indirect(global BVHBase* bvh, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
|
||||
{
|
||||
global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
|
||||
uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
|
||||
arrayOfPtrs += indirect_data->primitiveOffset;
|
||||
copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel copy_geo_meta_data(global char* dest, global char* src, uint64_t size)
|
||||
{
|
||||
//global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.geoDescsStart);
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc = (global GRL_RAYTRACING_GEOMETRY_DESC *)((unsigned long)src);
|
||||
copyGeoMetaData(dest, geomDesc, size, groupCountForGeoMetaDataCopySize(size));
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__( ( reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 ) ) )
|
||||
__attribute__( ( intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH ) ) )
|
||||
void kernel copy_geo_descs_indirect_build(global char* dest, global char* src, global struct IndirectBuildRangeInfo const * const indirect_data, uint numGeometries)
|
||||
{
|
||||
uint32_t gid = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
if (gid < numGeometries) {
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* dstDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(dest);
|
||||
global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(src);
|
||||
|
||||
GRL_RAYTRACING_GEOMETRY_DESC geo = srcDesc[gid];
|
||||
|
||||
uint primitiveCount = indirect_data[gid].primitiveCount;
|
||||
uint primitiveOffset = indirect_data[gid].primitiveOffset;
|
||||
uint firstVertex = indirect_data[gid].firstVertex;
|
||||
uint transformOffset = indirect_data[gid].transformOffset;
|
||||
|
||||
if (srcDesc[gid].Type == GEOMETRY_TYPE_TRIANGLES)
|
||||
{
|
||||
if (geo.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
|
||||
{
|
||||
geo.Desc.Triangles.VertexCount = primitiveCount * 3;
|
||||
geo.Desc.Triangles.pVertexBuffer += primitiveOffset
|
||||
+ firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
|
||||
}
|
||||
else
|
||||
{
|
||||
geo.Desc.Triangles.IndexCount = primitiveCount * 3;
|
||||
geo.Desc.Triangles.pIndexBuffer += primitiveOffset;
|
||||
geo.Desc.Triangles.pVertexBuffer += firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
|
||||
}
|
||||
if (geo.Desc.Triangles.pTransformBuffer) {
|
||||
geo.Desc.Triangles.pTransformBuffer += transformOffset;
|
||||
}
|
||||
} else {
|
||||
// GEOMETRY_TYPE_PROCEDURAL
|
||||
geo.Desc.Procedural.AABBCount = primitiveCount;
|
||||
geo.Desc.Procedural.pAABBs_GPUVA += primitiveOffset;
|
||||
}
|
||||
|
||||
dstDesc[gid] = geo;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(1, 1, 1))) void kernel batched_init_globals(global struct BatchedInitGlobalsData *data)
|
||||
{
|
||||
uint groupID = get_group_id(0);
|
||||
|
||||
struct BatchedInitGlobalsData entry = data[groupID];
|
||||
|
||||
global struct Globals* globals = (global struct Globals*)entry.p_build_globals;
|
||||
global char *bvh_mem = (global char*)entry.p_bvh_buffer;
|
||||
uint numPrimitives = entry.numPrimitives;
|
||||
uint numGeometries = entry.numGeometries;
|
||||
uint numInstances = entry.numInstances;
|
||||
uint instance_descs_start = entry.instance_descs_start;
|
||||
uint geo_meta_data_start = entry.geo_meta_data_start;
|
||||
uint node_data_start = entry.node_data_start;
|
||||
uint quad_data_start = entry.leaf_data_start;
|
||||
uint instance_data_start = entry.leaf_data_start;
|
||||
uint procedural_data_start = entry.procedural_data_start;
|
||||
uint back_pointer_start = entry.back_pointer_start;
|
||||
uint build_record_start = entry.leaf_data_start;
|
||||
uint totalBytes = entry.sizeTotal;
|
||||
uint leafPrimType = entry.leafType;
|
||||
uint leafSize = entry.leafSize;
|
||||
|
||||
uint root_node_offset = node_data_start;
|
||||
struct BVHBase *base = (struct BVHBase *)bvh_mem;
|
||||
|
||||
base->Meta.instanceCount = numInstances;
|
||||
base->Meta.geoCount = numGeometries;
|
||||
base->Meta.instanceDescsStart = instance_descs_start;
|
||||
base->Meta.geoDescsStart = geo_meta_data_start;
|
||||
base->Meta.allocationSize = totalBytes;
|
||||
// This doesnt work correctly
|
||||
//ERROR_INFO initErr = { 0, 0, 0, 0xAAABBAAA };
|
||||
//base->Meta.errors = initErr;
|
||||
base->Meta.errors.type = 0;
|
||||
base->Meta.errors.offset_in_BVH = 0; //in 64B units
|
||||
base->Meta.errors.when = 0;
|
||||
base->Meta.errors.reserved = 0xAAABBAAA;
|
||||
|
||||
base->nodeDataCur = node_data_start / 64;
|
||||
base->quadLeafStart = quad_data_start / 64;
|
||||
base->quadLeafCur = quad_data_start / 64;
|
||||
base->instanceLeafStart = instance_data_start / 64;
|
||||
base->instanceLeafEnd = instance_data_start / 64;
|
||||
base->proceduralDataStart = procedural_data_start / 64;
|
||||
base->proceduralDataCur = procedural_data_start / 64;
|
||||
base->backPointerDataStart = back_pointer_start / 64;
|
||||
base->refitTreeletsDataStart = totalBytes / 64;
|
||||
base->refitStartPointDataStart = totalBytes / 64;
|
||||
base->BVHDataEnd = totalBytes / 64;
|
||||
base->refitTreeletCnt = 0;
|
||||
base->refitTreeletCnt2 = 0;
|
||||
base->rootNodeOffset = root_node_offset;
|
||||
|
||||
base->fatLeafCount = 0;
|
||||
base->fatLeafTableStart = entry.fatleaf_table_start / 64;
|
||||
base->innerCount = 0;
|
||||
base->innerTableStart = entry.innernode_table_start / 64;
|
||||
base->quadLeftoversCountNewAtomicUpdate = 0;
|
||||
base->quadTableSizeNewAtomicUpdate = 0;
|
||||
base->quadIndicesDataStart = entry.quad_indices_data_start / 64;
|
||||
|
||||
if (back_pointer_start != totalBytes)
|
||||
{
|
||||
BackPointers* back_pointers = BVHBase_GetBackPointers(base);
|
||||
uint root_node_idx = root_node_offset - node_data_start;
|
||||
global uint *root_node_backpointer = (global uint *)InnerNode_GetBackPointer(back_pointers,root_node_idx);
|
||||
*root_node_backpointer = ((uint)-1) << 6;
|
||||
}
|
||||
|
||||
AABB3f_init(&base->Meta.bounds);
|
||||
AABB_init(&globals->centroidBounds);
|
||||
|
||||
globals->build_record_start = build_record_start;
|
||||
|
||||
globals->numBuildRecords = 0;
|
||||
globals->numBuildRecords_extended = 0;
|
||||
globals->numPrimitives = numPrimitives;
|
||||
globals->numSplittedPrimitives = 0;
|
||||
globals->sync = 0;
|
||||
globals->probThreshold = 0.0f;
|
||||
globals->leafPrimType = leafPrimType;
|
||||
globals->leafSize = leafSize;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// This is temporary WA for mock in DXR
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel copy_mock(global char *dest,
|
||||
global char *src,
|
||||
uint32_t size)
|
||||
{
|
||||
uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
uint32_t globalSize = get_num_groups(0) * get_local_size(0);
|
||||
for (uint32_t i = globalId; i < size; i += globalSize)
|
||||
{
|
||||
dest[i] = src[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(32, 1, 1)))
|
||||
void kernel mem_set(global char *dest,
|
||||
dword byte,
|
||||
dword size)
|
||||
{
|
||||
uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
if (globalId < size)
|
||||
{
|
||||
dest[globalId] = (char)byte;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(32, 1, 1)))
|
||||
void kernel mem_set_size_ptr(global char *dest,
|
||||
dword byte,
|
||||
global qword* sizePtr)
|
||||
{
|
||||
uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
if (globalId < *sizePtr)
|
||||
{
|
||||
dest[globalId] = (char)byte;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,278 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module misc;
|
||||
|
||||
kernel_module misc("misc.cl")
|
||||
{
|
||||
kernel opencl_kernel_batched_init_globals < kernelFunction="batched_init_globals" >;
|
||||
kernel opencl_kernel_copy_instances < kernelFunction="copy_instances" >;
|
||||
kernel opencl_kernel_copy_instances_indirect < kernelFunction="copy_instances_indirect" >;
|
||||
kernel opencl_kernel_copy_instance_ptrs < kernelFunction="copy_instance_ptrs" >;
|
||||
kernel opencl_kernel_copy_instance_ptrs_indirect < kernelFunction="copy_instance_ptrs_indirect" >;
|
||||
kernel opencl_kernel_copy_instances_base_ptr < kernelFunction="copy_instances_base_ptr" >;
|
||||
kernel opencl_kernel_copy_instances_base_ptr_indirect < kernelFunction="copy_instances_base_ptr_indirect" >;
|
||||
kernel opencl_kernel_copy_instance_ptrs_base_ptr < kernelFunction="copy_instance_ptrs_base_ptr" >;
|
||||
kernel opencl_kernel_copy_instance_ptrs_base_ptr_indirect < kernelFunction="copy_instance_ptrs_base_ptr_indirect" >;
|
||||
kernel opencl_kernel_copy_geo_meta_data < kernelFunction="copy_geo_meta_data" >;
|
||||
kernel opencl_kernel_copy_geo_descs_indirect_build < source="misc.cl", kernelFunction="copy_geo_descs_indirect_build" >;
|
||||
kernel opencl_kernel_copy_mock < kernelFunction="copy_mock" >;
|
||||
kernel opencl_kernel_memset < kernelFunction="mem_set" >;
|
||||
kernel opencl_kernel_memset_size_ptr < kernelFunction="mem_set_size_ptr" >;
|
||||
}
|
||||
|
||||
import struct MKBuilderState "structs.grl";
|
||||
import struct MKSizeEstimate "structs.grl";
|
||||
|
||||
|
||||
metakernel batched_init_globals(
|
||||
qword p_data,
|
||||
dword numWgs)
|
||||
{
|
||||
dispatch opencl_kernel_batched_init_globals(numWgs,1,1) args(p_data);
|
||||
}
|
||||
|
||||
metakernel copy_instances(
|
||||
qword bvh_buffer,
|
||||
qword instanceDescsBuffer,
|
||||
qword totalSizeToCopy,
|
||||
dword numThreads)
|
||||
{
|
||||
dispatch opencl_kernel_copy_instances (numThreads, 1, 1) args(
|
||||
bvh_buffer,
|
||||
instanceDescsBuffer,
|
||||
totalSizeToCopy);
|
||||
}
|
||||
|
||||
metakernel
|
||||
copy_instances_indirect( qword bvh_buffer, qword instanceDescsBuffer, qword indirectBuildRangeInfo )
|
||||
{
|
||||
|
||||
define num_groups REG0;
|
||||
define C_2 REG2;
|
||||
define C_3 REG3;
|
||||
|
||||
C_2 = 2;
|
||||
C_3 = 3;
|
||||
|
||||
// sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
|
||||
// num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
|
||||
num_groups = load_dword( indirectBuildRangeInfo );
|
||||
num_groups = num_groups >> C_2;
|
||||
num_groups = num_groups + C_3;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_copy_instances_indirect args(
|
||||
bvh_buffer,
|
||||
instanceDescsBuffer,
|
||||
indirectBuildRangeInfo);
|
||||
}
|
||||
|
||||
metakernel copy_instance_ptrs(
|
||||
qword bvh_buffer,
|
||||
qword instanceDescPtrsBuffer,
|
||||
qword totalSizeToCopy,
|
||||
dword numThreads)
|
||||
{
|
||||
dispatch opencl_kernel_copy_instance_ptrs (numThreads, 1, 1) args(
|
||||
bvh_buffer,
|
||||
instanceDescPtrsBuffer,
|
||||
totalSizeToCopy);
|
||||
}
|
||||
|
||||
metakernel copy_instance_ptrs_indirect(
|
||||
qword bvh_buffer,
|
||||
qword instanceDescPtrsBuffer,
|
||||
qword indirectBuildRangeInfo)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define C_2 REG2;
|
||||
define C_3 REG3;
|
||||
|
||||
C_2 = 2;
|
||||
C_3 = 3;
|
||||
|
||||
// sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
|
||||
// num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
|
||||
num_groups = load_dword( indirectBuildRangeInfo );
|
||||
num_groups = num_groups >> C_2;
|
||||
num_groups = num_groups + C_3;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_copy_instance_ptrs_indirect args(
|
||||
bvh_buffer,
|
||||
instanceDescPtrsBuffer,
|
||||
indirectBuildRangeInfo);
|
||||
}
|
||||
|
||||
metakernel copy_instances_base_ptr(
|
||||
qword bvh_buffer,
|
||||
qword instanceDescsBuffer,
|
||||
qword totalSizeToCopy,
|
||||
dword numThreads)
|
||||
{
|
||||
dispatch opencl_kernel_copy_instances_base_ptr (numThreads, 1, 1) args(
|
||||
bvh_buffer,
|
||||
instanceDescsBuffer,
|
||||
totalSizeToCopy);
|
||||
}
|
||||
|
||||
metakernel copy_instances_base_ptr_indirect(
|
||||
qword bvh_buffer,
|
||||
qword instanceDescsBuffer,
|
||||
qword indirectBuildRangeInfo)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define C_2 REG2;
|
||||
define C_3 REG3;
|
||||
|
||||
C_2 = 2;
|
||||
C_3 = 3;
|
||||
|
||||
// sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
|
||||
// num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
|
||||
num_groups = load_dword( indirectBuildRangeInfo );
|
||||
num_groups = num_groups >> C_2;
|
||||
num_groups = num_groups + C_3;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_copy_instances_base_ptr_indirect args(
|
||||
bvh_buffer,
|
||||
instanceDescsBuffer,
|
||||
indirectBuildRangeInfo);
|
||||
}
|
||||
|
||||
metakernel copy_instance_ptrs_base_ptr(
|
||||
qword bvh_buffer,
|
||||
qword instanceDescPtrsBuffer,
|
||||
qword totalSizeToCopy,
|
||||
dword numThreads)
|
||||
{
|
||||
dispatch opencl_kernel_copy_instance_ptrs_base_ptr (numThreads, 1, 1) args(
|
||||
bvh_buffer,
|
||||
instanceDescPtrsBuffer,
|
||||
totalSizeToCopy);
|
||||
}
|
||||
|
||||
metakernel copy_instance_ptrs_base_ptr_indirect(
|
||||
qword bvh_buffer,
|
||||
qword instanceDescPtrsBuffer,
|
||||
qword indirectBuildRangeInfo)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define C_2 REG2;
|
||||
define C_3 REG3;
|
||||
|
||||
C_2 = 2;
|
||||
C_3 = 3;
|
||||
|
||||
// sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
|
||||
// num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
|
||||
num_groups = load_dword( indirectBuildRangeInfo );
|
||||
num_groups = num_groups >> C_2;
|
||||
num_groups = num_groups + C_3;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_copy_instance_ptrs_base_ptr_indirect args(
|
||||
bvh_buffer,
|
||||
instanceDescPtrsBuffer,
|
||||
indirectBuildRangeInfo);
|
||||
}
|
||||
|
||||
metakernel copy_geo_descs(
|
||||
qword private_dest,
|
||||
qword transient_src,
|
||||
qword indirectBuildRangeInfo,
|
||||
dword numGeometries)
|
||||
{
|
||||
|
||||
define num_groups (numGeometries + 16 - 1) / 16;
|
||||
dispatch opencl_kernel_copy_geo_descs_indirect_build(num_groups, 1, 1) args(
|
||||
private_dest,
|
||||
transient_src,
|
||||
indirectBuildRangeInfo,
|
||||
numGeometries);
|
||||
}
|
||||
|
||||
metakernel copy_geo_meta_data(
|
||||
qword bvh_buffer,
|
||||
qword geomdesc_buffer,
|
||||
qword totalSizeToCopy,
|
||||
dword numThreads)
|
||||
{
|
||||
dispatch opencl_kernel_copy_geo_meta_data (numThreads, 1, 1) args(
|
||||
bvh_buffer,
|
||||
geomdesc_buffer,
|
||||
totalSizeToCopy);
|
||||
}
|
||||
|
||||
|
||||
const COPY_MOCK_GROUP_SIZE = 16;
|
||||
|
||||
metakernel copy_mock(
|
||||
qword dest,
|
||||
qword src,
|
||||
dword size)
|
||||
{
|
||||
define num_groups (size + COPY_MOCK_GROUP_SIZE - 1) / COPY_MOCK_GROUP_SIZE;
|
||||
dispatch opencl_kernel_copy_mock(num_groups, 1, 1) args(
|
||||
dest,
|
||||
src,
|
||||
size);
|
||||
}
|
||||
|
||||
metakernel memset(
|
||||
qword dest,
|
||||
dword byte,
|
||||
dword size)
|
||||
{
|
||||
define num_groups (size + 32 - 1) / 32;
|
||||
dispatch opencl_kernel_memset(num_groups, 1, 1) args(
|
||||
dest,
|
||||
byte,
|
||||
size);
|
||||
}
|
||||
|
||||
metakernel memset_size_ptr(
|
||||
qword dest,
|
||||
dword byte,
|
||||
qword sizePtr)
|
||||
{
|
||||
define byteSize REG0;
|
||||
define C_32 REG1; C_32 = 32;
|
||||
define C_1 REG2; C_1 = 1;
|
||||
define C_4 REG3; C_4 = 4;
|
||||
define numGroupsRqd REG4;
|
||||
|
||||
byteSize = load_dword(sizePtr);
|
||||
|
||||
numGroupsRqd = byteSize + C_32;
|
||||
numGroupsRqd = numGroupsRqd - C_1;
|
||||
numGroupsRqd = numGroupsRqd >> C_4;
|
||||
numGroupsRqd = numGroupsRqd >> C_1;
|
||||
|
||||
DISPATCHDIM_X = numGroupsRqd.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_kernel_memset_size_ptr args(
|
||||
dest,
|
||||
byte,
|
||||
sizePtr);
|
||||
}
|
||||
|
|
@ -1,386 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "input_client_structs.h"
|
||||
#include "common.h"
|
||||
#include "instance.h"
|
||||
|
||||
#define DBG(x)
|
||||
#define ENABLE_CHECKS 0
|
||||
|
||||
/*
|
||||
|
||||
This kernel implements a exclusive scan addition operation. The
|
||||
implementation currently only uses one DSS.
|
||||
|
||||
*/
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
parallel_scan_exclusive_add(global uint *input,
|
||||
global uint *output,
|
||||
const uint N)
|
||||
{
|
||||
const uint j = get_local_id(0);
|
||||
const uint J = get_local_size(0);
|
||||
const uint BLOCKSIZE = (N + J - 1) / J;
|
||||
const uint start = min((j + 0) * BLOCKSIZE, N);
|
||||
const uint end = min((j + 1) * BLOCKSIZE, N);
|
||||
|
||||
uint base = 0;
|
||||
for (uint i = start; i < end; i++)
|
||||
base += input[i];
|
||||
|
||||
base = work_group_scan_exclusive_add(base);
|
||||
|
||||
uint accu = 0;
|
||||
for (uint i = start; i < end; i++)
|
||||
{
|
||||
output[i] = base + accu;
|
||||
accu += input[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This kernel implements a exclusive scan addition operation that can use the entire GPU.
|
||||
|
||||
*/
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
parallel_scan_exclusive_add_phase0(global uint *input,
|
||||
global uint *output,
|
||||
global uint *prefix_sums,
|
||||
const uint N)
|
||||
{
|
||||
const uint local_size = get_local_size(0);
|
||||
const uint numTasks = get_num_groups(0);
|
||||
const uint groupID = get_group_id(0);
|
||||
const uint localID = get_local_id(0);
|
||||
const uint global_startID = (groupID + 0) * N / numTasks;
|
||||
const uint global_endID = (groupID + 1) * N / numTasks;
|
||||
|
||||
uint base = 0;
|
||||
for (uint i = global_startID + localID; i < global_endID; i += local_size)
|
||||
base += input[i];
|
||||
|
||||
base = work_group_reduce_add(base);
|
||||
|
||||
if (localID == 0)
|
||||
{
|
||||
prefix_sums[groupID] = base;
|
||||
printf("%d -> %d \n", groupID, base);
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
parallel_scan_exclusive_add_phase1(global uint *input,
|
||||
global uint *output,
|
||||
global uint *prefix_sums,
|
||||
const uint N)
|
||||
{
|
||||
const uint local_size = get_local_size(0);
|
||||
const uint numTasks = get_num_groups(0);
|
||||
const uint groupID = get_group_id(0);
|
||||
const uint localID = get_local_id(0);
|
||||
const uint global_startID = (groupID + 0) * N / numTasks;
|
||||
const uint global_endID = (groupID + 1) * N / numTasks;
|
||||
const uint local_range = global_endID - global_startID;
|
||||
|
||||
uint global_base = 0;
|
||||
for (uint i = 0; i < groupID; i++)
|
||||
global_base += prefix_sums[i];
|
||||
|
||||
const uint j = get_local_id(0);
|
||||
const uint J = get_local_size(0);
|
||||
const uint BLOCKSIZE = (local_range + J - 1) / J;
|
||||
const uint startID = (j + 0) * local_range / J + global_startID;
|
||||
const uint endID = (j + 1) * local_range / J + global_startID;
|
||||
|
||||
uint base = 0;
|
||||
for (uint i = startID; i < endID; i++)
|
||||
base += input[i];
|
||||
|
||||
base = work_group_scan_exclusive_add(base);
|
||||
|
||||
uint accu = 0;
|
||||
for (uint i = startID; i < endID; i++)
|
||||
{
|
||||
output[i] = global_base + base + accu;
|
||||
accu += input[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
/* ============================== STATISTICS =============================== */
|
||||
/* ========================================================================= */
|
||||
|
||||
/* ====== STATS config ====== */
|
||||
|
||||
#define ENABLE_STAT_CHECKS 1
|
||||
#define DBG_STATS(x)
|
||||
|
||||
__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
printBVHStatistics(global struct Globals *globals,
|
||||
global char *bvh_mem,
|
||||
global struct StatStackEntry *global_stack0,
|
||||
global struct StatStackEntry *global_stack1,
|
||||
const uint presplit)
|
||||
{
|
||||
const uint globalID = get_global_id(0);
|
||||
const uint localID = get_local_id(0);
|
||||
const uint local_size = get_local_size(0);
|
||||
|
||||
struct BVHBase *base = (struct BVHBase *)bvh_mem;
|
||||
const uint root = base->rootNodeOffset;
|
||||
|
||||
local uint stack_items[2];
|
||||
local uint iterations;
|
||||
|
||||
struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root));
|
||||
root_aabb = conservativeAABB(&root_aabb);
|
||||
const float root_area = AABB_halfArea(&root_aabb);
|
||||
|
||||
global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
|
||||
|
||||
if (root_node->type != BVH_INTERNAL_NODE)
|
||||
{
|
||||
const uint numChildren = getNumChildren_QBVHNodeN(root_node);
|
||||
const uint current = root;
|
||||
for (uint i = 0; i < numChildren; i++)
|
||||
{
|
||||
struct AABB aabb = extractAABB_QBVHNodeN(root_node, i);
|
||||
const float area = AABB_halfArea(&aabb);
|
||||
|
||||
global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad);
|
||||
global_stack0[i].type = root_node->type;
|
||||
global_stack0[i].area = area;
|
||||
global_stack0[i].aabb = aabb;
|
||||
global_stack0[i].depth = 0;
|
||||
}
|
||||
stack_items[0] = numChildren;
|
||||
stack_items[1] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
global_stack0[0].node = root;
|
||||
global_stack0[0].type = root_node->type;
|
||||
global_stack0[0].area = root_area;
|
||||
global_stack0[0].aabb = root_aabb;
|
||||
global_stack0[0].depth = 1;
|
||||
stack_items[0] = 1;
|
||||
stack_items[1] = 0;
|
||||
}
|
||||
|
||||
const uint maxInnerNodeOffset = globals->node_mem_allocator.cur;
|
||||
const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur;
|
||||
|
||||
DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64));
|
||||
|
||||
iterations = 0;
|
||||
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
float sah_nodes = 0.0f;
|
||||
float sah_leaves = 0.0f;
|
||||
uint leaves = 0;
|
||||
uint inner_nodes = 0;
|
||||
uint max_depth = 0;
|
||||
uint leaf_items = 0;
|
||||
uint inner_nodes_valid_children = 0;
|
||||
|
||||
while (1)
|
||||
{
|
||||
work_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
const uint buffer_index = (iterations % 2) == 0 ? 0 : 1;
|
||||
global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1;
|
||||
global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0;
|
||||
|
||||
const uint local_stack_items = stack_items[buffer_index];
|
||||
stack_items[1 - buffer_index] = 0;
|
||||
|
||||
DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items));
|
||||
|
||||
if (local_stack_items == 0)
|
||||
break;
|
||||
//if (iterations == 5) break;
|
||||
|
||||
work_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
if (globalID == 0)
|
||||
iterations++;
|
||||
|
||||
for (uint sindex = localID; sindex < local_stack_items; sindex += local_size)
|
||||
{
|
||||
|
||||
uint current = input_global_stack[sindex].node;
|
||||
uint type = input_global_stack[sindex].type;
|
||||
float current_area = input_global_stack[sindex].area;
|
||||
struct AABB current_aabb = input_global_stack[sindex].aabb;
|
||||
uint current_depth = input_global_stack[sindex].depth;
|
||||
|
||||
//printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items);
|
||||
|
||||
max_depth = max(max_depth, current_depth);
|
||||
|
||||
if (type == BVH_QUAD_NODE)
|
||||
{
|
||||
unsigned int prims = 1; //getNumLeafPrims(current);
|
||||
if (prims > BVH_LEAF_N_MAX)
|
||||
printf("too many items in leaf %d \n", prims);
|
||||
unsigned int prims_offset = current; //getLeafOffset(current);
|
||||
//printf("prims_offset %d \n",prims_offset);
|
||||
|
||||
leaf_items += prims;
|
||||
sah_leaves += current_area;
|
||||
leaves++;
|
||||
#if ENABLE_STAT_CHECKS == 1
|
||||
struct AABB leafAABB;
|
||||
AABB_init(&leafAABB);
|
||||
|
||||
global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset);
|
||||
//printf("prims_offset %d \n",prims_offset);
|
||||
|
||||
for (uint i = 0; i < prims; i++)
|
||||
{
|
||||
struct AABB quadAABB = getAABB_Quad(&quads[i]);
|
||||
AABB_extend(&leafAABB, &quadAABB);
|
||||
}
|
||||
|
||||
if (!presplit && !AABB_subset(&leafAABB, ¤t_aabb))
|
||||
{
|
||||
printf("leaf error: current %d depth %d \n", current, current_depth);
|
||||
AABB_print(¤t_aabb);
|
||||
printf("leaf bounds: \n");
|
||||
AABB_print(&leafAABB);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if (type == BVH_INTERNAL_NODE)
|
||||
{
|
||||
inner_nodes++;
|
||||
sah_nodes += current_area;
|
||||
global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current);
|
||||
|
||||
uint children = 0;
|
||||
for (uint i = 0; i < BVH_NODE_N6; i++)
|
||||
{
|
||||
if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i])
|
||||
break;
|
||||
children++;
|
||||
}
|
||||
//printf("children %d \n",children);
|
||||
|
||||
#if ENABLE_STAT_CHECKS == 1
|
||||
if (children > BVH_NODE_N6 || children == 0)
|
||||
{
|
||||
printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID);
|
||||
printQBVHNodeN(nodeN);
|
||||
}
|
||||
|
||||
if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0)
|
||||
{
|
||||
printf("offset error %d \n", nodeN->offset);
|
||||
}
|
||||
#endif
|
||||
|
||||
uint children_offset = atomic_add(&stack_items[1 - buffer_index], children);
|
||||
|
||||
for (uint i = 0; i < children; i++)
|
||||
{
|
||||
inner_nodes_valid_children++;
|
||||
|
||||
struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i);
|
||||
const float area = AABB_halfArea(&aabb);
|
||||
|
||||
aabb = conservativeAABB(&aabb);
|
||||
|
||||
#if 0 // ENABLE_STAT_CHECKS == 1 // FIXME: not clear whether parent child property still holds !!!!
|
||||
|
||||
// if (aabb.lower.x == (float)(INFINITY))
|
||||
// {
|
||||
// printf("aabb inf error %d current %d nodeN %d \n",i, current, children);
|
||||
// break;
|
||||
// }
|
||||
|
||||
|
||||
if (!presplit && !AABB_subset(&aabb,¤t_aabb))
|
||||
{
|
||||
printf("Parent: current %d depth %d children %d \n",current, current_depth, children);
|
||||
AABB_print(¤t_aabb);
|
||||
printf("Child %d: \n",i);
|
||||
AABB_print(&aabb);
|
||||
}
|
||||
#endif
|
||||
|
||||
uint dest_index = children_offset + i;
|
||||
if (nodeN->type == BVH_QUAD_NODE)
|
||||
{
|
||||
output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad);
|
||||
if (output_global_stack[dest_index].node >= maxLeafNodeOffset)
|
||||
{
|
||||
printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64);
|
||||
}
|
||||
}
|
||||
else if (nodeN->type == BVH_INTERNAL_NODE)
|
||||
{
|
||||
output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN));
|
||||
if (output_global_stack[dest_index].node >= maxInnerNodeOffset)
|
||||
{
|
||||
printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset);
|
||||
}
|
||||
}
|
||||
|
||||
output_global_stack[dest_index].type = nodeN->type;
|
||||
output_global_stack[dest_index].area = area;
|
||||
output_global_stack[dest_index].aabb = aabb;
|
||||
output_global_stack[dest_index].depth = current_depth + 1;
|
||||
//printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sah_nodes = work_group_reduce_add(sah_nodes);
|
||||
sah_leaves = work_group_reduce_add(sah_leaves);
|
||||
leaves = work_group_reduce_add(leaves);
|
||||
inner_nodes = work_group_reduce_add(inner_nodes);
|
||||
max_depth = work_group_reduce_max(max_depth);
|
||||
leaf_items = work_group_reduce_add(leaf_items);
|
||||
inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children);
|
||||
|
||||
if (globalID == 0)
|
||||
{
|
||||
/*
|
||||
sah_nodes *= 1.0f / root_area;
|
||||
sah_leaves *= 1.0f / root_area;
|
||||
float sah = sah_nodes + sah_leaves;
|
||||
|
||||
const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start;
|
||||
const uint totalAllocatedMem = globals->totalAllocatedMem;
|
||||
|
||||
printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX);
|
||||
float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6);
|
||||
float leaf_util = 100.0f * (float)leaf_items / (leaves);
|
||||
printf("allocators: node %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start);
|
||||
printf("inner nodes %d leaves %d sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves);
|
||||
uint node_mem = globals->node_mem_allocator_cur;
|
||||
uint max_node_mem = globalLeafMemAllocatorOffset;
|
||||
float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem;
|
||||
|
||||
uint leaf_mem = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset;
|
||||
uint max_leaf_mem = totalAllocatedMem - globalLeafMemAllocatorOffset;
|
||||
float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem;
|
||||
|
||||
uint total_mem = node_mem + leaf_mem;
|
||||
float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem;
|
||||
|
||||
printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem);
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
|
@ -1,196 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
//
|
||||
// This file contains structure definitions shared by GRL OCL kernels and host code
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLGen12.h"
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
GRL_NAMESPACE_BEGIN(MISC)
|
||||
|
||||
struct BatchedInitGlobalsData
|
||||
{
|
||||
qword p_build_globals;
|
||||
qword p_bvh_buffer;
|
||||
dword numPrimitives;
|
||||
dword numGeometries;
|
||||
dword numInstances;
|
||||
dword instance_descs_start;
|
||||
dword geo_meta_data_start;
|
||||
dword node_data_start;
|
||||
dword leaf_data_start;
|
||||
dword procedural_data_start;
|
||||
dword back_pointer_start;
|
||||
dword sizeTotal;
|
||||
dword leafType;
|
||||
dword leafSize;
|
||||
dword fatleaf_table_start;
|
||||
dword innernode_table_start;
|
||||
dword quad_indices_data_start;
|
||||
};
|
||||
|
||||
/// Header of debug buffer
|
||||
///
|
||||
/// Header is placed at the begining of debug buffer.
|
||||
/// After header there is circullar buffer space
|
||||
typedef struct DebugBufferHeader
|
||||
{
|
||||
/// Offset to begin of buffer (after header)
|
||||
dword headStart;
|
||||
/// Offset to free memory in buffer (used by gpu)
|
||||
dword gpuHead;
|
||||
/// Offset to end of data in buffer that is ready to read (read on cpu, set on gpu, might be behind gpuHeader)
|
||||
dword cpuHead;
|
||||
/// Flag for buffer overflow
|
||||
dword overflow;
|
||||
/// Total size of buffer
|
||||
dword totalSize;
|
||||
/// Padding needed because otherwise GPU overrides tail with cacheline flush
|
||||
dword pad[11];
|
||||
/// Offset to begin of data in buffer
|
||||
dword tail;
|
||||
} DebugBufferHeader;
|
||||
|
||||
enum InputDumpOperationType
|
||||
{
|
||||
INPUT_DUMP_OP_NOP,
|
||||
INPUT_DUMP_OP_BATCH,
|
||||
INPUT_DUMP_OP_BUILD,
|
||||
INPUT_DUMP_OP_UPDATE,
|
||||
INPUT_DUMP_OP_CLONE,
|
||||
INPUT_DUMP_OP_COMPACT,
|
||||
INPUT_DUMP_OP_SERIALIZE,
|
||||
INPUT_DUMP_OP_DESERIALIZE,
|
||||
INPUT_DUMP_OP_END_BUFFER
|
||||
};
|
||||
|
||||
// each operation starts with the same header structure and looks like this
|
||||
|
||||
// some defined struct { <-----------------start
|
||||
// OpHeader
|
||||
// .... struct type specific data
|
||||
// }
|
||||
// ... auxilary data of variable len
|
||||
// <-------------------------------------- end - indicated by endOfData
|
||||
typedef struct OpHeader
|
||||
{
|
||||
dword operationType;
|
||||
dword endOfData; // offset to end of this primitive
|
||||
} OpHeader;
|
||||
|
||||
// header for batch operations
|
||||
typedef struct BatchOpHeader
|
||||
{
|
||||
OpHeader opHeader;
|
||||
} BatchOpHeader;
|
||||
|
||||
// interpretation for operationType INPUT_DUMP_OP_BATCH
|
||||
typedef struct InputBatch
|
||||
{
|
||||
BatchOpHeader header;
|
||||
qword batchId;
|
||||
dword vertexBufferDataSize;
|
||||
dword firstContainedOpOffset;
|
||||
|
||||
// layout of batch is as below, each line is 128B aligned:
|
||||
|
||||
//
|
||||
// InputBatch <-------------------------------- start
|
||||
// optional: batchVertexData
|
||||
// InputBuildDesc/InputCopy <------------------ start + firstContainedOpOffset
|
||||
// optional: extra data of above token
|
||||
// InputBuildDesc/InputCopy
|
||||
// optional: extra data of above token
|
||||
// ...
|
||||
// InputBuildDesc/InputCopy
|
||||
// optional: extra data of above token
|
||||
// <-------------------------------------------- end = start + endOfData
|
||||
} InputBatch;
|
||||
|
||||
// for operationType:
|
||||
// INPUT_DUMP_OP_BUILD,
|
||||
// INPUT_DUMP_OP_UPDATE,
|
||||
// followed by auxilary data of variable len
|
||||
typedef struct InputBuild
|
||||
{
|
||||
OpHeader header;
|
||||
qword srcBvhPtr;
|
||||
qword dstBvhPtr;
|
||||
dword flags;
|
||||
dword numGeos;
|
||||
dword numInstances;
|
||||
dword instArrayOfPtrs;
|
||||
} InputBuild;
|
||||
|
||||
// for operationType:
|
||||
// INPUT_DUMP_OP_CLONE,
|
||||
// INPUT_DUMP_OP_COMPACT,
|
||||
// INPUT_DUMP_OP_SERIALIZE,
|
||||
//
|
||||
// Not for INPUT_DUMP_OP_DESERIALIZE!
|
||||
typedef struct InputCopy
|
||||
{
|
||||
OpHeader header;
|
||||
qword srcBvhPtr;
|
||||
qword dstBvhPtr;
|
||||
} InputCopy;
|
||||
|
||||
// for INPUT_DUMP_OP_DESERIALIZE
|
||||
// decode for debug tools follows this format
|
||||
typedef struct InputDeserialize
|
||||
{
|
||||
OpHeader header;
|
||||
qword dstBvhPtr;
|
||||
} InputDeserialize;
|
||||
|
||||
typedef struct InputBatchPtrs
|
||||
{
|
||||
qword dumpDst;
|
||||
qword globalDumpBuffer;
|
||||
qword nonVertexDataStart;
|
||||
dword vertexBuffersSize;
|
||||
dword totalSize;
|
||||
} InputBatchPtrs;
|
||||
|
||||
enum OutputDumpOperationType
|
||||
{
|
||||
OUTPUT_DUMP_OP_NOP,
|
||||
OUTPUT_DUMP_OP_BATCH,
|
||||
OUTPUT_DUMP_OP_DATA,
|
||||
OUTPUT_DUMP_OP_END_BUFFER
|
||||
};
|
||||
|
||||
// interpretation for operationType OUTPUT_DUMP_OP_BATCH
|
||||
typedef struct OutputBatch {
|
||||
BatchOpHeader header;
|
||||
qword batchId;
|
||||
dword firstContainedOpOffset;
|
||||
} OutputBatch;
|
||||
|
||||
// interpretation for operationType OUTPUT_DUMP_OP_DATA
|
||||
typedef struct OutputData
|
||||
{
|
||||
OpHeader header;
|
||||
qword srcBvhPtr;
|
||||
} OutputData;
|
||||
|
||||
typedef struct OutputBatchPtrs
|
||||
{
|
||||
qword dumpDst;
|
||||
qword dataStart;
|
||||
dword dataSize;
|
||||
dword totalSize;
|
||||
} OutputBatchPtrs;
|
||||
|
||||
GRL_NAMESPACE_END(MISC)
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,245 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2022 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define MORTON_DEBUG_CHECKS 0
|
||||
#define MORTON_VERBOSE_LOG 0
|
||||
|
||||
GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift )
|
||||
{
|
||||
#if 0 // turn off, because current hierarchy build requires full sort
|
||||
// Difference between max iterations needed for LSB sorting and
|
||||
// number of iterations needed for LSB sorting without primIDs
|
||||
// This indicates how many of first iterations would be skipped in LSB
|
||||
return 8 - (8 - (shift >> 3));
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
typedef struct BuildRecordLocalMortonFlattener
|
||||
{
|
||||
unsigned int leftChild; // global
|
||||
unsigned int rightChild; // global
|
||||
unsigned int rangeStart; // global
|
||||
unsigned int local_parent_index__numItems;
|
||||
} BuildRecordLocalMortonFlattener;
|
||||
|
||||
// TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced
|
||||
typedef union UPerNodeData {
|
||||
float4 four_DWs;
|
||||
BuildRecordLocalMortonFlattener buildRecord;
|
||||
MortonFlattenedBoxlessNode boxlessNode;
|
||||
struct AABB box;
|
||||
} UPerNodeData;
|
||||
|
||||
GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)
|
||||
{
|
||||
return bn.childOffset_type >> 6;
|
||||
}
|
||||
|
||||
GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)
|
||||
{
|
||||
return bn.childOffset_type & ((1<<6) -1);
|
||||
}
|
||||
|
||||
GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
|
||||
{
|
||||
short lane_used = index % get_sub_group_size();
|
||||
short shift = (index / get_sub_group_size()) * get_sub_group_size();
|
||||
if (lane_used == lane) {
|
||||
*arr |= (val << shift);
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane)
|
||||
{
|
||||
short r = 0;
|
||||
short lane_used = index % get_sub_group_size();
|
||||
short shift = (index / get_sub_group_size()) * get_sub_group_size();
|
||||
r = arr >> shift;
|
||||
r = sub_group_broadcast(r, lane_used);
|
||||
return r;
|
||||
}
|
||||
|
||||
GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst)
|
||||
{
|
||||
if (lane < count)
|
||||
{
|
||||
dst[lane]=(ushort)(arr & 0xFFFF);
|
||||
short hi_idx = lane + get_sub_group_size();
|
||||
if (hi_idx < count) {
|
||||
dst[hi_idx] = (ushort)(arr >> 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane)
|
||||
{
|
||||
if (lane < count)
|
||||
{
|
||||
*arr = src[lane];
|
||||
short hi_idx = lane + get_sub_group_size();
|
||||
if (hi_idx < count) {
|
||||
*arr |= ((uint)(src[hi_idx])) << 16u;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane)
|
||||
{
|
||||
short lane_used = index % get_sub_group_size();
|
||||
short shift = (index / get_sub_group_size()) * get_sub_group_size();
|
||||
if (lane_used == lane) {
|
||||
uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint
|
||||
*arr = (val << shift) | rem_val;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void SUBGROUP_refit_bottom_up_local(
|
||||
uniform struct QBVHNodeN* globalNodeData,
|
||||
uniform struct BackPointers* backPointers,
|
||||
uniform uint treeletRootGlobalIndex,
|
||||
uniform uint globalBaseForInternalNodes,
|
||||
varying ushort lane,
|
||||
uniform local union UPerNodeData* local_nodes,
|
||||
varying uint sg_bu_startpoints,
|
||||
uniform uint sg_bu_startpoints_cnt)
|
||||
{
|
||||
if(sg_bu_startpoints_cnt == 0)
|
||||
return;
|
||||
|
||||
const uint head_lane = 0;
|
||||
uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
|
||||
|
||||
uniform uint prev_loc_index = 0;
|
||||
uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
|
||||
|
||||
uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
|
||||
|
||||
while (curNodeIndex != 0)
|
||||
{
|
||||
uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode);
|
||||
uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
|
||||
varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane;
|
||||
|
||||
uint numChildren = BackPointer_GetNumChildren(backpointer);
|
||||
if (child_loc_idx != prev_loc_index &&
|
||||
lane < numChildren)
|
||||
{
|
||||
child_aabb = local_nodes[child_loc_idx].box;
|
||||
}
|
||||
else if (lane >= numChildren) {
|
||||
AABB_init(&child_aabb);
|
||||
child_aabb.lower.w = as_float(0u);
|
||||
}
|
||||
|
||||
// TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
|
||||
struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
|
||||
reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 );
|
||||
|
||||
uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
|
||||
reduced_bounds.lower.w = as_float((uint)instMask);
|
||||
uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0);
|
||||
local uint* pbox = (local uint*)(local_nodes+ curNodeIndex);
|
||||
if (lane < 8)
|
||||
{
|
||||
pbox[lane] = reduce_bounds_lane;
|
||||
}
|
||||
|
||||
uint global_node_idx = globalBaseForInternalNodes + curNodeIndex;
|
||||
/* get bounds of all children from child nodes directly */
|
||||
struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
|
||||
subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false);
|
||||
child_aabb = reduced_bounds;
|
||||
uint parentIndex = BackPointer_GetParentIndex(backpointer);
|
||||
|
||||
write_mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lane == 0)
|
||||
{
|
||||
backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer));
|
||||
uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex;
|
||||
uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3);
|
||||
|
||||
/* set global back pointer */
|
||||
*InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer;
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n",
|
||||
global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
|
||||
prev_loc_index = curNodeIndex;
|
||||
curNodeIndex = parentIndex;
|
||||
|
||||
/* if all children got refitted, then continue */
|
||||
uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
|
||||
uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
|
||||
if (numChildrenRefitted != numChildrenTotal)
|
||||
{
|
||||
if(sg_bu_startpoints_cnt)
|
||||
{
|
||||
curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
|
||||
backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
|
||||
}
|
||||
else
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// process root of the treelet
|
||||
{
|
||||
|
||||
#if MORTON_DEBUG_CHECKS
|
||||
if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
|
||||
#endif
|
||||
|
||||
uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode);
|
||||
varying uint child_loc_idx = lead_child_loc_offset + 0 + lane;
|
||||
uint numChildren = BackPointer_GetNumChildren(backpointer);
|
||||
|
||||
if (child_loc_idx != prev_loc_index &&
|
||||
lane < numChildren)
|
||||
{
|
||||
child_aabb = local_nodes[child_loc_idx].box;
|
||||
}
|
||||
else if (lane >= numChildren) {
|
||||
AABB_init(&child_aabb);
|
||||
child_aabb.lower.w = as_float(0u);
|
||||
}
|
||||
|
||||
// TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
|
||||
uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
|
||||
uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
|
||||
uint global_node_idx = treeletRootGlobalIndex;
|
||||
uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset;
|
||||
|
||||
/* get bounds of all children from child nodes directly */
|
||||
struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
|
||||
|
||||
subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false);
|
||||
|
||||
/* reset refit counter for next refit */
|
||||
if (lane == 0)
|
||||
{
|
||||
/* set global back pointer */
|
||||
*InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u);
|
||||
|
||||
// TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
|
||||
curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,400 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2022 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
#include "morton/morton_common.h"
|
||||
|
||||
GRL_INLINE void SUBGROUP_create_node_phase0(
|
||||
uniform global struct Globals* globals,
|
||||
uniform global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
uniform global char* bvh_mem,
|
||||
uniform global uint *global_refit_startpoints,
|
||||
uniform uint rID,
|
||||
uniform local uint* local_numRecords,
|
||||
uniform local uint* local_QNodeOffset,
|
||||
uniform global struct BuildRecordMorton* records,
|
||||
uniform struct BuildRecordMorton current,
|
||||
uniform local uint* local_startpoints_num)
|
||||
{
|
||||
uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
|
||||
uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
|
||||
uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
|
||||
|
||||
varying ushort lane = get_sub_group_local_id();
|
||||
|
||||
/* initialize child array */
|
||||
uniform uint numChildren = 2;
|
||||
varying struct BuildRecordMorton sg_children;
|
||||
sg_children.items = 0;
|
||||
sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
|
||||
|
||||
if ( lane < numChildren )
|
||||
sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
|
||||
|
||||
/* fill QBVH6 node with up to 6 children */
|
||||
while ( numChildren < BVH_NODE_N6 )
|
||||
{
|
||||
varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
|
||||
if ( sub_group_all( sg_is_leaf ) )
|
||||
break;
|
||||
|
||||
uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
|
||||
uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
|
||||
uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
|
||||
|
||||
varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
|
||||
|
||||
if ( lane == numChildren || lane == bestChild )
|
||||
{
|
||||
sg_children.nodeID = nodeID;
|
||||
sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
|
||||
}
|
||||
|
||||
numChildren++;
|
||||
}
|
||||
|
||||
const uint current_index = current.current_index;
|
||||
struct QBVHNodeN* qnode = nodeData + current_index;
|
||||
SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
|
||||
|
||||
uniform uint global_offset;
|
||||
uniform uint child_node_offset;
|
||||
|
||||
// Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
|
||||
// used in global refit after phase1
|
||||
varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
|
||||
uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
|
||||
|
||||
if ( lane == 0 )
|
||||
{
|
||||
child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
|
||||
|
||||
/* create node, but to not set bounds yet as these get calculated during refit */
|
||||
QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE );
|
||||
QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) );
|
||||
/* set back pointers */
|
||||
uint backpointer = (current.parent_index << 6) | (numChildren << 3);
|
||||
|
||||
global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n",
|
||||
rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren);
|
||||
#endif
|
||||
|
||||
if(children_roots_num == numChildren)
|
||||
{
|
||||
uint startpoints_offset = atomic_inc_local( local_startpoints_num );
|
||||
global_refit_startpoints[startpoints_offset] = current_index;
|
||||
}
|
||||
else
|
||||
{
|
||||
backpointer += children_roots_num;
|
||||
}
|
||||
|
||||
*InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
|
||||
}
|
||||
|
||||
child_node_offset = sub_group_broadcast( child_node_offset, 0 );
|
||||
global_offset = sub_group_broadcast( global_offset, 0 );
|
||||
|
||||
uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
|
||||
|
||||
sg_children.current_index = childNodes - nodeData + lane;
|
||||
sg_children.parent_index = current_index;
|
||||
|
||||
if ( lane < numChildren )
|
||||
{
|
||||
uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
|
||||
records[write_position] = sg_children;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void SUBGROUP_create_node_phase0_local_sync(
|
||||
uniform global struct Globals* globals,
|
||||
uniform global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
uniform global char* bvh_mem,
|
||||
uniform uint rID,
|
||||
uniform local uint* local_numRecords,
|
||||
uniform local uint* local_QNodeOffset,
|
||||
uniform global struct BuildRecordMorton* records,
|
||||
uniform struct BuildRecordMorton current,
|
||||
uniform local uint* local_p0_total,
|
||||
uniform global struct MortonFlattenedBoxlessNode *boxless_nodes,
|
||||
uniform uint nodeDataStart)
|
||||
{
|
||||
uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
uniform const uint rootNodeOffset = bvh->rootNodeOffset;
|
||||
uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
|
||||
uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
|
||||
|
||||
varying ushort lane = get_sub_group_local_id();
|
||||
|
||||
/* initialize child array */
|
||||
uniform uint numChildren = 2;
|
||||
varying struct BuildRecordMorton sg_children;
|
||||
sg_children.items = 0;
|
||||
sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
|
||||
|
||||
if ( lane < numChildren )
|
||||
sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
|
||||
|
||||
/* fill QBVH6 node with up to 6 children */
|
||||
while ( numChildren < BVH_NODE_N6 )
|
||||
{
|
||||
varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
|
||||
if ( sub_group_all( sg_is_leaf ) )
|
||||
break;
|
||||
|
||||
uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
|
||||
uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
|
||||
uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
|
||||
|
||||
varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
|
||||
|
||||
if ( lane == numChildren || lane == bestChild )
|
||||
{
|
||||
sg_children.nodeID = nodeID;
|
||||
sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
|
||||
}
|
||||
|
||||
numChildren++;
|
||||
}
|
||||
|
||||
const uint current_index = current.current_index;
|
||||
uniform uint global_offset;
|
||||
uniform uint child_node_offset;
|
||||
|
||||
// Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
|
||||
// used in global refit after phase1
|
||||
varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
|
||||
uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane);
|
||||
uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
|
||||
|
||||
if ( lane == 0 )
|
||||
{
|
||||
child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
|
||||
|
||||
/* Do not create qnodes here */
|
||||
uint backpointer = (current.parent_index << 6) | (numChildren << 3);
|
||||
|
||||
global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n",
|
||||
rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart);
|
||||
#endif
|
||||
|
||||
MortonFlattenedBoxlessNode flattened_node;
|
||||
|
||||
if(children_roots_num != numChildren)
|
||||
backpointer += children_roots_num;
|
||||
|
||||
flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask;
|
||||
|
||||
uint loc_id = atomic_inc_local( local_p0_total );
|
||||
|
||||
flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE;
|
||||
flattened_node.backPointer = backpointer;
|
||||
|
||||
//TODO: change this writes to L1WB or streaming
|
||||
boxless_nodes[loc_id] = flattened_node;
|
||||
|
||||
*InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
|
||||
}
|
||||
|
||||
child_node_offset = sub_group_broadcast( child_node_offset, 0 );
|
||||
global_offset = sub_group_broadcast( global_offset, 0 );
|
||||
|
||||
uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
|
||||
|
||||
sg_children.current_index = childNodes - nodeData + lane;
|
||||
sg_children.parent_index = current_index;
|
||||
|
||||
if ( lane < numChildren )
|
||||
{
|
||||
uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
|
||||
records[write_position] = sg_children;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
In this phase a single large work group performs the construction of
|
||||
the top of the BVH and creates a build record array.
|
||||
|
||||
Two varians of this kernel:
|
||||
1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit
|
||||
in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase
|
||||
that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate
|
||||
that is not effective.
|
||||
2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with
|
||||
number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit.
|
||||
In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1.
|
||||
Refit is performed only with local synchronization.
|
||||
|
||||
*/
|
||||
|
||||
__attribute__((reqd_work_group_size(512, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
|
||||
parallel_build_phase0(global struct Globals *globals,
|
||||
global struct BinaryMortonCodeHierarchy *bnodes,
|
||||
global char *bvh_mem,
|
||||
global uint *global_refit_startpoints)
|
||||
{
|
||||
global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
|
||||
global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
|
||||
|
||||
/* a queue of build records in global memory */
|
||||
global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
|
||||
local uint local_numRecords;
|
||||
local uint local_QNodeOffset;
|
||||
local uint local_startpoints_num;
|
||||
|
||||
/* initialize first build record */
|
||||
if (get_local_id(0) == 0)
|
||||
{
|
||||
/* allocate root node */
|
||||
uint root_node_offset = 64*bvh->nodeDataCur;
|
||||
global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
|
||||
|
||||
//assert(root_node_offset == 0);
|
||||
records[0].nodeID = globals->binary_hierarchy_root;
|
||||
records[0].items = globals->numPrimitives;
|
||||
records[0].current_index = rootNode - nodeData;
|
||||
records[0].parent_index = -1;
|
||||
|
||||
local_numRecords = 1;
|
||||
local_QNodeOffset = root_node_offset + 64;
|
||||
local_startpoints_num = 0;
|
||||
|
||||
mem_fence_workgroup_default();
|
||||
}
|
||||
|
||||
uint num_records = 1;
|
||||
|
||||
/* terminate when all subtrees are under size threshold */
|
||||
while(true)
|
||||
{
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* all work items in the work group pick a subtree to build */
|
||||
for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
|
||||
{
|
||||
/* small subtrees will get built in next phase */
|
||||
if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
|
||||
continue;
|
||||
|
||||
/* create QBVH node */
|
||||
SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset,
|
||||
records, records[ID], &local_startpoints_num);
|
||||
}
|
||||
|
||||
work_group_barrier( CLK_LOCAL_MEM_FENCE );
|
||||
mem_fence_workgroup_default();
|
||||
uint old_num_records = num_records;
|
||||
num_records = local_numRecords;
|
||||
if( old_num_records == num_records )
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
/* remember number of build records for next phase */
|
||||
if (get_local_id( 0 ) == 0)
|
||||
{
|
||||
globals->numBuildRecords = local_numRecords;
|
||||
globals->p0_created_num = local_startpoints_num;
|
||||
bvh->nodeDataCur = local_QNodeOffset / 64;
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(512, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
|
||||
parallel_build_phase0_local_sync(global struct Globals *globals,
|
||||
global struct BinaryMortonCodeHierarchy *bnodes,
|
||||
global char *bvh_mem,
|
||||
global struct MortonFlattenedBoxlessNode *boxless_nodes)
|
||||
{
|
||||
global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
|
||||
global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
|
||||
uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
|
||||
|
||||
/* a queue of build records in global memory */
|
||||
global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
|
||||
local uint local_numRecords;
|
||||
local uint local_QNodeOffset;
|
||||
local uint local_p0_total;
|
||||
|
||||
/* initialize first build record */
|
||||
if (get_local_id(0) == 0)
|
||||
{
|
||||
/* allocate root node */
|
||||
uint root_node_offset = 64*bvh->nodeDataCur;
|
||||
global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
|
||||
|
||||
//assert(root_node_offset == 0);
|
||||
records[0].nodeID = globals->binary_hierarchy_root;
|
||||
records[0].items = globals->numPrimitives;
|
||||
records[0].current_index = rootNode - nodeData;
|
||||
records[0].parent_index = -1;
|
||||
|
||||
local_numRecords = 1;
|
||||
local_QNodeOffset = root_node_offset + 64;
|
||||
local_p0_total = 0;
|
||||
|
||||
mem_fence_workgroup_default();
|
||||
}
|
||||
|
||||
uint num_records = 1;
|
||||
|
||||
/* terminate when all subtrees are under size threshold */
|
||||
while(true)
|
||||
{
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* all work items in the work group pick a subtree to build */
|
||||
for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
|
||||
{
|
||||
/* small subtrees will get built in next phase */
|
||||
if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
|
||||
continue;
|
||||
|
||||
/* create QBVH node */
|
||||
SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records,
|
||||
records[ID], &local_p0_total, boxless_nodes, nodeDataStart);
|
||||
}
|
||||
|
||||
mem_fence_workgroup_default();
|
||||
work_group_barrier( CLK_LOCAL_MEM_FENCE );
|
||||
|
||||
uint old_num_records = num_records;
|
||||
num_records = local_numRecords;
|
||||
if( old_num_records == num_records )
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
/* remember number of build records for next phase */
|
||||
if (get_local_id( 0 ) == 0)
|
||||
{
|
||||
globals->numBuildRecords = local_numRecords;
|
||||
bvh->nodeDataCur = local_QNodeOffset / 64;
|
||||
|
||||
globals->p0_allocated_num = BVHBase_numNodes(bvh);
|
||||
globals->p0_created_num = local_p0_total;
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
@ -1,785 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2022 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
#include "morton/morton_common.h"
|
||||
|
||||
// caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards;
|
||||
BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec)
|
||||
{
|
||||
BuildRecordLocalMortonFlattener rec;
|
||||
rec.leftChild = srcRec.leftChild;
|
||||
rec.rightChild = srcRec.rightChild;
|
||||
rec.rangeStart = srcRec.range.start;
|
||||
rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1;
|
||||
return rec;
|
||||
}
|
||||
|
||||
GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless)
|
||||
{
|
||||
BuildRecordLocalMortonFlattener rec;
|
||||
rec.leftChild = boxless.binary_hierarchy_index;
|
||||
rec.rightChild = boxless.childOffset_type;
|
||||
rec.rangeStart = boxless.backPointer;
|
||||
rec.local_parent_index__numItems = 0;
|
||||
return rec;
|
||||
}
|
||||
|
||||
GRL_INLINE void SUBGROUP_create_boxless_node_phase1(
|
||||
uniform global struct Globals* globals,
|
||||
uniform global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
uniform global char* bvh_mem,
|
||||
uniform BuildRecordLocalMortonFlattener currentRecord,
|
||||
uniform uint currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record
|
||||
uniform local uint* local_numRecords,
|
||||
uniform uint tictoc,
|
||||
uniform uint* sg_bu_startpoint_arr,
|
||||
uniform uint* sg_bu_startpoint_cnt,
|
||||
uniform uint parentOfRoot,
|
||||
uniform bool processRoot,
|
||||
uniform UPerNodeData* nodeData)
|
||||
{
|
||||
varying ushort lane = get_sub_group_local_id();
|
||||
|
||||
/* initialize child array */
|
||||
uniform uint numChildren = 2;
|
||||
varying struct BuildRecordLocalMortonFlattener sg_children;
|
||||
sg_children.local_parent_index__numItems = 0;
|
||||
|
||||
uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild;
|
||||
if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31;
|
||||
|
||||
sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx));
|
||||
|
||||
/* fill QBVH6 node with up to 6 children */
|
||||
while (numChildren < BVH_NODE_N6)
|
||||
{
|
||||
// we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point
|
||||
uint childNumItems = sg_children.local_parent_index__numItems;
|
||||
varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize;
|
||||
if (sub_group_all(sg_is_leaf)) { break; }
|
||||
|
||||
uniform uint bestItems = sub_group_reduce_max_N6(childNumItems);
|
||||
uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems));
|
||||
varying uint leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes
|
||||
uniform uint rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild);
|
||||
|
||||
varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest;
|
||||
|
||||
if (lane == numChildren || lane == bestChild)
|
||||
{
|
||||
sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID));
|
||||
}
|
||||
|
||||
numChildren++;
|
||||
}
|
||||
|
||||
uniform uint global_offset;
|
||||
uniform uint child_node_index;
|
||||
|
||||
bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren);
|
||||
uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild));
|
||||
|
||||
if (lane <= numChildren) {
|
||||
uint writeIDX = 0;
|
||||
|
||||
if (lane == numChildren)
|
||||
{
|
||||
/* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */
|
||||
MortonFlattenedBoxlessNode flattened_node;
|
||||
uint parentIDX;
|
||||
|
||||
if (processRoot)
|
||||
{
|
||||
*local_numRecords = numChildren + 1;
|
||||
child_node_index = 1;
|
||||
writeIDX = 0;
|
||||
flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
|
||||
flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE;
|
||||
parentIDX = parentOfRoot;
|
||||
}
|
||||
else
|
||||
{
|
||||
uint shift = (16 * tictoc);
|
||||
uint mask = 0xFFFF;
|
||||
uint atomicAddVal = numChildren << shift;
|
||||
child_node_index = atomic_add_local(local_numRecords, atomicAddVal);
|
||||
sub_group_barrier(0);
|
||||
writeIDX = currQnodeLocalId;
|
||||
parentIDX = currentRecord.local_parent_index__numItems >> 16;
|
||||
flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
|
||||
sub_group_barrier(0);
|
||||
child_node_index = (child_node_index >> 16) + (child_node_index & mask);
|
||||
flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE;
|
||||
}
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren);
|
||||
#endif
|
||||
flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren;
|
||||
sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node);
|
||||
}
|
||||
|
||||
child_node_index = sub_group_broadcast(child_node_index, numChildren);
|
||||
|
||||
if (lane != numChildren)
|
||||
{
|
||||
writeIDX = child_node_index + lane;
|
||||
sg_children.local_parent_index__numItems |= currQnodeLocalId << 16;
|
||||
}
|
||||
|
||||
nodeData[writeIDX].buildRecord = sg_children;
|
||||
}
|
||||
|
||||
if (numFatleafChildren == numChildren) {
|
||||
uint arridx = *sg_bu_startpoint_cnt;
|
||||
// GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
|
||||
set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane);
|
||||
*sg_bu_startpoint_cnt = arridx + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO_OPT: Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants
|
||||
// of this kernel with different WG sizes. There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is
|
||||
// probably often wasted
|
||||
GRL_INLINE void phase1_process_fatleaf(
|
||||
uint globalBaseForInternalNodes, // for root node this is indexOfRoot
|
||||
uint globalParent , // for root this should be parentOfRoot
|
||||
bool isInstancePrimLeafType, //
|
||||
uint leafPrimType, //
|
||||
uint leafStride, //
|
||||
global struct QBVHNodeN* nodeData, // per group
|
||||
uint nodeDataStart, //
|
||||
struct AABB* primref, //
|
||||
BackPointers* backPointers, //
|
||||
global struct MortonCodePrimitive* mc,//
|
||||
uint nodesToLeafsGap, //
|
||||
local union UPerNodeData* perNodeData,//
|
||||
bool processRoot, //
|
||||
short localNodeId, //
|
||||
BuildRecordLocalMortonFlattener fatleafRecord, // per node
|
||||
uint primID ) //
|
||||
{
|
||||
uint lane = get_sub_group_local_id();
|
||||
uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
|
||||
uniform uint mcID = fatleafRecord.rangeStart;
|
||||
uint pseudolane = lane < numChildren ? lane : 0;
|
||||
varying struct AABB sg_bounds = primref[primID];
|
||||
|
||||
uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16);
|
||||
uint globalNodeId = globalBaseForInternalNodes + localNodeId;
|
||||
uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId;
|
||||
|
||||
uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId;
|
||||
|
||||
{
|
||||
/* For all primitives in a fat leaf we store a back
|
||||
* pointer. This way we can modify the fat leaf node at leaf construction time. */
|
||||
uint back_pointer = globalNodeId + nodeDataStart;
|
||||
/* Store back pointer and primID inside morton code array to
|
||||
* be later used by leaf creation. */
|
||||
mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
|
||||
}
|
||||
|
||||
struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds);
|
||||
reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
|
||||
|
||||
uint8_t instMask;
|
||||
if (isInstancePrimLeafType)
|
||||
{
|
||||
instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0;
|
||||
subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask);
|
||||
instMask = sub_group_reduce_or_N6(instMask);
|
||||
}
|
||||
else
|
||||
{
|
||||
instMask = 0xFF;
|
||||
subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds);
|
||||
}
|
||||
|
||||
reduce_bounds.lower.w = as_float((uint)instMask);
|
||||
uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0);
|
||||
local uint* boxUint = (local uint*)(perNodeData + localNodeId);
|
||||
if (get_sub_group_size() == 8 || lane < 8)
|
||||
{
|
||||
boxUint[lane] = reduce_bounds_lane;
|
||||
uint globalParentIdx;
|
||||
if (processRoot) {
|
||||
// for root, treeletRootGlobalIndex is index of rootsParent in global space
|
||||
globalParentIdx = globalParent;
|
||||
}
|
||||
else {
|
||||
// for non root, raw_parent_idx is in local space
|
||||
globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent;
|
||||
}
|
||||
if (lane == 0) {
|
||||
*InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void perform_phase1(global struct Globals* globals,
|
||||
global struct MortonCodePrimitive* mc,
|
||||
global struct AABB* primref,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global char* bvh_mem,
|
||||
local union UPerNodeData* perNodeData,
|
||||
local uint* local_records_head,
|
||||
local uint* local_globalOffsetForNodes,
|
||||
BuildRecordLocalMortonFlattener rootRecord,
|
||||
uint treeletRootGlobalIndex,
|
||||
uint parentOfRootIndex,
|
||||
const uint leafPrimType,
|
||||
bool isInstancePrimLeafType)
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
varying ushort lane = get_sub_group_local_id();
|
||||
|
||||
// array that will keep 2x8 shorts indices
|
||||
varying uint sg_fatleaf_array = 0x0;
|
||||
uniform uint8_t sg_fatleaf_cnt = 0;
|
||||
/* terminate when all subtrees are leaves */
|
||||
|
||||
uint subgroupId = get_sub_group_id();
|
||||
uint ID = subgroupId;
|
||||
|
||||
uint sg_bu_startpoints = 0;
|
||||
uniform uint sg_bu_startpoints_cnt = 0;
|
||||
const uint shift_mask = globals->shift_mask;
|
||||
|
||||
const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
|
||||
global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh);
|
||||
|
||||
uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart;
|
||||
uint leafStart = *pLeafStart;
|
||||
uint leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode));
|
||||
uint nodesToLeafsGap = leafStart - nodeDataStart;
|
||||
|
||||
if (ID == 0)
|
||||
{
|
||||
BuildRecordLocalMortonFlattener current = rootRecord;
|
||||
|
||||
if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
|
||||
{
|
||||
*local_records_head = 1;
|
||||
#if MORTON_DEBUG_CHECKS
|
||||
if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
|
||||
#endif
|
||||
BuildRecordLocalMortonFlattener fatleafRecord = current;
|
||||
uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
|
||||
uint pseudolane = lane < numChildren ? lane : 0;
|
||||
uniform const uint mcID = fatleafRecord.rangeStart;
|
||||
varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
|
||||
|
||||
phase1_process_fatleaf(
|
||||
treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride,
|
||||
nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
|
||||
true, 0, fatleafRecord, primID);
|
||||
}
|
||||
else
|
||||
{
|
||||
#if MORTON_VERBOSE_LOG
|
||||
if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); }
|
||||
#endif
|
||||
//printf("local_records_head = %d\n", *local_records_head);
|
||||
SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData);
|
||||
*local_globalOffsetForNodes = treeletRootGlobalIndex;
|
||||
}
|
||||
|
||||
ID += get_num_sub_groups();
|
||||
}
|
||||
|
||||
uniform uint priv_records_tail = 1;
|
||||
|
||||
/* wait for all work items to have updated local_records array */
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uniform uint priv_records_head = *local_records_head & 0xFFFF;
|
||||
treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1
|
||||
uniform uint priv_records_tail_prev = priv_records_tail;
|
||||
uniform uint other_records_head = priv_records_head;
|
||||
|
||||
uint ticToc = 1;
|
||||
|
||||
if (priv_records_head == priv_records_tail)
|
||||
{
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
do
|
||||
{
|
||||
for (; ID < priv_records_head; ID += get_num_sub_groups())
|
||||
{
|
||||
BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord);
|
||||
|
||||
if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
|
||||
{
|
||||
set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane);
|
||||
#if MORTON_VERBOSE_LOG
|
||||
if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID);
|
||||
#endif
|
||||
#if MORTON_DEBUG_CHECKS
|
||||
if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData);
|
||||
}
|
||||
}
|
||||
|
||||
priv_records_tail = priv_records_head;
|
||||
/* wait for all work items to have updated local_records array */
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
{
|
||||
uint records_as_in_mem = *local_records_head;
|
||||
priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF;
|
||||
uint other_records_head_temp = priv_records_head;
|
||||
priv_records_head += other_records_head;
|
||||
other_records_head = other_records_head_temp;
|
||||
ticToc = ticToc ^ 1;
|
||||
#if MORTON_VERBOSE_LOG
|
||||
if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem);
|
||||
#endif
|
||||
}
|
||||
} while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head
|
||||
}
|
||||
|
||||
bool atomicNodeAllocation = treeletRootGlobalIndex > 0;
|
||||
bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation;
|
||||
uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0;
|
||||
|
||||
uniform uint globalBaseForInternalNodes = 0;
|
||||
|
||||
// we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex
|
||||
// if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so
|
||||
// there's no need to synchronize multiple treelets nodes allocations with atomics.
|
||||
if (atomicNodeAllocationProduce)
|
||||
{
|
||||
*local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1);
|
||||
}
|
||||
|
||||
// because, root is allocated elsewhere, and first node placed in global mem is node with local index 1
|
||||
// mapping local to global:
|
||||
// local space global space
|
||||
// [0] - treelet root [treeletRootGlobalIndex]
|
||||
// ... possibly very long distance ...
|
||||
// [1] - first non root [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above
|
||||
// [2] - first [globalBaseForInternalNodes + 2]
|
||||
// ...
|
||||
// [numToAllocate] - last node [globalBaseForInternalNodes + 3]
|
||||
if (atomicNodeAllocation)
|
||||
{
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1);
|
||||
}
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); }
|
||||
#endif
|
||||
|
||||
if (sg_fatleaf_cnt)
|
||||
{
|
||||
short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane);
|
||||
//if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue;
|
||||
//if(local_startpoints_cnt > 1) return;
|
||||
BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord;
|
||||
|
||||
varying uint primID;
|
||||
{
|
||||
uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
|
||||
uint pseudolane = lane < numChildren ? lane : 0;
|
||||
uniform const uint mcID = fatleafRecord.rangeStart;
|
||||
primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
|
||||
}
|
||||
|
||||
// process fatleafs, and store their boxes to SLM
|
||||
// also put startpoints for bottom up
|
||||
//uint fatleaf_cnt = *local_startpoints_cnt;
|
||||
while (sg_fatleaf_cnt-- > 1)
|
||||
{
|
||||
short nextLocalNodeId = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane);
|
||||
BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord;
|
||||
varying uint nextPrimId;
|
||||
|
||||
{
|
||||
uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF);
|
||||
uint pseudolane = lane < numChildren ? lane : 0;
|
||||
uniform const uint mcID = nextfatleafRecord.rangeStart;
|
||||
nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
|
||||
}
|
||||
|
||||
phase1_process_fatleaf(
|
||||
globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
|
||||
nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
|
||||
false, localNodeId, fatleafRecord, primID);
|
||||
|
||||
fatleafRecord = nextfatleafRecord;
|
||||
localNodeId = nextLocalNodeId;
|
||||
primID = nextPrimId;
|
||||
}
|
||||
|
||||
phase1_process_fatleaf(
|
||||
globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
|
||||
nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
|
||||
false, localNodeId, fatleafRecord, primID);
|
||||
}
|
||||
|
||||
#if 0
|
||||
// put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups.
|
||||
{
|
||||
ushort myStartpointWriteSite = 0;
|
||||
|
||||
if (lane == 0)
|
||||
{
|
||||
myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt);
|
||||
}
|
||||
myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0);
|
||||
|
||||
unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite);
|
||||
}
|
||||
#endif
|
||||
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// distribute bottom-up startpoints
|
||||
#if 0
|
||||
{
|
||||
short sp_count_to_divide = (*local_startpoints_cnt);
|
||||
|
||||
//calculate the chunk for each sg.
|
||||
sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups();
|
||||
uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups();
|
||||
|
||||
uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt;
|
||||
if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) {
|
||||
//from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx
|
||||
// and all sgs before it also have one extra
|
||||
myReadSite += get_sub_group_id();
|
||||
sg_bu_startpoints_cnt++;
|
||||
}
|
||||
else
|
||||
{
|
||||
// all reminder elements are consummed by previous sgs
|
||||
myReadSite += sg_bu_startpoints_cnt_reminder;
|
||||
}
|
||||
|
||||
pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane);
|
||||
}
|
||||
#endif
|
||||
|
||||
SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt);
|
||||
|
||||
if (singleTreeletBumpBVHnodeCnt)
|
||||
{
|
||||
bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType)
|
||||
{
|
||||
if (get_sub_group_id() == 0 )
|
||||
{
|
||||
global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh);
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
|
||||
|
||||
//set required fields to mark that blas is empty
|
||||
uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0;
|
||||
qnode->type = leafPrimType;
|
||||
qnode->instMask = 0;
|
||||
qnode->qbounds.lower_x[k] = 0x80;
|
||||
qnode->qbounds.upper_x[k] = 0;
|
||||
|
||||
*InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
POSTSORT PHASE1:
|
||||
Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD.
|
||||
1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip
|
||||
2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards)
|
||||
|
||||
*/
|
||||
|
||||
__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
parallel_build_phase1_Indirect_SG( global struct Globals* globals,
|
||||
global struct MortonCodePrimitive* mc,
|
||||
global struct AABB* primref,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global char* bvh_mem)
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
const uint leafPrimType = globals->leafPrimType;
|
||||
|
||||
//special case for empty blas
|
||||
if(globals->numPrimitives == 0)
|
||||
{
|
||||
bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1;
|
||||
update_empty_blas(bvh, leafPrimType);
|
||||
return;
|
||||
}
|
||||
|
||||
local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1];
|
||||
local uint local_records_head;
|
||||
// Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers
|
||||
local uint local_globalOffsetForNodes, local_globalOffsetForNodes2;
|
||||
|
||||
uint rootIndex = 0;
|
||||
uint parentOfRoot = 0;
|
||||
BuildRecordLocalMortonFlattener rootBuildRecord;
|
||||
|
||||
/* add start build record to local stack */
|
||||
if (get_sub_group_id() == 0 )
|
||||
{
|
||||
global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart);
|
||||
uint recordID = get_group_id(0);
|
||||
struct BuildRecordMorton mortonGlobalRecord = records[recordID];
|
||||
|
||||
rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID));
|
||||
|
||||
parentOfRoot = mortonGlobalRecord.parent_index;
|
||||
rootIndex = mortonGlobalRecord.current_index;
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n",
|
||||
local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (leafPrimType == NODE_TYPE_INSTANCE)
|
||||
{
|
||||
perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
|
||||
&local_records_head, &local_globalOffsetForNodes,
|
||||
rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
|
||||
&local_records_head, &local_globalOffsetForNodes,
|
||||
rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
parallel_build_phase1_Indirect_global_root( global struct Globals* globals,
|
||||
global struct MortonCodePrimitive* mc,
|
||||
global struct AABB* primref,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global char* bvh_mem)
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
const uint leafPrimType = globals->leafPrimType;
|
||||
const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
|
||||
|
||||
bvh->nodeDataCur = nodeDataStart + 1;
|
||||
|
||||
//special case for empty blas
|
||||
if(globals->numPrimitives == 0)
|
||||
{
|
||||
update_empty_blas(bvh, leafPrimType);
|
||||
return;
|
||||
}
|
||||
|
||||
local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1];
|
||||
local uint local_records_head;
|
||||
local uint local_globalOffsetForNodes;
|
||||
|
||||
BuildRecordLocalMortonFlattener rootBuildRecord;
|
||||
|
||||
if (get_sub_group_id() == 0 )
|
||||
{
|
||||
struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root);
|
||||
|
||||
rootBuildRecord = TranslateToLocalRecord(binaryNode);
|
||||
|
||||
local_globalOffsetForNodes = 0;
|
||||
}
|
||||
|
||||
if (leafPrimType == NODE_TYPE_INSTANCE)
|
||||
{
|
||||
perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
|
||||
&local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
|
||||
&local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
GRL_INLINE void
|
||||
DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
|
||||
global struct MortonCodePrimitive* mc,
|
||||
global struct AABB* primref,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global char* bvh_mem,
|
||||
uint startID, uint endID,
|
||||
local uint* local_numRecords,
|
||||
local uint* local_numRecordsOld,
|
||||
local struct BuildRecordMorton* local_records
|
||||
)
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
|
||||
|
||||
/* iterate over all subtrees this workgroup should build */
|
||||
for ( uint recordID = startID; recordID < endID; recordID++ )
|
||||
{
|
||||
/* add start build record to local stack */
|
||||
if ( get_local_id( 0 ) == 0 )
|
||||
{
|
||||
local_records[0] = records[recordID];
|
||||
*local_numRecords = 1;
|
||||
*local_numRecordsOld = 0;
|
||||
}
|
||||
work_group_barrier( CLK_LOCAL_MEM_FENCE );
|
||||
|
||||
/* terminate when all subtrees are leaves */
|
||||
while ( *local_numRecords != *local_numRecordsOld )
|
||||
{
|
||||
/* remember the old number of build records to detect later
|
||||
* whether we are done */
|
||||
if ( get_local_id( 0 ) == 0 )
|
||||
{
|
||||
*local_numRecordsOld = *local_numRecords;
|
||||
}
|
||||
work_group_barrier( CLK_LOCAL_MEM_FENCE );
|
||||
|
||||
/* all work items in the sub group pick a subtree to build */
|
||||
for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
|
||||
{
|
||||
/* ignore small subtrees */
|
||||
if ( local_records[ID].items <= BVH_NODE_N6 )
|
||||
continue;
|
||||
|
||||
/* create QBVH node */
|
||||
create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
|
||||
}
|
||||
|
||||
/* wait for all work items to have updated local_records array */
|
||||
work_group_barrier( CLK_LOCAL_MEM_FENCE );
|
||||
}
|
||||
|
||||
const uint shift_mask = globals->shift_mask;
|
||||
const uint leafPrimType = globals->leafPrimType;
|
||||
const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
|
||||
global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
|
||||
|
||||
/* create all fat leaf nodes and initiate refit */
|
||||
for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
|
||||
{
|
||||
struct BuildRecordMorton current = local_records[ID];
|
||||
const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
|
||||
|
||||
global struct QBVHNodeN* qnode = nodeData + current.current_index;
|
||||
|
||||
/* get bounds of all children of the fat leaf node */
|
||||
struct AABB bounds[BVH_NODE_N6];
|
||||
for ( uint i = 0; i < current.items; i++ )
|
||||
{
|
||||
/* get primID and bounds of primitive */
|
||||
const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
|
||||
bounds[i] = primref[primID];
|
||||
|
||||
/* For all primitives in a fat leaf we store a back
|
||||
* pointer. This way we can modify the fat leaf node at leaf construction time. */
|
||||
const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
|
||||
|
||||
/* Store back pointer and primID inside morton code array to
|
||||
* be later used by leaf creation. */
|
||||
mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
|
||||
}
|
||||
|
||||
/* update fat leaf node */
|
||||
QBVHNodeN_setType( qnode, leafPrimType );
|
||||
global void* offset;
|
||||
if ( leafPrimType != BVH_INSTANCE_NODE )
|
||||
{
|
||||
offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
|
||||
QBVHNodeN_setChildIncr1( qnode );
|
||||
}
|
||||
else
|
||||
{
|
||||
offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
|
||||
QBVHNodeN_setChildIncr2( qnode );
|
||||
}
|
||||
QBVH6Node_set_offset( qnode, offset );
|
||||
QBVHNodeN_setBounds( qnode, bounds, current.items );
|
||||
|
||||
/* set back pointers for fat leaf nodes */
|
||||
*InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
|
||||
|
||||
/* bottom up refit */
|
||||
refit_bottom_up( qnode, bvh, bounds, current.items );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This phase takes the build records calculated in phase0 as input and
|
||||
finished the BVH construction for all these subtrees.
|
||||
|
||||
*/
|
||||
__attribute__((reqd_work_group_size(8, 1, 1)))
|
||||
old_parallel_build_phase1(global struct Globals *globals,
|
||||
global struct MortonCodePrimitive *mc,
|
||||
global struct AABB *primref,
|
||||
global struct BinaryMortonCodeHierarchy *bnodes,
|
||||
global char *bvh_mem)
|
||||
{
|
||||
global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
|
||||
global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
|
||||
|
||||
/* a queue of build records */
|
||||
local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
|
||||
local uint local_numRecords;
|
||||
local uint local_numRecordsOld;
|
||||
|
||||
/* construct range of build records that each sub group will process */
|
||||
const uint numRecords = globals->numBuildRecords;
|
||||
const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
|
||||
const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
|
||||
|
||||
DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
|
||||
|
||||
}
|
||||
|
||||
__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
|
||||
old_parallel_build_phase1_Indirect( global struct Globals* globals,
|
||||
global struct MortonCodePrimitive* mc,
|
||||
global struct AABB* primref,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global char* bvh_mem )
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
|
||||
|
||||
/* a queue of build records */
|
||||
local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
|
||||
local uint local_numRecords;
|
||||
local uint local_numRecordsOld;
|
||||
|
||||
/* construct range of build records that each sub group will process */
|
||||
const uint numRecords = globals->numBuildRecords;
|
||||
uint startID = get_group_id( 0 );
|
||||
uint endID = startID + 1;
|
||||
|
||||
DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,314 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2022 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "bvh_build_refit.h"
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
#include "morton/morton_common.h"
|
||||
|
||||
/*
|
||||
|
||||
POSTSORT PHASE2:
|
||||
Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value.
|
||||
1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate.
|
||||
This kernel should be used only for very big bvh, it is faster than non-SLM fallback
|
||||
in parallel_build_phase2_refit_local.
|
||||
2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of
|
||||
nodes allocated in phase0, but there is also non-SLM fallback there, as the
|
||||
decision on which kernel to run is based on the nodes estimates on the host
|
||||
side.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
GRL_INLINE void refit_bottom_up_global_sync(
|
||||
global char* bvh_mem,
|
||||
global uint* global_refit_startpoints,
|
||||
uniform uint nodeId,
|
||||
uniform ushort lane)
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
|
||||
global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
|
||||
|
||||
// Get the node idx that was put here in phase1
|
||||
const uint innerNodeIdx = global_refit_startpoints[nodeId];
|
||||
|
||||
// Get the qnode and backpointer
|
||||
uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx;
|
||||
uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
|
||||
|
||||
varying struct AABB childrenAABB; // one child AABB per lane
|
||||
AABB_init(&childrenAABB);
|
||||
|
||||
uniform uint numChildren = (backPointer >> 3) & 0x7;
|
||||
if(numChildren == 0) return;
|
||||
|
||||
global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
|
||||
varying ushort child_idx = (lane < numChildren) ? lane : 0;
|
||||
childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
if(lane == 0)
|
||||
printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx);
|
||||
#endif
|
||||
|
||||
struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
|
||||
reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
|
||||
|
||||
subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane);
|
||||
|
||||
uint children_mask = qnode_child[child_idx].instMask;
|
||||
qnode->instMask = sub_group_reduce_or_N6(children_mask);
|
||||
|
||||
SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 );
|
||||
}
|
||||
|
||||
__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel
|
||||
parallel_build_phase2_refit( global char* bvh_mem,
|
||||
global uint* global_refit_startpoints )
|
||||
{
|
||||
refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0));
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void SUBGROUP_refit_bottom_up_global(
|
||||
uniform global struct QBVHNodeN* globalNodeData,
|
||||
uniform struct BackPointers* backPointers,
|
||||
varying ushort lane,
|
||||
varying uint curNodeIndex)
|
||||
{
|
||||
uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
|
||||
|
||||
const uint head_lane = 0;
|
||||
uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
|
||||
|
||||
while (curNodeIndex != 0)
|
||||
{
|
||||
global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex;
|
||||
global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
|
||||
uint numChildren = BackPointer_GetNumChildren(backpointer);
|
||||
|
||||
varying ushort child_idx = (lane < numChildren) ? lane : 0;
|
||||
child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
|
||||
|
||||
struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
|
||||
reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
|
||||
|
||||
/* get bounds of all children from child nodes directly */
|
||||
subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane);
|
||||
|
||||
uchar childrenMask = qnode_child[child_idx].instMask;
|
||||
qnode->instMask = sub_group_reduce_or_N6(childrenMask);
|
||||
|
||||
uint parentIndex = BackPointer_GetParentIndex(backpointer);
|
||||
|
||||
mem_fence_gpu_invalidate();
|
||||
|
||||
if (lane == 0)
|
||||
{
|
||||
backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex));
|
||||
|
||||
uint globalBackpointer = (parentIndex << 6) | (numChildren << 3);
|
||||
|
||||
/* set global back pointer */
|
||||
*InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer;
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n",
|
||||
curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x);
|
||||
#endif
|
||||
}
|
||||
|
||||
backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
|
||||
curNodeIndex = parentIndex;
|
||||
|
||||
/* if all children got refitted, then continue */
|
||||
uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
|
||||
uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
|
||||
|
||||
if (numChildrenRefitted != numChildrenTotal)
|
||||
return;
|
||||
}
|
||||
|
||||
// process root of the treelet
|
||||
{
|
||||
|
||||
#if MORTON_DEBUG_CHECKS
|
||||
if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
|
||||
#endif
|
||||
|
||||
global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData );
|
||||
uint numChildren = BackPointer_GetNumChildren(backpointer);
|
||||
|
||||
varying ushort child_idx = (lane < numChildren) ? lane : 0;
|
||||
child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
|
||||
|
||||
struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
|
||||
reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
|
||||
|
||||
/* get bounds of all children from child nodes directly */
|
||||
subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane);
|
||||
|
||||
uchar childrenMask = qnode_child[child_idx].instMask;
|
||||
globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask);
|
||||
|
||||
/* reset refit counter for next refit */
|
||||
if (lane == 0)
|
||||
{
|
||||
/* set global back pointer */
|
||||
*InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u);
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
|
||||
curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// TODO: Check why 512 wg size has worse performance than 256
|
||||
__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
parallel_build_phase2_refit_local( global struct Globals* globals,
|
||||
global char* bvh_mem,
|
||||
global struct MortonFlattenedBoxlessNode *boxless_nodes)
|
||||
{
|
||||
// Number of nodes created in P0, to be refitted in this stage
|
||||
uint p0_created_num = globals->p0_created_num;
|
||||
|
||||
// Return immediately if host executed this kernel but there is nothing to do
|
||||
if(p0_created_num == 0)
|
||||
return;
|
||||
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
|
||||
global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
|
||||
varying ushort lane = get_sub_group_local_id();
|
||||
|
||||
// Hardcode SLM to max here as we do not know upfront how much mem will be needed
|
||||
local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */
|
||||
|
||||
// Number of allocated nodes in phase0 (p0_created_num + children)
|
||||
uint p0_allocated_num = globals->p0_allocated_num;
|
||||
|
||||
// array that will keep 2x8 shorts indices
|
||||
varying uint sg_fatleaf_array = 0x0;
|
||||
uniform uint8_t sg_bu_startpoints_cnt = 0;
|
||||
|
||||
// Determine if we can fit into SLM with all the nodes allocated in phase0,
|
||||
// There are two paths here:
|
||||
// 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local,
|
||||
// which does refit nad creates qnodes in bvh
|
||||
// 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization.
|
||||
// It is not performant to do so, keep it as a guardrail here. On the host side we do fallback
|
||||
// to the old refit separated path, with wg_size 8 with better EU reuse.
|
||||
if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM)
|
||||
{
|
||||
for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
|
||||
{
|
||||
MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
|
||||
uint current_id = boxless_node.binary_hierarchy_index >> 6;
|
||||
|
||||
// Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
|
||||
uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
|
||||
|
||||
if(lane == 0)
|
||||
perNodeData[current_id].boxlessNode = boxless_node;
|
||||
|
||||
// When no children are subtree roots, we are done and skip to the next iteration
|
||||
if(children_root_mask == 0x0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// When all children are subtree roots, put them to sg_fatleaf_array
|
||||
else if(children_root_mask == 0x3F)
|
||||
{
|
||||
set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
|
||||
}
|
||||
|
||||
uniform global struct QBVHNodeN* qnode = nodeData + current_id;
|
||||
|
||||
uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
|
||||
uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
|
||||
varying ushort child_idx = (lane < numChildren) ? lane : 0;
|
||||
|
||||
varying struct AABB childrenAABB; // one child AABB per lane
|
||||
AABB_init(&childrenAABB);
|
||||
|
||||
uint lead_child_global_id = current_id + lead_child_offset;
|
||||
|
||||
uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id;
|
||||
childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
|
||||
|
||||
// Get only AABBs of children that are p1 subtree roots
|
||||
bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx);
|
||||
if(lane_active)
|
||||
{
|
||||
uint child_global_id = lead_child_global_id + child_idx;
|
||||
perNodeData[child_global_id].box = childrenAABB;
|
||||
perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask);
|
||||
}
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
if(lane == 0)
|
||||
printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset);
|
||||
#endif
|
||||
}
|
||||
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
|
||||
{
|
||||
MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
|
||||
uint current_id = boxless_node.binary_hierarchy_index >> 6;
|
||||
|
||||
// Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
|
||||
uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
|
||||
uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
|
||||
|
||||
uniform global struct QBVHNodeN* qnode = nodeData + current_id;
|
||||
uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node);
|
||||
uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
|
||||
|
||||
SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
|
||||
if(lane == 0)
|
||||
{
|
||||
QBVH6Node_set_type( qnode, nodeType );
|
||||
qnode->offset = lead_child_offset;
|
||||
}
|
||||
|
||||
// When no children are subtree roots, we are done and skip to the next iteration
|
||||
if(children_root_mask == 0x0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// When all children are subtree roots, put them to sg_fatleaf_array
|
||||
else if(children_root_mask == 0x3F)
|
||||
{
|
||||
set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
|
||||
}
|
||||
|
||||
#if MORTON_VERBOSE_LOG
|
||||
if(lane == 0)
|
||||
printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset);
|
||||
#endif
|
||||
}
|
||||
|
||||
while (sg_bu_startpoints_cnt > 0)
|
||||
{
|
||||
uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane);
|
||||
|
||||
SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,521 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2022 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
#include "morton/morton_common.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
|
||||
This kernel constructs a binary hierarchy in bottom up fashion from
|
||||
the morton codes.
|
||||
|
||||
*/
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int Delta(global struct MortonCodePrimitive* mc, const uint64_t key0, const uint i1 )
|
||||
{
|
||||
const uint64_t key1 = mc[i1].index_code;
|
||||
return clz(key0 ^ key1);
|
||||
}
|
||||
|
||||
int sign( int d )
|
||||
{
|
||||
return (d > 0) ? 1 : -1;
|
||||
}
|
||||
|
||||
__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
|
||||
__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
|
||||
void kernel build_bottom_up_indirect( global struct Globals* globals,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global struct MortonCodePrimitive* mc )
|
||||
{
|
||||
/* construct range of primitives that each work group will process */
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
|
||||
uint i = get_group_id( 0 ) * get_local_size(0) + get_local_id( 0 );
|
||||
|
||||
if (i == 0)
|
||||
{
|
||||
globals->binary_hierarchy_root = 0;
|
||||
if (numPrimitives == 1)
|
||||
{
|
||||
// special kludge for 1-prim tree. Make sure the one leaf node is initialized
|
||||
bnodes[i].range.start = 0;
|
||||
bnodes[i].range.end = 0;
|
||||
bnodes[i].leftChild = -1;
|
||||
bnodes[i].rightChild = -1;
|
||||
}
|
||||
|
||||
// store pointer to the binary hierarchy in the globals struct.
|
||||
// This will be used
|
||||
globals->binary_hierarchy_buffer = (gpuva_t) bnodes;
|
||||
}
|
||||
|
||||
uint num_inner_nodes = numPrimitives-1;
|
||||
if ( i < num_inner_nodes )
|
||||
{
|
||||
//
|
||||
// direction is 1 if this morton code is the node's first key, -1 if it's the last
|
||||
// By construction every internal node is either the start or the end of a given key range
|
||||
// direction should be towards the neighbor with the most bits in common
|
||||
|
||||
uint64_t ki = mc[i].index_code;
|
||||
|
||||
int direction, delta_min;
|
||||
uint lmax;
|
||||
if( i == 0 )
|
||||
{
|
||||
direction = 1;
|
||||
delta_min = -1;
|
||||
lmax = numPrimitives;
|
||||
}
|
||||
else
|
||||
{
|
||||
direction = sign( Delta( mc, ki, i + 1 ) - Delta( mc, ki, i - 1 ) );
|
||||
delta_min = Delta( mc, ki, i - direction );
|
||||
|
||||
// find upper bound for length of this node's key range
|
||||
lmax = 8;
|
||||
while ( (i+lmax*direction) < numPrimitives && Delta( mc, ki, i+lmax*direction ) > delta_min)
|
||||
lmax = lmax * 2;
|
||||
}
|
||||
|
||||
// clamp max length so that the binary searches are fully in-bounds
|
||||
uint maxLen = (direction>0) ? (numPrimitives - i) : (i+1);
|
||||
lmax = min(lmax, maxLen);
|
||||
|
||||
// find end of range using binary search
|
||||
uint length = 0;
|
||||
uint end = lmax-1;
|
||||
while (length != end)
|
||||
{
|
||||
uint mid = length + ((end-length)/2) + ((end-length)%2);
|
||||
bool bigger = Delta( mc, ki, i+mid*direction) > delta_min;
|
||||
length = bigger ? mid : length;
|
||||
end = bigger ? end : mid-1;
|
||||
}
|
||||
uint j = i + length*direction ;
|
||||
|
||||
// find split position using binary search
|
||||
uint split = 0;
|
||||
end = length-1;
|
||||
int delta_node = Delta(mc, ki, j);
|
||||
while (split != end)
|
||||
{
|
||||
uint mid = split + ((end-split)/2) + ((end-split)%2);
|
||||
bool bigger = Delta( mc, ki, i+mid*direction) > delta_node;
|
||||
split = bigger ? mid : split;
|
||||
end = bigger ? end : mid-1;
|
||||
}
|
||||
split = i + split*direction + min(direction,0);
|
||||
|
||||
uint left = split;
|
||||
uint right = split+1;
|
||||
|
||||
// mark leaves
|
||||
if( min(i,j) == split )
|
||||
left = left | (1<<31);
|
||||
if( max(i,j) == split+1 )
|
||||
right = right | (1<<31);
|
||||
|
||||
bnodes[i].range.start = min(i,j);
|
||||
bnodes[i].range.end = max(i,j);
|
||||
bnodes[i].leftChild = left;
|
||||
bnodes[i].rightChild = right;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
|
||||
__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
|
||||
void kernel build_bottom_up_indirect( global struct Globals* globals,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global struct MortonCodePrimitive* mc )
|
||||
{
|
||||
/* construct range of primitives that each work group will process */
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
|
||||
// RangeFactor determines the distance between adjacent nodeIds in work group.
|
||||
// The aim of the nodes distribution within work group, for rangeFactor > 1
|
||||
// is to be sure that half of the work groups will entirelly be dropped off
|
||||
// at the bottom layer of the graph. This way the EUs can be reused faster.
|
||||
// The factor needs to be smaller than MAX_HW_SIMD_WIDTH
|
||||
const uint rangeFactor = 2;
|
||||
|
||||
const uint numGroups = ((numPrimitives + MAX_HW_SIMD_WIDTH - 1) / MAX_HW_SIMD_WIDTH);
|
||||
const uint globalId = get_group_id( 0 ) * MAX_HW_SIMD_WIDTH + get_local_id( 0 );
|
||||
const uint numPrimitivesAlignedToWGSize = MAX_HW_SIMD_WIDTH * numGroups;
|
||||
const uint groupsRange = numPrimitivesAlignedToWGSize / rangeFactor;
|
||||
|
||||
/* iterate over all primitives the work group should process */
|
||||
const uint i = (globalId * rangeFactor) % numPrimitivesAlignedToWGSize + globalId / groupsRange;
|
||||
|
||||
if ( i < numPrimitives )
|
||||
{
|
||||
uint node = i | ((uint)1 << 31);
|
||||
uint start = i;
|
||||
uint end = i;
|
||||
|
||||
/* bottom up */
|
||||
while ( true )
|
||||
{
|
||||
/* goto parent node and link parent node to current node */
|
||||
node = updateParent( bnodes, mc, node, start, end, numPrimitives - 1 );
|
||||
|
||||
/* do not continue if we reached this node the first time */
|
||||
if ( node == -1 )
|
||||
break;
|
||||
|
||||
mem_fence_gpu_invalidate();
|
||||
|
||||
/* update range */
|
||||
start = bnodes[node].range.start;
|
||||
end = bnodes[node].range.end;
|
||||
|
||||
/* stop when we reached the root node */
|
||||
if ( start == 0 && end == numPrimitives - 1 )
|
||||
{
|
||||
globals->binary_hierarchy_root = node;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
||||
This function builds one QBVH6 node by opening the provided binary
|
||||
BVH nodes until the QBVH node is full.
|
||||
|
||||
*/
|
||||
|
||||
GRL_INLINE void create_node(global struct Globals *globals,
|
||||
global struct BinaryMortonCodeHierarchy *bnodes,
|
||||
global char *bvh_mem,
|
||||
uint rID,
|
||||
local uint *local_numRecords,
|
||||
local uint *local_QNodeOffset,
|
||||
struct BuildRecordMorton *records,
|
||||
struct BuildRecordMorton *current)
|
||||
{
|
||||
global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
|
||||
const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
|
||||
global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
|
||||
BackPointers *backPointers = BVHBase_GetBackPointers(bvh);
|
||||
|
||||
/* initialize child array */
|
||||
uint numChildren = 2;
|
||||
struct BuildRecordMorton children[BVH_NODE_N6];
|
||||
children[0].nodeID = bnodes[current->nodeID].leftChild;
|
||||
children[0].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[0].nodeID);
|
||||
children[1].nodeID = bnodes[current->nodeID].rightChild;
|
||||
children[1].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[1].nodeID);
|
||||
|
||||
/* fill QBVH6 node with up to 6 children */
|
||||
while (numChildren < BVH_NODE_N6)
|
||||
{
|
||||
/*! find best child to split */
|
||||
uint bestItems = 0;
|
||||
int bestChild = -1;
|
||||
for (int i = 0; i < numChildren; i++)
|
||||
{
|
||||
const uint items = children[i].items;
|
||||
|
||||
/* ignore leaves as they cannot get split */
|
||||
if (items <= cfg_minLeafSize)
|
||||
continue;
|
||||
|
||||
/* find child with largest number of items */
|
||||
if (items > bestItems)
|
||||
{
|
||||
bestItems = items;
|
||||
bestChild = i;
|
||||
}
|
||||
}
|
||||
if (bestChild == -1)
|
||||
break;
|
||||
|
||||
/* perform best found split */
|
||||
const uint bestNodeID = children[bestChild].nodeID;
|
||||
struct BuildRecordMorton *lrecord = &children[bestChild];
|
||||
struct BuildRecordMorton *rrecord = &children[numChildren];
|
||||
lrecord->nodeID = bnodes[bestNodeID].leftChild;
|
||||
lrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, lrecord->nodeID);
|
||||
rrecord->nodeID = bnodes[bestNodeID].rightChild;
|
||||
rrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, rrecord->nodeID);
|
||||
numChildren++;
|
||||
}
|
||||
|
||||
/* allocate memory for all children */
|
||||
const uint child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
|
||||
global struct QBVHNodeN *childNodes = (global struct QBVHNodeN *)(bvh_mem + child_node_offset);
|
||||
|
||||
/* create node, but to not set bounds yet as these get calculated during refit */
|
||||
const uint current_index = current->current_index;
|
||||
struct QBVHNodeN *qnode = nodeData + current_index;
|
||||
QBVH6Node_set_type(qnode, BVH_INTERNAL_NODE);
|
||||
QBVHNodeN_setChildIncr1(qnode);
|
||||
QBVH6Node_set_offset(qnode, childNodes);
|
||||
|
||||
/* set back pointers */
|
||||
*InnerNode_GetBackPointer(backPointers, current_index) = (current->parent_index << 6) | (numChildren << 3);
|
||||
|
||||
/* update parent pointer of build records of all children */
|
||||
for (uint ID = 0; ID < numChildren; ID++)
|
||||
{
|
||||
children[ID].current_index = childNodes - nodeData + ID;
|
||||
children[ID].parent_index = current_index;
|
||||
}
|
||||
|
||||
/* write out child build records */
|
||||
const uint global_offset = atomic_add_local(local_numRecords, numChildren - 1);
|
||||
records[rID] = children[0];
|
||||
|
||||
for (uint i = 1; i < numChildren; i++)
|
||||
records[global_offset + i - 1] = children[i];
|
||||
|
||||
mem_fence_workgroup_default();
|
||||
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* This function calculates the similarity between two morton
|
||||
* codes. It essentially counts how many bits of the morton codes are
|
||||
* equal starting at the top. The more bits are equal, the similar the
|
||||
* codes, and the closer the primitives are located spatially. */
|
||||
|
||||
GRL_INLINE uint64_t delta(global struct MortonCodePrimitive *mc,
|
||||
const uint id)
|
||||
{
|
||||
const uint64_t key0 = mc[id + 0].index_code;
|
||||
const uint64_t key1 = mc[id + 1].index_code;
|
||||
return clz(key0 ^ key1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* This function checks for a range [left,right] of morton codes, if
|
||||
* it is spatially closer to the left or to the right nodes. */
|
||||
|
||||
GRL_INLINE bool merge_to_right(global struct MortonCodePrimitive *mc,
|
||||
const uint left,
|
||||
const uint right,
|
||||
const uint last)
|
||||
{
|
||||
/* merge to right if we are at the left end of the array */
|
||||
if (left == 0)
|
||||
return true;
|
||||
|
||||
/* merge to left if we are at the right end of the array */
|
||||
if (right == last)
|
||||
return false;
|
||||
|
||||
/* otherwise merge to the side where the morton code sequence has
|
||||
* the largest number of equal bits from the top */
|
||||
return delta(mc, right) > delta(mc, left - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE uint updateParent(global struct BinaryMortonCodeHierarchy *bnodes,
|
||||
global struct MortonCodePrimitive *mc,
|
||||
const uint nodeID,
|
||||
const uint left,
|
||||
const uint right,
|
||||
const uint last)
|
||||
{
|
||||
uint parent;
|
||||
|
||||
/* check if we should merge this node to the left or right */
|
||||
if (merge_to_right(mc, left, right, last))
|
||||
{
|
||||
parent = right;
|
||||
bnodes[parent].leftChild = nodeID;
|
||||
bnodes[parent].range.start = left;
|
||||
}
|
||||
else
|
||||
{
|
||||
parent = left - 1;
|
||||
bnodes[parent].rightChild = nodeID;
|
||||
bnodes[parent].range.end = right;
|
||||
}
|
||||
|
||||
mem_fence_gpu_default();
|
||||
|
||||
/* stop ascending the tree if we reached this node the first time */
|
||||
const bool first = atomic_inc_global((global uint *)&bnodes[parent].flag) == 0;
|
||||
return first ? -1 : parent;
|
||||
}
|
||||
|
||||
GRL_INLINE void
|
||||
DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
|
||||
global struct MortonCodePrimitive* mc,
|
||||
global struct AABB* primref,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global char* bvh_mem,
|
||||
uint startID, uint endID,
|
||||
local uint* local_numRecords,
|
||||
local uint* local_numRecordsOld,
|
||||
local struct BuildRecordMorton* local_records
|
||||
)
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
|
||||
|
||||
/* iterate over all subtrees this workgroup should build */
|
||||
for ( uint recordID = startID; recordID < endID; recordID++ )
|
||||
{
|
||||
/* add start build record to local stack */
|
||||
if ( get_local_id( 0 ) == 0 )
|
||||
{
|
||||
local_records[0] = records[recordID];
|
||||
*local_numRecords = 1;
|
||||
*local_numRecordsOld = 0;
|
||||
}
|
||||
work_group_barrier( CLK_LOCAL_MEM_FENCE );
|
||||
|
||||
/* terminate when all subtrees are leaves */
|
||||
while ( *local_numRecords != *local_numRecordsOld )
|
||||
{
|
||||
/* remember the old number of build records to detect later
|
||||
* whether we are done */
|
||||
if ( get_local_id( 0 ) == 0 )
|
||||
{
|
||||
*local_numRecordsOld = *local_numRecords;
|
||||
}
|
||||
work_group_barrier( CLK_LOCAL_MEM_FENCE );
|
||||
|
||||
/* all work items in the sub group pick a subtree to build */
|
||||
for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
|
||||
{
|
||||
/* ignore small subtrees */
|
||||
if ( local_records[ID].items <= BVH_NODE_N6 )
|
||||
continue;
|
||||
|
||||
/* create QBVH node */
|
||||
create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
|
||||
}
|
||||
|
||||
/* wait for all work items to have updated local_records array */
|
||||
work_group_barrier( CLK_LOCAL_MEM_FENCE );
|
||||
}
|
||||
|
||||
const uint shift_mask = globals->shift_mask;
|
||||
const uint leafPrimType = globals->leafPrimType;
|
||||
const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
|
||||
BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
|
||||
global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
|
||||
|
||||
/* create all fat leaf nodes and initiate refit */
|
||||
for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
|
||||
{
|
||||
struct BuildRecordMorton current = local_records[ID];
|
||||
const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
|
||||
|
||||
global struct QBVHNodeN* qnode = nodeData + current.current_index;
|
||||
|
||||
/* get bounds of all children of the fat leaf node */
|
||||
struct AABB bounds[BVH_NODE_N6];
|
||||
for ( uint i = 0; i < current.items; i++ )
|
||||
{
|
||||
/* get primID and bounds of primitive */
|
||||
const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
|
||||
bounds[i] = primref[primID];
|
||||
|
||||
/* For all primitives in a fat leaf we store a back
|
||||
* pointer. This way we can modify the fat leaf node at leaf construction time. */
|
||||
const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
|
||||
|
||||
/* Store back pointer and primID inside morton code array to
|
||||
* be later used by leaf creation. */
|
||||
mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
|
||||
}
|
||||
|
||||
/* update fat leaf node */
|
||||
QBVHNodeN_setType( qnode, leafPrimType );
|
||||
global void* offset;
|
||||
if ( leafPrimType != BVH_INSTANCE_NODE )
|
||||
{
|
||||
offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
|
||||
QBVHNodeN_setChildIncr1( qnode );
|
||||
}
|
||||
else
|
||||
{
|
||||
offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
|
||||
QBVHNodeN_setChildIncr2( qnode );
|
||||
}
|
||||
QBVH6Node_set_offset( qnode, offset );
|
||||
QBVHNodeN_setBounds( qnode, bounds, current.items );
|
||||
|
||||
/* set back pointers for fat leaf nodes */
|
||||
*InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
|
||||
|
||||
/* bottom up refit */
|
||||
refit_bottom_up( qnode, bvh, bounds, current.items );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This phase takes the build records calculated in phase0 as input and
|
||||
finished the BVH construction for all these subtrees.
|
||||
|
||||
*/
|
||||
__attribute__((reqd_work_group_size(8, 1, 1)))
|
||||
old_parallel_build_phase1(global struct Globals *globals,
|
||||
global struct MortonCodePrimitive *mc,
|
||||
global struct AABB *primref,
|
||||
global struct BinaryMortonCodeHierarchy *bnodes,
|
||||
global char *bvh_mem)
|
||||
{
|
||||
global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
|
||||
global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
|
||||
|
||||
/* a queue of build records */
|
||||
local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
|
||||
local uint local_numRecords;
|
||||
local uint local_numRecordsOld;
|
||||
|
||||
/* construct range of build records that each sub group will process */
|
||||
const uint numRecords = globals->numBuildRecords;
|
||||
const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
|
||||
const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
|
||||
|
||||
DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
|
||||
|
||||
}
|
||||
|
||||
__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
|
||||
old_parallel_build_phase1_Indirect( global struct Globals* globals,
|
||||
global struct MortonCodePrimitive* mc,
|
||||
global struct AABB* primref,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes,
|
||||
global char* bvh_mem )
|
||||
{
|
||||
global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
|
||||
global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
|
||||
|
||||
/* a queue of build records */
|
||||
local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
|
||||
local uint local_numRecords;
|
||||
local uint local_numRecordsOld;
|
||||
|
||||
/* construct range of build records that each sub group will process */
|
||||
const uint numRecords = globals->numBuildRecords;
|
||||
uint startID = get_group_id( 0 );
|
||||
uint endID = startID + 1;
|
||||
|
||||
DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2022 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "morton/morton_common.h"
|
||||
|
||||
GRL_INLINE uint get_morton_shift( uint numPrimitives )
|
||||
{
|
||||
return 32 - clz( numPrimitives );
|
||||
}
|
||||
|
||||
GRL_INLINE uint get_morton_shift_mask( uint numPrimitives )
|
||||
{
|
||||
uint shift = get_morton_shift( numPrimitives );
|
||||
uint mask =(uint)(((ulong)1 << shift));
|
||||
return mask - 1; // separated due to problems in DX
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(1, 1, 1))) void kernel init( global struct Globals *globals )
|
||||
{
|
||||
/* variable shift for putting morton code + index to 64 bit */
|
||||
const uint shift = 32 - clz(globals->numPrimitives);
|
||||
globals->shift = shift;
|
||||
globals->shift_mask = (uint)(((ulong)1 << shift));
|
||||
globals->shift_mask -= 1; // separated due to problems in DX
|
||||
globals->binary_hierarchy_root = 0;
|
||||
globals->morton_sort_in_flight = 0;
|
||||
globals->sort_iterations = get_morton_sort_lsb_req_iterations(shift);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
This kernel create a morton code array containing a morton code and
|
||||
index into the primref array.
|
||||
|
||||
The code uses the maximal number of bits for the morton code, such
|
||||
that the morton code and index can still both get stored in 64 bits.
|
||||
|
||||
The algorithm first maps the centroids of the primitives and their
|
||||
bounding box diagonal into a 4D grid, and then interleaves all 4
|
||||
grid coordinates to construct the to morton code.
|
||||
|
||||
*/
|
||||
|
||||
__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
|
||||
__attribute__( (intel_reqd_sub_group_size( 16 )) ) void kernel
|
||||
create_morton_codes_indirect( global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global struct AABB* primref,
|
||||
global struct MortonCodePrimitive* morton_codes,
|
||||
global struct MortonCodePrimitive* morton_codes_tmp,
|
||||
uint use_new_morton_sort)
|
||||
{
|
||||
/* construct range of morton codes each work group should create */
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
const uint startID = get_group_id( 0 ) * get_local_size( 0 );
|
||||
const uint endID = min((uint)(startID + get_local_size(0)), numPrimitives);
|
||||
|
||||
/* get lower and upper bounds of geometry and length of scene diagonal */
|
||||
const float3 lower = globals->centroidBounds.lower.xyz;
|
||||
const float3 upper = globals->centroidBounds.upper.xyz;
|
||||
const float diag = length( AABB3f_size( &bvh->Meta.bounds ).xyz );
|
||||
|
||||
/* calculates the 4D grid */
|
||||
const uint shift = get_morton_shift( numPrimitives );
|
||||
const uint grid_size = 1 << (64 - shift) / 4;
|
||||
const float4 grid_base = (float4)(lower, 0.0f);
|
||||
const float4 grid_extend = (float4)(upper - lower, diag);
|
||||
const float4 grid_scale = select( (grid_size * 0.99f) / grid_extend, 0.0f, grid_extend == 0.0f ); // FIXME: 0.99f!!!!!
|
||||
|
||||
const uint req_iterations = get_morton_sort_lsb_req_iterations(shift);
|
||||
|
||||
/* each work group iterates over its range of morton codes to create */
|
||||
uint primID = startID + get_local_id( 0 );
|
||||
if( primID < endID )
|
||||
{
|
||||
/* calculate position inside 4D grid */
|
||||
float4 centroid2 = AABB_centroid2( &primref[primID] );
|
||||
centroid2.w = length( AABB_size( &primref[primID] ).xyz );
|
||||
const uint4 gridpos = convert_uint4_rtz( (centroid2 - grid_base) * grid_scale );
|
||||
|
||||
/* calculate and store morton code */
|
||||
const ulong code = ulong_bitInterleave4D( gridpos );
|
||||
const ulong index_code = ((ulong)code << shift) | (ulong)primID;
|
||||
|
||||
// It is required for morton code to be in morton_codes buffer after LSB sort finishes.
|
||||
// If there would be odd iteration number needed for sorting, it is needed
|
||||
// to skip some iterations of sorting. For odd number of iteration start with morton_codes_tmp buffer
|
||||
if(req_iterations & 1 && !use_new_morton_sort)
|
||||
morton_codes_tmp[primID].index_code = index_code;
|
||||
else
|
||||
morton_codes[primID].index_code = index_code;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Initialization of the binary morton code hierarchy.
|
||||
|
||||
*/
|
||||
|
||||
__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) void kernel init_bottom_up_indirect( global struct Globals* globals,
|
||||
global struct BinaryMortonCodeHierarchy* bnodes )
|
||||
{
|
||||
/* construct range each work group will process */
|
||||
const uint numPrimitives = globals->numPrimitives;
|
||||
const uint startID = get_group_id( 0 ) * get_local_size(0);
|
||||
const uint endID = min((uint)(startID + get_local_size(0)), numPrimitives);
|
||||
|
||||
/* each workgroup iterates over its range to initialize the binary BVH */
|
||||
uint i = startID + get_local_id( 0 );
|
||||
if( i < endID )
|
||||
BinaryMortonCodeHierarchy_init( &bnodes[i], 0, numPrimitives - 1 );
|
||||
}
|
||||
|
|
@ -1,335 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module morton_builder;
|
||||
|
||||
kernel_module morton_kernels ("morton/pre_sort.cl")
|
||||
{
|
||||
kernel opencl_build_kernel_init < kernelFunction="init" >;
|
||||
kernel opencl_build_morton_kernel_create_morton_codes_indirect < kernelFunction="create_morton_codes_indirect" >;
|
||||
kernel opencl_build_morton_kernel_init_bottom_up_indirect < kernelFunction="init_bottom_up_indirect" >;
|
||||
}
|
||||
|
||||
kernel_module morton_kernels ("morton/post_sort.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_build_morton_kernel_build_bottom_up_indirect < kernelFunction="build_bottom_up_indirect" >;
|
||||
}
|
||||
|
||||
kernel_module morton_kernels ("morton/phase0.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_build_morton_kernel_parallel_build_phase0 < kernelFunction="parallel_build_phase0" >;
|
||||
kernel opencl_build_morton_kernel_parallel_build_phase0_local_sync < kernelFunction="parallel_build_phase0_local_sync" >;
|
||||
}
|
||||
|
||||
kernel_module morton_kernels ("morton/phase1.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_build_morton_kernel_parallel_build_phase1_Indirect < kernelFunction="parallel_build_phase1_Indirect_SG" >;
|
||||
kernel opencl_build_morton_kernel_parallel_build_phase1_root < kernelFunction="parallel_build_phase1_Indirect_global_root" >;
|
||||
}
|
||||
|
||||
kernel_module morton_kernels ("morton/phase2.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_build_morton_kernel_parallel_build_phase2_refit < kernelFunction="parallel_build_phase2_refit" >;
|
||||
kernel opencl_build_morton_kernel_parallel_build_phase2_refit_local < kernelFunction="parallel_build_phase2_refit_local" >;
|
||||
}
|
||||
|
||||
import struct MKBuilderState "structs.grl";
|
||||
|
||||
/*
|
||||
metakernel begin(
|
||||
MKBuilderState state,
|
||||
qword morton_code_buffer,
|
||||
dword primLeafType,
|
||||
dword numHwThreads)
|
||||
{
|
||||
dispatch opencl_build_kernel_init(1, 1, 1) args(
|
||||
state.build_globals
|
||||
);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
|
||||
dispatch opencl_build_morton_kernel_create_morton_codes(numHwThreads, 1, 1) args(
|
||||
state.build_globals,
|
||||
state.bvh_buffer,
|
||||
state.build_primref_buffer,
|
||||
morton_code_buffer);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
}
|
||||
|
||||
metakernel build_bottom_up(
|
||||
MKBuilderState state,
|
||||
qword buildrecords_bottom_up,
|
||||
qword morton_code_buffer,
|
||||
dword numHwThreads)
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_init_bottom_up(numHwThreads, 1, 1) args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
dispatch opencl_build_morton_kernel_build_bottom_up(numHwThreads, 1, 1) args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up,
|
||||
morton_code_buffer);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
}
|
||||
|
||||
|
||||
metakernel parallel_build(
|
||||
MKBuilderState state,
|
||||
qword buildrecords_bottom_up,
|
||||
qword morton_code_buffer,
|
||||
dword numHwThreads)
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up,
|
||||
state.bvh_buffer);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
dispatch opencl_build_morton_kernel_parallel_build_phase1(numHwThreads, 1, 1) args(
|
||||
state.build_globals,
|
||||
morton_code_buffer,
|
||||
state.build_primref_buffer,
|
||||
buildrecords_bottom_up,
|
||||
state.bvh_buffer);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
metakernel NewMorton_pre_sort(
|
||||
qword num_primrefs_counter,
|
||||
MKBuilderState state,
|
||||
qword morton_code_buffer,
|
||||
qword morton_code_buffer_tmp,
|
||||
qword buildrecords_bottom_up,
|
||||
dword use_new_morton_sort)
|
||||
{
|
||||
|
||||
|
||||
{
|
||||
REG1 = 15;
|
||||
REG2 = 4;
|
||||
REG0 = load_dword( num_primrefs_counter );
|
||||
|
||||
REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals
|
||||
REG1 = ~REG1;
|
||||
REG0 = REG0 & REG1;
|
||||
REG0 = REG0 >> REG2;
|
||||
}
|
||||
|
||||
dispatch opencl_build_kernel_init(1, 1, 1) args( state.build_globals );
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
/*
|
||||
// new bottom-up kernel does not need this
|
||||
dispatch_indirect opencl_build_morton_kernel_init_bottom_up_indirect args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up);
|
||||
*/
|
||||
dispatch_indirect opencl_build_morton_kernel_create_morton_codes_indirect args(
|
||||
state.build_globals,
|
||||
state.bvh_buffer,
|
||||
state.build_primref_buffer,
|
||||
morton_code_buffer,
|
||||
morton_code_buffer_tmp,
|
||||
use_new_morton_sort);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
metakernel NewMorton_post_sort(
|
||||
qword num_primrefs_counter,
|
||||
qword num_buildrecords_counter,
|
||||
MKBuilderState state,
|
||||
qword buildrecords_bottom_up,
|
||||
qword morton_code_buffer )
|
||||
{
|
||||
|
||||
{
|
||||
REG1 = 15;
|
||||
REG2 = 4;
|
||||
REG0 = load_dword( num_primrefs_counter );
|
||||
|
||||
REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals
|
||||
REG1 = ~REG1;
|
||||
REG0 = REG0 & REG1;
|
||||
REG0 = REG0 >> REG2;
|
||||
}
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up,
|
||||
morton_code_buffer);
|
||||
|
||||
|
||||
/*
|
||||
dispatch opencl_build_morton_kernel_build_bottom_up(16, 1, 1) args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up,
|
||||
morton_code_buffer);
|
||||
*/
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up,
|
||||
state.bvh_buffer);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
DISPATCHDIM_X = load_dword( num_buildrecords_counter );
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
|
||||
state.build_globals,
|
||||
morton_code_buffer,
|
||||
state.build_primref_buffer,
|
||||
buildrecords_bottom_up,
|
||||
state.bvh_buffer);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
}
|
||||
|
||||
metakernel NewMorton_bottom_up(
|
||||
qword num_primrefs_counter,
|
||||
MKBuilderState state,
|
||||
qword buildrecords_bottom_up,
|
||||
qword morton_code_buffer )
|
||||
{
|
||||
|
||||
{
|
||||
REG1 = 15;
|
||||
REG2 = 4;
|
||||
REG0 = load_dword( num_primrefs_counter );
|
||||
|
||||
REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals
|
||||
REG1 = ~REG1;
|
||||
REG0 = REG0 & REG1;
|
||||
REG0 = REG0 >> REG2;
|
||||
}
|
||||
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up,
|
||||
morton_code_buffer);
|
||||
}
|
||||
|
||||
|
||||
metakernel NewMorton_phase0(
|
||||
MKBuilderState state,
|
||||
qword buildrecords_bottom_up,
|
||||
qword morton_p0_refit_startpoints)
|
||||
{
|
||||
|
||||
dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up,
|
||||
state.bvh_buffer,
|
||||
morton_p0_refit_startpoints);
|
||||
}
|
||||
|
||||
metakernel NewMorton_phase0_local_sync(
|
||||
MKBuilderState state,
|
||||
qword buildrecords_bottom_up,
|
||||
qword p0_boxless_nodes)
|
||||
{
|
||||
|
||||
dispatch opencl_build_morton_kernel_parallel_build_phase0_local_sync(1, 1, 1) args(
|
||||
state.build_globals,
|
||||
buildrecords_bottom_up,
|
||||
state.bvh_buffer,
|
||||
p0_boxless_nodes);
|
||||
}
|
||||
|
||||
|
||||
metakernel NewMorton_phase1(
|
||||
qword num_buildrecords_counter,
|
||||
MKBuilderState state,
|
||||
qword buildrecords_bottom_up,
|
||||
qword morton_code_buffer)
|
||||
{
|
||||
|
||||
DISPATCHDIM_X = load_dword( num_buildrecords_counter );
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
|
||||
state.build_globals,
|
||||
morton_code_buffer,
|
||||
state.build_primref_buffer,
|
||||
buildrecords_bottom_up,
|
||||
state.bvh_buffer);
|
||||
}
|
||||
|
||||
metakernel NewMorton_phase1_root(
|
||||
qword num_buildrecords_counter,
|
||||
MKBuilderState state,
|
||||
qword buildrecords_bottom_up,
|
||||
qword morton_code_buffer)
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_parallel_build_phase1_root(1, 1, 1) args(
|
||||
state.build_globals,
|
||||
morton_code_buffer,
|
||||
state.build_primref_buffer,
|
||||
buildrecords_bottom_up,
|
||||
state.bvh_buffer);
|
||||
}
|
||||
|
||||
metakernel NewMorton_phase2(
|
||||
qword num_leaves_counter,
|
||||
MKBuilderState state,
|
||||
qword bottom_node_ids )
|
||||
{
|
||||
|
||||
DISPATCHDIM_X = load_dword( num_leaves_counter );
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_parallel_build_phase2_refit args(
|
||||
state.bvh_buffer,
|
||||
bottom_node_ids);
|
||||
}
|
||||
|
||||
metakernel NewMorton_phase2_local(
|
||||
MKBuilderState state,
|
||||
qword p0_boxless_nodes)
|
||||
{
|
||||
|
||||
dispatch opencl_build_morton_kernel_parallel_build_phase2_refit_local(1, 1, 1) args(
|
||||
state.build_globals,
|
||||
state.bvh_buffer,
|
||||
p0_boxless_nodes);
|
||||
}
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
// just inlines the kernels that are there in the header
|
||||
#include "morton_msb_radix_bitonic_sort.h"
|
||||
|
|
@ -1,924 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "morton_msb_radix_bitonic_sort_shared.h"
|
||||
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Configuration switches
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define DEBUG 0
|
||||
#define MERGE_BLS_WITHIN_SG 0
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#if DEBUG
|
||||
#define DEBUG_CODE(A) A
|
||||
#else
|
||||
#define DEBUG_CODE(A)
|
||||
#endif
|
||||
|
||||
#define BOTTOM_LEVEL_SORT_WG_SIZE 512
|
||||
|
||||
// this kernel is only used to put into metakernel for debug to print that the code reached that place
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void kernel debug_print_kernel(uint variable)
|
||||
{
|
||||
if(get_local_id(0) == 0)
|
||||
printf("I'm here! %d\n", variable);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void kernel check_bls_sort(global struct Globals* globals, global ulong* input)
|
||||
{
|
||||
uint prims_num = globals->numPrimitives;
|
||||
|
||||
printf("in check_bls_sort kernel. Values count:: %d\n", prims_num);
|
||||
|
||||
ulong left = input[0];
|
||||
ulong right;
|
||||
for (int i = 0; i < prims_num - 1; i++)
|
||||
{
|
||||
right = input[i + 1];
|
||||
printf("sorted val: %llu\n", left);
|
||||
if (left > right)
|
||||
{
|
||||
printf("element %d is bigger than %d: %llu > %llu\n", i, i+1, left, right);
|
||||
}
|
||||
left = right;
|
||||
}
|
||||
}
|
||||
|
||||
inline uint wg_scan_inclusive_add_opt(local uint* tmp, uint val, uint SG_SIZE, uint WG_SIZE)
|
||||
{
|
||||
const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE;
|
||||
const uint sg_local_id = get_local_id(0) % SG_SIZE;
|
||||
const uint NUM_HW_THREADS_IN_WG = WG_SIZE / SG_SIZE;
|
||||
|
||||
uint acc = sub_group_scan_inclusive_add(val);
|
||||
if (NUM_HW_THREADS_IN_WG == 1)
|
||||
{
|
||||
return acc;
|
||||
}
|
||||
tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc, SG_SIZE - 1);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint loaded_val = sg_local_id < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
|
||||
uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
|
||||
uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
|
||||
// for > 256 workitems in SIMD16 we won't fit in 16 workitems per subgroup, so we need additional iteration
|
||||
// same for > 64 workitems and more in SIMD8
|
||||
uint num_iterations = (NUM_HW_THREADS_IN_WG + SG_SIZE - 1) / SG_SIZE;
|
||||
for (int i = 1; i < num_iterations; i++)
|
||||
{
|
||||
// need to add tmp[] because of "exclusive" scan, so last element misses it
|
||||
uint prev_max_sum = sub_group_broadcast(wgs_acc, SG_SIZE - 1) + tmp[(i * SG_SIZE) - 1];
|
||||
loaded_val = (sg_local_id + i * SG_SIZE) < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
|
||||
wgs_acc = sub_group_scan_exclusive_add(loaded_val);
|
||||
wgs_acc += prev_max_sum;
|
||||
uint new_acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id % SG_SIZE);
|
||||
if (hw_thread_in_wg_id >= i * SG_SIZE)
|
||||
acc_for_this_hw_thread = new_acc_for_this_hw_thread;
|
||||
}
|
||||
return acc + acc_for_this_hw_thread;
|
||||
}
|
||||
|
||||
struct MSBDispatchArgs
|
||||
{
|
||||
global struct MSBRadixContext* context;
|
||||
uint num_of_wgs; // this is the number of workgroups that was dispatched for this context
|
||||
ulong* wg_key_start; // this is where keys to process start for current workgroup
|
||||
ulong* wg_key_end;
|
||||
uint shift_bit;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
struct MSBDispatchArgs get_msb_dispatch_args(global struct VContextScheduler* scheduler)
|
||||
{
|
||||
global struct MSBDispatchQueue* queue = &scheduler->msb_queue;
|
||||
|
||||
uint group = get_group_id(0);
|
||||
struct MSBDispatchRecord record;
|
||||
|
||||
// TODO_OPT: Load this entire prefix array into SLM instead of searching..
|
||||
// Or use sub-group ops
|
||||
uint i = 0;
|
||||
while (i < queue->num_records)
|
||||
{
|
||||
uint n = queue->records[i].wgs_to_dispatch;
|
||||
|
||||
if (group < n)
|
||||
{
|
||||
record = queue->records[i];
|
||||
break;
|
||||
}
|
||||
|
||||
group -= n;
|
||||
i++;
|
||||
}
|
||||
|
||||
uint context_id = i;
|
||||
global struct MSBRadixContext* context = &scheduler->contexts[context_id];
|
||||
|
||||
// moving to ulongs to avoid uint overflow
|
||||
ulong group_id_in_dispatch = group;
|
||||
ulong start_offset = context->start_offset;
|
||||
ulong num_keys = context->num_keys;
|
||||
ulong wgs_to_dispatch = record.wgs_to_dispatch;
|
||||
|
||||
struct MSBDispatchArgs args;
|
||||
args.context = context;
|
||||
args.num_of_wgs = record.wgs_to_dispatch;
|
||||
args.wg_key_start = context->keys_in + start_offset + (group_id_in_dispatch * num_keys / wgs_to_dispatch);
|
||||
args.wg_key_end = context->keys_in + start_offset + ((group_id_in_dispatch+1) * num_keys / wgs_to_dispatch);
|
||||
args.shift_bit = MSB_SHIFT_BYTE_START_OFFSET - context->iteration * MSB_BITS_PER_ITERATION;
|
||||
return args;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void BLSDispatchQueue_push(global struct BLSDispatchQueue* queue, struct BLSDispatchRecord* record)
|
||||
{
|
||||
uint new_idx = atomic_inc_global(&queue->num_records);
|
||||
queue->records[new_idx] = *record;
|
||||
DEBUG_CODE(printf("adding bls of size: %d\n", record->count));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void DO_CountSort(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
|
||||
{
|
||||
uint tid = get_local_id(0);
|
||||
|
||||
global ulong* in = ((global ulong*)(dispatchRecord.keys_in)) + dispatchRecord.start_offset;
|
||||
|
||||
ulong a = tid < dispatchRecord.count ? in[tid] : ULONG_MAX;
|
||||
|
||||
SLM_shared[tid] = a;
|
||||
|
||||
uint counter = 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
ulong curr = SLM_shared[get_sub_group_local_id()];
|
||||
|
||||
for (uint i = 16; i < dispatchRecord.count; i += 16)
|
||||
{
|
||||
ulong next = SLM_shared[i + get_sub_group_local_id()];
|
||||
|
||||
for (uint j = 0; j < 16; j++)
|
||||
{
|
||||
// some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
|
||||
uint2 curr_as_uint2 = as_uint2(curr);
|
||||
uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
|
||||
ulong c = as_ulong(sg_curr_as_uint2);
|
||||
if (c < a)
|
||||
counter++;
|
||||
}
|
||||
|
||||
curr = next;
|
||||
}
|
||||
|
||||
|
||||
// last iter
|
||||
for (uint j = 0; j < 16; j++)
|
||||
{
|
||||
// some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
|
||||
uint2 curr_as_uint2 = as_uint2(curr);
|
||||
uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
|
||||
ulong c = as_ulong(sg_curr_as_uint2);
|
||||
if (c < a)
|
||||
counter++;
|
||||
}
|
||||
|
||||
// save elements to its sorted positions
|
||||
if (tid < dispatchRecord.count)
|
||||
output[dispatchRecord.start_offset + counter] = a;
|
||||
}
|
||||
|
||||
void DO_Bitonic(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
uint elements_to_sort = BOTTOM_LEVEL_SORT_THRESHOLD;
|
||||
while ((elements_to_sort >> 1) >= dispatchRecord.count && elements_to_sort >> 1 >= BOTTOM_LEVEL_SORT_WG_SIZE)
|
||||
{
|
||||
elements_to_sort >>= 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
|
||||
{
|
||||
uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
|
||||
|
||||
if (tid >= dispatchRecord.count)
|
||||
SLM_shared[tid] = ULONG_MAX;
|
||||
else
|
||||
SLM_shared[tid] = ((global ulong*)(dispatchRecord.keys_in))[dispatchRecord.start_offset + tid];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint k_iterations = elements_to_sort;
|
||||
while(k_iterations >> 1 >= dispatchRecord.count && k_iterations != 0)
|
||||
{
|
||||
k_iterations >>= 1;
|
||||
}
|
||||
|
||||
for (unsigned int k = 2; k <= k_iterations; k *= 2)
|
||||
{
|
||||
for (unsigned int j = k / 2; j > 0; j /= 2)
|
||||
{
|
||||
// this loop is needed when we can't create big enough workgroup so we need to process multiple times
|
||||
for (uint i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
|
||||
{
|
||||
uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
|
||||
unsigned int ixj = tid ^ j;
|
||||
if (ixj > tid)
|
||||
{
|
||||
if ((tid & k) == 0)
|
||||
{
|
||||
if (SLM_shared[tid] > SLM_shared[ixj])
|
||||
{
|
||||
ulong tmp = SLM_shared[tid];
|
||||
SLM_shared[tid] = SLM_shared[ixj];
|
||||
SLM_shared[ixj] = tmp;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (SLM_shared[tid] < SLM_shared[ixj])
|
||||
{
|
||||
ulong tmp = SLM_shared[tid];
|
||||
SLM_shared[tid] = SLM_shared[ixj];
|
||||
SLM_shared[ixj] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
|
||||
{
|
||||
uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
|
||||
|
||||
if (tid < dispatchRecord.count)
|
||||
output[dispatchRecord.start_offset + tid] = SLM_shared[tid];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void DO_Create_Separate_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
|
||||
uint start = context->start[lid];
|
||||
uint count = context->count[lid];
|
||||
uint start_offset = context->start_offset + start;
|
||||
|
||||
struct BLSDispatchRecord record;
|
||||
record.start_offset = start_offset;
|
||||
record.count = count;
|
||||
record.keys_in = context->keys_out;
|
||||
|
||||
if (count == 0) // we don't have elements so don't do anything
|
||||
{
|
||||
}
|
||||
else if (count == 1) // single element so just write it out
|
||||
{
|
||||
input[start_offset] = ((global ulong*)record.keys_in)[start_offset];
|
||||
}
|
||||
else if (count <= BOTTOM_LEVEL_SORT_THRESHOLD)
|
||||
{
|
||||
BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// We try to merge small BLS into larger one within the sub_group
|
||||
void DO_Create_SG_Merged_BLS_Work_Parallel(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
uint sid = get_sub_group_local_id();
|
||||
|
||||
uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
|
||||
|
||||
uint start = context->start[lid];
|
||||
uint count = context->count[lid];
|
||||
uint ctx_start_offset = context->start_offset;
|
||||
|
||||
if (sid == 0 || create_msb_work) // these SIMD lanes are the begining of merged BLS
|
||||
{
|
||||
struct BLSDispatchRecord record;
|
||||
if (create_msb_work)
|
||||
{
|
||||
record.start_offset = ctx_start_offset + start + count;
|
||||
record.count = 0;
|
||||
}
|
||||
else // SIMD lane 0 case
|
||||
{
|
||||
record.start_offset = ctx_start_offset + start;
|
||||
record.count = count;
|
||||
}
|
||||
|
||||
record.keys_in = context->keys_out;
|
||||
|
||||
uint loop_idx = 1;
|
||||
while (sid + loop_idx < 16) // loop over subgroup
|
||||
{
|
||||
uint _create_msb_work = intel_sub_group_shuffle_down(create_msb_work, 0u, loop_idx);
|
||||
uint _count = intel_sub_group_shuffle_down(count, 0u, loop_idx);
|
||||
uint _start = intel_sub_group_shuffle_down(start, 0u, loop_idx);
|
||||
|
||||
if (_create_msb_work) // found out next MSB work, so range of merges ends
|
||||
break;
|
||||
|
||||
// need to push record since nothing more will fit
|
||||
if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
|
||||
{
|
||||
if (record.count == 1)
|
||||
{
|
||||
input[record.start_offset] = record.keys_in[record.start_offset];
|
||||
}
|
||||
else if (record.count > 1)
|
||||
{
|
||||
BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
|
||||
}
|
||||
record.start_offset = ctx_start_offset + _start;
|
||||
record.count = _count;
|
||||
}
|
||||
else
|
||||
{
|
||||
record.count += _count;
|
||||
}
|
||||
loop_idx++;
|
||||
}
|
||||
// if we have any elements left, then schedule them
|
||||
if (record.count == 1) // only one element, so just write it out
|
||||
{
|
||||
input[record.start_offset] = record.keys_in[record.start_offset];
|
||||
}
|
||||
else if (record.count > 1)
|
||||
{
|
||||
BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// We try to merge small BLS into larger one within the sub_group
|
||||
void DO_Create_SG_Merged_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
uint sid = get_sub_group_local_id();
|
||||
|
||||
uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
|
||||
|
||||
uint start = context->start[lid];
|
||||
uint count = context->count[lid];
|
||||
uint ctx_start_offset = context->start_offset;
|
||||
|
||||
if (sid == 0)
|
||||
{
|
||||
struct BLSDispatchRecord record;
|
||||
record.start_offset = ctx_start_offset + start;
|
||||
record.count = 0;
|
||||
record.keys_in = context->keys_out;
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
uint _create_msb_work = sub_group_broadcast(create_msb_work, i);
|
||||
uint _count = sub_group_broadcast(count, i);
|
||||
uint _start = sub_group_broadcast(start, i);
|
||||
if (_create_msb_work)
|
||||
{
|
||||
if (record.count == 1) // only one element, so just write it out
|
||||
{
|
||||
input[record.start_offset] = record.keys_in[record.start_offset];
|
||||
}
|
||||
else if (record.count > 1)
|
||||
{
|
||||
BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
|
||||
}
|
||||
record.start_offset = ctx_start_offset + _start + _count;
|
||||
record.count = 0;
|
||||
continue;
|
||||
}
|
||||
// need to push record since nothing more will fit
|
||||
if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
|
||||
{
|
||||
BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
|
||||
record.start_offset = ctx_start_offset + _start;
|
||||
record.count = _count;
|
||||
}
|
||||
else
|
||||
{
|
||||
record.count += _count;
|
||||
}
|
||||
}
|
||||
// if we have any elements left, then schedule them
|
||||
if (record.count == 1) // only one element, so just write it out
|
||||
{
|
||||
input[record.start_offset] = record.keys_in[record.start_offset];
|
||||
}
|
||||
else if (record.count > 1)
|
||||
{
|
||||
BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void DO_Create_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input, local uint* slm_for_wg_scan, uint sg_size, uint wg_size)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
|
||||
uint iteration = context->iteration + 1;
|
||||
uint start = context->start[lid];
|
||||
uint count = context->count[lid];
|
||||
uint start_offset = context->start_offset + start;
|
||||
|
||||
uint create_msb_work = count > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
|
||||
|
||||
#if MERGE_BLS_WITHIN_SG
|
||||
DO_Create_SG_Merged_BLS_Work_Parallel(scheduler, context, input);
|
||||
#else
|
||||
DO_Create_Separate_BLS_Work(scheduler, context, input);
|
||||
#endif
|
||||
|
||||
uint new_entry_id = wg_scan_inclusive_add_opt(slm_for_wg_scan, create_msb_work, sg_size, wg_size);//work_group_scan_inclusive_add(create_msb_work);
|
||||
uint stack_begin_entry;
|
||||
// last workitem in wg contains number of all new entries
|
||||
if (lid == (MSB_RADIX_NUM_BINS - 1))
|
||||
{
|
||||
stack_begin_entry = atomic_add_global(&scheduler->msb_stack.num_entries, new_entry_id);
|
||||
}
|
||||
stack_begin_entry = work_group_broadcast(stack_begin_entry, (MSB_RADIX_NUM_BINS - 1));
|
||||
new_entry_id += stack_begin_entry -1;
|
||||
|
||||
|
||||
if (create_msb_work)
|
||||
{
|
||||
scheduler->msb_stack.entries[new_entry_id].start_offset = start_offset;
|
||||
scheduler->msb_stack.entries[new_entry_id].count = count;
|
||||
scheduler->msb_stack.entries[new_entry_id].iteration = iteration;
|
||||
}
|
||||
|
||||
if (lid == 0) {
|
||||
DEBUG_CODE(printf("num of new bls: %d\n", scheduler->next_bls_queue->num_records));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct BatchedBLSDispatchEntry
|
||||
{
|
||||
/////////////////////////////////////////////////////////////
|
||||
// State data used for communication with command streamer
|
||||
// NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
|
||||
/////////////////////////////////////////////////////////////
|
||||
qword p_data_buffer;
|
||||
qword num_elements; // number of elements in p_data_buffer
|
||||
};
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel sort_morton_codes_batched_BLS_dispatch(global struct BatchedBLSDispatchEntry* bls_dispatches)
|
||||
{
|
||||
uint dispatch_id = get_group_id(0);
|
||||
uint lid = get_local_id(0);
|
||||
|
||||
local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
|
||||
|
||||
struct BatchedBLSDispatchEntry dispatchArgs = bls_dispatches[dispatch_id];
|
||||
struct BLSDispatchRecord dispatchRecord;
|
||||
dispatchRecord.start_offset = 0;
|
||||
dispatchRecord.count = dispatchArgs.num_elements;
|
||||
dispatchRecord.keys_in = (ulong*)dispatchArgs.p_data_buffer;
|
||||
|
||||
DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", dispatchRecord.count));
|
||||
|
||||
if(dispatchRecord.count > 1)
|
||||
DO_Bitonic(dispatchRecord, SLM_shared, (global ulong*)dispatchRecord.keys_in);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel sort_morton_codes_bottom_level_single_wg(global struct Globals* globals, global ulong* input, global ulong* output)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
|
||||
DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", globals->numPrimitives));
|
||||
|
||||
local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
|
||||
|
||||
struct BLSDispatchRecord dispatchRecord;
|
||||
dispatchRecord.start_offset = 0;
|
||||
dispatchRecord.count = globals->numPrimitives;
|
||||
dispatchRecord.keys_in = (ulong*)input;
|
||||
|
||||
//TODO: count or bitonic here?
|
||||
//DO_Bitonic(dispatchRecord, SLM_shared, output);
|
||||
DO_CountSort(dispatchRecord, SLM_shared, output);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// This kernel initializes first context to start up the whole execution
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel sort_morton_codes_msb_begin(
|
||||
global struct Globals* globals,
|
||||
global struct VContextScheduler* scheduler,
|
||||
global ulong* buf0,
|
||||
global ulong* buf1)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
uint gid = get_group_id(0);
|
||||
|
||||
DEBUG_CODE(if (lid == 0)printf("running sort_morton_codes_msb_begin\n"));
|
||||
|
||||
scheduler->contexts[gid].count[lid] = 0;
|
||||
|
||||
if (gid == 0 && lid == 0)
|
||||
{
|
||||
global struct MSBRadixContext* context = &scheduler->contexts[lid];
|
||||
const uint num_prims = globals->numPrimitives;
|
||||
|
||||
scheduler->bls_queue0.num_records = 0;
|
||||
scheduler->bls_queue1.num_records = 0;
|
||||
|
||||
scheduler->curr_bls_queue = &scheduler->bls_queue1;
|
||||
scheduler->next_bls_queue = &scheduler->bls_queue0;
|
||||
|
||||
context->start_offset = 0;
|
||||
context->num_wgs_in_flight = 0;
|
||||
context->num_keys = num_prims;
|
||||
context->iteration = 0;
|
||||
context->keys_in = buf0;
|
||||
context->keys_out = buf1;
|
||||
|
||||
uint msb_wgs_to_dispatch = (num_prims + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
|
||||
scheduler->msb_queue.records[0].wgs_to_dispatch = msb_wgs_to_dispatch;
|
||||
|
||||
scheduler->num_wgs_msb = msb_wgs_to_dispatch;
|
||||
scheduler->num_wgs_bls = 0;
|
||||
scheduler->msb_stack.num_entries = 0;
|
||||
scheduler->msb_queue.num_records = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(MSB_RADIX_NUM_VCONTEXTS, 1, 1)))
|
||||
kernel void
|
||||
scheduler(global struct VContextScheduler* scheduler, global ulong* buf0, global ulong* buf1)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
|
||||
DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler\n"));
|
||||
|
||||
uint context_idx = lid;
|
||||
|
||||
const uint num_of_stack_entries = scheduler->msb_stack.num_entries;
|
||||
|
||||
uint msb_wgs_to_dispatch = 0;
|
||||
if (lid < num_of_stack_entries)
|
||||
{
|
||||
struct MSBStackEntry entry = scheduler->msb_stack.entries[(num_of_stack_entries-1) - lid];
|
||||
global struct MSBRadixContext* context = &scheduler->contexts[lid];
|
||||
context->start_offset = entry.start_offset;
|
||||
context->num_wgs_in_flight = 0;
|
||||
context->num_keys = entry.count;
|
||||
context->iteration = entry.iteration;
|
||||
context->keys_in = entry.iteration % 2 == 0 ? buf0 : buf1;
|
||||
context->keys_out = entry.iteration % 2 == 0 ? buf1 : buf0;
|
||||
|
||||
msb_wgs_to_dispatch = (entry.count + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
|
||||
scheduler->msb_queue.records[lid].wgs_to_dispatch = msb_wgs_to_dispatch;
|
||||
}
|
||||
|
||||
msb_wgs_to_dispatch = work_group_reduce_add(msb_wgs_to_dispatch);// TODO: if compiler implementation is slow, then consider to manually write it
|
||||
|
||||
if (lid == 0)
|
||||
{
|
||||
// swap queue for next iteration
|
||||
struct BLSDispatchQueue* tmp = scheduler->curr_bls_queue;
|
||||
scheduler->curr_bls_queue = scheduler->next_bls_queue;
|
||||
scheduler->next_bls_queue = tmp;
|
||||
|
||||
scheduler->next_bls_queue->num_records = 0;
|
||||
|
||||
scheduler->num_wgs_bls = scheduler->curr_bls_queue->num_records;
|
||||
scheduler->num_wgs_msb = msb_wgs_to_dispatch;
|
||||
|
||||
if (num_of_stack_entries < MSB_RADIX_NUM_VCONTEXTS)
|
||||
{
|
||||
scheduler->msb_queue.num_records = num_of_stack_entries;
|
||||
scheduler->msb_stack.num_entries = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
scheduler->msb_queue.num_records = MSB_RADIX_NUM_VCONTEXTS;
|
||||
scheduler->msb_stack.num_entries -= MSB_RADIX_NUM_VCONTEXTS;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler finished, to spawn %d MSB wgs in %d contexts and %d BLS wgs, MSB records on stack %d\n",
|
||||
scheduler->num_wgs_msb, scheduler->msb_queue.num_records, scheduler->num_wgs_bls, scheduler->msb_stack.num_entries));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// this is the lowest sub-task, which should end return sorted codes
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel sort_morton_codes_bottom_level( global struct VContextScheduler* scheduler, global ulong* output)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
|
||||
DEBUG_CODE(if (get_group_id(0) == 0 && lid == 0) printf("running sort_morton_codes_bottom_level\n"));
|
||||
|
||||
local struct BLSDispatchRecord l_dispatchRecord;
|
||||
if (lid == 0)
|
||||
{
|
||||
uint record_idx = get_group_id(0);
|
||||
l_dispatchRecord = scheduler->curr_bls_queue->records[record_idx];
|
||||
//l_dispatchRecord = BLSDispatchQueue_pop((global struct BLSDispatchQueue*)scheduler->curr_bls_queue);
|
||||
atomic_dec_global(&scheduler->num_wgs_bls);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
struct BLSDispatchRecord dispatchRecord = l_dispatchRecord;
|
||||
|
||||
local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
|
||||
|
||||
// right now use only bitonic sort
|
||||
// TODO: maybe implement something else
|
||||
if (1)
|
||||
{
|
||||
//DO_Bitonic(dispatchRecord, SLM_shared, output);
|
||||
DO_CountSort(dispatchRecord, SLM_shared, output);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define MSB_COUNT_WG_SIZE MSB_RADIX_NUM_BINS
|
||||
#define MSB_COUNT_SG_SIZE 16
|
||||
|
||||
// count how many elements per buckets we have
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MSB_COUNT_WG_SIZE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MSB_COUNT_SG_SIZE)))
|
||||
void kernel sort_morton_codes_msb_count_items( global struct VContextScheduler* scheduler)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
uint lsz = MSB_RADIX_NUM_BINS;
|
||||
|
||||
DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_count_items\n"));
|
||||
|
||||
local uint bucket_count[MSB_RADIX_NUM_BINS];
|
||||
local uint finish_count;
|
||||
bucket_count[lid] = 0;
|
||||
if (lid == 0)
|
||||
{
|
||||
finish_count = 0;
|
||||
}
|
||||
|
||||
struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
|
||||
|
||||
global struct MSBRadixContext* context = dispatchArgs.context;
|
||||
|
||||
global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
|
||||
global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
|
||||
uint shift_bit = dispatchArgs.shift_bit;
|
||||
uchar shift_byte = shift_bit / 8; // so we count how many uchars to shift
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
global uchar* ks = (global uchar*)key_start;
|
||||
ks += shift_byte;
|
||||
global uchar* ke = (global uchar*)key_end;
|
||||
ke += shift_byte;
|
||||
|
||||
// double buffering on value loading
|
||||
if (ks < ke)
|
||||
{
|
||||
uchar bucket_id = *ks;
|
||||
ks += lsz * sizeof(ulong);
|
||||
|
||||
for (global uchar* k = ks; k < ke; k += lsz * sizeof(ulong))
|
||||
{
|
||||
uchar next_bucket_id = *k;
|
||||
atomic_inc_local(&bucket_count[bucket_id]);
|
||||
bucket_id = next_bucket_id;
|
||||
}
|
||||
|
||||
atomic_inc_local(&bucket_count[bucket_id]);
|
||||
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//update global counters for context
|
||||
uint count = bucket_count[lid];
|
||||
if (count > 0)
|
||||
atomic_add_global(&context->count[lid], bucket_count[lid]);
|
||||
|
||||
mem_fence_gpu_invalidate();
|
||||
work_group_barrier(0);
|
||||
|
||||
bool final_wg = true;
|
||||
// count WGs which have reached the end
|
||||
if (dispatchArgs.num_of_wgs > 1)
|
||||
{
|
||||
if (lid == 0)
|
||||
finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
final_wg = finish_count == dispatchArgs.num_of_wgs;
|
||||
}
|
||||
|
||||
local uint partial_dispatches[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
|
||||
// if this is last wg for current dispatch, update context
|
||||
if (final_wg)
|
||||
{
|
||||
// code below does work_group_scan_exclusive_add(context->count[lid]);
|
||||
{
|
||||
uint lane_val = context->count[lid];
|
||||
uint sg_result = sub_group_scan_inclusive_add(lane_val);
|
||||
|
||||
partial_dispatches[get_sub_group_id()] = sub_group_broadcast(sg_result, MSB_COUNT_SG_SIZE - 1);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint slm_result = sub_group_scan_exclusive_add(partial_dispatches[get_sub_group_local_id()]);
|
||||
slm_result = sub_group_broadcast(slm_result, get_sub_group_id());
|
||||
uint result = slm_result + sg_result - lane_val;
|
||||
context->start[lid] = result;//work_group_scan_exclusive_add(context->count[lid]);
|
||||
}
|
||||
|
||||
context->count[lid] = 0;
|
||||
if(lid == 0)
|
||||
context->num_wgs_in_flight = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// sort elements into appropriate buckets
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
void kernel sort_morton_codes_msb_bin_items(
|
||||
global struct VContextScheduler* scheduler, global ulong* input)
|
||||
{
|
||||
uint lid = get_local_id(0);
|
||||
uint lsz = get_local_size(0);
|
||||
|
||||
DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_bin_items\n"));
|
||||
|
||||
local uint finish_count;
|
||||
if (lid == 0)
|
||||
{
|
||||
finish_count = 0;
|
||||
}
|
||||
|
||||
struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
|
||||
global struct MSBRadixContext* context = dispatchArgs.context;
|
||||
|
||||
global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
|
||||
global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
|
||||
uint shift_bit = dispatchArgs.shift_bit;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
global ulong* sorted_keys = (global ulong*)context->keys_out + context->start_offset;
|
||||
|
||||
#if MSB_RADIX_NUM_BINS == MSB_WG_SORT_ELEMENTS_THRESHOLD // special case meaning that we process exactly 1 element per workitem
|
||||
// here we'll do local counting, then move to global
|
||||
|
||||
local uint slm_counters[MSB_RADIX_NUM_BINS];
|
||||
slm_counters[lid] = 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint place_in_slm_bucket;
|
||||
uint bucket_id;
|
||||
ulong val;
|
||||
|
||||
bool active_lane = key_start < key_end;
|
||||
|
||||
if (active_lane)
|
||||
{
|
||||
val = *key_start;
|
||||
|
||||
bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
|
||||
place_in_slm_bucket = atomic_inc_local(&slm_counters[bucket_id]);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// override slm_counters with global counters - we don't need to override counters with 0 elements since we won't use them anyway
|
||||
if (slm_counters[lid])
|
||||
slm_counters[lid] = atomic_add_global(&context->count[lid], slm_counters[lid]);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint id_in_bucket = slm_counters[bucket_id] + place_in_slm_bucket;//atomic_inc_global(&context->count[bucket_id]);
|
||||
|
||||
if (active_lane)
|
||||
sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
|
||||
#else
|
||||
// double buffering on value loading
|
||||
if (key_start < key_end)
|
||||
{
|
||||
ulong val = *key_start;
|
||||
key_start += lsz;
|
||||
|
||||
for (global ulong* k = key_start; k < key_end; k += lsz)
|
||||
{
|
||||
ulong next_val = *k;
|
||||
uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
|
||||
uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
|
||||
|
||||
//printf("dec: %llu, val: %llX bucket_id: %X", *k, *k, bucket_id);
|
||||
sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
|
||||
|
||||
val = next_val;
|
||||
}
|
||||
|
||||
uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
|
||||
uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
|
||||
|
||||
sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
|
||||
}
|
||||
#endif
|
||||
|
||||
// make sure all groups's "counters" and "starts" are visible to final workgroup
|
||||
mem_fence_gpu_invalidate();
|
||||
work_group_barrier(0);
|
||||
|
||||
bool final_wg = true;
|
||||
// count WGs which have reached the end
|
||||
if (dispatchArgs.num_of_wgs > 1)
|
||||
{
|
||||
if (lid == 0)
|
||||
finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
final_wg = finish_count == dispatchArgs.num_of_wgs;
|
||||
}
|
||||
|
||||
local uint slm_for_wg_funcs[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
|
||||
// if this is last wg for current dispatch, then prepare sub-tasks
|
||||
if (final_wg)
|
||||
{
|
||||
DO_Create_Work(scheduler, context, input, slm_for_wg_funcs, 16, MSB_RADIX_NUM_BINS);
|
||||
|
||||
// clear context's counters for future execution
|
||||
context->count[lid] = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,135 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
//
|
||||
// This file contains structure definitions shared by GRL OCL kernels and host code
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLGen12.h"
|
||||
|
||||
// NOTE:
|
||||
// MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work
|
||||
// BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work
|
||||
//
|
||||
|
||||
#define MSB_RADIX_NUM_BINS 256
|
||||
#define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration
|
||||
#define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here
|
||||
|
||||
#define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used
|
||||
|
||||
#define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here,
|
||||
// since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these
|
||||
|
||||
#define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context
|
||||
|
||||
#define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS,
|
||||
// so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS
|
||||
|
||||
#define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup.
|
||||
// If a single MSB entry needs more, then it will spawn more WGs
|
||||
// after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num
|
||||
|
||||
#define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance
|
||||
// Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with.
|
||||
// Since we use ulong(8bytes) we can store 4096 elements
|
||||
// This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler
|
||||
// Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD
|
||||
|
||||
#define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
|
||||
|
||||
|
||||
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT)
|
||||
|
||||
struct MSBStackEntry
|
||||
{
|
||||
uint start_offset;
|
||||
uint count;
|
||||
uint iteration;
|
||||
};
|
||||
|
||||
struct MSBStack
|
||||
{
|
||||
dword num_entries;
|
||||
struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM];
|
||||
};
|
||||
|
||||
struct MSBRadixContext
|
||||
{
|
||||
uint start[MSB_RADIX_NUM_BINS];
|
||||
uint count[MSB_RADIX_NUM_BINS];
|
||||
uint num_wgs_in_flight; // this is used to identify which msb wg is last
|
||||
uint num_keys; // number of keys to process
|
||||
uint iteration;
|
||||
ulong* keys_in;
|
||||
ulong* keys_out;
|
||||
|
||||
uint start_offset; //offset from the beginning of the buffer
|
||||
};
|
||||
|
||||
struct MSBDispatchRecord
|
||||
{
|
||||
uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record
|
||||
};
|
||||
|
||||
struct MSBDispatchQueue
|
||||
{
|
||||
dword num_records;
|
||||
struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record
|
||||
};
|
||||
|
||||
// BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks
|
||||
struct BLSDispatchRecord
|
||||
{
|
||||
uint start_offset; // offset from the beginning of the buffer
|
||||
uint count;
|
||||
ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer
|
||||
};
|
||||
|
||||
struct BLSDispatchQueue
|
||||
{
|
||||
dword num_records;
|
||||
struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS];
|
||||
};
|
||||
|
||||
struct VContextScheduler
|
||||
{
|
||||
/////////////////////////////////////////////////////////////
|
||||
// State data used for communication with command streamer
|
||||
// NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword num_wgs_msb; // number of MSB workgroups being processed by current iteration
|
||||
dword num_wgs_bls; // number of BLS workgroups being processed by current iteration
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
struct MSBDispatchQueue msb_queue;
|
||||
struct BLSDispatchQueue bls_queue0;
|
||||
struct BLSDispatchQueue bls_queue1;
|
||||
|
||||
struct BLSDispatchQueue* curr_bls_queue;
|
||||
struct BLSDispatchQueue* next_bls_queue;
|
||||
|
||||
struct MSBStack msb_stack;
|
||||
|
||||
struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS];
|
||||
};
|
||||
|
||||
GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT)
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
// just inlines the kernels that are there in the header
|
||||
#include "morton_radix_sort.h"
|
||||
|
|
@ -1,855 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "libs/lsc_intrinsics.h"
|
||||
|
||||
/* ============================================================================= */
|
||||
/* ============================== LSB RADIX SORT =============================== */
|
||||
/* ============================================================================= */
|
||||
|
||||
#define RADIX_BINS 256
|
||||
#define SCATTER_WG_SIZE 512
|
||||
#define MORTON_LSB_SORT_NO_SHIFT_THRESHOLD 0xFFFFFFFF // turn off, because current hierarchy build requires full sort
|
||||
|
||||
uint2 get_thread_range( uint numItems, uint numGroups, uint taskID )
|
||||
{
|
||||
uint items_per_group = (numItems / numGroups);
|
||||
uint remainder = numItems - (items_per_group * numGroups);
|
||||
uint startID = taskID * items_per_group + min(taskID, remainder);
|
||||
uint endID = startID + items_per_group + ((taskID < remainder) ? 1 : 0);
|
||||
|
||||
return (uint2)(startID,endID);
|
||||
}
|
||||
|
||||
GRL_INLINE void sort_morton_codes_bin_items_taskID_func(global struct Globals* globals,
|
||||
global uint* global_histogram,
|
||||
global uchar* input,
|
||||
local uint* histogram,
|
||||
uint iteration,
|
||||
uint numGroups,
|
||||
uint numItems,
|
||||
bool shift_primID,
|
||||
uint taskID,
|
||||
uint startID,
|
||||
uint endID)
|
||||
{
|
||||
const uint shift = globals->shift;
|
||||
|
||||
for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
|
||||
histogram[i] = 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (shift_primID)
|
||||
{
|
||||
for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
|
||||
{
|
||||
// Read input as ulong to make bitshift, so the bits representing primID are not being
|
||||
// taken into account during sorting, which would result in smaller sort loops for
|
||||
// cases where morton shift are bigger than 8 bits
|
||||
ulong* ptr_ul = (ulong*)&input[8 * i];
|
||||
ulong code = *ptr_ul;
|
||||
uchar* ptr = (uchar*)&code;
|
||||
code >>= shift;
|
||||
|
||||
uchar bin = ptr[iteration];
|
||||
atomic_inc_local(&histogram[bin]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
|
||||
{
|
||||
uchar bin = input[8 * i + iteration];
|
||||
atomic_inc_local(&histogram[bin]);
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
|
||||
global_histogram[RADIX_BINS * taskID + i] = histogram[i];
|
||||
}
|
||||
|
||||
GRL_INLINE void sort_morton_codes_bin_items_func(global struct Globals* globals,
|
||||
global uint* global_histogram,
|
||||
global uint* wg_flags,
|
||||
global uchar* input,
|
||||
local uint* histogram,
|
||||
uint iteration,
|
||||
uint numGroups,
|
||||
uint numItems,
|
||||
bool shift_primID,
|
||||
bool update_wg_flags)
|
||||
{
|
||||
if (shift_primID)
|
||||
{
|
||||
// This check is present in other LSB sort functions as well, its purpose is
|
||||
// to skip first n iterations where n is the difference between max iterations
|
||||
// and actually needed iterations to sort without primIDs
|
||||
const uint req_iterations = globals->sort_iterations;
|
||||
if (iteration < req_iterations)
|
||||
return;
|
||||
|
||||
// iteration needs to be adjusted to reflect the skipped cycles
|
||||
iteration -= req_iterations;
|
||||
}
|
||||
|
||||
const uint taskID = get_group_id(0);
|
||||
|
||||
if (taskID == 0 && update_wg_flags)
|
||||
{
|
||||
for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
|
||||
wg_flags[i] = 0;
|
||||
}
|
||||
|
||||
uint2 ids = get_thread_range(numItems, numGroups, taskID);
|
||||
uint startID = ids.x;
|
||||
uint endID = ids.y;
|
||||
|
||||
sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, shift_primID,
|
||||
taskID, startID, endID);
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(512, 1, 1)))
|
||||
void kernel
|
||||
sort_morton_codes_bin_items(
|
||||
global struct Globals* globals,
|
||||
global uint* global_histogram,
|
||||
global uint* wg_flags,
|
||||
global uchar* input,
|
||||
uint iteration,
|
||||
uint numGroups,
|
||||
uint update_wg_flags
|
||||
)
|
||||
{
|
||||
local uint histogram[RADIX_BINS];
|
||||
const uint numItems = globals->numPrimitives;
|
||||
if(numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
|
||||
sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, false, update_wg_flags);
|
||||
else
|
||||
sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, true, update_wg_flags);
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void sort_morton_codes_reduce_bins_func(global struct Globals* globals,
|
||||
global uint* global_histogram,
|
||||
local uint* partials,
|
||||
uint numTasks,
|
||||
uint iteration,
|
||||
bool shift_primID)
|
||||
{
|
||||
const uint localID = get_local_id(0);
|
||||
|
||||
if (shift_primID)
|
||||
{
|
||||
const uint req_iterations = globals->sort_iterations;
|
||||
if (iteration < req_iterations)
|
||||
return;
|
||||
}
|
||||
|
||||
uint t = 0;
|
||||
for (uint j = 0; j < numTasks; j++)
|
||||
{
|
||||
const uint count = load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + localID], 0);
|
||||
store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + localID], 0, t);
|
||||
t += count;
|
||||
}
|
||||
|
||||
// each lane now contains the number of elements in the corresponding bin
|
||||
// prefix sum this for use in the subsequent scattering pass.
|
||||
uint global_count = t;
|
||||
|
||||
partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint lane = get_sub_group_local_id();
|
||||
uint p = partials[lane];
|
||||
p = (lane < get_sub_group_id()) ? p : 0;
|
||||
|
||||
global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
|
||||
|
||||
store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numTasks + localID], 0, global_count);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(256, 1, 1)))
|
||||
void kernel
|
||||
sort_morton_codes_reduce_bins(global struct Globals* globals,
|
||||
uint numTasks,
|
||||
global uint* global_histogram,
|
||||
uint iteration)
|
||||
{
|
||||
local uint partials[RADIX_BINS];
|
||||
const uint numItems = globals->numPrimitives;
|
||||
if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
|
||||
sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, false);
|
||||
else
|
||||
sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, true);
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
GRL_INLINE void sort_morton_codes_scatter_items_func(
|
||||
global struct Globals* globals,
|
||||
global uint* global_histogram,
|
||||
global ulong* input,
|
||||
global ulong* output,
|
||||
local uint* local_offset,
|
||||
local uint* flags,
|
||||
uint iteration,
|
||||
uint numGroups,
|
||||
uint numItems,
|
||||
bool shift_primID,
|
||||
bool update_morton_sort_in_flight)
|
||||
{
|
||||
const uint gID = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
|
||||
const uint global_shift = globals->shift;
|
||||
const uint localID = get_local_id(0);
|
||||
const uint taskID = get_group_id(0);
|
||||
|
||||
if (gID == 0 && update_morton_sort_in_flight)
|
||||
globals->morton_sort_in_flight = 0;
|
||||
|
||||
uint2 ids = get_thread_range(numItems, numGroups, taskID);
|
||||
uint startID = ids.x;
|
||||
uint endID = ids.y;
|
||||
|
||||
if (shift_primID)
|
||||
{
|
||||
const uint req_iterations = globals->sort_iterations;
|
||||
if (iteration < req_iterations)
|
||||
return;
|
||||
|
||||
iteration -= req_iterations;
|
||||
}
|
||||
|
||||
const uint shift = 8 * iteration;
|
||||
|
||||
// load the global bin counts, and add each bin's global prefix
|
||||
// to the local prefix
|
||||
{
|
||||
uint global_prefix = 0, local_prefix = 0;
|
||||
if (localID < RADIX_BINS)
|
||||
{
|
||||
local_prefix = global_histogram[RADIX_BINS * taskID + localID];
|
||||
global_prefix = global_histogram[RADIX_BINS * numGroups + localID];
|
||||
local_offset[localID] = global_prefix + local_prefix;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
|
||||
// move elements in WG-sized chunks. The elements need to be moved sequentially (can't use atomics)
|
||||
// because relative order has to be preserved for LSB radix sort to work
|
||||
|
||||
// For each bin, a bit vector indicating which elements are in the bin
|
||||
for (uint block_base = startID; block_base < endID; block_base += get_local_size(0))
|
||||
{
|
||||
// initialize bit vectors
|
||||
for (uint i = 4 * localID; i < RADIX_BINS * SCATTER_WG_SIZE / 32; i += 4 * get_local_size(0))
|
||||
{
|
||||
flags[i + 0] = 0;
|
||||
flags[i + 1] = 0;
|
||||
flags[i + 2] = 0;
|
||||
flags[i + 3] = 0;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// read sort key, determine which bin it goes into, scatter into the bit vector
|
||||
// and pre-load the local offset
|
||||
uint ID = localID + block_base;
|
||||
ulong key = 0;
|
||||
uint bin_offset = 0;
|
||||
uint bin = 0;
|
||||
uint bin_word = localID / 32;
|
||||
uint bin_bit = 1 << (localID % 32);
|
||||
|
||||
if (ID < endID)
|
||||
{
|
||||
key = input[ID];
|
||||
|
||||
if (shift_primID)
|
||||
bin = ((key >> global_shift) >> shift) & (RADIX_BINS - 1);
|
||||
else
|
||||
bin = (key >> shift) & (RADIX_BINS - 1);
|
||||
|
||||
atomic_add_local(&flags[(SCATTER_WG_SIZE / 32) * bin + bin_word], bin_bit);
|
||||
bin_offset = local_offset[bin];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (ID < endID)
|
||||
{
|
||||
// each key reads the bit-vectors for its bin,
|
||||
// - Computes local prefix sum to determine its output location
|
||||
// - Computes number of items added to its bin (last thread adjusts bin position)
|
||||
uint prefix = 0;
|
||||
uint count = 0;
|
||||
for (uint i = 0; i < (SCATTER_WG_SIZE / 32); i++)
|
||||
{
|
||||
uint bits = flags[(SCATTER_WG_SIZE / 32) * bin + i];
|
||||
uint bc = popcount(bits);
|
||||
uint pc = popcount(bits & (bin_bit - 1));
|
||||
prefix += (i < bin_word) ? bc : 0;
|
||||
prefix += (i == bin_word) ? pc : 0;
|
||||
|
||||
count += bc;
|
||||
}
|
||||
|
||||
// store the key in its proper place..
|
||||
output[prefix + bin_offset] = key;
|
||||
|
||||
// last item for each bin adjusts local offset for next outer loop iteration
|
||||
if (prefix == count - 1)
|
||||
local_offset[bin] += count;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
}
|
||||
|
||||
/* uint local_offset[RADIX_BINS]; */
|
||||
/* uint offset_global = 0; */
|
||||
/* for (int i=0;i<RADIX_BINS;i++) */
|
||||
/* { */
|
||||
/* const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
|
||||
/* const uint offset_local = global_histogram[RADIX_BINS*taskID+i]; */
|
||||
/* local_offset[i] = offset_global + offset_local; */
|
||||
/* offset_global += count_global; */
|
||||
/* } */
|
||||
|
||||
/* for (uint ID=startID;ID<endID;ID++) */
|
||||
/* { */
|
||||
/* const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
|
||||
/* const uint offset = local_offset[bin]; */
|
||||
/* output[offset] = input[ID]; */
|
||||
/* local_offset[bin]++; */
|
||||
/* } */
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
sort_morton_codes_scatter_items(
|
||||
global struct Globals* globals,
|
||||
uint shift,
|
||||
global uint* global_histogram,
|
||||
global char* input0,
|
||||
global char* input1,
|
||||
unsigned int input0_offset,
|
||||
unsigned int input1_offset,
|
||||
uint iteration)
|
||||
{
|
||||
const uint numItems = globals->numPrimitives;
|
||||
const uint local_size = get_local_size(0);
|
||||
const uint taskID = get_group_id(0);
|
||||
const uint numTasks = get_num_groups(0);
|
||||
const uint localID = get_local_id(0);
|
||||
const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
const uint subgroupLocalID = get_sub_group_local_id();
|
||||
const uint subgroup_size = get_sub_group_size();
|
||||
|
||||
const uint startID = (taskID + 0) * numItems / numTasks;
|
||||
const uint endID = (taskID + 1) * numItems / numTasks;
|
||||
|
||||
global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
|
||||
global ulong* output = (global ulong*)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
|
||||
|
||||
local uint local_offset[RADIX_BINS];
|
||||
uint off = 0;
|
||||
for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
|
||||
{
|
||||
const uint count = global_histogram[RADIX_BINS * numTasks + i];
|
||||
const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
|
||||
const uint sum = sub_group_reduce_add(count);
|
||||
const uint prefix_sum = sub_group_scan_exclusive_add(count);
|
||||
local_offset[i] = off + offset_task + prefix_sum;
|
||||
off += sum;
|
||||
}
|
||||
|
||||
for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
|
||||
{
|
||||
const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
|
||||
const uint offset = atomic_add_local(&local_offset[bin], 1);
|
||||
output[offset] = input[ID];
|
||||
}
|
||||
|
||||
/* uint local_offset[RADIX_BINS]; */
|
||||
/* uint offset_global = 0; */
|
||||
/* for (int i=0;i<RADIX_BINS;i++) */
|
||||
/* { */
|
||||
/* const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
|
||||
/* const uint offset_local = global_histogram[RADIX_BINS*taskID+i]; */
|
||||
/* local_offset[i] = offset_global + offset_local; */
|
||||
/* offset_global += count_global; */
|
||||
/* } */
|
||||
|
||||
/* for (uint ID=startID;ID<endID;ID++) */
|
||||
/* { */
|
||||
/* const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
|
||||
/* const uint offset = local_offset[bin]; */
|
||||
/* output[offset] = input[ID]; */
|
||||
/* local_offset[bin]++; */
|
||||
/* } */
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(SCATTER_WG_SIZE, 1, 1)))
|
||||
void kernel
|
||||
sort_morton_codes_scatter_items(
|
||||
global struct Globals *globals,
|
||||
global uint *global_histogram,
|
||||
global ulong *input,
|
||||
global ulong *output,
|
||||
uint iteration,
|
||||
uint numGroups,
|
||||
uint update_morton_sort_in_flight)
|
||||
{
|
||||
local uint local_offset[RADIX_BINS];
|
||||
local uint flags[RADIX_BINS*SCATTER_WG_SIZE/32];
|
||||
const uint numItems = globals->numPrimitives;
|
||||
if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
|
||||
sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
|
||||
flags, iteration, numGroups, numItems, false, update_morton_sort_in_flight);
|
||||
else
|
||||
sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
|
||||
flags, iteration, numGroups, numItems, true, update_morton_sort_in_flight);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
sort_morton_codes_scatter_items(
|
||||
global struct Globals *globals,
|
||||
uint shift,
|
||||
global uint *global_histogram,
|
||||
global char *input0,
|
||||
global char *input1,
|
||||
unsigned int input0_offset,
|
||||
unsigned int input1_offset,
|
||||
uint iteration)
|
||||
{
|
||||
const uint numItems = globals->numPrimitives;
|
||||
const uint local_size = get_local_size(0);
|
||||
const uint taskID = get_group_id(0);
|
||||
const uint numTasks = get_num_groups(0);
|
||||
const uint localID = get_local_id(0);
|
||||
const uint globalID = get_local_id(0) + get_group_id(0)*get_local_size(0);
|
||||
const uint subgroupLocalID = get_sub_group_local_id();
|
||||
const uint subgroup_size = get_sub_group_size();
|
||||
|
||||
const uint startID = (taskID + 0) * numItems / numTasks;
|
||||
const uint endID = (taskID + 1) * numItems / numTasks;
|
||||
|
||||
global ulong *input = (global ulong *)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
|
||||
global ulong *output = (global ulong *)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
|
||||
|
||||
local uint local_offset[RADIX_BINS];
|
||||
uint off = 0;
|
||||
for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
|
||||
{
|
||||
const uint count = global_histogram[RADIX_BINS * numTasks + i];
|
||||
const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
|
||||
const uint sum = sub_group_reduce_add(count);
|
||||
const uint prefix_sum = sub_group_scan_exclusive_add(count);
|
||||
local_offset[i] = off + offset_task + prefix_sum;
|
||||
off += sum;
|
||||
}
|
||||
|
||||
for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
|
||||
{
|
||||
const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
|
||||
const uint offset = atomic_add_local(&local_offset[bin], 1);
|
||||
output[offset] = input[ID];
|
||||
}
|
||||
|
||||
/* uint local_offset[RADIX_BINS]; */
|
||||
/* uint offset_global = 0; */
|
||||
/* for (int i=0;i<RADIX_BINS;i++) */
|
||||
/* { */
|
||||
/* const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
|
||||
/* const uint offset_local = global_histogram[RADIX_BINS*taskID+i]; */
|
||||
/* local_offset[i] = offset_global + offset_local; */
|
||||
/* offset_global += count_global; */
|
||||
/* } */
|
||||
|
||||
/* for (uint ID=startID;ID<endID;ID++) */
|
||||
/* { */
|
||||
/* const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
|
||||
/* const uint offset = local_offset[bin]; */
|
||||
/* output[offset] = input[ID]; */
|
||||
/* local_offset[bin]++; */
|
||||
/* } */
|
||||
}
|
||||
#endif
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(512, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
|
||||
void kernel
|
||||
sort_morton_codes_merged(
|
||||
global struct Globals* globals,
|
||||
global uint* global_histogram,
|
||||
global uchar* input,
|
||||
uint iteration,
|
||||
uint numGroups
|
||||
)
|
||||
{
|
||||
const uint numItems = globals->numPrimitives;
|
||||
const uint taskID = get_group_id(0);
|
||||
const uint loc_id = get_local_id(0);
|
||||
const uint lane = get_sub_group_local_id();
|
||||
|
||||
uint2 ids = get_thread_range(numItems, numGroups, taskID);
|
||||
uint startID = ids.x;
|
||||
uint endID = ids.y;
|
||||
|
||||
local uint histogram[RADIX_BINS];
|
||||
local uint hist_tmp[RADIX_BINS];
|
||||
|
||||
if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
|
||||
{
|
||||
sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, false,
|
||||
taskID, startID, endID);
|
||||
}
|
||||
else
|
||||
{
|
||||
const uint req_iterations = globals->sort_iterations;
|
||||
if (iteration < req_iterations)
|
||||
return;
|
||||
|
||||
iteration -= req_iterations;
|
||||
|
||||
sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, true,
|
||||
taskID, startID, endID);
|
||||
}
|
||||
|
||||
uint last_group = 0;
|
||||
if (loc_id == 0)
|
||||
last_group = atomic_inc_global(&globals->morton_sort_in_flight);
|
||||
|
||||
write_mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
last_group = work_group_broadcast(last_group, 0);
|
||||
|
||||
bool isLastGroup = (loc_id < RADIX_BINS) && (last_group == numGroups - 1);
|
||||
|
||||
uint global_count = 0;
|
||||
|
||||
if (isLastGroup)
|
||||
{
|
||||
for (uint j = 0; j < numGroups; j++)
|
||||
{
|
||||
const uint count = (j == taskID) ? histogram[loc_id] : load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + loc_id], 0);
|
||||
store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + loc_id], 0, global_count);
|
||||
global_count += count;
|
||||
}
|
||||
|
||||
hist_tmp[get_sub_group_id()] = (get_sub_group_id() < MAX_HW_SIMD_WIDTH) ? sub_group_reduce_add(global_count) : 0;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (isLastGroup)
|
||||
{
|
||||
uint p = hist_tmp[lane];
|
||||
p = (lane < get_sub_group_id()) ? p : 0;
|
||||
|
||||
global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
|
||||
|
||||
store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numGroups + loc_id], 0, global_count);
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16))) void kernel
|
||||
sort_morton_codes_bin_items(
|
||||
global struct Globals* globals,
|
||||
uint shift,
|
||||
global uint* global_histogram,
|
||||
global char* input0,
|
||||
global char* input1,
|
||||
unsigned int input0_offset,
|
||||
unsigned int input1_offset,
|
||||
uint iteration)
|
||||
{
|
||||
const uint numItems = globals->numPrimitives;
|
||||
const uint local_size = get_local_size(0);
|
||||
const uint taskID = get_group_id(0);
|
||||
const uint numTasks = get_num_groups(0);
|
||||
const uint localID = get_local_id(0);
|
||||
const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
|
||||
const uint subgroupLocalID = get_sub_group_local_id();
|
||||
const uint subgroup_size = get_sub_group_size();
|
||||
|
||||
const uint startID = (taskID + 0) * numItems / numTasks;
|
||||
const uint endID = (taskID + 1) * numItems / numTasks;
|
||||
|
||||
global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
|
||||
|
||||
#if 1
|
||||
local uint histogram[RADIX_BINS];
|
||||
for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
|
||||
histogram[i] = 0;
|
||||
|
||||
for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
|
||||
{
|
||||
const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
|
||||
atomic_add(&histogram[bin], 1);
|
||||
}
|
||||
|
||||
for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
|
||||
global_histogram[RADIX_BINS * taskID + i] = histogram[i];
|
||||
|
||||
#else
|
||||
uint histogram[RADIX_BINS];
|
||||
for (int i = 0; i < RADIX_BINS; i++)
|
||||
histogram[i] = 0;
|
||||
|
||||
for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
|
||||
{
|
||||
const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
|
||||
histogram[bin]++;
|
||||
}
|
||||
|
||||
for (uint i = 0; i < RADIX_BINS; i++)
|
||||
{
|
||||
const uint reduced_counter = sub_group_reduce_add(histogram[i]);
|
||||
global_histogram[RADIX_BINS * taskID + i] = reduced_counter;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define WG_SIZE_WIDE 256
|
||||
#define SG_SIZE_SCAN 16
|
||||
|
||||
// Fast implementation of work_group_scan_exclusive using SLM for WG size 256 and SG size 16
|
||||
GRL_INLINE uint work_group_scan_exclusive_add_opt(local uint* tmp, uint val)
|
||||
{
|
||||
const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE_SCAN;
|
||||
const uint sg_local_id = get_local_id(0) % SG_SIZE_SCAN;
|
||||
const uint NUM_HW_THREADS_IN_WG = WG_SIZE_WIDE / SG_SIZE_SCAN;
|
||||
|
||||
uint acc = sub_group_scan_exclusive_add(val);
|
||||
uint acc2 = acc + val;
|
||||
|
||||
tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc2, SG_SIZE_SCAN - 1);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint loaded_val = tmp[sg_local_id];
|
||||
uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
|
||||
uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
|
||||
return acc + acc_for_this_hw_thread;
|
||||
}
|
||||
|
||||
// Wide reduce algorithm is divided into 2 kernels:
|
||||
// 1. First, partial exclusive add scans are made within each work group using SLM.
|
||||
// Then, The last work group for each histogram bin perform exclusive add scan along the bins using separate histgram_partials buffer.
|
||||
// Last work group is determined using global atomics on wg_flags buffer.
|
||||
// 2. Second kernel globally adds the values from histgram_partials to the histogram buffer where partial sums are.
|
||||
// Then, last work group performs one more work_group scan and add so the histogram buffer values are adjusted with the global ones.
|
||||
GRL_INLINE void sort_morton_codes_reduce_bins_wide_partial_sum_func(
|
||||
global struct Globals* globals,
|
||||
global uint* global_histogram,
|
||||
global uint* global_histogram_partials,
|
||||
global uint* wg_flags,
|
||||
local uint* exclusive_scan_tmp,
|
||||
uint numTasks,
|
||||
uint numGroups,
|
||||
uint iteration,
|
||||
bool shift_primID)
|
||||
{
|
||||
if (shift_primID)
|
||||
{
|
||||
const uint req_iterations = globals->sort_iterations;
|
||||
if (iteration < req_iterations)
|
||||
return;
|
||||
|
||||
iteration -= req_iterations;
|
||||
}
|
||||
|
||||
const uint groupID = get_group_id(0) % RADIX_BINS;
|
||||
const uint scanGroupID = get_group_id(0) / RADIX_BINS;
|
||||
uint localID = get_local_id(0);
|
||||
uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
|
||||
const uint lastGroup = (numGroups / WG_SIZE_WIDE);
|
||||
const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
|
||||
|
||||
uint temp = 0;
|
||||
uint last_count = 0;
|
||||
if (globalID < numTasks)
|
||||
{
|
||||
temp = global_histogram[RADIX_BINS * globalID + groupID];
|
||||
|
||||
// Store the last value of the work group, it is either last element of histogram or last item in work group
|
||||
if (globalID == endID)
|
||||
last_count = temp;
|
||||
}
|
||||
|
||||
uint val = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp);
|
||||
|
||||
if (globalID <= numTasks)
|
||||
{
|
||||
global_histogram[RADIX_BINS * globalID + groupID] = val;
|
||||
|
||||
// Store the block sum value to separate buffer
|
||||
if (globalID == endID)
|
||||
global_histogram_partials[scanGroupID * WG_SIZE_WIDE + groupID] = val + last_count;
|
||||
}
|
||||
|
||||
// Make sure that global_histogram_partials is updated in all work groups
|
||||
write_mem_fence(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(0);
|
||||
|
||||
// Now, wait for the last group for each histogram bin, so we know that
|
||||
// all work groups already updated the global_histogram_partials buffer
|
||||
uint last_group = 0;
|
||||
if (localID == 0)
|
||||
last_group = atomic_inc_global(&wg_flags[groupID]);
|
||||
|
||||
last_group = work_group_broadcast(last_group, 0);
|
||||
bool isLastGroup = (last_group == lastGroup - 1);
|
||||
|
||||
// Each of the last groups computes the scan exclusive add for each partial sum we have
|
||||
if (isLastGroup)
|
||||
{
|
||||
uint temp1 = 0;
|
||||
if (localID < lastGroup)
|
||||
temp1 = global_histogram_partials[localID * WG_SIZE_WIDE + groupID];
|
||||
|
||||
uint val2 = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp1);
|
||||
|
||||
if (localID < lastGroup)
|
||||
global_histogram_partials[localID * WG_SIZE_WIDE + groupID] = val2;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void sort_morton_codes_reduce_bins_wide_add_reduce_func(
|
||||
global struct Globals* globals,
|
||||
global uint* global_histogram,
|
||||
global uint* global_histogram_partials,
|
||||
local uint* partials,
|
||||
uint numTasks,
|
||||
uint numGroups,
|
||||
uint iteration,
|
||||
bool shift_primID)
|
||||
{
|
||||
if (shift_primID)
|
||||
{
|
||||
const uint req_iterations = globals->sort_iterations;
|
||||
if (iteration < req_iterations)
|
||||
return;
|
||||
|
||||
iteration -= req_iterations;
|
||||
}
|
||||
|
||||
const uint groupID = get_group_id(0) % RADIX_BINS;
|
||||
const uint scanGroupID = get_group_id(0) / RADIX_BINS;
|
||||
const uint lastGroup = (numGroups / WG_SIZE_WIDE);
|
||||
uint localID = get_local_id(0);
|
||||
uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
|
||||
const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
|
||||
|
||||
// Add the global sums to the partials, skip the firsy scanGroupID as the first add
|
||||
// value is 0 in case of exclusive add scans
|
||||
if (scanGroupID > 0 && globalID <= numTasks)
|
||||
{
|
||||
uint add_val = global_histogram_partials[scanGroupID * RADIX_BINS + groupID];
|
||||
atomic_add_global(&global_histogram[globalID * RADIX_BINS + groupID], add_val);
|
||||
}
|
||||
|
||||
// Wait for the last group
|
||||
uint last_group = 0;
|
||||
if (localID == 0)
|
||||
last_group = atomic_inc_global(&globals->morton_sort_in_flight);
|
||||
|
||||
last_group = work_group_broadcast(last_group, 0);
|
||||
bool isLastGroup = (last_group == numGroups - 1);
|
||||
|
||||
// Do the exclusive scan within all bins with global data now
|
||||
if (isLastGroup)
|
||||
{
|
||||
mem_fence_gpu_invalidate();
|
||||
|
||||
uint global_count = global_histogram[numTasks * RADIX_BINS + localID];
|
||||
|
||||
partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint lane = get_sub_group_local_id();
|
||||
uint p = partials[lane];
|
||||
p = (lane < get_sub_group_id()) ? p : 0;
|
||||
|
||||
global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
|
||||
|
||||
store_uint_L1WB_L3WB(&global_histogram[numTasks * RADIX_BINS + localID], 0, global_count);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
|
||||
void kernel
|
||||
sort_morton_codes_reduce_bins_wide_partial_sum(
|
||||
global struct Globals* globals,
|
||||
uint numTasks,
|
||||
uint numGroups,
|
||||
global uint* global_histogram,
|
||||
global uint* global_histogram_partials,
|
||||
global uint* wg_flags,
|
||||
uint iteration)
|
||||
{
|
||||
local uint exclusive_scan_tmp[WG_SIZE_WIDE / SG_SIZE_SCAN];
|
||||
|
||||
const uint numItems = globals->numPrimitives;
|
||||
if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
|
||||
sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, false);
|
||||
else
|
||||
sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, true);
|
||||
}
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
|
||||
void kernel
|
||||
sort_morton_codes_reduce_bins_wide_add_reduce(
|
||||
global struct Globals* globals,
|
||||
uint numTasks,
|
||||
uint numGroups,
|
||||
global uint* global_histogram,
|
||||
global uint* global_histogram_partials,
|
||||
uint iteration)
|
||||
{
|
||||
local uint partials[RADIX_BINS];
|
||||
|
||||
const uint numItems = globals->numPrimitives;
|
||||
if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
|
||||
sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, false);
|
||||
else
|
||||
sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, true);
|
||||
}
|
||||
|
|
@ -1,297 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module msb_radix_bitonic_sort;
|
||||
|
||||
kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_debug_print < kernelFunction="debug_print_kernel">;
|
||||
kernel opencl_check_bls < kernelFunction="check_bls_sort">;
|
||||
|
||||
kernel opencl_bottom_level_sort_single_wg < kernelFunction="sort_morton_codes_bottom_level_single_wg">;
|
||||
|
||||
kernel opencl_build_morton_kernel_sort_msb_init < kernelFunction="sort_morton_codes_msb_begin">;
|
||||
|
||||
kernel opencl_build_morton_kernel_sort_msb_scheduler < kernelFunction="scheduler">;
|
||||
|
||||
kernel opencl_build_morton_kernel_sort_bottom_level < kernelFunction="sort_morton_codes_bottom_level">;
|
||||
|
||||
kernel opencl_build_morton_kernel_sort_msb_count_items < kernelFunction="sort_morton_codes_msb_count_items">;
|
||||
kernel opencl_build_morton_kernel_sort_msb_bin_items < kernelFunction="sort_morton_codes_msb_bin_items">;
|
||||
|
||||
kernel opencl_build_morton_kernel_sort_batched_bls_dispatch < kernelFunction="sort_morton_codes_batched_BLS_dispatch">;
|
||||
}
|
||||
|
||||
|
||||
const MSB_RADIX_NUM_VCONTEXTS = 8;
|
||||
const BOTTOM_LEVEL_SORT_THRESHOLD = 512;
|
||||
|
||||
struct MSBRadixScheduler
|
||||
{
|
||||
dword num_wgs_msb;
|
||||
dword num_wgs_bls;
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
};
|
||||
|
||||
struct MSBRadixArgs
|
||||
{
|
||||
qword p_scheduler;
|
||||
qword p_num_primitives;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
struct BatchedBLSDispatchEntry
|
||||
{
|
||||
qword p_data_buffer;
|
||||
qword num_elements; // number of elements in p_data_buffer
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
metakernel add_bls_dispatch_init(qword p_storage)
|
||||
{
|
||||
define REG_numWgs REG14;
|
||||
define REG_p_storage REG15;
|
||||
|
||||
REG_numWgs = 0;
|
||||
REG_p_storage = p_storage;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// basically this code does:
|
||||
// bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives };
|
||||
// dispatchId++;
|
||||
//
|
||||
metakernel add_bls_dispatch(
|
||||
qword p_data,
|
||||
qword p_num_primitives
|
||||
)
|
||||
{
|
||||
define C_1 REG0;
|
||||
define C_8 REG1;
|
||||
|
||||
define C_MIN_PRIMREFS REG2;
|
||||
|
||||
define REG_p_data REG3;
|
||||
define REG_num_prims REG4;
|
||||
define REG_no_dispatch REG5;
|
||||
|
||||
define REG_numWgs REG14;
|
||||
define REG_p_storage REG15;
|
||||
|
||||
C_MIN_PRIMREFS = 2;
|
||||
|
||||
REG_num_prims = 0;
|
||||
REG_num_prims.lo = load_dword(p_num_primitives);
|
||||
|
||||
REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS;
|
||||
|
||||
goto l_finish if(REG_no_dispatch.lo);
|
||||
|
||||
C_1 = 1;
|
||||
C_8 = 8;
|
||||
|
||||
// pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data
|
||||
REG_p_data = p_data;
|
||||
store_qword( REG_p_storage, REG_p_data ); // store the data pointer
|
||||
|
||||
REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct
|
||||
|
||||
// pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives
|
||||
store_qword( REG_p_storage, REG_num_prims );
|
||||
|
||||
REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance
|
||||
|
||||
REG_numWgs = REG_numWgs + C_1;
|
||||
|
||||
l_finish:
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
metakernel batched_bls_dispatch(
|
||||
qword private_mem
|
||||
)
|
||||
{
|
||||
define REG_numWgs REG14;
|
||||
|
||||
DISPATCHDIM_X = REG_numWgs;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
metakernel sort_bottom_level(
|
||||
qword build_globals,
|
||||
qword input,
|
||||
qword p_num_primitives)
|
||||
{
|
||||
define REG_num_prims REG0;
|
||||
define C_MIN_PRIMREFS REG1;
|
||||
define REG_no_dispatch REG2;
|
||||
|
||||
REG_num_prims = load_dword( p_num_primitives );
|
||||
|
||||
C_MIN_PRIMREFS = 2;
|
||||
|
||||
REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS;
|
||||
|
||||
goto l_finish if(REG_no_dispatch.lo);
|
||||
|
||||
dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
|
||||
|
||||
l_finish:
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
metakernel sort(
|
||||
qword build_globals,
|
||||
qword input,
|
||||
qword tmp,
|
||||
MSBRadixArgs sort_args)
|
||||
{
|
||||
define REG_num_prims REG0;
|
||||
{
|
||||
define C_MIN_PRIMREFS REG1;
|
||||
define C_MAX_PRIMREFS REG2;
|
||||
define REG_no_dispatch REG3;
|
||||
define REG_dispatch_single_wg REG4;
|
||||
|
||||
REG_num_prims = load_dword( sort_args.p_num_primitives );
|
||||
C_MIN_PRIMREFS = 2;
|
||||
C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD;
|
||||
|
||||
REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS;
|
||||
REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS;
|
||||
|
||||
goto l_sort_finish if(REG_no_dispatch.lo);
|
||||
goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
|
||||
goto l_full_sort;
|
||||
}
|
||||
|
||||
l_dispatch_single_wg:
|
||||
|
||||
{
|
||||
dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
|
||||
goto l_sort_finish;
|
||||
}
|
||||
|
||||
l_full_sort:
|
||||
|
||||
define p_scheduler sort_args.p_scheduler;
|
||||
define p_scheduler_postsync (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) );
|
||||
define p_num_wgs_bls (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) );
|
||||
|
||||
define REG_scheduler_postsync REG3;
|
||||
REG_scheduler_postsync = p_scheduler_postsync;
|
||||
|
||||
define C_0 REG4;
|
||||
define C_8 REG5;
|
||||
define C_255 REG6;
|
||||
C_0 = 0;
|
||||
C_8 = 8;
|
||||
C_255 = 255;
|
||||
|
||||
store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
|
||||
|
||||
REG_num_prims = REG_num_prims + C_255;
|
||||
REG_num_prims = REG_num_prims >> C_8;
|
||||
|
||||
DISPATCHDIM_X = REG_num_prims.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
control( cs_store_fence ); // commit the semaphore write
|
||||
|
||||
// initialize the whole execution
|
||||
dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp)
|
||||
postsync store_dword( p_scheduler_postsync, 1 );
|
||||
|
||||
// wait on count_items kernel
|
||||
semaphore_wait while( *p_scheduler_postsync != 1 );
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
|
||||
postsync store_dword( p_scheduler_postsync, 2 );
|
||||
|
||||
// wait on count_items kernel
|
||||
semaphore_wait while( *p_scheduler_postsync != 2 );
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
|
||||
postsync store_dword( p_scheduler_postsync, 0 );
|
||||
|
||||
define C_MASK_HI REG4;
|
||||
C_MASK_HI = 0x00000000ffffffff;
|
||||
|
||||
l_build_loop:
|
||||
{
|
||||
semaphore_wait while( *p_scheduler_postsync != 0 );
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp )
|
||||
postsync store_dword( p_scheduler_postsync, 1 );
|
||||
|
||||
// wait on scheduler kernel
|
||||
semaphore_wait while( *p_scheduler_postsync != 1 );
|
||||
}
|
||||
|
||||
// load and process the scheduler results
|
||||
define REG_wg_counts REG0;
|
||||
define REG_num_msb_wgs REG0.lo;
|
||||
define REG_num_bls_wgs REG0.hi;
|
||||
define REG_p_scheduler REG1;
|
||||
define REG_no_msb_wgs REG2;
|
||||
{
|
||||
REG_p_scheduler = p_scheduler;
|
||||
REG_wg_counts = load_qword( REG_p_scheduler );
|
||||
|
||||
REG_no_msb_wgs = REG_wg_counts & C_MASK_HI;
|
||||
REG_no_msb_wgs = REG_no_msb_wgs == 0;
|
||||
}
|
||||
|
||||
// dispatch new bls WGs
|
||||
DISPATCHDIM_X = REG_num_bls_wgs;
|
||||
dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input );
|
||||
|
||||
// jump out if there are no msb WGs
|
||||
goto l_sort_finish if (REG_no_msb_wgs);
|
||||
|
||||
DISPATCHDIM_X = REG_num_msb_wgs;
|
||||
dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
|
||||
postsync store_dword( p_scheduler_postsync, 2 );
|
||||
|
||||
// wait on count_items kernel
|
||||
semaphore_wait while( *p_scheduler_postsync != 2 );
|
||||
|
||||
dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
|
||||
postsync store_dword( p_scheduler_postsync, 0 );
|
||||
|
||||
// wait till all BLS finished launching
|
||||
semaphore_wait while( *p_num_wgs_bls != 0 );
|
||||
|
||||
goto l_build_loop;
|
||||
}
|
||||
|
||||
l_sort_finish:
|
||||
|
||||
}
|
||||
|
|
@ -1,665 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module new_sah_builder;
|
||||
|
||||
kernel_module bfs_kernels ("bvh_build_BFS.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial < kernelFunction="BFS_pass1_initial" > ;
|
||||
kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed < kernelFunction="BFS_pass1_indexed" > ;
|
||||
kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial < kernelFunction="BFS_pass2_initial" > ;
|
||||
kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed < kernelFunction="BFS_pass2_indexed" > ;
|
||||
|
||||
kernel opencl_build_kernel_BinnedSAH_DFS < kernelFunction="DFS" >;
|
||||
// kernel opencl_build_kernel_BinnedSAH_BuildQNodes < kernelFunction="build_qnodes" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff < kernelFunction="build_qnodes_pc_kickoff" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify < kernelFunction="build_qnodes_pc_amplify" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_begin < kernelFunction = "begin" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_scheduler < kernelFunction = "scheduler" >;
|
||||
|
||||
kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch < kernelFunction="BFS_pass1_initial_batchable" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch < kernelFunction="BFS_pass1_indexed_batchable" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch < kernelFunction="BFS_pass2_initial_batchable" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch < kernelFunction="BFS_pass2_indexed_batchable" >;
|
||||
|
||||
kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_begin_batched < kernelFunction="begin_batchable" >;
|
||||
|
||||
kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched < kernelFunction="build_qnodes_init_scheduler_batched" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched < kernelFunction="build_qnodes_begin_batchable" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_qnode_scheduler < kernelFunction="build_qnodes_scheduler" >;
|
||||
kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch < kernelFunction="build_qnodes_pc_amplify_batched" >;
|
||||
|
||||
kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >;
|
||||
|
||||
}
|
||||
|
||||
kernel opencl_build_kernel_DFS_single_wg < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" >
|
||||
kernel opencl_build_kernel_DFS_trivial < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial" >
|
||||
kernel opencl_build_kernel_DFS_single_wg_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" >
|
||||
kernel opencl_build_kernel_DFS_trivial_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable" >
|
||||
|
||||
kernel single_pass_binsah < source="bvh_build_DFS.cl", kernelFunction="DFS" >
|
||||
|
||||
|
||||
const DFS_MIN_PRIMREFS = 6;
|
||||
const DFS_MAX_PRIMREFS = 256;
|
||||
const BFS_WG_SIZE_SHIFT = 9;
|
||||
|
||||
|
||||
|
||||
struct Scheduler
|
||||
{
|
||||
dword num_bfs_wgs;
|
||||
dword num_dfs_wgs;
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
|
||||
dword num_trivial_builds;
|
||||
dword num_single_builds;
|
||||
|
||||
dword batched_build_wg_count;
|
||||
dword batched_build_loop_mask;
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct SAHBuildArgs
|
||||
{
|
||||
qword p_num_primitives;
|
||||
qword p_qnode_child_buffer;
|
||||
qword p_scheduler;
|
||||
qword p_sah_globals;
|
||||
qword p_globals;
|
||||
qword p_primref_buffer;
|
||||
qword p_primref_index_buffers;
|
||||
qword p_bvh_base;
|
||||
qword p_bvh2;
|
||||
qword p_root_buffer_counters;
|
||||
dword sah_build_flags;
|
||||
dword leaf_size;
|
||||
dword leaf_type;
|
||||
dword max_internal_nodes;
|
||||
};
|
||||
|
||||
|
||||
metakernel single_pass_binsah(
|
||||
qword build_globals,
|
||||
qword bvh_buffer,
|
||||
qword build_primref_buffer,
|
||||
qword build_primref_index_buffers,
|
||||
dword alloc_backpointers )
|
||||
{
|
||||
|
||||
dispatch single_pass_binsah(1, 1, 1) args(
|
||||
build_globals,
|
||||
bvh_buffer,
|
||||
build_primref_buffer,
|
||||
build_primref_index_buffers,
|
||||
alloc_backpointers
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
metakernel new_sah_build( SAHBuildArgs build_args )
|
||||
{
|
||||
define REG_num_prims REG0;
|
||||
|
||||
{
|
||||
define C_MIN_PRIMREFS REG1;
|
||||
define C_MAX_PRIMREFS REG2;
|
||||
define REG_dispatch_trivial REG3;
|
||||
define REG_dispatch_single_wg REG4;
|
||||
|
||||
REG_num_prims = load_dword( build_args.p_num_primitives );
|
||||
C_MIN_PRIMREFS = DFS_MIN_PRIMREFS;
|
||||
C_MAX_PRIMREFS = DFS_MAX_PRIMREFS;
|
||||
|
||||
REG_dispatch_trivial = REG_num_prims <= C_MIN_PRIMREFS;
|
||||
REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS;
|
||||
|
||||
goto l_dispatch_trivial if(REG_dispatch_trivial.lo);
|
||||
goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
|
||||
goto l_full_build;
|
||||
}
|
||||
|
||||
l_dispatch_trivial:
|
||||
{
|
||||
dispatch opencl_build_kernel_DFS_trivial (1,1,1)
|
||||
args( build_args.p_globals,
|
||||
build_args.p_bvh_base,
|
||||
build_args.p_primref_buffer,
|
||||
build_args.p_primref_index_buffers,
|
||||
build_args.sah_build_flags
|
||||
);
|
||||
|
||||
control( wait_idle );
|
||||
goto l_done;
|
||||
}
|
||||
|
||||
l_dispatch_single_wg:
|
||||
{
|
||||
dispatch opencl_build_kernel_DFS_single_wg (1,1,1)
|
||||
args( build_args.p_globals,
|
||||
build_args.p_bvh_base,
|
||||
build_args.p_primref_buffer,
|
||||
build_args.p_primref_index_buffers,
|
||||
build_args.sah_build_flags
|
||||
);
|
||||
|
||||
control( wait_idle );
|
||||
goto l_done;
|
||||
}
|
||||
|
||||
|
||||
l_full_build:
|
||||
|
||||
|
||||
{
|
||||
define p_scheduler build_args.p_scheduler;
|
||||
define p_num_dfs_wgs build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs);
|
||||
define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
|
||||
define C_0 REG1;
|
||||
define C_8 REG2;
|
||||
C_8 = 8;
|
||||
C_0 = 0;
|
||||
|
||||
|
||||
//
|
||||
// Init pass
|
||||
//
|
||||
store_dword( p_scheduler_postsync, C_0.lo );
|
||||
|
||||
// compute number of BFS WGs from prim-count
|
||||
// NOTE: This code uses a hardcoded WG size of 512 for BFS
|
||||
// If the BFS wg size ever changes, it needs to be touched
|
||||
// This is necessary because DG2 shifter only supports POW2 shifts
|
||||
{
|
||||
define REG_scheduler_postsync REG3;
|
||||
define C_511 REG4;
|
||||
define C_1 REG5;
|
||||
|
||||
REG_scheduler_postsync = p_scheduler_postsync;
|
||||
C_511 = 511;
|
||||
C_1 = 1;
|
||||
|
||||
store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
|
||||
|
||||
REG_num_prims = REG_num_prims + C_511;
|
||||
REG_num_prims = REG_num_prims >> C_8;
|
||||
REG_num_prims = REG_num_prims >> C_1;
|
||||
|
||||
DISPATCHDIM_X = REG_num_prims.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
control( cs_store_fence ); // commit the semaphore write
|
||||
|
||||
// launch scheduler init kernel
|
||||
dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1)
|
||||
args(
|
||||
build_args.p_scheduler,
|
||||
build_args.leaf_size,
|
||||
build_args.leaf_type,
|
||||
build_args.p_primref_index_buffers,
|
||||
build_args.p_primref_buffer,
|
||||
build_args.p_bvh2,
|
||||
build_args.p_bvh_base,
|
||||
build_args.p_globals,
|
||||
build_args.p_sah_globals,
|
||||
build_args.p_qnode_child_buffer,
|
||||
build_args.sah_build_flags
|
||||
)
|
||||
postsync store_dword( p_scheduler_postsync, 1 );
|
||||
|
||||
// wait on init kernel
|
||||
semaphore_wait while( *p_scheduler_postsync != 1 );
|
||||
|
||||
// launch BFS1 pass1
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial
|
||||
args( build_args.p_scheduler,
|
||||
build_args.p_sah_globals)
|
||||
postsync store_dword( p_scheduler_postsync, 0 );
|
||||
|
||||
// wait on BFS pass1
|
||||
semaphore_wait while( *p_scheduler_postsync != 0 );
|
||||
|
||||
// launch BFS pass2
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial
|
||||
args( build_args.p_scheduler,
|
||||
build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 1 );
|
||||
}
|
||||
|
||||
// after BFS pass 2 we drop into a scheduling loop
|
||||
|
||||
l_build_loop:
|
||||
{
|
||||
semaphore_wait while( *p_scheduler_postsync != 1 );
|
||||
|
||||
{
|
||||
dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
|
||||
args( build_args.p_scheduler, build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 0 );
|
||||
|
||||
// wait on the scheduler
|
||||
semaphore_wait while( *p_scheduler_postsync != 0 );
|
||||
}
|
||||
|
||||
// load and process the scheduler results
|
||||
define REG_wg_counts REG0;
|
||||
define REG_num_bfs_wgs REG0.lo;
|
||||
define REG_num_dfs_wgs REG0.hi;
|
||||
define REG_loop_break REG1;
|
||||
define REG_p_scheduler REG2;
|
||||
{
|
||||
REG_p_scheduler = p_scheduler;
|
||||
REG_wg_counts = load_qword( REG_p_scheduler );
|
||||
|
||||
define C_MASK_LO REG3 ;
|
||||
C_MASK_LO = 0xffffffff;
|
||||
|
||||
REG_loop_break = REG_wg_counts & C_MASK_LO;
|
||||
REG_loop_break = REG_loop_break == 0;
|
||||
}
|
||||
|
||||
// dispatch new DFS WGs
|
||||
DISPATCHDIM_X = REG_num_dfs_wgs;
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
|
||||
args( p_scheduler,
|
||||
build_args.p_sah_globals );
|
||||
|
||||
// jump out if there are no bfs WGs
|
||||
goto l_build_qnodes if (REG_loop_break);
|
||||
|
||||
// dispatch new BFS1 WGs
|
||||
DISPATCHDIM_X = REG_num_bfs_wgs;
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed
|
||||
args( p_scheduler,
|
||||
build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 2 );
|
||||
|
||||
semaphore_wait while( *p_scheduler_postsync != 2 );
|
||||
|
||||
// dispatch new BFS2 WGs
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed
|
||||
args( p_scheduler,
|
||||
build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 1 );
|
||||
|
||||
//goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
|
||||
|
||||
// wait until all upcoming DFS WGs have finished launching
|
||||
// so that the scheduler can refill the launch array
|
||||
// TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
|
||||
semaphore_wait while( *p_num_dfs_wgs != 0 );
|
||||
|
||||
|
||||
goto l_build_loop;
|
||||
}
|
||||
}
|
||||
|
||||
l_build_qnodes:
|
||||
|
||||
control( wait_idle );
|
||||
|
||||
// P/C qnode build
|
||||
|
||||
dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1)
|
||||
args( build_args.p_sah_globals,
|
||||
build_args.p_qnode_child_buffer,
|
||||
build_args.sah_build_flags );
|
||||
|
||||
{
|
||||
define p_pc_counters ( build_args.p_root_buffer_counters );
|
||||
|
||||
define REG_addr REG0;
|
||||
define REG_produced REG1;
|
||||
define REG_consumed REG2;
|
||||
define REG_have_work REG3;
|
||||
define REG_wg_count REG4;
|
||||
define C_8 REG5;
|
||||
define C_16 REG6;
|
||||
define C_1 REG7;
|
||||
C_1 = 1;
|
||||
C_8 = 8;
|
||||
C_16 = 16;
|
||||
REG_addr = build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address
|
||||
|
||||
REG_consumed = 0;
|
||||
|
||||
l_qnode_loop:
|
||||
|
||||
control( wait_idle ); // wait for previous pass
|
||||
|
||||
// load counters and compute number of wgs to respawn
|
||||
REG_produced = load_qword( REG_addr ); REG_addr = REG_addr + C_8;
|
||||
REG_wg_count = REG_produced - REG_consumed;
|
||||
REG_have_work = REG_wg_count > 0;
|
||||
|
||||
goto l_done if not(REG_have_work.lo);
|
||||
|
||||
// save REG_consumed as a starting position in p_qnode_child_buffer
|
||||
store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8;
|
||||
|
||||
// save REG_produced as ending position in p_qnode_child_buffer
|
||||
store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16;
|
||||
|
||||
REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration
|
||||
|
||||
// calculate amount of workgroups to schedule
|
||||
REG_wg_count = REG_wg_count + C_1;
|
||||
REG_wg_count = REG_wg_count >> C_1;
|
||||
|
||||
DISPATCHDIM_X = REG_wg_count.lo;
|
||||
|
||||
control( cs_store_fence ); // commit the stores
|
||||
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify
|
||||
args( build_args.p_sah_globals,
|
||||
build_args.p_qnode_child_buffer,
|
||||
build_args.sah_build_flags);
|
||||
|
||||
goto l_qnode_loop;
|
||||
}
|
||||
|
||||
l_done:
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
struct SAHBuildArgsBatchable
|
||||
{
|
||||
qword p_globals_ptrs;
|
||||
qword p_scheduler;
|
||||
qword p_buffers_info;
|
||||
qword p_sah_globals;
|
||||
|
||||
dword num_max_qnode_global_root_buffer_entries;
|
||||
dword num_builds;
|
||||
|
||||
};
|
||||
|
||||
|
||||
metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args )
|
||||
{
|
||||
define p_scheduler build_args.p_scheduler;
|
||||
define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
|
||||
define p_num_dfs_wgs (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs));
|
||||
|
||||
// initialize scheduler semaphore
|
||||
REG0.lo = 0;
|
||||
store_dword( p_scheduler_postsync, REG0.lo );
|
||||
|
||||
|
||||
// dispatch categorization pass
|
||||
dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1)
|
||||
args(
|
||||
build_args.p_scheduler,
|
||||
build_args.p_globals_ptrs,
|
||||
build_args.p_buffers_info,
|
||||
build_args.p_sah_globals,
|
||||
build_args.num_builds
|
||||
)
|
||||
postsync store_dword( p_scheduler_postsync, 1 );
|
||||
|
||||
// wait on the categorization pass
|
||||
semaphore_wait while( *p_scheduler_postsync != 1 );
|
||||
|
||||
|
||||
// dispatch the trivial and single-WG passes
|
||||
{
|
||||
REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) );
|
||||
DISPATCHDIM_X = REG0.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
// dispatch trivial builds
|
||||
|
||||
dispatch_indirect opencl_build_kernel_DFS_trivial_batch
|
||||
args( build_args.p_sah_globals );
|
||||
|
||||
control( wait_idle );
|
||||
|
||||
// dispatch single-wg builds
|
||||
|
||||
DISPATCHDIM_X = REG0.hi;
|
||||
dispatch_indirect opencl_build_kernel_DFS_single_wg_batch
|
||||
args( build_args.p_sah_globals, build_args.p_scheduler );
|
||||
}
|
||||
|
||||
// compute the number of builds not covered by the trivial passes
|
||||
// skip the builder loop if all builds are satisfied by trivial passes
|
||||
{
|
||||
REG1 = REG0.lo;
|
||||
REG2 = REG0.hi;
|
||||
REG3 = build_args.num_builds;
|
||||
REG5 = REG2 + REG1;
|
||||
REG5 = REG3 - REG5;
|
||||
REG4 = REG5 == 0 ;
|
||||
|
||||
goto l_done if (REG4.lo);
|
||||
}
|
||||
|
||||
// REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop
|
||||
define REG_num_nontrivial REG5;
|
||||
|
||||
l_build_outer_loop:
|
||||
{
|
||||
|
||||
// configure the scheduler to initiate a new block of builds
|
||||
|
||||
dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1)
|
||||
args( build_args.p_scheduler, build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 0 );
|
||||
|
||||
// wait on init kernel
|
||||
semaphore_wait while( *p_scheduler_postsync != 0 );
|
||||
|
||||
|
||||
// read results produced by scheduler init kernel
|
||||
// lo == BFS wg count. hi == all ones if we need to loop again
|
||||
//
|
||||
REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
|
||||
REG4 = load_qword( REG0 );
|
||||
|
||||
// launch BFS1 pass1
|
||||
DISPATCHDIM_X = REG4.lo;
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch
|
||||
args( build_args.p_scheduler,
|
||||
build_args.p_sah_globals)
|
||||
postsync store_dword( p_scheduler_postsync, 1 );
|
||||
|
||||
// wait on BFS pass1
|
||||
semaphore_wait while( *p_scheduler_postsync != 1 );
|
||||
|
||||
// launch BFS pass2
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch
|
||||
args( build_args.p_scheduler,
|
||||
build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 0 );
|
||||
|
||||
l_build_loop:
|
||||
{
|
||||
semaphore_wait while( *p_scheduler_postsync != 0 );
|
||||
|
||||
{
|
||||
dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
|
||||
args( build_args.p_scheduler, build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 1 );
|
||||
|
||||
// wait on the scheduler
|
||||
semaphore_wait while( *p_scheduler_postsync != 1 );
|
||||
}
|
||||
|
||||
// load and process the scheduler results
|
||||
define REG_wg_counts REG0;
|
||||
define REG_num_bfs_wgs REG0.lo;
|
||||
define REG_num_dfs_wgs REG0.hi;
|
||||
define REG_loop_break REG1;
|
||||
define REG_p_scheduler REG2;
|
||||
{
|
||||
REG_p_scheduler = p_scheduler;
|
||||
REG_wg_counts = load_qword( REG_p_scheduler );
|
||||
|
||||
define C_MASK_LO REG3 ;
|
||||
C_MASK_LO = 0xffffffff;
|
||||
|
||||
REG_loop_break = REG_wg_counts & C_MASK_LO;
|
||||
REG_loop_break = REG_loop_break == 0;
|
||||
}
|
||||
|
||||
// dispatch new DFS WGs
|
||||
DISPATCHDIM_X = REG_num_dfs_wgs;
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
|
||||
args( p_scheduler,
|
||||
build_args.p_sah_globals );
|
||||
|
||||
// jump out if there are no bfs WGs
|
||||
goto l_continue_outer_loop if (REG_loop_break);
|
||||
|
||||
// dispatch new BFS1 WGs
|
||||
DISPATCHDIM_X = REG_num_bfs_wgs;
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch
|
||||
args( p_scheduler,
|
||||
build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 2 );
|
||||
|
||||
semaphore_wait while( *p_scheduler_postsync != 2 );
|
||||
|
||||
// dispatch new BFS2 WGs
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch
|
||||
args( p_scheduler,
|
||||
build_args.p_sah_globals )
|
||||
postsync store_dword( p_scheduler_postsync, 0 );
|
||||
|
||||
//goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
|
||||
|
||||
// wait until all upcoming DFS WGs have finished launching
|
||||
// so that the scheduler can refill the launch array
|
||||
// TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
|
||||
semaphore_wait while( *p_num_dfs_wgs != 0 );
|
||||
|
||||
goto l_build_loop;
|
||||
}
|
||||
|
||||
|
||||
l_continue_outer_loop:
|
||||
|
||||
|
||||
goto l_build_outer_loop if(REG4.hi);
|
||||
|
||||
}
|
||||
|
||||
////////
|
||||
//
|
||||
// Qnode build phase
|
||||
//
|
||||
////////
|
||||
|
||||
// Wait for all outstanding DFS dispatches to complete, then build the QNodes
|
||||
control( wait_idle );
|
||||
|
||||
define REG_wg_counts REG1;
|
||||
define REG_p_scheduler REG2;
|
||||
define REG_have_work REG3;
|
||||
define REG_GRB_NUM_MAX_ENTRIES REG4;
|
||||
|
||||
// init scheduler for qnode phase
|
||||
dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1)
|
||||
args( build_args.p_scheduler,
|
||||
build_args.num_builds,
|
||||
build_args.num_max_qnode_global_root_buffer_entries);
|
||||
|
||||
REG_p_scheduler = p_scheduler;
|
||||
|
||||
control( wait_idle );
|
||||
|
||||
REG_wg_counts = load_qword( REG_p_scheduler );
|
||||
|
||||
DISPATCHDIM_X = REG_wg_counts.lo;
|
||||
|
||||
// configure the scheduler to initiate a new block of builds
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched
|
||||
args( build_args.p_scheduler,
|
||||
build_args.p_sah_globals);
|
||||
|
||||
// read results produced by init scheduler kernel
|
||||
// lo == num of builds processed. hi == num of maximum global root buffer entries
|
||||
//
|
||||
REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
|
||||
REG5 = load_qword( REG0 );
|
||||
|
||||
REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi;
|
||||
REG_GRB_NUM_MAX_ENTRIES.hi = 0;
|
||||
|
||||
l_qnode_loop:
|
||||
{
|
||||
control( wait_idle ); // wait for previous pass
|
||||
|
||||
dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler );
|
||||
|
||||
control( wait_idle );
|
||||
|
||||
REG_wg_counts = load_qword( REG_p_scheduler );
|
||||
REG_have_work = REG_wg_counts > 0;
|
||||
|
||||
goto l_done if not(REG_have_work.lo);
|
||||
|
||||
DISPATCHDIM_X = REG_wg_counts.lo;
|
||||
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch
|
||||
args( build_args.p_sah_globals,
|
||||
build_args.p_scheduler );
|
||||
|
||||
control( wait_idle );
|
||||
|
||||
REG_wg_counts = load_qword( REG_p_scheduler ); // reload values
|
||||
REG_wg_counts.lo = REG_wg_counts.hi;
|
||||
REG_wg_counts.hi = 0;
|
||||
|
||||
REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES;
|
||||
|
||||
goto l_qnode_loop if not(REG_have_work.lo);
|
||||
|
||||
DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled
|
||||
|
||||
dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched
|
||||
args( build_args.p_sah_globals,
|
||||
build_args.p_scheduler );
|
||||
|
||||
goto l_qnode_loop;
|
||||
}
|
||||
|
||||
////////
|
||||
//
|
||||
// Old implementation - TODO: maybe add switch between two implementations?
|
||||
//
|
||||
////////
|
||||
// Wait for all outstanding DFS dispatches to complete, then build the QNodes
|
||||
//DISPATCHDIM_X = REG5.lo;
|
||||
|
||||
//dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes
|
||||
// args( build_args.p_sah_globals, build_args.p_scheduler );
|
||||
|
||||
|
||||
l_done:
|
||||
|
||||
control( wait_idle );
|
||||
|
||||
}
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module postbuild_info; // In postbuild we assume output data structure to be DXR compatible
|
||||
|
||||
kernel compacted_size < source="bvh_postbuild_info.cl", kernelFunction="compacted_size" >
|
||||
kernel current_size < source="bvh_postbuild_info.cl", kernelFunction="current_size" >
|
||||
kernel serialized_size < source="bvh_postbuild_info.cl", kernelFunction="serialized_size" >
|
||||
kernel decoded_size < source="bvh_postbuild_info.cl", kernelFunction="decoded_size" >
|
||||
|
||||
metakernel compacted_size(
|
||||
qword bvh,
|
||||
qword postbuildInfo)
|
||||
{
|
||||
dispatch compacted_size(1,1,1) args(
|
||||
bvh,
|
||||
postbuildInfo);
|
||||
}
|
||||
|
||||
metakernel current_size(
|
||||
qword bvh,
|
||||
qword postbuildInfo)
|
||||
{
|
||||
dispatch current_size(1,1,1) args(
|
||||
bvh,
|
||||
postbuildInfo);
|
||||
}
|
||||
|
||||
metakernel serialized_size(
|
||||
qword bvh,
|
||||
qword postbuildInfo)
|
||||
{
|
||||
dispatch serialized_size(1,1,1) args(
|
||||
bvh,
|
||||
postbuildInfo);
|
||||
}
|
||||
|
||||
metakernel decoded_size(
|
||||
qword bvh,
|
||||
qword postbuildInfo)
|
||||
{
|
||||
dispatch decoded_size(1,1,1) args(
|
||||
bvh,
|
||||
postbuildInfo);
|
||||
}
|
||||
|
|
@ -1,62 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module presplit;
|
||||
|
||||
kernel_module presplit_kernels ("bvh_build_presplit.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel opencl_kernel_compute_num_presplits < kernelFunction="compute_num_presplits" >;
|
||||
kernel opencl_kernel_priority_sum < kernelFunction="priority_sum" >;
|
||||
kernel opencl_kernel_perform_presplits < kernelFunction="perform_presplits" >;
|
||||
}
|
||||
|
||||
import struct MKBuilderState "structs.grl";
|
||||
import struct MKSizeEstimate "structs.grl";
|
||||
|
||||
|
||||
metakernel compute_num_presplits(
|
||||
MKBuilderState state,
|
||||
qword presplit_buffer,
|
||||
dword numHwThreads )
|
||||
{
|
||||
dispatch opencl_kernel_compute_num_presplits ( numHwThreads, 1, 1 ) args(
|
||||
state.build_globals,
|
||||
state.bvh_buffer,
|
||||
state.build_primref_buffer,
|
||||
presplit_buffer,
|
||||
state.geomDesc_buffer );
|
||||
}
|
||||
|
||||
|
||||
metakernel priority_sum(
|
||||
MKBuilderState state,
|
||||
MKSizeEstimate estimate,
|
||||
qword presplit_buffer )
|
||||
{
|
||||
dispatch opencl_kernel_priority_sum ( 1, 1, 1 ) args(
|
||||
state.build_globals,
|
||||
presplit_buffer,
|
||||
estimate.numPrimitivesToSplit / 2 );
|
||||
}
|
||||
|
||||
metakernel perform_presplits(
|
||||
MKBuilderState state,
|
||||
MKSizeEstimate estimate,
|
||||
qword presplit_buffer,
|
||||
dword numHwThreads )
|
||||
{
|
||||
dispatch opencl_kernel_perform_presplits ( numHwThreads, 1, 1 ) args(
|
||||
state.build_globals,
|
||||
state.bvh_buffer,
|
||||
state.build_primref_buffer,
|
||||
presplit_buffer,
|
||||
state.bvh_buffer,
|
||||
state.geomDesc_buffer,
|
||||
estimate.numPrimitivesToSplit / 2 );
|
||||
}
|
||||
|
|
@ -1,933 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLGen12.h"
|
||||
|
||||
#include "shared.h"
|
||||
#include "quad.h"
|
||||
|
||||
/* ====== GENERAL BVH config ====== */
|
||||
|
||||
#define BVH_NODE_N6 6
|
||||
#define BVH_NODE_N 8
|
||||
#define BVH_NODE_N_LOG 3
|
||||
|
||||
#define SAH_LOG_BLOCK_SHIFT 2
|
||||
#define BVH_LEAF_N_MIN BVH_NODE_N6
|
||||
#define BVH_LEAF_N_MAX BVH_NODE_N6
|
||||
|
||||
#define BVH_NODE_DEFAULT_MASK 0xff
|
||||
#define BVH_NODE_DEGENERATED_MASK 0x00
|
||||
|
||||
/* ====== QUANTIZATION config ====== */
|
||||
|
||||
#define QUANT_BITS 8
|
||||
#define QUANT_MIN 0
|
||||
#define QUANT_MAX 255
|
||||
#define QUANT_MAX_MANT (255.0f / 256.0f)
|
||||
|
||||
#define NO_NODE_OFFSET 0
|
||||
|
||||
/* ======================================================================= */
|
||||
/* ============================== BVH BASE =============================== */
|
||||
/* ======================================================================= */
|
||||
|
||||
GRL_INLINE void setBVHBaseBounds(struct BVHBase *base, struct AABB *aabb)
|
||||
{
|
||||
base->Meta.bounds.lower[0] = aabb->lower.x;
|
||||
base->Meta.bounds.lower[1] = aabb->lower.y;
|
||||
base->Meta.bounds.lower[2] = aabb->lower.z;
|
||||
|
||||
base->Meta.bounds.upper[0] = aabb->upper.x;
|
||||
base->Meta.bounds.upper[1] = aabb->upper.y;
|
||||
base->Meta.bounds.upper[2] = aabb->upper.z;
|
||||
}
|
||||
|
||||
GRL_INLINE global struct QBVHNodeN *BVHBase_nodeData(struct BVHBase *bvh)
|
||||
{
|
||||
return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
|
||||
}
|
||||
|
||||
GRL_INLINE global struct QBVHNodeN *BVHBase_rootNode(struct BVHBase *bvh)
|
||||
{
|
||||
return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
|
||||
}
|
||||
|
||||
GRL_INLINE global struct Quad *BVHBase_quadLeaves(struct BVHBase *bvh)
|
||||
{
|
||||
return (global struct Quad *)((void *)bvh + 64 * (ulong)bvh->quadLeafStart);
|
||||
}
|
||||
|
||||
GRL_INLINE uint64_t BVHBase_numNodes(struct BVHBase *bvh)
|
||||
{
|
||||
return bvh->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
|
||||
}
|
||||
|
||||
GRL_INLINE uint64_t BVHBase_numQuads(struct BVHBase *bvh)
|
||||
{
|
||||
return bvh->quadLeafCur - bvh->quadLeafStart;
|
||||
}
|
||||
|
||||
GRL_INLINE uint64_t BVHBase_numProcedurals(struct BVHBase *bvh)
|
||||
{
|
||||
return bvh->proceduralDataCur - bvh->proceduralDataStart;
|
||||
}
|
||||
|
||||
GRL_INLINE uint64_t BVHBase_numInstances(struct BVHBase *bvh)
|
||||
{
|
||||
return bvh->instanceLeafEnd - bvh->instanceLeafStart;
|
||||
}
|
||||
|
||||
/* =================================================================== */
|
||||
/* ============================== QBVH =============================== */
|
||||
/* =================================================================== */
|
||||
|
||||
__constant const float ulp = FLT_EPSILON;
|
||||
|
||||
GRL_INLINE struct AABB conservativeAABB(struct AABB *aabb)
|
||||
{
|
||||
struct AABB box;
|
||||
const float4 v4 = max(fabs(aabb->lower), fabs(aabb->upper));
|
||||
const float v = ulp * max(v4.x, max(v4.y, v4.z));
|
||||
box.lower = aabb->lower - (float4)v;
|
||||
box.upper = aabb->upper + (float4)v;
|
||||
return box;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB3f conservativeAABB3f(struct AABB3f* aabb3d)
|
||||
{
|
||||
struct AABB aabb4d = AABBfromAABB3f(*aabb3d);
|
||||
struct AABB box = conservativeAABB(&aabb4d);
|
||||
return AABB3fFromAABB(box);
|
||||
}
|
||||
|
||||
struct QBVH_AABB
|
||||
{
|
||||
uchar lower_x[BVH_NODE_N6];
|
||||
uchar upper_x[BVH_NODE_N6];
|
||||
uchar lower_y[BVH_NODE_N6];
|
||||
uchar upper_y[BVH_NODE_N6];
|
||||
uchar lower_z[BVH_NODE_N6];
|
||||
uchar upper_z[BVH_NODE_N6];
|
||||
};
|
||||
|
||||
struct QBVHNodeN
|
||||
{
|
||||
float lower[3];
|
||||
int offset;
|
||||
// 16 bytes
|
||||
uchar type;
|
||||
uchar pad;
|
||||
// 18 bytes
|
||||
char exp[3];
|
||||
uchar instMask;
|
||||
// 22 bytes
|
||||
uchar childData[6];
|
||||
// 28 bytes
|
||||
struct QBVH_AABB qbounds; // + 36 bytes
|
||||
// 64 bytes
|
||||
};
|
||||
|
||||
GRL_INLINE uint QBVHNodeN_blockIncr(struct QBVHNodeN *This, uint childID)
|
||||
{
|
||||
return This->childData[childID] & 0x3;
|
||||
}
|
||||
|
||||
GRL_INLINE uint QBVHNodeN_startPrim(struct QBVHNodeN *This, uint childID)
|
||||
{
|
||||
return (This->childData[childID] >> 2) & 0xF;
|
||||
}
|
||||
|
||||
GRL_INLINE void initQBVHNodeN(struct QBVHNodeN *qnode)
|
||||
{
|
||||
uint *ptr = (uint *)qnode;
|
||||
for (uint i = 0; i < 16; i++)
|
||||
ptr[i] = 0;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB extractAABB_QBVHNodeN(struct QBVHNodeN *qnode, uint i)
|
||||
{
|
||||
struct AABB aabb;
|
||||
const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
|
||||
const int4 lower_i = (int4)(qnode->qbounds.lower_x[i], qnode->qbounds.lower_y[i], qnode->qbounds.lower_z[i], 0);
|
||||
const int4 upper_i = (int4)(qnode->qbounds.upper_x[i], qnode->qbounds.upper_y[i], qnode->qbounds.upper_z[i], 0);
|
||||
const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
|
||||
aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
|
||||
aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
|
||||
return aabb;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB getAABB_QBVHNodeN(struct QBVHNodeN *qnode)
|
||||
{
|
||||
struct AABB aabb;
|
||||
#if 0
|
||||
AABB_init(&aabb);
|
||||
for (uint i = 0; i < BVH_NODE_N6; i++)
|
||||
{
|
||||
struct AABB v = extractAABB_QBVHNodeN(qnode, i);
|
||||
AABB_extend(&aabb, &v);
|
||||
}
|
||||
#else
|
||||
uint lower_x = qnode->qbounds.lower_x[0];
|
||||
uint lower_y = qnode->qbounds.lower_y[0];
|
||||
uint lower_z = qnode->qbounds.lower_z[0];
|
||||
|
||||
uint upper_x = qnode->qbounds.upper_x[0];
|
||||
uint upper_y = qnode->qbounds.upper_y[0];
|
||||
uint upper_z = qnode->qbounds.upper_z[0];
|
||||
|
||||
for (uint i = 1; i < BVH_NODE_N6; i++)
|
||||
{
|
||||
uint lx = qnode->qbounds.lower_x[i];
|
||||
uint ly = qnode->qbounds.lower_y[i];
|
||||
uint lz = qnode->qbounds.lower_z[i];
|
||||
|
||||
uint ux = qnode->qbounds.upper_x[i];
|
||||
uint uy = qnode->qbounds.upper_y[i];
|
||||
uint uz = qnode->qbounds.upper_z[i];
|
||||
|
||||
bool valid = lx <= ux;
|
||||
if (valid)
|
||||
{
|
||||
lower_x = min(lower_x, lx);
|
||||
lower_y = min(lower_y, ly);
|
||||
lower_z = min(lower_z, lz);
|
||||
|
||||
upper_x = max(upper_x, ux);
|
||||
upper_y = max(upper_y, uy);
|
||||
upper_z = max(upper_z, uz);
|
||||
}
|
||||
}
|
||||
|
||||
const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
|
||||
const int4 lower_i = (int4)(lower_x, lower_y, lower_z, 0);
|
||||
const int4 upper_i = (int4)(upper_x, upper_y, upper_z, 0);
|
||||
const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
|
||||
aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
|
||||
aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
|
||||
#endif
|
||||
return aabb;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB3f InternalNode_getAABB3f(struct InternalNode* node)
|
||||
{
|
||||
return AABB3fFromAABB(getAABB_QBVHNodeN((struct QBVHNodeN*)node));
|
||||
}
|
||||
|
||||
GRL_INLINE uint getNumChildren_QBVHNodeN(struct QBVHNodeN *qnode)
|
||||
{
|
||||
uint children = 0;
|
||||
for (uint i = 0; i < BVH_NODE_N6; i++)
|
||||
{
|
||||
uint lx = qnode->qbounds.lower_x[i];
|
||||
uint ux = qnode->qbounds.upper_x[i];
|
||||
bool valid = lx <= ux;
|
||||
if (valid)
|
||||
children++;
|
||||
}
|
||||
return children;
|
||||
}
|
||||
|
||||
GRL_INLINE long extractQBVHNodeN_offset(struct QBVHNodeN *qnode)
|
||||
{
|
||||
return ((long)qnode->offset) << 6;
|
||||
}
|
||||
|
||||
GRL_INLINE void *QBVHNodeN_childrenPointer(struct QBVHNodeN *qnode)
|
||||
{
|
||||
const int offset = qnode->offset;
|
||||
return (void *)(qnode + offset);
|
||||
}
|
||||
|
||||
GRL_INLINE void subgroup_setQBVHNodeN_setFields_reduced_bounds(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, struct AABB reduced_aabb)
|
||||
{
|
||||
const uint subgroupLocalID = get_sub_group_local_id();
|
||||
const uint k = subgroupLocalID;
|
||||
const float up = 1.0f + ulp;
|
||||
const float down = 1.0f - ulp;
|
||||
|
||||
struct AABB aabb = reduced_aabb; // needs to execute with full subgroup width
|
||||
aabb = AABB_sub_group_broadcast(&aabb, 0);
|
||||
|
||||
if (subgroupLocalID < BVH_NODE_N6)
|
||||
{
|
||||
struct AABB conservative_aabb = conservativeAABB(&aabb);
|
||||
const float3 len = AABB_size(&conservative_aabb).xyz * up;
|
||||
int3 exp;
|
||||
const float3 mant = frexp_vec3(len, &exp);
|
||||
const float3 org = conservative_aabb.lower.xyz;
|
||||
|
||||
exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
|
||||
|
||||
qbvh_node->offset = offset;
|
||||
qbvh_node->type = type;
|
||||
|
||||
qbvh_node->lower[0] = org.x;
|
||||
qbvh_node->lower[1] = org.y;
|
||||
qbvh_node->lower[2] = org.z;
|
||||
|
||||
qbvh_node->exp[0] = exp.x;
|
||||
qbvh_node->exp[1] = exp.y;
|
||||
qbvh_node->exp[2] = exp.z;
|
||||
|
||||
qbvh_node->instMask = mask;
|
||||
|
||||
uchar3 lower_uchar = (uchar3)(0x80);
|
||||
uchar3 upper_uchar = (uchar3)(0);
|
||||
|
||||
if (subgroupLocalID < numChildren)
|
||||
{
|
||||
struct AABB child_aabb = conservativeAABB(input_aabb);
|
||||
|
||||
float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
|
||||
lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
|
||||
float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
|
||||
upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
|
||||
|
||||
lower_uchar = convert_uchar3_rtn(lower);
|
||||
upper_uchar = convert_uchar3_rtp(upper);
|
||||
|
||||
if (degenerated)
|
||||
{
|
||||
lower_uchar = upper_uchar = 0;
|
||||
}
|
||||
}
|
||||
|
||||
qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
|
||||
qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
|
||||
qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
|
||||
qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
|
||||
qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
|
||||
qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
|
||||
|
||||
qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
|
||||
|
||||
#if ENABLE_CONVERSION_CHECKS == 1
|
||||
|
||||
if (!(exp.x >= -128 && exp.x <= 127))
|
||||
printf("exp_x error \n");
|
||||
if (!(exp.y >= -128 && exp.y <= 127))
|
||||
printf("exp_y error \n");
|
||||
if (!(exp.z >= -128 && exp.z <= 127))
|
||||
printf("exp_z error \n");
|
||||
|
||||
struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
|
||||
if (!AABB_subset(&child_aabb, &child_qaabb))
|
||||
{
|
||||
uint3 lower_i = convert_uint3(lower_uchar);
|
||||
uint3 upper_i = convert_uint3(upper_uchar);
|
||||
|
||||
printf("\n ERROR %d\n", k);
|
||||
printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i);
|
||||
printf("%i uncompressed \n", k);
|
||||
AABB_print(&child_aabb);
|
||||
printf("%i compressed \n", k);
|
||||
AABB_print(&child_qaabb);
|
||||
|
||||
printf("%i uncompressed (as int) \n", k);
|
||||
AABB_printasInt(&child_aabb);
|
||||
printf("%i compressed (as int) \n", k);
|
||||
AABB_printasInt(&child_qaabb);
|
||||
|
||||
int4 e0 = child_aabb.lower < child_qaabb.lower;
|
||||
int4 e1 = child_aabb.upper > child_qaabb.upper;
|
||||
printf("e0 %d e1 %d \n", e0, e1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void subgroup_setQBVHNodeN_setFields(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated)
|
||||
{
|
||||
struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb);
|
||||
subgroup_setQBVHNodeN_setFields_reduced_bounds(offset, type, input_aabb, numChildren, mask, qbvh_node, degenerated, aabb);
|
||||
}
|
||||
|
||||
GRL_INLINE void subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, bool active_lane)
|
||||
{
|
||||
const uint lane = get_sub_group_local_id() % 8;
|
||||
const uint node_in_sg = get_sub_group_local_id() / 8;
|
||||
const uint k = lane;
|
||||
const float up = 1.0f + ulp;
|
||||
const float down = 1.0f - ulp;
|
||||
|
||||
struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); // needs to execute with full subgroup width
|
||||
aabb = AABB_sub_group_shuffle(&aabb, node_in_sg * 8);
|
||||
|
||||
if (lane < BVH_NODE_N6 && active_lane)
|
||||
{
|
||||
struct AABB conservative_aabb = conservativeAABB(&aabb);
|
||||
const float3 len = AABB_size(&conservative_aabb).xyz * up;
|
||||
int3 exp;
|
||||
const float3 mant = frexp_vec3(len, &exp);
|
||||
const float3 org = conservative_aabb.lower.xyz;
|
||||
|
||||
exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
|
||||
|
||||
qbvh_node->offset = offset;
|
||||
qbvh_node->type = type;
|
||||
|
||||
qbvh_node->lower[0] = org.x;
|
||||
qbvh_node->lower[1] = org.y;
|
||||
qbvh_node->lower[2] = org.z;
|
||||
|
||||
qbvh_node->exp[0] = exp.x;
|
||||
qbvh_node->exp[1] = exp.y;
|
||||
qbvh_node->exp[2] = exp.z;
|
||||
|
||||
qbvh_node->instMask = mask;
|
||||
|
||||
uchar3 lower_uchar = (uchar3)(0x80);
|
||||
uchar3 upper_uchar = (uchar3)(0);
|
||||
|
||||
if (lane < numChildren)
|
||||
{
|
||||
struct AABB child_aabb = conservativeAABB(input_aabb);
|
||||
|
||||
float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
|
||||
lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
|
||||
float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
|
||||
upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
|
||||
|
||||
lower_uchar = convert_uchar3_rtn(lower);
|
||||
upper_uchar = convert_uchar3_rtp(upper);
|
||||
|
||||
if (degenerated)
|
||||
{
|
||||
lower_uchar = upper_uchar = 0;
|
||||
}
|
||||
}
|
||||
|
||||
qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
|
||||
qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
|
||||
qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
|
||||
qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
|
||||
qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
|
||||
qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
|
||||
|
||||
qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
|
||||
|
||||
#if ENABLE_CONVERSION_CHECKS == 1
|
||||
|
||||
if (!(exp.x >= -128 && exp.x <= 127))
|
||||
printf("exp_x error \n");
|
||||
if (!(exp.y >= -128 && exp.y <= 127))
|
||||
printf("exp_y error \n");
|
||||
if (!(exp.z >= -128 && exp.z <= 127))
|
||||
printf("exp_z error \n");
|
||||
|
||||
struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
|
||||
if (!AABB_subset(&child_aabb, &child_qaabb))
|
||||
{
|
||||
uint3 lower_i = convert_uint3(lower_uchar);
|
||||
uint3 upper_i = convert_uint3(upper_uchar);
|
||||
|
||||
printf("\n ERROR %d\n", k);
|
||||
printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i);
|
||||
printf("%i uncompressed \n", k);
|
||||
AABB_print(&child_aabb);
|
||||
printf("%i compressed \n", k);
|
||||
AABB_print(&child_qaabb);
|
||||
|
||||
printf("%i uncompressed (as int) \n", k);
|
||||
AABB_printasInt(&child_aabb);
|
||||
printf("%i compressed (as int) \n", k);
|
||||
AABB_printasInt(&child_qaabb);
|
||||
|
||||
int4 e0 = child_aabb.lower < child_qaabb.lower;
|
||||
int4 e1 = child_aabb.upper > child_qaabb.upper;
|
||||
printf("e0 %d e1 %d \n", e0, e1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void subgroup_setInstanceQBVHNodeN(const int offset, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node, const uint instMask)
|
||||
{
|
||||
const uint subgroupLocalID = get_sub_group_local_id();
|
||||
|
||||
// for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
|
||||
// if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
|
||||
bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
|
||||
|
||||
struct AABB aabb;
|
||||
AABB_init(&aabb);
|
||||
|
||||
// if every child is degenerated (or inactive) instance, we need to init aabb with origin point
|
||||
uchar commonMask = sub_group_reduce_or_N6(instMask);
|
||||
if (subgroupLocalID < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
|
||||
aabb = *input_aabb;
|
||||
|
||||
subgroup_setQBVHNodeN_setFields(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated);
|
||||
}
|
||||
|
||||
|
||||
// return true if is degenerated
|
||||
GRL_INLINE bool subgroup_setInstanceBox_2xSIMD8_in_SIMD16(struct AABB* input_aabb, const uint numChildren, uchar* mask, const uint instMask, bool active_lane)
|
||||
{
|
||||
const uint lane = get_sub_group_local_id() % 8;
|
||||
|
||||
// for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
|
||||
// if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
|
||||
bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
|
||||
|
||||
// if every child is degenerated (or inactive) instance, we need to init aabb with origin point
|
||||
uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
|
||||
if (active_lane)
|
||||
*mask = commonMask;
|
||||
|
||||
if (active_lane && (degenerated && commonMask != BVH_NODE_DEGENERATED_MASK))
|
||||
AABB_init(input_aabb);
|
||||
|
||||
return active_lane ? degenerated : false;
|
||||
}
|
||||
|
||||
GRL_INLINE void subgroup_setInstanceQBVHNodeN_x2(const int offset, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, const uint instMask, bool active_lane)
|
||||
{
|
||||
const uint lane = get_sub_group_local_id() % 8;
|
||||
|
||||
// for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
|
||||
// if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
|
||||
bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
|
||||
|
||||
struct AABB aabb;
|
||||
AABB_init(&aabb);
|
||||
|
||||
// if every child is degenerated (or inactive) instance, we need to init aabb with origin point
|
||||
uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
|
||||
if (lane < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
|
||||
aabb = *input_aabb;
|
||||
|
||||
subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated, active_lane);
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void subgroup_setQBVHNodeN(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, uint mask)
|
||||
{
|
||||
const uint subgroupLocalID = get_sub_group_local_id();
|
||||
|
||||
struct AABB aabb;
|
||||
AABB_init(&aabb);
|
||||
|
||||
if (subgroupLocalID < numChildren)
|
||||
aabb = *input_aabb;
|
||||
|
||||
subgroup_setQBVHNodeN_setFields(offset, type, &aabb, numChildren, mask, qbvh_node, false);
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void subgroup_setQBVHNodeN_x2(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, bool active_lane)
|
||||
{
|
||||
const uint lane = get_sub_group_local_id() % 8;
|
||||
|
||||
struct AABB aabb;
|
||||
AABB_init(&aabb);
|
||||
|
||||
if (lane < numChildren)
|
||||
aabb = *input_aabb;
|
||||
|
||||
subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, type, &aabb, numChildren, BVH_NODE_DEFAULT_MASK, qbvh_node, false, active_lane);
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void subgroup_QBVHNodeN_setBounds( uniform struct QBVHNodeN* qbvh_node,
|
||||
uniform struct AABB reduced_bounds,
|
||||
varying struct AABB input_aabb,
|
||||
uniform uint numChildren,
|
||||
varying ushort lane )
|
||||
{
|
||||
const float up = 1.0f + ulp;
|
||||
const float down = 1.0f - ulp;
|
||||
|
||||
int3 exp;
|
||||
|
||||
struct AABB conservative_aabb = conservativeAABB( &reduced_bounds);
|
||||
const float3 len = AABB_size( &conservative_aabb ).xyz * up;
|
||||
const float3 mant = frexp_vec3( len, &exp );
|
||||
const float3 org = conservative_aabb.lower.xyz;
|
||||
|
||||
exp += (mant > ( float3 )QUANT_MAX_MANT ? (int3)1 : (int3)0);
|
||||
|
||||
qbvh_node->lower[0] = org.x;
|
||||
qbvh_node->lower[1] = org.y;
|
||||
qbvh_node->lower[2] = org.z;
|
||||
|
||||
qbvh_node->exp[0] = exp.x;
|
||||
qbvh_node->exp[1] = exp.y;
|
||||
qbvh_node->exp[2] = exp.z;
|
||||
|
||||
qbvh_node->instMask = 0xff;
|
||||
|
||||
uchar3 lower_uchar = 0x80;
|
||||
uchar3 upper_uchar = 0;
|
||||
|
||||
if ( lane < BVH_NODE_N6 )
|
||||
{
|
||||
ushort k = lane;
|
||||
if( lane < numChildren )
|
||||
{
|
||||
struct AABB child_aabb = conservativeAABB( &input_aabb ); // conservative ???
|
||||
|
||||
float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
|
||||
lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
|
||||
float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
|
||||
upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
|
||||
|
||||
lower_uchar = convert_uchar3_rtn( lower );
|
||||
upper_uchar = convert_uchar3_rtp( upper );
|
||||
}
|
||||
|
||||
qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
|
||||
qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
|
||||
qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
|
||||
qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
|
||||
qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
|
||||
qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
GRL_INLINE void QBVHNodeN_setBounds(struct QBVHNodeN *qbvh_node, struct AABB *input_aabb, const uint numChildren)
|
||||
{
|
||||
const float up = 1.0f + ulp;
|
||||
const float down = 1.0f - ulp;
|
||||
|
||||
int3 exp;
|
||||
struct AABB aabb;
|
||||
AABB_init(&aabb);
|
||||
for (uint i = 0; i < numChildren; i++)
|
||||
AABB_extend(&aabb, &input_aabb[i]);
|
||||
|
||||
struct AABB conservative_aabb = conservativeAABB(&aabb);
|
||||
const float3 len = AABB_size(&conservative_aabb).xyz * up;
|
||||
const float3 mant = frexp_vec3(len, &exp);
|
||||
const float3 org = conservative_aabb.lower.xyz;
|
||||
|
||||
exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
|
||||
|
||||
qbvh_node->lower[0] = org.x;
|
||||
qbvh_node->lower[1] = org.y;
|
||||
qbvh_node->lower[2] = org.z;
|
||||
|
||||
qbvh_node->exp[0] = exp.x;
|
||||
qbvh_node->exp[1] = exp.y;
|
||||
qbvh_node->exp[2] = exp.z;
|
||||
|
||||
qbvh_node->instMask = 0xff;
|
||||
|
||||
for (uint k = 0; k < numChildren; k++)
|
||||
{
|
||||
struct AABB child_aabb = conservativeAABB(&input_aabb[k]); // conservative ???
|
||||
|
||||
float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
|
||||
lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
|
||||
float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
|
||||
upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
|
||||
|
||||
uchar3 lower_uchar = convert_uchar3_rtn(lower);
|
||||
uchar3 upper_uchar = convert_uchar3_rtp(upper);
|
||||
|
||||
qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
|
||||
qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
|
||||
qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
|
||||
qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
|
||||
qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
|
||||
qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
|
||||
|
||||
#if ENABLE_CONVERSION_CHECKS == 1
|
||||
if (!(exp.x >= -128 && exp.x <= 127))
|
||||
printf("exp_x error \n");
|
||||
if (!(exp.y >= -128 && exp.y <= 127))
|
||||
printf("exp_y error \n");
|
||||
if (!(exp.z >= -128 && exp.z <= 127))
|
||||
printf("exp_z error \n");
|
||||
|
||||
struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
|
||||
if (!AABB_subset(&child_aabb, &child_qaabb))
|
||||
{
|
||||
uint3 lower_i = convert_uint3(lower_uchar);
|
||||
uint3 upper_i = convert_uint3(upper_uchar);
|
||||
|
||||
printf("\n ERROR %d\n", k);
|
||||
printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i);
|
||||
printf("%i uncompressed \n", k);
|
||||
AABB_print(&child_aabb);
|
||||
printf("%i compressed \n", k);
|
||||
AABB_print(&child_qaabb);
|
||||
|
||||
printf("%i uncompressed (as int) \n", k);
|
||||
AABB_printasInt(&child_aabb);
|
||||
printf("%i compressed (as int) \n", k);
|
||||
AABB_printasInt(&child_qaabb);
|
||||
|
||||
int4 e0 = child_aabb.lower < child_qaabb.lower;
|
||||
int4 e1 = child_aabb.upper > child_qaabb.upper;
|
||||
printf("e0 %d e1 %d \n", e0, e1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (uint k = numChildren; k < BVH_NODE_N6; k++)
|
||||
{
|
||||
qbvh_node->qbounds.lower_x[k] = 0x80;
|
||||
qbvh_node->qbounds.lower_y[k] = 0x80;
|
||||
qbvh_node->qbounds.lower_z[k] = 0x80;
|
||||
qbvh_node->qbounds.upper_x[k] = 0;
|
||||
qbvh_node->qbounds.upper_y[k] = 0;
|
||||
qbvh_node->qbounds.upper_z[k] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void QBVHNodeN_setChildren(struct QBVHNodeN *qbvh_node, const int offset, const uint numChildren)
|
||||
{
|
||||
qbvh_node->offset = offset;
|
||||
for (uint k = 0; k < BVH_NODE_N6; k++)
|
||||
qbvh_node->childData[k] = 1;
|
||||
}
|
||||
|
||||
GRL_INLINE void QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
|
||||
{
|
||||
for (uint k = 0; k < BVH_NODE_N6; k++)
|
||||
qbvh_node->childData[k] = 1;
|
||||
}
|
||||
|
||||
GRL_INLINE void SUBGROUP_QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
|
||||
{
|
||||
if( get_sub_group_local_id() < BVH_NODE_N6 )
|
||||
qbvh_node->childData[get_sub_group_local_id()] = 1;
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void QBVHNodeN_setChildIncr2(struct QBVHNodeN *qbvh_node)
|
||||
{
|
||||
for (uint k = 0; k < BVH_NODE_N6; k++)
|
||||
qbvh_node->childData[k] = 2;
|
||||
}
|
||||
|
||||
GRL_INLINE void QBVHNodeN_setType(struct QBVHNodeN *qbvh_node, const uint type)
|
||||
{
|
||||
qbvh_node->type = type;
|
||||
}
|
||||
|
||||
GRL_INLINE void setQBVHNodeN(const int offset, const uint type, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node)
|
||||
{
|
||||
QBVHNodeN_setType(qbvh_node, type);
|
||||
QBVHNodeN_setChildren(qbvh_node, offset, numChildren);
|
||||
QBVHNodeN_setBounds(qbvh_node, input_aabb, numChildren);
|
||||
}
|
||||
|
||||
GRL_INLINE void printQBVHNodeN(struct QBVHNodeN *qnode)
|
||||
{
|
||||
printf(" offset %d type %d \n", qnode->offset, (int)qnode->type);
|
||||
printf(" lower %f %f %f \n", qnode->lower[0], qnode->lower[1], qnode->lower[2]);
|
||||
printf(" exp %d %d %d \n", (int)qnode->exp[0], (int)qnode->exp[1], (int)qnode->exp[2]);
|
||||
printf(" instMask %d \n", qnode->instMask);
|
||||
|
||||
struct AABB aabb0 = extractAABB_QBVHNodeN(qnode, 0);
|
||||
struct AABB aabb1 = extractAABB_QBVHNodeN(qnode, 1);
|
||||
struct AABB aabb2 = extractAABB_QBVHNodeN(qnode, 2);
|
||||
struct AABB aabb3 = extractAABB_QBVHNodeN(qnode, 3);
|
||||
struct AABB aabb4 = extractAABB_QBVHNodeN(qnode, 4);
|
||||
struct AABB aabb5 = extractAABB_QBVHNodeN(qnode, 5);
|
||||
|
||||
printf(" lower_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_x[0], qnode->qbounds.lower_x[1], qnode->qbounds.lower_x[2], qnode->qbounds.lower_x[3], qnode->qbounds.lower_x[4], qnode->qbounds.lower_x[5], aabb0.lower.x, aabb1.lower.x, aabb2.lower.x, aabb3.lower.x, aabb4.lower.x, aabb5.lower.x);
|
||||
printf(" upper_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_x[0], qnode->qbounds.upper_x[1], qnode->qbounds.upper_x[2], qnode->qbounds.upper_x[3], qnode->qbounds.upper_x[4], qnode->qbounds.upper_x[5], aabb0.upper.x, aabb1.upper.x, aabb2.upper.x, aabb3.upper.x, aabb4.upper.x, aabb5.upper.x);
|
||||
|
||||
printf(" lower_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_y[0], qnode->qbounds.lower_y[1], qnode->qbounds.lower_y[2], qnode->qbounds.lower_y[3], qnode->qbounds.lower_y[4], qnode->qbounds.lower_y[5], aabb0.lower.y, aabb1.lower.y, aabb2.lower.y, aabb3.lower.y, aabb4.lower.y, aabb5.lower.y);
|
||||
printf(" upper_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_y[0], qnode->qbounds.upper_y[1], qnode->qbounds.upper_y[2], qnode->qbounds.upper_y[3], qnode->qbounds.upper_y[4], qnode->qbounds.upper_y[5], aabb0.upper.y, aabb1.upper.y, aabb2.upper.y, aabb3.upper.y, aabb4.upper.y, aabb5.upper.y);
|
||||
|
||||
printf(" lower_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_z[0], qnode->qbounds.lower_z[1], qnode->qbounds.lower_z[2], qnode->qbounds.lower_z[3], qnode->qbounds.lower_z[4], qnode->qbounds.lower_z[5], aabb0.lower.z, aabb1.lower.z, aabb2.lower.z, aabb3.lower.z, aabb4.lower.z, aabb5.lower.z);
|
||||
printf(" upper_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_z[0], qnode->qbounds.upper_z[1], qnode->qbounds.upper_z[2], qnode->qbounds.upper_z[3], qnode->qbounds.upper_z[4], qnode->qbounds.upper_z[5], aabb0.upper.z, aabb1.upper.z, aabb2.upper.z, aabb3.upper.z, aabb4.upper.z, aabb5.upper.z);
|
||||
}
|
||||
|
||||
GRL_INLINE int encodeOffset(global char *bvh_mem, global void *parent, int global_child_offset)
|
||||
{
|
||||
long global_parent_offset = (long)parent - (long)bvh_mem;
|
||||
global_parent_offset = global_parent_offset & (~(64 - 1)); // FIXME: (sw) this should not be necessary?
|
||||
int relative_offset = global_child_offset - global_parent_offset; // FIXME: this limits BVH size to 4GB
|
||||
//if ((int)relative_offset <= 0) printf("relative offset <= 0 %d global_child_offset %d global_parent_offset %d \n", relative_offset,global_child_offset,global_parent_offset);
|
||||
return relative_offset;
|
||||
}
|
||||
|
||||
GRL_INLINE void QBVH6Node_set_offset(struct QBVHNodeN *qnode, void *children)
|
||||
{
|
||||
int ofs = (struct QBVHNodeN *)children - qnode;
|
||||
qnode->offset = ofs;
|
||||
}
|
||||
|
||||
GRL_INLINE void QBVH6Node_set_type(struct QBVHNodeN *qnode, uint type)
|
||||
{
|
||||
qnode->type = type;
|
||||
}
|
||||
|
||||
GRL_INLINE uint sortBVHChildrenIDs(uint input)
|
||||
{
|
||||
#if BVH_NODE_N == 8
|
||||
return sort8_descending(input);
|
||||
#else
|
||||
return sort4_descending(input);
|
||||
#endif
|
||||
}
|
||||
|
||||
enum XFM_BOX_OPTION {
|
||||
XFM_BOX_NO_CLIP = 0,
|
||||
XFM_BOX_NOT_REFINED_CLIPPED = 1, //<<use clipbox, for not refined, compute bbox from children, transform after extending to one box
|
||||
XFM_BOX_NOT_REFINED_TAKE_CLIPBOX = 2 //<<use clipbox, for not refined, just transform xlipbox, don't take children boxes into account
|
||||
};
|
||||
|
||||
#define DEB_PRINTFS 0
|
||||
#ifndef FINE_TRANSFORM_NODE_BOX
|
||||
#define FINE_TRANSFORM_NODE_BOX 0
|
||||
#endif
|
||||
|
||||
GRL_INLINE struct AABB3f GRL_OVERLOADABLE compute_xfm_bbox(const float* xfm, InternalNode* pnode, enum XFM_BOX_OPTION clipOpt, const AABB3f* clipBox, float matrixTransformOverhead)
|
||||
{
|
||||
AABB3f childrenbox;
|
||||
#if FINE_TRANSFORM_NODE_BOX
|
||||
struct AffineSpace3f axfm = AffineSpace3f_load_row_major(xfm);
|
||||
bool computeFine = matrixTransformOverhead < 0.6f;
|
||||
computeFine = sub_group_any(computeFine);
|
||||
if (computeFine)
|
||||
{
|
||||
bool clip = clipOpt != XFM_BOX_NO_CLIP;
|
||||
InternalNode node = *pnode;
|
||||
|
||||
#if DEB_PRINTFS
|
||||
if (InternalNode_IsChildValid(&node, 5) && !InternalNode_IsChildValid(&node, 4))
|
||||
printf("child 5 valid && child 4 invalid\n");
|
||||
if (InternalNode_IsChildValid(&node, 4) && !InternalNode_IsChildValid(&node, 3))
|
||||
printf("child 4 valid && child 3 invalid\n");
|
||||
if (InternalNode_IsChildValid(&node, 3) && !InternalNode_IsChildValid(&node, 2))
|
||||
printf("child 3 valid && child 2 invalid\n");
|
||||
if (InternalNode_IsChildValid(&node, 2) && !InternalNode_IsChildValid(&node, 1))
|
||||
printf("child 2 valid && child 1 invalid\n");
|
||||
if (InternalNode_IsChildValid(&node, 1) && !InternalNode_IsChildValid(&node, 0))
|
||||
printf("child 1 valid && child 0 invalid\n");
|
||||
#endif
|
||||
|
||||
#if DEB_PRINTFS
|
||||
printf("F");
|
||||
#endif
|
||||
AABB3f child_bounds0 = InternalNode_GetChildAABB(&node, 0);
|
||||
AABB3f child_bounds1 = InternalNode_GetChildAABB(&node, 1);
|
||||
AABB3f child_bounds2 = InternalNode_GetChildAABB(&node, 2);
|
||||
AABB3f child_bounds3 = InternalNode_GetChildAABB(&node, 3);
|
||||
AABB3f child_bounds4 = InternalNode_GetChildAABB(&node, 4);
|
||||
AABB3f child_bounds5 = InternalNode_GetChildAABB(&node, 5);
|
||||
|
||||
// we bravely assumme we will have at least 2 children here.
|
||||
if(!InternalNode_IsChildValid(&node, 2)) child_bounds2 = child_bounds0;
|
||||
if(!InternalNode_IsChildValid(&node, 3)) child_bounds3 = child_bounds0;
|
||||
if(!InternalNode_IsChildValid(&node, 4)) child_bounds4 = child_bounds0;
|
||||
if(!InternalNode_IsChildValid(&node, 5)) child_bounds5 = child_bounds0;
|
||||
|
||||
if (clip)
|
||||
{
|
||||
AABB3f_trim_upper(&child_bounds0, clipBox->upper);
|
||||
AABB3f_trim_upper(&child_bounds1, clipBox->upper);
|
||||
AABB3f_trim_upper(&child_bounds2, clipBox->upper);
|
||||
AABB3f_trim_upper(&child_bounds3, clipBox->upper);
|
||||
AABB3f_trim_upper(&child_bounds4, clipBox->upper);
|
||||
AABB3f_trim_upper(&child_bounds5, clipBox->upper);
|
||||
}
|
||||
|
||||
child_bounds0 = transform_aabb(child_bounds0, xfm);
|
||||
child_bounds1 = transform_aabb(child_bounds1, xfm);
|
||||
child_bounds2 = transform_aabb(child_bounds2, xfm);
|
||||
child_bounds3 = transform_aabb(child_bounds3, xfm);
|
||||
child_bounds4 = transform_aabb(child_bounds4, xfm);
|
||||
child_bounds5 = transform_aabb(child_bounds5, xfm);
|
||||
|
||||
AABB3f_extend(&child_bounds0, &child_bounds1);
|
||||
AABB3f_extend(&child_bounds2, &child_bounds3);
|
||||
AABB3f_extend(&child_bounds4, &child_bounds5);
|
||||
AABB3f_extend(&child_bounds0, &child_bounds2);
|
||||
AABB3f_extend(&child_bounds0, &child_bounds4);
|
||||
|
||||
return child_bounds0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if DEB_PRINTFS
|
||||
printf("0");
|
||||
#endif
|
||||
|
||||
struct AABB3f child_bounds;
|
||||
|
||||
if (clipOpt != XFM_BOX_NOT_REFINED_TAKE_CLIPBOX)
|
||||
{
|
||||
// XFM_BOX_NOT_REFINED_CLIPPED || XFM_BOX_NO_CLIP
|
||||
child_bounds = InternalNode_getAABB3f(pnode);
|
||||
if (clipOpt != XFM_BOX_NO_CLIP)
|
||||
{
|
||||
AABB3f_intersect(&child_bounds, *clipBox);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//XFM_BOX_NOT_REFINED_TAKE_CLIPBOX
|
||||
child_bounds = *clipBox;
|
||||
}
|
||||
|
||||
child_bounds = transform_aabb(child_bounds, xfm);
|
||||
//child_bounds = conservativeAABB3f(&child_bounds);
|
||||
return child_bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE AABB3f GRL_OVERLOADABLE compute_xfm_bbox(struct AffineSpace3f xfm, InternalNode* pnode, bool clip, AABB3f* clipBox, float matOverhead)
|
||||
{
|
||||
float transform[12];
|
||||
load_row_major_from_AffineSpace3f(xfm, transform);
|
||||
return compute_xfm_bbox(transform, pnode, clip, clipBox, matOverhead);
|
||||
}
|
||||
|
||||
GRL_INLINE uint64_t compute_refit_structs_compacted_size(BVHBase* base)
|
||||
{
|
||||
uint dataSize = 0;
|
||||
|
||||
if (BVHBase_HasBackPointers(base))
|
||||
{
|
||||
const uint fatleafEntrySize = (base->fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63;
|
||||
const uint innerEntrySize = (base->innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63;
|
||||
|
||||
// New atomic update
|
||||
if(base->quadIndicesDataStart > base->backPointerDataStart)
|
||||
{
|
||||
uint numQuads = BVHBase_GetNumQuads(base);
|
||||
|
||||
const uint quadTableMainBufferSize = (numQuads + 255) & ~255;
|
||||
const uint quadLeftoversSize = (base->quadLeftoversCountNewAtomicUpdate + 255) & ~255;
|
||||
const uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
|
||||
|
||||
const uint quadIndicesDataSize = (numQuads * sizeof(QuadDataIndices) + 63) & ~63;
|
||||
|
||||
dataSize += quadTableEntriesSize + quadIndicesDataSize;
|
||||
}
|
||||
|
||||
dataSize +=
|
||||
((BVHBase_GetNumInternalNodes(base) * sizeof(uint) + 63) & ~63)
|
||||
+ fatleafEntrySize + innerEntrySize;
|
||||
}
|
||||
|
||||
return (uint64_t)dataSize;
|
||||
}
|
||||
|
||||
GRL_INLINE uint64_t compute_compacted_size(BVHBase* base)
|
||||
{
|
||||
uint64_t size = sizeof(BVHBase);
|
||||
size += BVHBase_GetNumHWInstanceLeaves(base) * sizeof(HwInstanceLeaf);
|
||||
size += BVHBase_GetNumProcedurals(base) * sizeof(ProceduralLeaf);
|
||||
size += BVHBase_GetNumQuads(base) * sizeof(QuadLeaf);
|
||||
size += compute_refit_structs_compacted_size(base);
|
||||
size += BVHBase_GetNumInternalNodes(base) * sizeof(InternalNode);
|
||||
size += sizeof(InstanceDesc) * base->Meta.instanceCount;
|
||||
size += (sizeof(GeoMetaData) * base->Meta.geoCount + 63) & ~63; // align to 64
|
||||
size = (size + 63) & ~63;
|
||||
|
||||
return size;
|
||||
}
|
||||
|
|
@ -1,127 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared.h"
|
||||
#include "intrinsics.h"
|
||||
#include "AABB.h"
|
||||
#include "AABB3f.h"
|
||||
|
||||
// JDB TODO: Use corresponding GRL structures!!!
|
||||
|
||||
struct Quad
|
||||
{
|
||||
unsigned int shaderIndex; // note: also mask
|
||||
unsigned int geomIndex; // note: also geom flags in upper 2 bits
|
||||
unsigned int primIndex0;
|
||||
unsigned int primIndex1Delta;
|
||||
float v[4][3];
|
||||
};
|
||||
|
||||
GRL_INLINE unsigned int Quad_getGeomIndex(global struct Quad *quad)
|
||||
{
|
||||
return quad->geomIndex;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned int Quad_getPrimIndex0(global struct Quad *quad)
|
||||
{
|
||||
return quad->primIndex0;
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned int Quad_getPrimIndex1(global struct Quad *quad)
|
||||
{
|
||||
return quad->primIndex0 + (quad->primIndex1Delta & 0xFFFF);
|
||||
}
|
||||
|
||||
GRL_INLINE float3 load_float3(float *p)
|
||||
{
|
||||
return (float3)(p[0], p[1], p[2]);
|
||||
}
|
||||
|
||||
GRL_INLINE float3 load_perm_float3(float *p, const uint3 perm)
|
||||
{
|
||||
return (float3)(p[perm.x], p[perm.y], p[perm.z]);
|
||||
}
|
||||
|
||||
GRL_INLINE float2 load_perm_float2(float *p, const uint2 perm)
|
||||
{
|
||||
return (float2)(p[perm.x], p[perm.y]);
|
||||
}
|
||||
|
||||
GRL_INLINE float load_perm_float(float *p, const uint perm)
|
||||
{
|
||||
return p[perm];
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB getAABB_Quad(struct Quad *q)
|
||||
{
|
||||
struct AABB aabb;
|
||||
const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
|
||||
const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
|
||||
aabb.lower = (float4)(lower, 0.0f);
|
||||
aabb.upper = (float4)(upper, 0.0f);
|
||||
return aabb;
|
||||
}
|
||||
|
||||
GRL_INLINE void Quad_ExtendAABB(struct Quad* q, struct AABB* box)
|
||||
{
|
||||
struct AABB aabb;
|
||||
const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
|
||||
const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
|
||||
aabb.lower = (float4)(lower, 0.0f);
|
||||
aabb.upper = (float4)(upper, 0.0f);
|
||||
AABB_extend(box, &aabb);
|
||||
}
|
||||
|
||||
GRL_INLINE float4 getCentroid2_Quad(struct Quad *q)
|
||||
{
|
||||
struct AABB aabb = getAABB_Quad(q);
|
||||
return aabb.lower + aabb.upper;
|
||||
}
|
||||
|
||||
GRL_INLINE void setQuad(struct Quad *quad, const float4 v0, const float4 v1, const float4 v2, const float4 v3,
|
||||
const uchar j0, const uchar j1, const uchar j2,
|
||||
const uint geomID, const uint primID0, const uint primID1, const uint geomMask, const uint geomFlags )
|
||||
{
|
||||
quad->v[0][0] = v0.x;
|
||||
quad->v[0][1] = v0.y;
|
||||
quad->v[0][2] = v0.z;
|
||||
quad->v[1][0] = v1.x;
|
||||
quad->v[1][1] = v1.y;
|
||||
quad->v[1][2] = v1.z;
|
||||
quad->v[2][0] = v2.x;
|
||||
quad->v[2][1] = v2.y;
|
||||
quad->v[2][2] = v2.z;
|
||||
quad->v[3][0] = v3.x;
|
||||
quad->v[3][1] = v3.y;
|
||||
quad->v[3][2] = v3.z;
|
||||
|
||||
quad->shaderIndex = (geomMask << 24) | geomID;
|
||||
quad->geomIndex = geomID | (geomFlags << 30);
|
||||
quad->primIndex0 = primID0;
|
||||
const uint delta = primID1 - primID0;
|
||||
const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
|
||||
quad->primIndex1Delta = delta | (j << 16) | (1 << 22); // single prim in leaf
|
||||
|
||||
}
|
||||
|
||||
GRL_INLINE void setQuadVertices(struct Quad *quad, const float3 v0, const float3 v1, const float3 v2, const float3 v3)
|
||||
{
|
||||
quad->v[0][0] = v0.x;
|
||||
quad->v[0][1] = v0.y;
|
||||
quad->v[0][2] = v0.z;
|
||||
quad->v[1][0] = v1.x;
|
||||
quad->v[1][1] = v1.y;
|
||||
quad->v[1][2] = v1.z;
|
||||
quad->v[2][0] = v2.x;
|
||||
quad->v[2][1] = v2.y;
|
||||
quad->v[2][2] = v2.z;
|
||||
quad->v[3][0] = v3.x;
|
||||
quad->v[3][1] = v3.y;
|
||||
quad->v[3][2] = v3.z;
|
||||
}
|
||||
|
|
@ -1,163 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module radix_sort;
|
||||
|
||||
kernel_module radix_kernels ("morton_radix_sort.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
kernel opencl_build_morton_kernel_sort_bin_items < kernelFunction="sort_morton_codes_bin_items">;
|
||||
kernel opencl_build_morton_kernel_sort_reduce_bins < kernelFunction="sort_morton_codes_reduce_bins">;
|
||||
kernel opencl_build_morton_kernel_sort_scatter_items < kernelFunction="sort_morton_codes_scatter_items">;
|
||||
|
||||
kernel opencl_build_morton_codes_sort_merged < kernelFunction="sort_morton_codes_merged">;
|
||||
|
||||
kernel opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum < kernelFunction="sort_morton_codes_reduce_bins_wide_partial_sum">;
|
||||
kernel opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce < kernelFunction="sort_morton_codes_reduce_bins_wide_add_reduce">;
|
||||
}
|
||||
|
||||
metakernel sort(
|
||||
qword build_globals,
|
||||
dword shift,
|
||||
qword global_histogram,
|
||||
qword input0,
|
||||
qword input1,
|
||||
dword input0_offset,
|
||||
dword input1_offset,
|
||||
dword iteration,
|
||||
dword threads)
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
|
||||
build_globals,
|
||||
shift,
|
||||
global_histogram,
|
||||
input0,
|
||||
input1,
|
||||
input0_offset,
|
||||
input1_offset,
|
||||
iteration);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
|
||||
threads,
|
||||
global_histogram);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
dispatch opencl_build_morton_kernel_sort_scatter_items (threads, 1, 1) args(
|
||||
build_globals,
|
||||
shift,
|
||||
global_histogram,
|
||||
input0,
|
||||
input1,
|
||||
input0_offset,
|
||||
input1_offset,
|
||||
iteration);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
}
|
||||
|
||||
metakernel sort_bin_items(
|
||||
qword build_globals,
|
||||
qword global_histogram,
|
||||
qword wg_flags,
|
||||
qword input0,
|
||||
dword iteration,
|
||||
dword threads,
|
||||
dword update_wg_flags
|
||||
)
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
|
||||
build_globals,
|
||||
global_histogram,
|
||||
wg_flags,
|
||||
input0,
|
||||
iteration,
|
||||
threads,
|
||||
update_wg_flags
|
||||
);
|
||||
}
|
||||
|
||||
metakernel sort_reduce_bins(
|
||||
qword build_globals,
|
||||
qword global_histogram,
|
||||
dword threads,
|
||||
dword iteration)
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
|
||||
build_globals,
|
||||
threads,
|
||||
global_histogram,
|
||||
iteration);
|
||||
}
|
||||
|
||||
metakernel sort_scatter_items(
|
||||
qword build_globals,
|
||||
qword global_histogram,
|
||||
qword input0,
|
||||
qword input1,
|
||||
dword iteration,
|
||||
dword threads,
|
||||
dword update_morton_sort_in_flight )
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_sort_scatter_items( threads, 1, 1 ) args(
|
||||
build_globals,
|
||||
global_histogram,
|
||||
input0,
|
||||
input1,
|
||||
iteration,
|
||||
threads,
|
||||
update_morton_sort_in_flight
|
||||
);
|
||||
}
|
||||
|
||||
metakernel sort_bin_items_merged(
|
||||
qword build_globals,
|
||||
qword global_histogram,
|
||||
qword input0,
|
||||
dword iteration,
|
||||
dword threads)
|
||||
{
|
||||
dispatch opencl_build_morton_codes_sort_merged (threads, 1, 1) args(
|
||||
build_globals,
|
||||
global_histogram,
|
||||
input0,
|
||||
iteration,
|
||||
threads
|
||||
);
|
||||
}
|
||||
|
||||
metakernel sort_reduce_bins_wide(
|
||||
qword build_globals,
|
||||
qword global_histogram,
|
||||
qword global_histogram_tmp,
|
||||
qword wg_flags,
|
||||
dword threads,
|
||||
dword threads_groups,
|
||||
dword iteration)
|
||||
{
|
||||
dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum(threads_groups, 1, 1) args(
|
||||
build_globals,
|
||||
threads,
|
||||
threads_groups,
|
||||
global_histogram,
|
||||
global_histogram_tmp,
|
||||
wg_flags,
|
||||
iteration);
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce(threads_groups, 1, 1) args(
|
||||
build_globals,
|
||||
threads,
|
||||
threads_groups,
|
||||
global_histogram,
|
||||
global_histogram_tmp,
|
||||
iteration);
|
||||
}
|
||||
|
|
@ -1,167 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module rebraid;
|
||||
|
||||
kernel init_scratch < source="bvh_rebraid.cl", kernelFunction="rebraid_init_scratch" >
|
||||
kernel chase_instance_ptrs < source="bvh_rebraid.cl", kernelFunction="rebraid_chase_instance_pointers" >
|
||||
kernel calc_aabb < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances" >
|
||||
kernel calc_aabb_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_indirect" >
|
||||
kernel calc_aabb_ptr < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers" >
|
||||
kernel calc_aabb_ptr_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers_indirect" >
|
||||
kernel count_splits < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits" >
|
||||
kernel count_splits_SG < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG" >
|
||||
kernel count_splits_SG_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG_indirect" >
|
||||
kernel build_primrefs < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs" >
|
||||
kernel build_primrefs_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs_indirect" >
|
||||
|
||||
//kernel ISA_TEST < source="bvh_rebraid.cl", kernelFunction="ISA_TEST" >
|
||||
//kernel DEBUG_PRINT < source="bvh_rebraid.cl", kernelFunction="DEBUG_PRINT" >
|
||||
|
||||
|
||||
const PRIMREF_GROUP_SIZE = 256;
|
||||
|
||||
const COUNT_SPLITS_GROUP_SIZE = 16;
|
||||
|
||||
struct MKRebraidArgs
|
||||
{
|
||||
qword bvh_buffer;
|
||||
qword primref_buffer;
|
||||
qword global_buffer;
|
||||
qword instances_buffer;
|
||||
qword rebraid_scratch;
|
||||
qword flat_instances_buffer;
|
||||
dword num_instances;
|
||||
dword num_extra_primrefs;
|
||||
};
|
||||
|
||||
metakernel rebraid(
|
||||
MKRebraidArgs Args
|
||||
)
|
||||
{
|
||||
dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
|
||||
dispatch calc_aabb(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
|
||||
control( wait_idle );
|
||||
|
||||
//define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
|
||||
//dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.num_instances );
|
||||
|
||||
dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
|
||||
control( wait_idle );
|
||||
|
||||
define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
|
||||
|
||||
dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
|
||||
control( wait_idle );
|
||||
|
||||
//dispatch DEBUG_PRINT(1,1,1) args( Args.global_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
|
||||
}
|
||||
|
||||
metakernel rebraid_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
|
||||
{
|
||||
|
||||
dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
|
||||
|
||||
define num_groups REG0;
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect calc_aabb_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
|
||||
control(wait_idle);
|
||||
|
||||
dispatch_indirect count_splits_SG_indirect
|
||||
args(Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
|
||||
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_8 REG2;
|
||||
|
||||
groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
|
||||
C_8 = 8; // log_2(PRIMREF_GROUP_SIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
dispatch_indirect build_primrefs_indirect args(
|
||||
Args.global_buffer,
|
||||
Args.bvh_buffer,
|
||||
Args.instances_buffer,
|
||||
Args.rebraid_scratch,
|
||||
Args.primref_buffer,
|
||||
indirectBuildRangeInfo,
|
||||
Args.num_extra_primrefs);
|
||||
control(wait_idle);
|
||||
}
|
||||
|
||||
metakernel rebraid_ptrs(
|
||||
MKRebraidArgs Args
|
||||
)
|
||||
{
|
||||
dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
|
||||
dispatch chase_instance_ptrs( Args.num_instances, 1, 1) args( Args.instances_buffer, Args.flat_instances_buffer );
|
||||
dispatch calc_aabb_ptr(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
|
||||
control( wait_idle );
|
||||
|
||||
//define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
|
||||
//dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
|
||||
|
||||
dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch );
|
||||
control( wait_idle );
|
||||
|
||||
define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
|
||||
|
||||
|
||||
dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
|
||||
control( wait_idle );
|
||||
|
||||
}
|
||||
|
||||
metakernel rebraid_ptrs_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
|
||||
{
|
||||
dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
|
||||
|
||||
define num_groups REG0;
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect chase_instance_ptrs
|
||||
args(Args.instances_buffer, Args.flat_instances_buffer, indirectBuildRangeInfo);
|
||||
dispatch_indirect calc_aabb_ptr_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
|
||||
control(wait_idle);
|
||||
|
||||
dispatch_indirect count_splits_SG_indirect
|
||||
args(Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
|
||||
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_8 REG2;
|
||||
|
||||
groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
|
||||
C_8 = 8; // log_2(PRIMREF_GROUP_SIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
|
||||
control(wait_idle);
|
||||
|
||||
dispatch_indirect build_primrefs_indirect args(
|
||||
Args.global_buffer,
|
||||
Args.bvh_buffer,
|
||||
Args.flat_instances_buffer,
|
||||
Args.rebraid_scratch,
|
||||
Args.primref_buffer,
|
||||
Args.num_extra_primrefs,
|
||||
indirectBuildRangeInfo,
|
||||
Args.num_instances);
|
||||
control(wait_idle);
|
||||
}
|
||||
|
|
@ -1,182 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "GRLGen12.h"
|
||||
#pragma once
|
||||
|
||||
#define sizeof_Quad 64
|
||||
#define sizeof_Procedural 64
|
||||
#define sizeof_PrimRef 32
|
||||
#define sizeof_PresplitItem 8
|
||||
#define sizeof_HwInstanceLeaf 128
|
||||
#define MORTON_BUILDER_SUBTREE_THRESHOLD 256
|
||||
#define MORTON_BUILDER_P2_ELEMENTS_IN_SLM 16 * 1024 / 32
|
||||
// Temporarily disable localized phase2 due to issues in ELG presi
|
||||
// This implementation would be replaced with bottom_up + bounding box approach without the need for phase2 refit
|
||||
#define MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD /*100000*/ 0
|
||||
|
||||
#define BVH_QUAD_NODE 4
|
||||
#define BVH_INSTANCE_NODE 1
|
||||
#define BVH_INTERNAL_NODE 0
|
||||
#define BVH_PROCEDURAL_NODE 3
|
||||
#define BUILDRECORD_STACK_SIZE 48
|
||||
#define BINS 16
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
|
||||
|
||||
struct AABB
|
||||
{
|
||||
float4 lower;
|
||||
float4 upper;
|
||||
};
|
||||
|
||||
typedef struct BlockAllocator
|
||||
{
|
||||
unsigned int start;
|
||||
unsigned int cur;
|
||||
} BlockAllocator;
|
||||
|
||||
struct Globals
|
||||
{
|
||||
struct AABB centroidBounds;
|
||||
|
||||
unsigned int build_record_start;
|
||||
unsigned int numPrimitives;
|
||||
unsigned int leafPrimType;
|
||||
unsigned int leafSize;
|
||||
|
||||
unsigned int numSplittedPrimitives;
|
||||
unsigned int numBuildRecords;
|
||||
|
||||
// spatial split sate
|
||||
unsigned int numOriginalPrimitives;
|
||||
float presplitPrioritySum;
|
||||
float probThreshold;
|
||||
|
||||
// binned-sah bfs state
|
||||
unsigned int counter;
|
||||
unsigned int numBuildRecords_extended;
|
||||
|
||||
// sync variable used for global-sync on work groups
|
||||
unsigned int sync;
|
||||
|
||||
|
||||
/* morton code builder state */
|
||||
unsigned int shift; // used by adaptive mc-builder
|
||||
unsigned int shift_mask; // used by adaptive mc-builder
|
||||
unsigned int binary_hierarchy_root;
|
||||
unsigned int p0_allocated_num;
|
||||
unsigned int p0_created_num;
|
||||
unsigned int morton_sort_in_flight;
|
||||
unsigned int sort_iterations;
|
||||
|
||||
gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid
|
||||
};
|
||||
|
||||
struct Range
|
||||
{
|
||||
unsigned int start, end;
|
||||
};
|
||||
|
||||
struct Triangle
|
||||
{
|
||||
unsigned int vtx[3];
|
||||
//unsigned int primID;
|
||||
//unsigned int geomID;
|
||||
};
|
||||
|
||||
struct MortonCodePrimitive
|
||||
{
|
||||
uint64_t index_code; // 64bit code + index combo
|
||||
};
|
||||
|
||||
struct BuildRecord
|
||||
{
|
||||
struct AABB centroidBounds;
|
||||
unsigned int start, end;
|
||||
__global void *current;
|
||||
};
|
||||
|
||||
struct BinaryMortonCodeHierarchy
|
||||
{
|
||||
struct Range range;
|
||||
unsigned int leftChild;
|
||||
unsigned int rightChild;
|
||||
// unsigned int flag;
|
||||
};
|
||||
|
||||
typedef struct MortonFlattenedBoxlessNode {
|
||||
uint binary_hierarchy_index; // only needed when type != BVH_INTERNAL_NODE
|
||||
uint childOffset_type; // childOffset : 26, type : 6
|
||||
uint backPointer; // same usage as in bvh
|
||||
} MortonFlattenedBoxlessNode;
|
||||
|
||||
struct StatStackEntry
|
||||
{
|
||||
struct AABB aabb;
|
||||
unsigned int node;
|
||||
unsigned int type;
|
||||
unsigned int depth;
|
||||
float area;
|
||||
};
|
||||
|
||||
struct BuildRecordMorton
|
||||
{
|
||||
unsigned int nodeID;
|
||||
unsigned int items;
|
||||
unsigned int current_index;
|
||||
unsigned int parent_index;
|
||||
};
|
||||
|
||||
struct Split
|
||||
{
|
||||
float sah;
|
||||
int dim;
|
||||
int pos;
|
||||
};
|
||||
|
||||
struct BinMapping
|
||||
{
|
||||
float4 ofs, scale;
|
||||
};
|
||||
|
||||
struct BinInfo
|
||||
{
|
||||
struct AABB3f boundsX[BINS];
|
||||
struct AABB3f boundsY[BINS];
|
||||
struct AABB3f boundsZ[BINS];
|
||||
uint3 counts[BINS];
|
||||
};
|
||||
|
||||
struct BinInfo2
|
||||
{
|
||||
struct AABB3f boundsX[BINS * 2];
|
||||
struct AABB3f boundsY[BINS * 2];
|
||||
struct AABB3f boundsZ[BINS * 2];
|
||||
uint3 counts[BINS * 2];
|
||||
};
|
||||
|
||||
struct GlobalBuildRecord
|
||||
{
|
||||
struct BinInfo2 binInfo;
|
||||
struct BinMapping binMapping;
|
||||
struct Split split;
|
||||
struct Range range;
|
||||
struct AABB leftCentroid;
|
||||
struct AABB rightCentroid;
|
||||
struct AABB leftGeometry;
|
||||
struct AABB rightGeometry;
|
||||
unsigned int atomicCountLeft;
|
||||
unsigned int atomicCountRight;
|
||||
unsigned int buildRecordID;
|
||||
};
|
||||
|
||||
GRL_NAMESPACE_END(GPUBVHBuilder)
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module structs;
|
||||
|
||||
struct MKBuilderState {
|
||||
qword geomDesc_buffer;
|
||||
qword build_primref_buffer;
|
||||
qword build_globals;
|
||||
qword bvh_buffer;
|
||||
dword leaf_type;
|
||||
dword leaf_size;
|
||||
};
|
||||
|
||||
struct MKSizeEstimate {
|
||||
dword numTriangles;
|
||||
dword numProcedurals;
|
||||
dword numPrimitives;
|
||||
dword numMeshes;
|
||||
dword numBuildPrimitives;
|
||||
dword numPrimitivesToSplit;
|
||||
dword instance_descs_start;
|
||||
dword geo_meta_data_start;
|
||||
dword node_data_start;
|
||||
dword leaf_data_start;
|
||||
dword procedural_data_start;
|
||||
dword back_pointer_start;
|
||||
dword sizeTotal;
|
||||
dword updateScratchSizeTotal;
|
||||
dword fatleaf_table_start;
|
||||
dword innernode_table_start;
|
||||
dword max_fatleaves;
|
||||
dword quad_indices_data_start;
|
||||
};
|
||||
|
|
@ -1,277 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#include "instance.h"
|
||||
#include "api_interface.h"
|
||||
|
||||
#include "bvh_build_primref.h"
|
||||
#include "bvh_build_refit.h"
|
||||
|
||||
/*
|
||||
Create primrefs from array of instance descriptors.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
|
||||
TS_primrefs_from_instances(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
|
||||
uint numInstances,
|
||||
global struct AABB* primrefs,
|
||||
global uchar* pAABBs,
|
||||
global uchar* pIsProcedural,
|
||||
dword aabb_stride,
|
||||
uint allowUpdate
|
||||
)
|
||||
{
|
||||
const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
if (instanceIndex < numInstances)
|
||||
{
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
|
||||
|
||||
global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
|
||||
if ( pIsProcedural[instanceIndex] )
|
||||
{
|
||||
procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
|
||||
}
|
||||
|
||||
primrefs_from_instances(
|
||||
globals,
|
||||
bvh,
|
||||
instance,
|
||||
instanceIndex,
|
||||
primrefs,
|
||||
procedural_bb,
|
||||
allowUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Create primrefs from array of instance descriptors.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
void kernel
|
||||
TS_primrefs_from_instances_indirect(
|
||||
global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
|
||||
uint numInstances,
|
||||
global struct AABB* primrefs,
|
||||
global uchar* pAABBs,
|
||||
global uchar* pIsProcedural,
|
||||
dword aabb_stride,
|
||||
uint allowUpdate,
|
||||
global struct IndirectBuildRangeInfo* indirect_data
|
||||
)
|
||||
{
|
||||
const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
if (instanceIndex < indirect_data->primitiveCount)
|
||||
{
|
||||
instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
|
||||
(((global char*)instances) + indirect_data->primitiveOffset);
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
|
||||
|
||||
global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
|
||||
if ( pIsProcedural[instanceIndex] )
|
||||
{
|
||||
procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
|
||||
}
|
||||
|
||||
primrefs_from_instances(
|
||||
globals,
|
||||
bvh,
|
||||
instance,
|
||||
instanceIndex,
|
||||
primrefs,
|
||||
procedural_bb,
|
||||
allowUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Create primrefs from array of pointers to instance descriptors.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
|
||||
TS_primrefs_from_instances_pointers(global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global void* instances_in,
|
||||
uint numInstances,
|
||||
global struct AABB* primrefs,
|
||||
global uchar* pAABBs,
|
||||
global uchar* pIsProcedural,
|
||||
dword aabb_stride,
|
||||
uint allowUpdate
|
||||
)
|
||||
{
|
||||
global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
|
||||
(global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
|
||||
|
||||
const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
if (instanceIndex < numInstances)
|
||||
{
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
|
||||
|
||||
global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
|
||||
if (pIsProcedural[instanceIndex])
|
||||
{
|
||||
procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
|
||||
}
|
||||
|
||||
primrefs_from_instances(
|
||||
globals,
|
||||
bvh,
|
||||
instance,
|
||||
instanceIndex,
|
||||
primrefs,
|
||||
procedural_bb,
|
||||
allowUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Create primrefs from array of pointers to instance descriptors.
|
||||
*/
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
|
||||
void kernel
|
||||
TS_primrefs_from_instances_pointers_indirect(global struct Globals* globals,
|
||||
global struct BVHBase* bvh,
|
||||
global void* instances_in,
|
||||
global struct AABB* primrefs,
|
||||
global uchar* pAABBs,
|
||||
global uchar* pIsProcedural,
|
||||
dword aabb_stride,
|
||||
uint allowUpdate,
|
||||
global struct IndirectBuildRangeInfo* indirect_data
|
||||
)
|
||||
{
|
||||
const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
|
||||
if (instanceIndex < indirect_data->primitiveCount)
|
||||
{
|
||||
instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
|
||||
global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
|
||||
(global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
|
||||
global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
|
||||
|
||||
global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
|
||||
if (pIsProcedural[instanceIndex])
|
||||
{
|
||||
procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
|
||||
}
|
||||
|
||||
primrefs_from_instances(
|
||||
globals,
|
||||
bvh,
|
||||
instance,
|
||||
instanceIndex,
|
||||
primrefs,
|
||||
procedural_bb,
|
||||
allowUpdate);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel
|
||||
TS_update_instance_leaves(global struct BVHBase* bvh,
|
||||
uint64_t dxrInstancesArray,
|
||||
uint64_t dxrInstancesPtr,
|
||||
global struct AABB3f* instance_aabb_scratch,
|
||||
global uchar* aabbs,
|
||||
global uchar* is_procedural,
|
||||
dword aabb_stride
|
||||
)
|
||||
{
|
||||
uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
|
||||
uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
|
||||
if (id >= num_leaves)
|
||||
return;
|
||||
|
||||
struct HwInstanceLeaf* leaves = BVHBase_GetHWInstanceLeaves(bvh);
|
||||
uint idx = HwInstanceLeaf_GetInstanceIndex(&leaves[id]);
|
||||
|
||||
global GRL_RAYTRACING_AABB* procedural_box = 0;
|
||||
if (is_procedural[idx])
|
||||
{
|
||||
procedural_box = (global GRL_RAYTRACING_AABB*)(aabbs + (aabb_stride * idx));
|
||||
}
|
||||
|
||||
DO_update_instance_leaves(
|
||||
bvh,
|
||||
dxrInstancesArray,
|
||||
dxrInstancesPtr,
|
||||
instance_aabb_scratch,
|
||||
id,
|
||||
procedural_box);
|
||||
}
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void kernel
|
||||
TS_fixup_leaves( global struct BVHBase* bvh,
|
||||
global uchar* primref_index,
|
||||
global PrimRef* primrefs,
|
||||
uint stride )
|
||||
|
||||
{
|
||||
uint num_inners = BVHBase_GetNumInternalNodes(bvh);
|
||||
uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
|
||||
|
||||
// assign 8 lanes to each inner node, 6 of which will do useful work
|
||||
uint node_id = id / 8;
|
||||
uint child_id = id % 8;
|
||||
|
||||
bool node_valid = (node_id < num_inners);
|
||||
|
||||
if (node_valid )
|
||||
{
|
||||
global InternalNode* nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
|
||||
global InternalNode* my_node = nodes + node_id;
|
||||
|
||||
if (my_node->nodeType == BVH_INSTANCE_NODE)
|
||||
{
|
||||
bool child_valid = (child_id < 6) && InternalNode_IsChildValid(my_node, child_id);
|
||||
if (child_valid)
|
||||
{
|
||||
global HwInstanceLeaf* leaves = (global HwInstanceLeaf*)InternalNode_GetChildren(my_node);
|
||||
uint leafIndex = (leaves - BVHBase_GetHWInstanceLeaves(bvh)) + child_id;
|
||||
|
||||
const uint primrefID = *(uint*)(primref_index + leafIndex * stride);
|
||||
|
||||
uint type = PRIMREF_isProceduralInstance(&primrefs[primrefID]) ?
|
||||
BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
|
||||
|
||||
InternalNode_SetChildType(my_node, child_id, type);
|
||||
}
|
||||
|
||||
if (child_id == 0)
|
||||
my_node->nodeType = BVH_INTERNAL_NODE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
GRL_ANNOTATE_IGC_DO_NOT_SPILL
|
||||
__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
|
||||
TS_Refit_per_one_startpoint_sg(
|
||||
global struct BVHBase* bvh,
|
||||
global struct AABB3f* instance_leaf_aabbs,
|
||||
global uchar* procedural_instance_enable_buffer )
|
||||
{
|
||||
DO_Refit_per_one_startpoint_sg(bvh, (global GRL_RAYTRACING_GEOMETRY_DESC*) bvh, instance_leaf_aabbs, procedural_instance_enable_buffer );
|
||||
|
||||
}
|
||||
|
|
@ -1,244 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
module traversal_shader;
|
||||
|
||||
kernel_module morton_kernels ("traversal_shader.cl")
|
||||
{
|
||||
links lsc_intrinsics;
|
||||
|
||||
kernel TS_primrefs_from_instances < kernelFunction = "TS_primrefs_from_instances" >;
|
||||
kernel TS_primrefs_from_instances_indirect < kernelFunction = "TS_primrefs_from_instances_indirect" >;
|
||||
kernel TS_primrefs_from_instances_ptrs < kernelFunction = "TS_primrefs_from_instances_pointers" >;
|
||||
kernel TS_primrefs_from_instances_ptrs_indirect < kernelFunction = "TS_primrefs_from_instances_pointers_indirect" >;
|
||||
kernel TS_update_instance_leaves < kernelFunction = "TS_update_instance_leaves" >;
|
||||
kernel TS_Refit_per_one_startpoint_sg < kernelFunction = "TS_Refit_per_one_startpoint_sg" >;
|
||||
kernel TS_fixup_leaves < kernelFunction = "TS_fixup_leaves" >;
|
||||
}
|
||||
|
||||
struct MKTSBuildArgs
|
||||
{
|
||||
qword build_globals;
|
||||
qword bvh_buffer;
|
||||
qword instance_descs;
|
||||
qword build_primref_buffer;
|
||||
qword aabb_buffer;
|
||||
qword is_procedural_buffer;
|
||||
qword leaf_creation_index_buffer;
|
||||
dword aabb_stride;
|
||||
dword num_instances;
|
||||
dword leaf_creation_index_stride;
|
||||
};
|
||||
|
||||
const BUILD_PRIMREFS_GROUPSIZE = 16;
|
||||
|
||||
|
||||
metakernel TS_build_primrefs( MKTSBuildArgs build_state, dword allowUpdate )
|
||||
{
|
||||
define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
|
||||
dispatch TS_primrefs_from_instances(num_groups, 1, 1) args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
build_state.instance_descs,
|
||||
build_state.num_instances,
|
||||
build_state.build_primref_buffer,
|
||||
build_state.aabb_buffer,
|
||||
build_state.is_procedural_buffer,
|
||||
build_state.aabb_stride,
|
||||
allowUpdate
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
metakernel TS_build_primrefs_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(BUILD_PRIMREFS_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect TS_primrefs_from_instances_indirect args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
build_state.instance_descs,
|
||||
build_state.build_primref_buffer,
|
||||
build_state.aabb_buffer,
|
||||
build_state.is_procedural_buffer,
|
||||
build_state.aabb_stride,
|
||||
allowUpdate,
|
||||
indirectBuildRangeInfo
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
metakernel TS_build_primrefs_array_of_pointers( MKTSBuildArgs build_state, dword allowUpdate )
|
||||
{
|
||||
define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
|
||||
dispatch TS_primrefs_from_instances_ptrs(num_groups, 1, 1) args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
build_state.instance_descs,
|
||||
build_state.num_instances,
|
||||
build_state.build_primref_buffer,
|
||||
build_state.aabb_buffer,
|
||||
build_state.is_procedural_buffer,
|
||||
build_state.aabb_stride,
|
||||
allowUpdate
|
||||
);
|
||||
}
|
||||
|
||||
metakernel
|
||||
TS_build_primrefs_array_of_pointers_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(BUILD_PRIMREFS_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect TS_primrefs_from_instances_ptrs_indirect args(
|
||||
build_state.build_globals,
|
||||
build_state.bvh_buffer,
|
||||
build_state.instance_descs,
|
||||
build_state.build_primref_buffer,
|
||||
build_state.aabb_buffer,
|
||||
build_state.is_procedural_buffer,
|
||||
build_state.aabb_stride,
|
||||
allowUpdate,
|
||||
indirectBuildRangeInfo
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
const UPDATE_INSTANCE_LEAVES_GROUPSIZE = 16;
|
||||
|
||||
struct MKTSUpdateArgs
|
||||
{
|
||||
qword bvh_buffer;
|
||||
qword instance_descs;
|
||||
qword instance_descs_ptrs;
|
||||
qword aabb_buffer;
|
||||
qword is_procedural_buffer;
|
||||
qword refit_scratch;
|
||||
dword aabb_stride;
|
||||
dword num_instances;
|
||||
};
|
||||
|
||||
metakernel TS_update_instance_leaves( MKTSUpdateArgs update_state )
|
||||
{
|
||||
define num_groups((update_state.num_instances + UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1) / UPDATE_INSTANCE_LEAVES_GROUPSIZE);
|
||||
dispatch TS_update_instance_leaves(num_groups, 1, 1) args(
|
||||
update_state.bvh_buffer,
|
||||
update_state.instance_descs,
|
||||
update_state.instance_descs_ptrs,
|
||||
update_state.refit_scratch,
|
||||
update_state.aabb_buffer,
|
||||
update_state.is_procedural_buffer,
|
||||
update_state.aabb_stride
|
||||
);
|
||||
}
|
||||
|
||||
metakernel TS_update_instance_leaves_indirect( MKTSUpdateArgs update_state, qword indirectBuildRangeInfo )
|
||||
{
|
||||
define num_groups REG0;
|
||||
define groupsize_1 REG1; // groupsize - 1
|
||||
define C_4 REG2;
|
||||
|
||||
// init with primitiveCount
|
||||
num_groups = load_dword(indirectBuildRangeInfo);
|
||||
groupsize_1 = 15; // UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1
|
||||
C_4 = 4; // log_2(UPDATE_INSTANCE_LEAVES_GROUPSIZE)
|
||||
|
||||
num_groups = num_groups + groupsize_1;
|
||||
num_groups = num_groups >> C_4; // num_groups / UPDATE_INSTANCE_LEAVES_GROUPSIZE;
|
||||
|
||||
DISPATCHDIM_X = num_groups.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
// need to add indirect offset?
|
||||
dispatch_indirect TS_update_instance_leaves args(
|
||||
update_state.bvh_buffer,
|
||||
update_state.instance_descs,
|
||||
update_state.instance_descs_ptrs,
|
||||
update_state.refit_scratch,
|
||||
update_state.aabb_buffer,
|
||||
update_state.is_procedural_buffer,
|
||||
update_state.aabb_stride
|
||||
);
|
||||
}
|
||||
|
||||
metakernel TS_refit(MKTSUpdateArgs update_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
|
||||
{
|
||||
REG0 = bvh_inner_nodes_start_value;
|
||||
REG1.lo = load_dword(bvh_inner_nodes_end);
|
||||
REG1.hi = 0;
|
||||
REG2 = REG1 - REG0;
|
||||
|
||||
DISPATCHDIM_X = REG2.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect TS_Refit_per_one_startpoint_sg
|
||||
args(
|
||||
update_state.bvh_buffer,
|
||||
update_state.refit_scratch,
|
||||
update_state.is_procedural_buffer
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
const FIXUP_LEAVES_NODES_PER_GROUP = 2;
|
||||
|
||||
metakernel TS_fixup_leaves(MKTSBuildArgs build_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
|
||||
{
|
||||
define ONE REG3;
|
||||
|
||||
ONE = 1;
|
||||
REG0 = bvh_inner_nodes_start_value;
|
||||
REG1.lo = load_dword(bvh_inner_nodes_end);
|
||||
REG1.hi = 0;
|
||||
REG2 = REG1 - REG0;
|
||||
REG2 = REG2 + ONE;
|
||||
REG2 = REG2 >> ONE;
|
||||
|
||||
DISPATCHDIM_X = REG2.lo;
|
||||
DISPATCHDIM_Y = 1;
|
||||
DISPATCHDIM_Z = 1;
|
||||
|
||||
dispatch_indirect TS_fixup_leaves
|
||||
args(
|
||||
build_state.bvh_buffer,
|
||||
build_state.leaf_creation_index_buffer,
|
||||
build_state.build_primref_buffer,
|
||||
build_state.leaf_creation_index_stride
|
||||
);
|
||||
|
||||
}
|
||||
|
|
@ -1,226 +0,0 @@
|
|||
COPYRIGHT = """\
|
||||
/*
|
||||
* Copyright 2021 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sub license, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the
|
||||
* next paragraph) shall be included in all copies or substantial portions
|
||||
* of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
|
||||
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
|
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from grl_parser import parse_grl_file
|
||||
from mako.template import Template
|
||||
|
||||
TEMPLATE_H = Template(COPYRIGHT + """
|
||||
/* This file generated from ${filename}, don't edit directly. */
|
||||
|
||||
#ifndef GRL_CL_KERNEL_H
|
||||
#define GRL_CL_KERNEL_H
|
||||
|
||||
#include "genxml/gen_macros.h"
|
||||
#include "compiler/brw_kernel.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum grl_cl_kernel {
|
||||
% for k in kernels:
|
||||
GRL_CL_KERNEL_${k.upper()},
|
||||
% endfor
|
||||
GRL_CL_KERNEL_MAX,
|
||||
};
|
||||
|
||||
const char *genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel);
|
||||
|
||||
const char *genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id);
|
||||
|
||||
void genX(grl_get_cl_kernel)(struct brw_kernel *kernel, enum grl_cl_kernel id);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* INTEL_GRL_H */
|
||||
""")
|
||||
|
||||
TEMPLATE_C = Template(COPYRIGHT + """
|
||||
/* This file generated from ${filename}, don't edit directly. */
|
||||
|
||||
#include "grl_cl_kernel.h"
|
||||
|
||||
% for k in kernels:
|
||||
#include "${prefix}_${k}.h"
|
||||
% endfor
|
||||
|
||||
const char *
|
||||
genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel)
|
||||
{
|
||||
switch (kernel) {
|
||||
% for k in kernels:
|
||||
case GRL_CL_KERNEL_${k.upper()}: return "${k}";
|
||||
% endfor
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char *
|
||||
genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id)
|
||||
{
|
||||
switch (id) {
|
||||
% for k in kernels:
|
||||
case GRL_CL_KERNEL_${k.upper()}: return ${prefix}_${k}_sha1;
|
||||
% endfor
|
||||
default:
|
||||
unreachable("Invalid GRL kernel enum");
|
||||
}
|
||||
};
|
||||
|
||||
void
|
||||
${prefix}_grl_get_cl_kernel(struct brw_kernel *kernel, enum grl_cl_kernel id)
|
||||
{
|
||||
switch (id) {
|
||||
% for k in kernels:
|
||||
case GRL_CL_KERNEL_${k.upper()}:
|
||||
*kernel = ${prefix}_${k};
|
||||
break;
|
||||
% endfor
|
||||
default:
|
||||
unreachable("Invalid GRL kernel enum");
|
||||
}
|
||||
}
|
||||
""")
|
||||
|
||||
def get_libraries_files(kernel_module):
|
||||
lib_files = []
|
||||
for item in kernel_module[3]:
|
||||
if item[0] != 'library':
|
||||
continue
|
||||
default_file = None
|
||||
fallback_file = None
|
||||
path_directory = None
|
||||
for props in item[2]:
|
||||
if props[0] == 'fallback':
|
||||
fallback_file = props[1]
|
||||
elif props[0] == 'default':
|
||||
default_file = props[1]
|
||||
elif props[0] == 'path':
|
||||
path_directory = props[1]
|
||||
assert path_directory
|
||||
assert default_file or fallback_file
|
||||
if fallback_file:
|
||||
lib_files.append(os.path.join(path_directory, fallback_file))
|
||||
else:
|
||||
lib_files.append(os.path.join(path_directory, default_file))
|
||||
return lib_files
|
||||
|
||||
def add_kernels(kernels, cl_file, entrypoint, libs):
|
||||
assert cl_file.endswith('.cl')
|
||||
for lib_file in libs:
|
||||
assert lib_file.endswith('.cl')
|
||||
kernels.append((cl_file, entrypoint, ','.join(libs)))
|
||||
|
||||
def get_kernels(grl_nodes):
|
||||
kernels = []
|
||||
for item in grl_nodes:
|
||||
assert isinstance(item, tuple)
|
||||
if item[0] == 'kernel':
|
||||
ann = item[2]
|
||||
add_kernels(kernels, ann['source'], ann['kernelFunction'], [])
|
||||
elif item[0] == 'kernel-module':
|
||||
cl_file = item[2]
|
||||
libfiles = get_libraries_files(item)
|
||||
for kernel_def in item[3]:
|
||||
if kernel_def[0] == 'kernel':
|
||||
ann = kernel_def[2]
|
||||
add_kernels(kernels, cl_file, ann['kernelFunction'], libfiles)
|
||||
return kernels
|
||||
|
||||
def parse_libraries(filenames):
|
||||
libraries = {}
|
||||
for fname in filenames:
|
||||
lib_package = parse_grl_file(fname, [])
|
||||
for lib in lib_package:
|
||||
assert lib[0] == 'library'
|
||||
# Add the directory of the library so that CL files can be found.
|
||||
lib[2].append(('path', os.path.dirname(fname)))
|
||||
libraries[lib[1]] = lib
|
||||
return libraries
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--out-c', help='Output C file')
|
||||
parser.add_argument('--out-h', help='Output H file')
|
||||
parser.add_argument('--ls-kernels', action='store_const', const=True,
|
||||
help='List all openCL kernels')
|
||||
parser.add_argument('--prefix', help='Prefix')
|
||||
parser.add_argument('--library', dest='libraries', action='append',
|
||||
default=[], help='Libraries to include')
|
||||
parser.add_argument('files', type=str, nargs='*', help='GRL files')
|
||||
args = parser.parse_args()
|
||||
|
||||
libraries = parse_libraries(args.libraries)
|
||||
|
||||
kernels = []
|
||||
for fname in args.files:
|
||||
kernels += get_kernels(parse_grl_file(fname, libraries))
|
||||
|
||||
# Make the list of kernels unique and sorted
|
||||
kernels = sorted(list(set(kernels)))
|
||||
|
||||
if args.ls_kernels:
|
||||
for cl_file, entrypoint, libs in kernels:
|
||||
if not os.path.isabs(cl_file):
|
||||
cl_file = os.path.join(os.path.dirname(fname), cl_file)
|
||||
print('{}:{}:{}'.format(cl_file, entrypoint, libs))
|
||||
|
||||
kernel_c_names = []
|
||||
for cl_file, entrypoint, libs in kernels:
|
||||
cl_file = os.path.splitext(cl_file)[0]
|
||||
cl_file_name = cl_file.replace('/', '_')
|
||||
kernel_c_names.append('_'.join([cl_file_name, entrypoint]))
|
||||
|
||||
try:
|
||||
if args.out_h:
|
||||
with open(args.out_h, 'w', encoding='utf-8') as f:
|
||||
f.write(TEMPLATE_H.render(kernels=kernel_c_names,
|
||||
filename=os.path.basename(__file__)))
|
||||
|
||||
if args.out_c:
|
||||
with open(args.out_c, 'w', encoding='utf-8') as f:
|
||||
f.write(TEMPLATE_C.render(kernels=kernel_c_names,
|
||||
prefix=args.prefix,
|
||||
filename=os.path.basename(__file__)))
|
||||
except Exception:
|
||||
# In the event there's an error, this imports some helpers from mako
|
||||
# to print a useful stack trace and prints it, then exits with
|
||||
# status 1, if python is run with debug; otherwise it just raises
|
||||
# the exception
|
||||
if __debug__:
|
||||
import sys
|
||||
from mako import exceptions
|
||||
sys.stderr.write(exceptions.text_error_template().render() + '\n')
|
||||
sys.exit(1)
|
||||
raise
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,930 +0,0 @@
|
|||
#!/bin/env python
|
||||
COPYRIGHT = """\
|
||||
/*
|
||||
* Copyright 2021 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sub license, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the
|
||||
* next paragraph) shall be included in all copies or substantial portions
|
||||
* of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
|
||||
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
|
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
|
||||
from grl_parser import parse_grl_file
|
||||
|
||||
class Writer(object):
|
||||
def __init__(self, file):
|
||||
self._file = file
|
||||
self._indent = 0
|
||||
self._new_line = True
|
||||
|
||||
def push_indent(self, levels=4):
|
||||
self._indent += levels
|
||||
|
||||
def pop_indent(self, levels=4):
|
||||
self._indent -= levels
|
||||
|
||||
def write(self, s, *fmt):
|
||||
if self._new_line:
|
||||
s = '\n' + s
|
||||
self._new_line = False
|
||||
if s.endswith('\n'):
|
||||
self._new_line = True
|
||||
s = s[:-1]
|
||||
if fmt:
|
||||
s = s.format(*fmt)
|
||||
self._file.write(s.replace('\n', '\n' + ' ' * self._indent))
|
||||
|
||||
# Internal Representation
|
||||
|
||||
class Value(object):
|
||||
def __init__(self, name=None, zone=None):
|
||||
self.name = name
|
||||
self._zone = zone
|
||||
self.live = False
|
||||
|
||||
@property
|
||||
def zone(self):
|
||||
assert self._zone is not None
|
||||
return self._zone
|
||||
|
||||
def is_reg(self):
|
||||
return False
|
||||
|
||||
def c_val(self):
|
||||
if not self.name:
|
||||
print(self)
|
||||
assert self.name
|
||||
return self.name
|
||||
|
||||
def c_cpu_val(self):
|
||||
assert self.zone == 'cpu'
|
||||
return self.c_val()
|
||||
|
||||
def c_gpu_val(self):
|
||||
if self.zone == 'gpu':
|
||||
return self.c_val()
|
||||
else:
|
||||
return 'mi_imm({})'.format(self.c_cpu_val())
|
||||
|
||||
class Constant(Value):
|
||||
def __init__(self, value):
|
||||
super().__init__(zone='cpu')
|
||||
self.value = value
|
||||
|
||||
def c_val(self):
|
||||
if self.value < 100:
|
||||
return str(self.value)
|
||||
elif self.value < (1 << 32):
|
||||
return '0x{:x}u'.format(self.value)
|
||||
else:
|
||||
return '0x{:x}ull'.format(self.value)
|
||||
|
||||
class Register(Value):
|
||||
def __init__(self, name):
|
||||
super().__init__(name=name, zone='gpu')
|
||||
|
||||
def is_reg(self):
|
||||
return True
|
||||
|
||||
class FixedGPR(Register):
|
||||
def __init__(self, num):
|
||||
super().__init__('REG{}'.format(num))
|
||||
self.num = num
|
||||
|
||||
def write_c(self, w):
|
||||
w.write('UNUSED struct mi_value {} = mi_reserve_gpr(&b, {});\n',
|
||||
self.name, self.num)
|
||||
|
||||
class GroupSizeRegister(Register):
|
||||
def __init__(self, comp):
|
||||
super().__init__('DISPATCHDIM_' + 'XYZ'[comp])
|
||||
self.comp = comp
|
||||
|
||||
class Member(Value):
|
||||
def __init__(self, value, member):
|
||||
super().__init__(zone=value.zone)
|
||||
self.value = value
|
||||
self.member = member
|
||||
|
||||
def is_reg(self):
|
||||
return self.value.is_reg()
|
||||
|
||||
def c_val(self):
|
||||
c_val = self.value.c_val()
|
||||
if self.zone == 'gpu':
|
||||
assert isinstance(self.value, Register)
|
||||
if self.member == 'hi':
|
||||
return 'mi_value_half({}, true)'.format(c_val)
|
||||
elif self.member == 'lo':
|
||||
return 'mi_value_half({}, false)'.format(c_val)
|
||||
else:
|
||||
assert False, 'Invalid member: {}'.format(self.member)
|
||||
else:
|
||||
return '.'.join([c_val, self.member])
|
||||
|
||||
class OffsetOf(Value):
|
||||
def __init__(self, mk, expr):
|
||||
super().__init__(zone='cpu')
|
||||
assert isinstance(expr, tuple) and expr[0] == 'member'
|
||||
self.type = mk.m.get_type(expr[1])
|
||||
self.field = expr[2]
|
||||
|
||||
def c_val(self):
|
||||
return 'offsetof({}, {})'.format(self.type.c_name, self.field)
|
||||
|
||||
class Scope(object):
|
||||
def __init__(self, m, mk, parent):
|
||||
self.m = m
|
||||
self.mk = mk
|
||||
self.parent = parent
|
||||
self.defs = {}
|
||||
|
||||
def add_def(self, d, name=None):
|
||||
if name is None:
|
||||
name = d.name
|
||||
assert name not in self.defs
|
||||
self.defs[name] = d
|
||||
|
||||
def get_def(self, name):
|
||||
if name in self.defs:
|
||||
return self.defs[name]
|
||||
assert self.parent, 'Unknown definition: "{}"'.format(name)
|
||||
return self.parent.get_def(name)
|
||||
|
||||
class Statement(object):
|
||||
def __init__(self, srcs=[]):
|
||||
assert isinstance(srcs, (list, tuple))
|
||||
self.srcs = list(srcs)
|
||||
|
||||
class SSAStatement(Statement, Value):
|
||||
_count = 0
|
||||
|
||||
def __init__(self, zone, srcs):
|
||||
Statement.__init__(self, srcs)
|
||||
Value.__init__(self, None, zone)
|
||||
self.c_name = '_tmp{}'.format(SSAStatement._count)
|
||||
SSAStatement._count += 1
|
||||
|
||||
def c_val(self):
|
||||
return self.c_name
|
||||
|
||||
def write_c_refs(self, w):
|
||||
assert self.zone == 'gpu'
|
||||
assert self.uses > 0
|
||||
if self.uses > 1:
|
||||
w.write('mi_value_add_refs(&b, {}, {});\n',
|
||||
self.c_name, self.uses - 1)
|
||||
|
||||
class Half(SSAStatement):
|
||||
def __init__(self, value, half):
|
||||
assert half in ('hi', 'lo')
|
||||
super().__init__(None, [value])
|
||||
self.half = half
|
||||
|
||||
@property
|
||||
def zone(self):
|
||||
return self.srcs[0].zone
|
||||
|
||||
def write_c(self, w):
|
||||
assert self.half in ('hi', 'lo')
|
||||
if self.zone == 'cpu':
|
||||
if self.half == 'hi':
|
||||
w.write('uint32_t {} = (uint64_t)({}) >> 32;\n',
|
||||
self.c_name, self.srcs[0].c_cpu_val())
|
||||
else:
|
||||
w.write('uint32_t {} = {};\n',
|
||||
self.c_name, self.srcs[0].c_cpu_val())
|
||||
else:
|
||||
if self.half == 'hi':
|
||||
w.write('struct mi_value {} = mi_value_half({}, true);\n',
|
||||
self.c_name, self.srcs[0].c_gpu_val())
|
||||
else:
|
||||
w.write('struct mi_value {} = mi_value_half({}, false);\n',
|
||||
self.c_name, self.srcs[0].c_gpu_val())
|
||||
self.write_c_refs(w)
|
||||
|
||||
class Expression(SSAStatement):
|
||||
def __init__(self, mk, op, *srcs):
|
||||
super().__init__(None, srcs)
|
||||
self.op = op
|
||||
|
||||
@property
|
||||
def zone(self):
|
||||
zone = 'cpu'
|
||||
for s in self.srcs:
|
||||
if s.zone == 'gpu':
|
||||
zone = 'gpu'
|
||||
return zone
|
||||
|
||||
def write_c(self, w):
|
||||
if self.zone == 'cpu':
|
||||
c_cpu_vals = [s.c_cpu_val() for s in self.srcs]
|
||||
# There is one bitfield that is a uint64_t, but only holds 2 bits.
|
||||
# In practice we won't overflow, but let's help the compiler (and
|
||||
# coverity) out here.
|
||||
if self.op == '<<':
|
||||
w.write(f'assume({c_cpu_vals[0]} < (1 << 8));')
|
||||
w.write('uint64_t {} = ', self.c_name)
|
||||
if len(self.srcs) == 1:
|
||||
w.write('({} {})', self.op, c_cpu_vals[0])
|
||||
elif len(self.srcs) == 2:
|
||||
w.write('({} {} {})', c_cpu_vals[0], self.op, c_cpu_vals[1])
|
||||
else:
|
||||
assert len(self.srcs) == 3 and op == '?'
|
||||
w.write('({} ? {} : {})', *c_cpu_vals)
|
||||
w.write(';\n')
|
||||
return
|
||||
|
||||
w.write('struct mi_value {} = ', self.c_name)
|
||||
if self.op == '~':
|
||||
w.write('mi_inot(&b, {});\n', self.srcs[0].c_gpu_val())
|
||||
elif self.op == '+':
|
||||
w.write('mi_iadd(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
|
||||
elif self.op == '-':
|
||||
w.write('mi_isub(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
|
||||
elif self.op == '&':
|
||||
w.write('mi_iand(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
|
||||
elif self.op == '|':
|
||||
w.write('mi_ior(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
|
||||
elif self.op == '<<':
|
||||
if self.srcs[1].zone == 'cpu':
|
||||
w.write('mi_ishl_imm(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val())
|
||||
else:
|
||||
w.write('mi_ishl(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
|
||||
elif self.op == '>>':
|
||||
if self.srcs[1].zone == 'cpu':
|
||||
w.write('mi_ushr_imm(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val())
|
||||
else:
|
||||
w.write('mi_ushr(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
|
||||
elif self.op == '==':
|
||||
w.write('mi_ieq(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
|
||||
elif self.op == '<':
|
||||
w.write('mi_ult(&b, {}, {});\n',
|
||||
self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
|
||||
elif self.op == '>':
|
||||
w.write('mi_ult(&b, {}, {});\n',
|
||||
self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val())
|
||||
elif self.op == '<=':
|
||||
w.write('mi_uge(&b, {}, {});\n',
|
||||
self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val())
|
||||
else:
|
||||
assert False, 'Unknown expression opcode: {}'.format(self.op)
|
||||
self.write_c_refs(w)
|
||||
|
||||
class StoreReg(Statement):
|
||||
def __init__(self, mk, reg, value):
|
||||
super().__init__([mk.load_value(value)])
|
||||
self.reg = mk.parse_value(reg)
|
||||
assert self.reg.is_reg()
|
||||
|
||||
def write_c(self, w):
|
||||
value = self.srcs[0]
|
||||
w.write('mi_store(&b, {}, {});\n',
|
||||
self.reg.c_gpu_val(), value.c_gpu_val())
|
||||
|
||||
class LoadMem(SSAStatement):
|
||||
def __init__(self, mk, bit_size, addr):
|
||||
super().__init__('gpu', [mk.load_value(addr)])
|
||||
self.bit_size = bit_size
|
||||
|
||||
def write_c(self, w):
|
||||
addr = self.srcs[0]
|
||||
w.write('struct mi_value {} = ', self.c_name)
|
||||
if addr.zone == 'cpu':
|
||||
w.write('mi_mem{}(anv_address_from_u64({}));\n',
|
||||
self.bit_size, addr.c_cpu_val())
|
||||
else:
|
||||
assert self.bit_size == 64
|
||||
w.write('mi_load_mem64_offset(&b, anv_address_from_u64(0), {});\n',
|
||||
addr.c_gpu_val())
|
||||
self.write_c_refs(w)
|
||||
|
||||
class StoreMem(Statement):
|
||||
def __init__(self, mk, bit_size, addr, src):
|
||||
super().__init__([mk.load_value(addr), mk.load_value(src)])
|
||||
self.bit_size = bit_size
|
||||
|
||||
def write_c(self, w):
|
||||
addr, data = tuple(self.srcs)
|
||||
if addr.zone == 'cpu':
|
||||
w.write('mi_store(&b, mi_mem{}(anv_address_from_u64({})), {});\n',
|
||||
self.bit_size, addr.c_cpu_val(), data.c_gpu_val())
|
||||
else:
|
||||
assert self.bit_size == 64
|
||||
w.write('mi_store_mem64_offset(&b, anv_address_from_u64(0), {}, {});\n',
|
||||
addr.c_gpu_val(), data.c_gpu_val())
|
||||
|
||||
class GoTo(Statement):
|
||||
def __init__(self, mk, target_id, cond=None, invert=False):
|
||||
cond = [mk.load_value(cond)] if cond is not None else []
|
||||
super().__init__(cond)
|
||||
self.target_id = target_id
|
||||
self.invert = invert
|
||||
self.mk = mk
|
||||
|
||||
def write_c(self, w):
|
||||
# Now that we've parsed the entire metakernel, we can look up the
|
||||
# actual target from the id
|
||||
target = self.mk.get_goto_target(self.target_id)
|
||||
|
||||
if self.srcs:
|
||||
cond = self.srcs[0]
|
||||
if self.invert:
|
||||
w.write('mi_goto_if(&b, mi_inot(&b, {}), &{});\n', cond.c_gpu_val(), target.c_name)
|
||||
else:
|
||||
w.write('mi_goto_if(&b, {}, &{});\n', cond.c_gpu_val(), target.c_name)
|
||||
else:
|
||||
w.write('mi_goto(&b, &{});\n', target.c_name)
|
||||
|
||||
class GoToTarget(Statement):
|
||||
def __init__(self, mk, name):
|
||||
super().__init__()
|
||||
self.name = name
|
||||
self.c_name = '_goto_target_' + name
|
||||
self.goto_tokens = []
|
||||
|
||||
mk = mk.add_goto_target(self)
|
||||
|
||||
def write_decl(self, w):
|
||||
w.write('struct mi_goto_target {} = MI_GOTO_TARGET_INIT;\n',
|
||||
self.c_name)
|
||||
|
||||
def write_c(self, w):
|
||||
w.write('mi_goto_target(&b, &{});\n', self.c_name)
|
||||
|
||||
class Dispatch(Statement):
|
||||
def __init__(self, mk, kernel, group_size, args, postsync):
|
||||
if group_size is None:
|
||||
srcs = [mk.scope.get_def('DISPATCHDIM_{}'.format(d)) for d in 'XYZ']
|
||||
else:
|
||||
srcs = [mk.load_value(s) for s in group_size]
|
||||
srcs += [mk.load_value(a) for a in args]
|
||||
super().__init__(srcs)
|
||||
self.kernel = mk.m.kernels[kernel]
|
||||
self.indirect = group_size is None
|
||||
self.postsync = postsync
|
||||
|
||||
def write_c(self, w):
|
||||
w.write('{\n')
|
||||
w.push_indent()
|
||||
|
||||
group_size = self.srcs[:3]
|
||||
args = self.srcs[3:]
|
||||
if not self.indirect:
|
||||
w.write('const uint32_t _group_size[3] = {{ {}, {}, {} }};\n',
|
||||
*[s.c_cpu_val() for s in group_size])
|
||||
gs = '_group_size'
|
||||
else:
|
||||
gs = 'NULL'
|
||||
|
||||
w.write('const struct anv_kernel_arg _args[] = {\n')
|
||||
w.push_indent()
|
||||
for arg in args:
|
||||
w.write('{{ .u64 = {} }},\n', arg.c_cpu_val())
|
||||
w.pop_indent()
|
||||
w.write('};\n')
|
||||
|
||||
w.write('genX(grl_dispatch)(cmd_buffer, {},\n', self.kernel.c_name)
|
||||
w.write(' {}, ARRAY_SIZE(_args), _args);\n', gs)
|
||||
w.pop_indent()
|
||||
w.write('}\n')
|
||||
|
||||
class SemWait(Statement):
|
||||
def __init__(self, scope, wait):
|
||||
super().__init__()
|
||||
self.wait = wait
|
||||
|
||||
class Control(Statement):
|
||||
def __init__(self, scope, wait):
|
||||
super().__init__()
|
||||
self.wait = wait
|
||||
|
||||
def write_c(self, w):
|
||||
w.write('cmd_buffer->state.pending_pipe_bits |=\n')
|
||||
w.write(' ANV_PIPE_CS_STALL_BIT |\n')
|
||||
w.write(' ANV_PIPE_DATA_CACHE_FLUSH_BIT |\n')
|
||||
w.write(' ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;\n')
|
||||
w.write('genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);\n')
|
||||
|
||||
TYPE_REMAPS = {
|
||||
'dword' : 'uint32_t',
|
||||
'qword' : 'uint64_t',
|
||||
}
|
||||
|
||||
class Module(object):
|
||||
def __init__(self, grl_dir, elems):
|
||||
assert isinstance(elems[0], tuple)
|
||||
assert elems[0][0] == 'module-name'
|
||||
self.grl_dir = grl_dir
|
||||
self.name = elems[0][1]
|
||||
self.kernels = {}
|
||||
self.structs = {}
|
||||
self.constants = []
|
||||
self.metakernels = []
|
||||
self.regs = {}
|
||||
|
||||
scope = Scope(self, None, None)
|
||||
for e in elems[1:]:
|
||||
if e[0] == 'kernel':
|
||||
k = Kernel(self, *e[1:])
|
||||
assert k.name not in self.kernels
|
||||
self.kernels[k.name] = k
|
||||
elif e[0] == 'kernel-module':
|
||||
m = KernelModule(self, *e[1:])
|
||||
for k in m.kernels:
|
||||
assert k.name not in self.kernels
|
||||
self.kernels[k.name] = k
|
||||
elif e[0] == 'struct':
|
||||
s = Struct(self, *e[1:])
|
||||
assert s.name not in self.kernels
|
||||
self.structs[s.name] = s
|
||||
elif e[0] == 'named-constant':
|
||||
c = NamedConstant(*e[1:])
|
||||
scope.add_def(c)
|
||||
self.constants.append(c)
|
||||
elif e[0] == 'meta-kernel':
|
||||
mk = MetaKernel(self, scope, *e[1:])
|
||||
self.metakernels.append(mk)
|
||||
elif e[0] == 'import':
|
||||
assert e[2] == 'struct'
|
||||
self.import_struct(e[1], e[3])
|
||||
else:
|
||||
assert False, 'Invalid module-level token: {}'.format(t[0])
|
||||
|
||||
def import_struct(self, filename, struct_name):
|
||||
elems = parse_grl_file(os.path.join(self.grl_dir, filename), [])
|
||||
assert elems
|
||||
for e in elems[1:]:
|
||||
if e[0] == 'struct' and e[1] == struct_name:
|
||||
s = Struct(self, *e[1:])
|
||||
assert s.name not in self.kernels
|
||||
self.structs[s.name] = s
|
||||
return
|
||||
assert False, "Struct {0} not found in {1}".format(struct_name, filename)
|
||||
|
||||
def get_type(self, name):
|
||||
if name in self.structs:
|
||||
return self.structs[name]
|
||||
return BasicType(TYPE_REMAPS.get(name, name))
|
||||
|
||||
def get_fixed_gpr(self, num):
|
||||
assert isinstance(num, int)
|
||||
if num in self.regs:
|
||||
return self.regs[num]
|
||||
|
||||
reg = FixedGPR(num)
|
||||
self.regs[num] = reg
|
||||
return reg
|
||||
|
||||
def optimize(self):
|
||||
progress = True
|
||||
while progress:
|
||||
progress = False
|
||||
|
||||
# Copy Propagation
|
||||
for mk in self.metakernels:
|
||||
if mk.opt_copy_prop():
|
||||
progress = True
|
||||
|
||||
# Dead Code Elimination
|
||||
for r in self.regs.values():
|
||||
r.live = False
|
||||
for c in self.constants:
|
||||
c.live = False
|
||||
for mk in self.metakernels:
|
||||
mk.opt_dead_code1()
|
||||
for mk in self.metakernels:
|
||||
if mk.opt_dead_code2():
|
||||
progress = True
|
||||
for n in list(self.regs.keys()):
|
||||
if not self.regs[n].live:
|
||||
del self.regs[n]
|
||||
progress = True
|
||||
self.constants = [c for c in self.constants if c.live]
|
||||
|
||||
def compact_regs(self):
|
||||
old_regs = self.regs
|
||||
self.regs = {}
|
||||
for i, reg in enumerate(old_regs.values()):
|
||||
reg.num = i
|
||||
self.regs[i] = reg
|
||||
|
||||
def write_h(self, w):
|
||||
for s in self.structs.values():
|
||||
s.write_h(w)
|
||||
for mk in self.metakernels:
|
||||
mk.write_h(w)
|
||||
|
||||
def write_c(self, w):
|
||||
for c in self.constants:
|
||||
c.write_c(w)
|
||||
for mk in self.metakernels:
|
||||
mk.write_c(w)
|
||||
|
||||
class Kernel(object):
|
||||
def __init__(self, m, name, ann):
|
||||
self.name = name
|
||||
self.source_file = ann['source']
|
||||
self.kernel_name = self.source_file.replace('/', '_')[:-3].upper()
|
||||
self.entrypoint = ann['kernelFunction']
|
||||
|
||||
assert self.source_file.endswith('.cl')
|
||||
self.c_name = '_'.join([
|
||||
'GRL_CL_KERNEL',
|
||||
self.kernel_name,
|
||||
self.entrypoint.upper(),
|
||||
])
|
||||
|
||||
class KernelModule(object):
|
||||
def __init__(self, m, name, source, kernels):
|
||||
self.name = name
|
||||
self.kernels = []
|
||||
self.libraries = []
|
||||
|
||||
for k in kernels:
|
||||
if k[0] == 'kernel':
|
||||
k[2]['source'] = source
|
||||
self.kernels.append(Kernel(m, *k[1:]))
|
||||
elif k[0] == 'library':
|
||||
# Skip this for now.
|
||||
pass
|
||||
|
||||
class BasicType(object):
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.c_name = name
|
||||
|
||||
class Struct(object):
|
||||
def __init__(self, m, name, fields, align):
|
||||
assert align == 0
|
||||
self.name = name
|
||||
self.c_name = 'struct ' + '_'.join(['grl', m.name, self.name])
|
||||
self.fields = [(m.get_type(t), n) for t, n in fields]
|
||||
|
||||
def write_h(self, w):
|
||||
w.write('{} {{\n', self.c_name)
|
||||
w.push_indent()
|
||||
for f in self.fields:
|
||||
w.write('{} {};\n', f[0].c_name, f[1])
|
||||
w.pop_indent()
|
||||
w.write('};\n')
|
||||
|
||||
class NamedConstant(Value):
|
||||
def __init__(self, name, value):
|
||||
super().__init__(name, 'cpu')
|
||||
self.name = name
|
||||
self.value = Constant(value)
|
||||
self.written = False
|
||||
|
||||
def set_module(self, m):
|
||||
pass
|
||||
|
||||
def write_c(self, w):
|
||||
if self.written:
|
||||
return
|
||||
w.write('static const uint64_t {} = {};\n',
|
||||
self.name, self.value.c_val())
|
||||
self.written = True
|
||||
|
||||
class MetaKernelParameter(Value):
|
||||
def __init__(self, mk, type, name):
|
||||
super().__init__(name, 'cpu')
|
||||
self.type = mk.m.get_type(type)
|
||||
|
||||
class MetaKernel(object):
|
||||
def __init__(self, m, m_scope, name, params, ann, statements):
|
||||
self.m = m
|
||||
self.name = name
|
||||
self.c_name = '_'.join(['grl', m.name, self.name])
|
||||
self.goto_targets = {}
|
||||
self.num_tmps = 0
|
||||
|
||||
mk_scope = Scope(m, self, m_scope)
|
||||
|
||||
self.params = [MetaKernelParameter(self, *p) for p in params]
|
||||
for p in self.params:
|
||||
mk_scope.add_def(p)
|
||||
|
||||
mk_scope.add_def(GroupSizeRegister(0), name='DISPATCHDIM_X')
|
||||
mk_scope.add_def(GroupSizeRegister(1), name='DISPATCHDIM_Y')
|
||||
mk_scope.add_def(GroupSizeRegister(2), name='DISPATCHDIM_Z')
|
||||
|
||||
self.statements = []
|
||||
self.parse_stmt(mk_scope, statements)
|
||||
self.scope = None
|
||||
|
||||
def get_tmp(self):
|
||||
tmpN = '_tmp{}'.format(self.num_tmps)
|
||||
self.num_tmps += 1
|
||||
return tmpN
|
||||
|
||||
def add_stmt(self, stmt):
|
||||
self.statements.append(stmt)
|
||||
return stmt
|
||||
|
||||
def parse_value(self, v):
|
||||
if isinstance(v, Value):
|
||||
return v
|
||||
elif isinstance(v, str):
|
||||
if re.match(r'REG\d+', v):
|
||||
return self.m.get_fixed_gpr(int(v[3:]))
|
||||
else:
|
||||
return self.scope.get_def(v)
|
||||
elif isinstance(v, int):
|
||||
return Constant(v)
|
||||
elif isinstance(v, tuple):
|
||||
if v[0] == 'member':
|
||||
return Member(self.parse_value(v[1]), v[2])
|
||||
elif v[0] == 'offsetof':
|
||||
return OffsetOf(self, v[1])
|
||||
else:
|
||||
op = v[0]
|
||||
srcs = [self.parse_value(s) for s in v[1:]]
|
||||
return self.add_stmt(Expression(self, op, *srcs))
|
||||
else:
|
||||
assert False, 'Invalid value: {}'.format(v[0])
|
||||
|
||||
def load_value(self, v):
|
||||
v = self.parse_value(v)
|
||||
if isinstance(v, Member) and v.zone == 'gpu':
|
||||
v = self.add_stmt(Half(v.value, v.member))
|
||||
return v
|
||||
|
||||
def parse_stmt(self, scope, s):
|
||||
self.scope = scope
|
||||
if isinstance(s, list):
|
||||
subscope = Scope(self.m, self, scope)
|
||||
for stmt in s:
|
||||
self.parse_stmt(subscope, stmt)
|
||||
elif s[0] == 'define':
|
||||
scope.add_def(self.parse_value(s[2]), name=s[1])
|
||||
elif s[0] == 'assign':
|
||||
self.add_stmt(StoreReg(self, *s[1:]))
|
||||
elif s[0] == 'dispatch':
|
||||
self.add_stmt(Dispatch(self, *s[1:]))
|
||||
elif s[0] == 'load-dword':
|
||||
v = self.add_stmt(LoadMem(self, 32, s[2]))
|
||||
self.add_stmt(StoreReg(self, s[1], v))
|
||||
elif s[0] == 'load-qword':
|
||||
v = self.add_stmt(LoadMem(self, 64, s[2]))
|
||||
self.add_stmt(StoreReg(self, s[1], v))
|
||||
elif s[0] == 'store-dword':
|
||||
self.add_stmt(StoreMem(self, 32, *s[1:]))
|
||||
elif s[0] == 'store-qword':
|
||||
self.add_stmt(StoreMem(self, 64, *s[1:]))
|
||||
elif s[0] == 'goto':
|
||||
self.add_stmt(GoTo(self, s[1]))
|
||||
elif s[0] == 'goto-if':
|
||||
self.add_stmt(GoTo(self, s[1], s[2]))
|
||||
elif s[0] == 'goto-if-not':
|
||||
self.add_stmt(GoTo(self, s[1], s[2], invert=True))
|
||||
elif s[0] == 'label':
|
||||
self.add_stmt(GoToTarget(self, s[1]))
|
||||
elif s[0] == 'control':
|
||||
self.add_stmt(Control(self, s[1]))
|
||||
elif s[0] == 'sem-wait-while':
|
||||
self.add_stmt(Control(self, s[1]))
|
||||
else:
|
||||
assert False, 'Invalid statement: {}'.format(s[0])
|
||||
|
||||
def add_goto_target(self, t):
|
||||
assert t.name not in self.goto_targets
|
||||
self.goto_targets[t.name] = t
|
||||
|
||||
def get_goto_target(self, name):
|
||||
return self.goto_targets[name]
|
||||
|
||||
def opt_copy_prop(self):
|
||||
progress = False
|
||||
copies = {}
|
||||
for stmt in self.statements:
|
||||
for i in range(len(stmt.srcs)):
|
||||
src = stmt.srcs[i]
|
||||
if isinstance(src, FixedGPR) and src.num in copies:
|
||||
stmt.srcs[i] = copies[src.num]
|
||||
progress = True
|
||||
|
||||
if isinstance(stmt, StoreReg):
|
||||
reg = stmt.reg
|
||||
if isinstance(reg, Member):
|
||||
reg = reg.value
|
||||
|
||||
if isinstance(reg, FixedGPR):
|
||||
copies.pop(reg.num, None)
|
||||
if not stmt.srcs[0].is_reg():
|
||||
copies[reg.num] = stmt.srcs[0]
|
||||
elif isinstance(stmt, (GoTo, GoToTarget)):
|
||||
copies = {}
|
||||
|
||||
return progress
|
||||
|
||||
def opt_dead_code1(self):
|
||||
for stmt in self.statements:
|
||||
# Mark every register which is read as live
|
||||
for src in stmt.srcs:
|
||||
if isinstance(src, Register):
|
||||
src.live = True
|
||||
|
||||
# Initialize every SSA statement to dead
|
||||
if isinstance(stmt, SSAStatement):
|
||||
stmt.live = False
|
||||
|
||||
def opt_dead_code2(self):
|
||||
def yield_live(statements):
|
||||
gprs_read = set(self.m.regs.keys())
|
||||
for stmt in statements:
|
||||
if isinstance(stmt, SSAStatement):
|
||||
if not stmt.live:
|
||||
continue
|
||||
elif isinstance(stmt, StoreReg):
|
||||
reg = stmt.reg
|
||||
if isinstance(reg, Member):
|
||||
reg = reg.value
|
||||
|
||||
if not stmt.reg.live:
|
||||
continue
|
||||
|
||||
if isinstance(reg, FixedGPR):
|
||||
if reg.num in gprs_read:
|
||||
gprs_read.remove(reg.num)
|
||||
else:
|
||||
continue
|
||||
elif isinstance(stmt, (GoTo, GoToTarget)):
|
||||
gprs_read = set(self.m.regs.keys())
|
||||
|
||||
for src in stmt.srcs:
|
||||
src.live = True
|
||||
if isinstance(src, FixedGPR):
|
||||
gprs_read.add(src.num)
|
||||
yield stmt
|
||||
|
||||
old_stmt_list = self.statements
|
||||
old_stmt_list.reverse()
|
||||
self.statements = list(yield_live(old_stmt_list))
|
||||
self.statements.reverse()
|
||||
return len(self.statements) != len(old_stmt_list)
|
||||
|
||||
def count_ssa_value_uses(self):
|
||||
for stmt in self.statements:
|
||||
if isinstance(stmt, SSAStatement):
|
||||
stmt.uses = 0
|
||||
|
||||
for src in stmt.srcs:
|
||||
if isinstance(src, SSAStatement):
|
||||
src.uses += 1
|
||||
|
||||
def write_h(self, w):
|
||||
w.write('void\n')
|
||||
w.write('genX({})(\n', self.c_name)
|
||||
w.push_indent()
|
||||
w.write('struct anv_cmd_buffer *cmd_buffer')
|
||||
for p in self.params:
|
||||
w.write(',\n{} {}', p.type.c_name, p.name)
|
||||
w.write(');\n')
|
||||
w.pop_indent()
|
||||
|
||||
def write_c(self, w):
|
||||
w.write('void\n')
|
||||
w.write('genX({})(\n', self.c_name)
|
||||
w.push_indent()
|
||||
w.write('struct anv_cmd_buffer *cmd_buffer')
|
||||
for p in self.params:
|
||||
w.write(',\n{} {}', p.type.c_name, p.name)
|
||||
w.write(')\n')
|
||||
w.pop_indent()
|
||||
w.write('{\n')
|
||||
w.push_indent()
|
||||
|
||||
w.write('struct mi_builder b;\n')
|
||||
w.write('mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);\n')
|
||||
w.write('/* TODO: use anv_mocs? */\n');
|
||||
w.write('const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);\n');
|
||||
w.write('mi_builder_set_mocs(&b, mocs);\n');
|
||||
w.write('\n')
|
||||
|
||||
for r in self.m.regs.values():
|
||||
r.write_c(w)
|
||||
w.write('\n')
|
||||
|
||||
for t in self.goto_targets.values():
|
||||
t.write_decl(w)
|
||||
w.write('\n')
|
||||
|
||||
self.count_ssa_value_uses()
|
||||
for s in self.statements:
|
||||
s.write_c(w)
|
||||
|
||||
w.pop_indent()
|
||||
|
||||
w.write('}\n')
|
||||
|
||||
HEADER_PROLOGUE = COPYRIGHT + '''
|
||||
#include "anv_private.h"
|
||||
#include "grl/genX_grl.h"
|
||||
|
||||
#ifndef {0}
|
||||
#define {0}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {{
|
||||
#endif
|
||||
|
||||
'''
|
||||
|
||||
HEADER_EPILOGUE = '''
|
||||
#ifdef __cplusplus
|
||||
}}
|
||||
#endif
|
||||
|
||||
#endif /* {0} */
|
||||
'''
|
||||
|
||||
C_PROLOGUE = COPYRIGHT + '''
|
||||
#include "{0}"
|
||||
|
||||
#include "genxml/gen_macros.h"
|
||||
#include "genxml/genX_pack.h"
|
||||
#include "genxml/genX_rt_pack.h"
|
||||
|
||||
#include "genX_mi_builder.h"
|
||||
|
||||
#define MI_PREDICATE_RESULT mi_reg32(0x2418)
|
||||
#define DISPATCHDIM_X mi_reg32(0x2500)
|
||||
#define DISPATCHDIM_Y mi_reg32(0x2504)
|
||||
#define DISPATCHDIM_Z mi_reg32(0x2508)
|
||||
'''
|
||||
|
||||
def parse_libraries(filenames):
|
||||
libraries = {}
|
||||
for fname in filenames:
|
||||
lib_package = parse_grl_file(fname, [])
|
||||
for lib in lib_package:
|
||||
assert lib[0] == 'library'
|
||||
# Add the directory of the library so that CL files can be found.
|
||||
lib[2].append(('path', os.path.dirname(fname)))
|
||||
libraries[lib[1]] = lib
|
||||
return libraries
|
||||
|
||||
def main():
|
||||
argparser = argparse.ArgumentParser()
|
||||
argparser.add_argument('--out-c', help='Output C file')
|
||||
argparser.add_argument('--out-h', help='Output C file')
|
||||
argparser.add_argument('--library', dest='libraries', action='append',
|
||||
default=[], help='Libraries to include')
|
||||
argparser.add_argument('grl', help="Input file")
|
||||
args = argparser.parse_args()
|
||||
|
||||
grl_dir = os.path.dirname(args.grl)
|
||||
|
||||
libraries = parse_libraries(args.libraries)
|
||||
|
||||
ir = parse_grl_file(args.grl, libraries)
|
||||
|
||||
m = Module(grl_dir, ir)
|
||||
m.optimize()
|
||||
m.compact_regs()
|
||||
|
||||
with open(args.out_h, 'w') as f:
|
||||
guard = os.path.splitext(os.path.basename(args.out_h))[0].upper()
|
||||
w = Writer(f)
|
||||
w.write(HEADER_PROLOGUE, guard)
|
||||
m.write_h(w)
|
||||
w.write(HEADER_EPILOGUE, guard)
|
||||
|
||||
with open(args.out_c, 'w') as f:
|
||||
w = Writer(f)
|
||||
w.write(C_PROLOGUE, os.path.basename(args.out_h))
|
||||
m.write_c(w)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,586 +0,0 @@
|
|||
#!/bin/env python
|
||||
COPYRIGHT = """\
|
||||
/*
|
||||
* Copyright 2021 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sub license, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the
|
||||
* next paragraph) shall be included in all copies or substantial portions
|
||||
* of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
|
||||
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
|
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import ply.lex as lex
|
||||
import ply.yacc as yacc
|
||||
|
||||
# Libraries
|
||||
|
||||
libraries = {}
|
||||
|
||||
# LEXER
|
||||
|
||||
keywords = {
|
||||
'__debugbreak': 'KW_DEBUGBREAK',
|
||||
'alignas': 'KW_ALIGNAS',
|
||||
'args': 'KW_ARGS',
|
||||
'atomic': 'KW_ATOMIC',
|
||||
'atomic_return': 'KW_ATOMIC_RETURN',
|
||||
'const': 'KW_CONST',
|
||||
'control': 'KW_CONTROL',
|
||||
'define': 'KW_DEFINE',
|
||||
'dispatch': 'KW_DISPATCH',
|
||||
'dispatch_indirect': 'KW_DISPATCH_INDIRECT',
|
||||
'goto': 'KW_GOTO',
|
||||
'if': 'KW_IF',
|
||||
'kernel': 'KW_KERNEL',
|
||||
'kernel_module': 'KW_KERNEL_MODULE',
|
||||
'import': 'KW_IMPORT',
|
||||
'library': 'KW_LIBRARY',
|
||||
'links': 'KW_LINKS',
|
||||
'load_dword': 'KW_LOAD_DWORD',
|
||||
'load_qword': 'KW_LOAD_QWORD',
|
||||
'metakernel': 'KW_METAKERNEL',
|
||||
'module': 'KW_MODULE',
|
||||
'not': 'KW_NOT',
|
||||
'offsetof': 'KW_OFFSETOF',
|
||||
'postsync': 'KW_POSTSYNC',
|
||||
'print': 'KW_PRINT',
|
||||
'semaphore_wait': 'KW_SEMAPHORE_WAIT',
|
||||
'shiftof': 'KW_SHIFTOF',
|
||||
'sizeof': 'KW_SIZEOF',
|
||||
'store_dword': 'KW_STORE_DWORD',
|
||||
'store_qword': 'KW_STORE_QWORD',
|
||||
'store_timestamp': 'KW_STORE_TIMESTAMP',
|
||||
'struct': 'KW_STRUCT',
|
||||
'unsigned': 'KW_UNSIGNED',
|
||||
'while': 'KW_WHILE'
|
||||
}
|
||||
|
||||
ops = {
|
||||
'&&': 'OP_LOGICAL_AND',
|
||||
'||': 'OP_LOGICAL_OR',
|
||||
'==': 'OP_EQUALEQUAL',
|
||||
'!=': 'OP_NOTEQUAL',
|
||||
'<=': 'OP_LESSEQUAL',
|
||||
'>=': 'OP_GREATEREQUAL',
|
||||
'<<': 'OP_LSHIFT',
|
||||
'>>': 'OP_RSHIFT'
|
||||
}
|
||||
|
||||
tokens = [
|
||||
'INT_LITERAL',
|
||||
'STRING_LITERAL',
|
||||
'OP',
|
||||
'IDENTIFIER'
|
||||
] + list(keywords.values()) + list(ops.values())
|
||||
|
||||
def t_INT_LITERAL(t):
|
||||
r'(0x[a-fA-F0-9]+|\d+)'
|
||||
if t.value.startswith('0x'):
|
||||
t.value = int(t.value[2:], 16)
|
||||
else:
|
||||
t.value = int(t.value)
|
||||
return t
|
||||
|
||||
def t_OP(t):
|
||||
r'(&&|\|\||==|!=|<=|>=|<<|>>)'
|
||||
t.type = ops.get(t.value)
|
||||
return t
|
||||
|
||||
def t_IDENTIFIER(t):
|
||||
r'[a-zA-Z_][a-zA-Z_0-9]*'
|
||||
t.type = keywords.get(t.value, 'IDENTIFIER')
|
||||
return t
|
||||
|
||||
def t_STRING_LITERAL(t):
|
||||
r'"(\\.|[^"\\])*"'
|
||||
t.value = t.value[1:-1]
|
||||
return t
|
||||
|
||||
literals = "+*/(){};:,=&|!~^.%?-<>[]"
|
||||
|
||||
t_ignore = ' \t'
|
||||
|
||||
def t_newline(t):
|
||||
r'\n+'
|
||||
t.lexer.lineno += len(t.value)
|
||||
|
||||
def t_error(t):
|
||||
print("WUT: {}".format(t.value))
|
||||
t.lexer.skip(1)
|
||||
|
||||
LEXER = lex.lex()
|
||||
|
||||
# PARSER
|
||||
|
||||
precedence = (
|
||||
('right', '?', ':'),
|
||||
('left', 'OP_LOGICAL_OR', 'OP_LOGICAL_AND'),
|
||||
('left', '|'),
|
||||
('left', '^'),
|
||||
('left', '&'),
|
||||
('left', 'OP_EQUALEQUAL', 'OP_NOTEQUAL'),
|
||||
('left', '<', '>', 'OP_LESSEQUAL', 'OP_GREATEREQUAL'),
|
||||
('left', 'OP_LSHIFT', 'OP_RSHIFT'),
|
||||
('left', '+', '-'),
|
||||
('left', '*', '/', '%'),
|
||||
('right', '!', '~'),
|
||||
('left', '[', ']', '.')
|
||||
)
|
||||
|
||||
def p_module(p):
|
||||
'module : element_list'
|
||||
p[0] = p[1]
|
||||
|
||||
def p_element_list(p):
|
||||
'''element_list : element_list element
|
||||
| element'''
|
||||
if len(p) == 2:
|
||||
p[0] = [p[1]]
|
||||
else:
|
||||
p[0] = p[1] + [p[2]]
|
||||
|
||||
def p_element(p):
|
||||
'''element : kernel_definition
|
||||
| kernel_module_definition
|
||||
| library_definition
|
||||
| metakernel_definition
|
||||
| module_name
|
||||
| struct_definition
|
||||
| const_definition
|
||||
| import_definition'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_module_name(p):
|
||||
'module_name : KW_MODULE IDENTIFIER ";"'
|
||||
p[0] = ('module-name', p[2])
|
||||
|
||||
def p_kernel_module_definition(p):
|
||||
'kernel_module_definition : KW_KERNEL_MODULE IDENTIFIER "(" STRING_LITERAL ")" "{" kernel_definition_list "}"'
|
||||
p[0] = ('kernel-module', p[2], p[4], p[7])
|
||||
|
||||
def p_kernel_definition(p):
|
||||
'kernel_definition : KW_KERNEL IDENTIFIER optional_annotation_list'
|
||||
p[0] = ('kernel', p[2], p[3])
|
||||
|
||||
def p_library_definition(p):
|
||||
'library_definition : KW_LIBRARY IDENTIFIER "{" library_definition_list "}"'
|
||||
p[0] = ('library', p[2], p[4])
|
||||
|
||||
def p_library_definition_list(p):
|
||||
'''library_definition_list :
|
||||
| library_definition_list IDENTIFIER STRING_LITERAL ";"'''
|
||||
if len(p) < 3:
|
||||
p[0] = []
|
||||
else:
|
||||
p[0] = p[1]
|
||||
p[0].append((p[2], p[3]))
|
||||
|
||||
def p_import_definition(p):
|
||||
'import_definition : KW_IMPORT KW_STRUCT IDENTIFIER STRING_LITERAL ";"'
|
||||
p[0] = ('import', p[4], 'struct', p[3])
|
||||
|
||||
def p_links_definition(p):
|
||||
'links_definition : KW_LINKS IDENTIFIER'
|
||||
|
||||
# Process a library include like a preprocessor
|
||||
global libraries
|
||||
|
||||
if not p[2] in libraries:
|
||||
raise "Not able to find library {0}".format(p[2])
|
||||
p[0] = libraries[p[2]]
|
||||
|
||||
def p_metakernel_definition(p):
|
||||
'metakernel_definition : KW_METAKERNEL IDENTIFIER "(" optional_parameter_list ")" optional_annotation_list scope'
|
||||
p[0] = ('meta-kernel', p[2], p[4], p[6], p[7])
|
||||
|
||||
def p_kernel_definition_list(p):
|
||||
'''kernel_definition_list :
|
||||
| kernel_definition_list kernel_definition ";"
|
||||
| kernel_definition_list links_definition ";"'''
|
||||
if len(p) < 3:
|
||||
p[0] = []
|
||||
else:
|
||||
p[0] = p[1]
|
||||
p[0].append(p[2])
|
||||
|
||||
def p_optional_annotation_list(p):
|
||||
'''optional_annotation_list :
|
||||
| "<" ">"
|
||||
| "<" annotation_list ">"'''
|
||||
if len(p) < 4:
|
||||
p[0] = {}
|
||||
else:
|
||||
p[0] = p[2]
|
||||
|
||||
def p_optional_parameter_list(p):
|
||||
'''optional_parameter_list :
|
||||
| parameter_list'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_annotation_list(p):
|
||||
'''annotation_list : annotation'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_annotation_list_append(p):
|
||||
'''annotation_list : annotation_list "," annotation'''
|
||||
p[0] = {**p[1], **p[3]}
|
||||
|
||||
def p_annotation(p):
|
||||
'''annotation : IDENTIFIER "=" INT_LITERAL
|
||||
| IDENTIFIER "=" IDENTIFIER
|
||||
| IDENTIFIER "=" STRING_LITERAL'''
|
||||
p[0] = {p[1]: p[3]}
|
||||
|
||||
def p_parameter_list(p):
|
||||
'''parameter_list : parameter_definition'''
|
||||
p[0] = [p[1]]
|
||||
|
||||
def p_parameter_list_append(p):
|
||||
'''parameter_list : parameter_list "," parameter_definition'''
|
||||
p[0] = p[1]
|
||||
p[0].append(p[3])
|
||||
|
||||
def p_parameter_definition(p):
|
||||
'parameter_definition : IDENTIFIER IDENTIFIER'
|
||||
p[0] = (p[1], p[2])
|
||||
|
||||
def p_scope(p):
|
||||
'''scope : "{" optional_statement_list "}"'''
|
||||
p[0] = p[2]
|
||||
|
||||
def p_optional_statement_list(p):
|
||||
'''optional_statement_list :
|
||||
| statement_list'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_statement_list(p):
|
||||
'''statement_list : statement'''
|
||||
p[0] = [p[1]]
|
||||
|
||||
def p_statement_list_append(p):
|
||||
'''statement_list : statement_list statement'''
|
||||
p[0] = p[1]
|
||||
p[0].append(p[2])
|
||||
|
||||
def p_statement(p):
|
||||
'''statement : definition_statement ";"
|
||||
| assignment_statement ";"
|
||||
| load_store_statement ";"
|
||||
| dispatch_statement ";"
|
||||
| semaphore_statement ";"
|
||||
| label
|
||||
| goto_statement ";"
|
||||
| scope_statement
|
||||
| atomic_op_statement ";"
|
||||
| control_statement ";"
|
||||
| print_statement ";"
|
||||
| debug_break_statement ";"'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_definition_statement(p):
|
||||
'definition_statement : KW_DEFINE IDENTIFIER value'
|
||||
p[0] = ('define', p[2], p[3])
|
||||
|
||||
def p_assignemt_statement(p):
|
||||
'assignment_statement : value "=" value'
|
||||
p[0] = ('assign', p[1], p[3])
|
||||
|
||||
def p_load_store_statement_load_dword(p):
|
||||
'''load_store_statement : value "=" KW_LOAD_DWORD "(" value ")"'''
|
||||
p[0] = ('load-dword', p[1], p[5])
|
||||
|
||||
def p_load_store_statement_load_qword(p):
|
||||
'''load_store_statement : value "=" KW_LOAD_QWORD "(" value ")"'''
|
||||
p[0] = ('load-qword', p[1], p[5])
|
||||
|
||||
def p_load_store_statement_store_dword(p):
|
||||
'''load_store_statement : KW_STORE_DWORD "(" value "," value ")"'''
|
||||
p[0] = ('store-dword', p[3], p[5])
|
||||
|
||||
def p_load_store_statement_store_qword(p):
|
||||
'''load_store_statement : KW_STORE_QWORD "(" value "," value ")"'''
|
||||
p[0] = ('store-qword', p[3], p[5])
|
||||
|
||||
def p_dispatch_statement(p):
|
||||
'''dispatch_statement : direct_dispatch_statement
|
||||
| indirect_dispatch_statement'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_direct_dispatch_statement(p):
|
||||
'''direct_dispatch_statement : KW_DISPATCH IDENTIFIER "(" value "," value "," value ")" optional_kernel_arg_list optional_postsync'''
|
||||
p[0] = ('dispatch', p[2], (p[4], p[6], p[8]), p[10], p[11])
|
||||
|
||||
def p_indirect_dispatch_statement(p):
|
||||
'''indirect_dispatch_statement : KW_DISPATCH_INDIRECT IDENTIFIER optional_kernel_arg_list optional_postsync'''
|
||||
p[0] = ('dispatch', p[2], None, p[3], p[4])
|
||||
|
||||
def p_optional_kernel_arg_list(p):
|
||||
'''optional_kernel_arg_list :
|
||||
| KW_ARGS "(" value_list ")"'''
|
||||
p[0] = p[3]
|
||||
|
||||
def p_value_list(p):
|
||||
'''value_list : value'''
|
||||
p[0] = [p[1]]
|
||||
|
||||
def p_value_list_append(p):
|
||||
'''value_list : value_list "," value'''
|
||||
p[0] = p[1]
|
||||
p[0].append(p[3])
|
||||
|
||||
def p_optional_postsync(p):
|
||||
'''optional_postsync :
|
||||
| postsync_operation'''
|
||||
if len(p) > 1:
|
||||
p[0] = p[1]
|
||||
|
||||
def p_postsync_operation(p):
|
||||
'''postsync_operation : postsync_write_dword
|
||||
| postsync_write_timestamp'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_postsync_write_dword(p):
|
||||
'''postsync_write_dword : KW_POSTSYNC KW_STORE_DWORD "(" value "," value ")"'''
|
||||
p[0] = ('postsync', 'store-dword', p[4], p[6])
|
||||
|
||||
def p_postsync_write_timestamp(p):
|
||||
'''postsync_write_timestamp : KW_POSTSYNC KW_STORE_TIMESTAMP "(" value ")"'''
|
||||
p[0] = ('postsync', 'timestamp', p[4])
|
||||
|
||||
def p_semaphore_statement(p):
|
||||
'''semaphore_statement : KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value "<" value ")"
|
||||
| KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value ">" value ")"
|
||||
| KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_LESSEQUAL value ")"
|
||||
| KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_GREATEREQUAL value ")"
|
||||
| KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_EQUALEQUAL value ")"
|
||||
| KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_NOTEQUAL value ")"'''
|
||||
p[0] = ('sem-wait-while', p[5], p[6], p[7])
|
||||
|
||||
def p_atomic_op_statement(p):
|
||||
'''atomic_op_statement : KW_ATOMIC IDENTIFIER IDENTIFIER "(" value_list ")"'''
|
||||
p[0] = ('atomic', p[2], p[3], p[5])
|
||||
|
||||
def p_atomic_op_statement_return(p):
|
||||
'''atomic_op_statement : KW_ATOMIC_RETURN IDENTIFIER IDENTIFIER "(" value_list ")"'''
|
||||
p[0] = ('atomic-return', p[2], p[3], p[5])
|
||||
|
||||
def p_label(p):
|
||||
'''label : IDENTIFIER ":"'''
|
||||
p[0] = ('label', p[1])
|
||||
|
||||
def p_goto_statement(p):
|
||||
'''goto_statement : KW_GOTO IDENTIFIER'''
|
||||
p[0] = ('goto', p[2])
|
||||
|
||||
def p_goto_statement_if(p):
|
||||
'''goto_statement : KW_GOTO IDENTIFIER KW_IF "(" value ")"'''
|
||||
p[0] = ('goto-if', p[2], p[5])
|
||||
|
||||
def p_goto_statement_if_not(p):
|
||||
'''goto_statement : KW_GOTO IDENTIFIER KW_IF KW_NOT "(" value ")"'''
|
||||
p[0] = ('goto-if-not', p[2], p[6])
|
||||
|
||||
def p_scope_statement(p):
|
||||
'''scope_statement : scope'''
|
||||
p[0] = (p[1])
|
||||
|
||||
def p_control_statement(p):
|
||||
'''control_statement : KW_CONTROL "(" id_list ")"'''
|
||||
p[0] = ('control', p[3])
|
||||
|
||||
def p_print_statement(p):
|
||||
'''print_statement : KW_PRINT "(" printable_list ")"'''
|
||||
p[0] = ('print', p[3])
|
||||
|
||||
def p_printable_list(p):
|
||||
'''printable_list : printable'''
|
||||
p[0] = [p[1]]
|
||||
|
||||
def p_printable_list_append(p):
|
||||
'''printable_list : printable_list "," printable'''
|
||||
p[0] = p[1]
|
||||
p[0].append(p[3])
|
||||
|
||||
def p_printable_str_lit(p):
|
||||
'''printable : STRING_LITERAL'''
|
||||
p[0] = '"{}"'.format(p[1])
|
||||
|
||||
def p_printable_value(p):
|
||||
'''printable : value'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_printable_str_lit_value(p):
|
||||
'''printable : STRING_LITERAL value'''
|
||||
p[0] = ('"{}"'.format(p[1]), p[2])
|
||||
|
||||
def p_debug_break_statement(p):
|
||||
'''debug_break_statement : KW_DEBUGBREAK'''
|
||||
p[0] = ('debug-break')
|
||||
|
||||
def p_id_list(p):
|
||||
'''id_list : IDENTIFIER'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_id_list_append(p):
|
||||
'''id_list : id_list "," IDENTIFIER'''
|
||||
p[0] = p[1]
|
||||
p[0].append(p[3])
|
||||
|
||||
def p_value(p):
|
||||
'''value : IDENTIFIER
|
||||
| INT_LITERAL'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_value_braces(p):
|
||||
'''value : "(" value ")"'''
|
||||
p[0] = (p[2])
|
||||
|
||||
def p_value_member(p):
|
||||
'''value : value "." IDENTIFIER'''
|
||||
p[0] = ('member', p[1], p[3])
|
||||
|
||||
def p_value_idx(p):
|
||||
'''value : value "[" value "]"'''
|
||||
p[0] = ('index', p[1], p[3])
|
||||
|
||||
def p_value_binop(p):
|
||||
'''value : value "+" value
|
||||
| value "-" value
|
||||
| value "*" value
|
||||
| value "/" value
|
||||
| value "%" value
|
||||
| value "&" value
|
||||
| value "|" value
|
||||
| value "<" value
|
||||
| value ">" value
|
||||
| value "^" value
|
||||
| value OP_LESSEQUAL value
|
||||
| value OP_GREATEREQUAL value
|
||||
| value OP_EQUALEQUAL value
|
||||
| value OP_NOTEQUAL value
|
||||
| value OP_LOGICAL_AND value
|
||||
| value OP_LOGICAL_OR value
|
||||
| value OP_LSHIFT value
|
||||
| value OP_RSHIFT value'''
|
||||
p[0] = (p[2], p[1], p[3])
|
||||
|
||||
def p_value_uniop(p):
|
||||
'''value : "!" value
|
||||
| "~" value'''
|
||||
p[0] = (p[1], p[2])
|
||||
|
||||
def p_value_cond(p):
|
||||
'''value : value "?" value ":" value'''
|
||||
p[0] = ('?', p[1], p[3], p[5])
|
||||
|
||||
def p_value_funcop(p):
|
||||
'''value : KW_OFFSETOF "(" offset_expression ")"
|
||||
| KW_SHIFTOF "(" IDENTIFIER ")"
|
||||
| KW_SIZEOF "(" IDENTIFIER ")"'''
|
||||
p[0] = (p[1], p[3])
|
||||
|
||||
def p_offset_expression(p):
|
||||
'''offset_expression : IDENTIFIER'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_offset_expression_member(p):
|
||||
'''offset_expression : offset_expression "." IDENTIFIER'''
|
||||
p[0] = ('member', p[1], p[3])
|
||||
|
||||
def p_offset_expression_idx(p):
|
||||
'''offset_expression : offset_expression "[" INT_LITERAL "]"'''
|
||||
p[0] = ('index', p[1], p[3])
|
||||
|
||||
def p_struct_definition(p):
|
||||
'''struct_definition : KW_STRUCT optional_alignment_specifier IDENTIFIER "{" optional_struct_member_list "}" ";"'''
|
||||
p[0] = ('struct', p[3], p[5], p[2])
|
||||
|
||||
def p_optional_alignment_specifier(p):
|
||||
'''optional_alignment_specifier :
|
||||
| KW_ALIGNAS "(" INT_LITERAL ")"'''
|
||||
if len(p) == 1:
|
||||
p[0] = 0
|
||||
else:
|
||||
p[0] = p[3]
|
||||
|
||||
def p_optional_struct_member_list(p):
|
||||
'''optional_struct_member_list :
|
||||
| struct_member_list'''
|
||||
if len(p) == 1:
|
||||
p[0] = {}
|
||||
else:
|
||||
p[0] = p[1]
|
||||
|
||||
def p_struct_member_list(p):
|
||||
'''struct_member_list : struct_member'''
|
||||
p[0] = [p[1]]
|
||||
|
||||
def p_struct_member_list_append(p):
|
||||
'''struct_member_list : struct_member_list struct_member'''
|
||||
p[0] = p[1] + [p[2]]
|
||||
|
||||
def p_struct_member(p):
|
||||
'''struct_member : struct_member_typename IDENTIFIER ";"'''
|
||||
p[0] = (p[1], p[2])
|
||||
|
||||
def p_struct_member_array(p):
|
||||
'''struct_member : struct_member_typename IDENTIFIER "[" INT_LITERAL "]" ";"'''
|
||||
'''struct_member : struct_member_typename IDENTIFIER "[" IDENTIFIER "]" ";"'''
|
||||
p[0] = {p[1]: p[2], 'count': p[4]}
|
||||
|
||||
def p_struct_member_typename(p):
|
||||
'''struct_member_typename : IDENTIFIER'''
|
||||
p[0] = p[1]
|
||||
|
||||
def p_struct_member_typename_unsigned(p):
|
||||
'''struct_member_typename : KW_UNSIGNED IDENTIFIER'''
|
||||
p[0] = ('unsigned', p[2])
|
||||
|
||||
def p_struct_member_typename_struct(p):
|
||||
'''struct_member_typename : KW_STRUCT IDENTIFIER'''
|
||||
p[0] = ('struct', p[2])
|
||||
|
||||
def p_const_definition(p):
|
||||
'''const_definition : KW_CONST IDENTIFIER "=" INT_LITERAL ";"'''
|
||||
p[0] = ('named-constant', p[2], p[4])
|
||||
|
||||
PARSER = yacc.yacc()
|
||||
|
||||
# Shamelessly stolen from some StackOverflow answer
|
||||
def _remove_comments(text):
|
||||
def replacer(match):
|
||||
s = match.group(0)
|
||||
if s.startswith('/'):
|
||||
return " " # note: a space and not an empty string
|
||||
else:
|
||||
return s
|
||||
pattern = re.compile(
|
||||
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
|
||||
re.DOTALL | re.MULTILINE
|
||||
)
|
||||
return re.sub(pattern, replacer, text)
|
||||
|
||||
def parse_grl_file(grl_fname, libs):
|
||||
global libraries
|
||||
|
||||
libraries = libs
|
||||
with open(grl_fname, 'r') as f:
|
||||
return PARSER.parse(_remove_comments(f.read()))
|
||||
|
|
@ -1,479 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2022 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* This file contains a redefinition of structures defined in the GRL library.
|
||||
* We need to have those structures defined to allocate & prepare data for
|
||||
* the OpenCL kernels building acceleration structures. Unfortunately because
|
||||
* of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL
|
||||
* header files directly so we have to redefine stuff here.
|
||||
*/
|
||||
|
||||
#ifndef GRL_STRUCTS_H
|
||||
#define GRL_STRUCTS_H
|
||||
|
||||
#include "GRLStructs.h"
|
||||
#include "GRLRTASCommon.h"
|
||||
|
||||
struct MKBuilderState {
|
||||
qword geomDesc_buffer;
|
||||
qword build_primref_buffer;
|
||||
qword build_globals;
|
||||
qword bvh_buffer;
|
||||
dword leaf_type;
|
||||
dword leaf_size;
|
||||
};
|
||||
|
||||
#define PREFIX_MK_STATE(prefix, obj) \
|
||||
(struct prefix##_MKBuilderState) { \
|
||||
.geomDesc_buffer = (obj).geomDesc_buffer, \
|
||||
.build_primref_buffer = (obj).build_primref_buffer, \
|
||||
.build_globals = (obj).build_globals, \
|
||||
.bvh_buffer = (obj).bvh_buffer, \
|
||||
.leaf_type = (obj).leaf_type, \
|
||||
.leaf_size = (obj).leaf_size, \
|
||||
}
|
||||
|
||||
struct MKSizeEstimate {
|
||||
dword numTriangles;
|
||||
dword numProcedurals;
|
||||
dword numPrimitives;
|
||||
dword numMeshes;
|
||||
dword numBuildPrimitives;
|
||||
dword numPrimitivesToSplit;
|
||||
dword instance_descs_start;
|
||||
dword geo_meta_data_start;
|
||||
dword node_data_start;
|
||||
dword leaf_data_start;
|
||||
dword procedural_data_start;
|
||||
dword back_pointer_start;
|
||||
dword sizeTotal;
|
||||
dword updateScratchSizeTotal;
|
||||
dword fatleaf_table_start;
|
||||
dword innernode_table_start;
|
||||
dword max_fatleaves;
|
||||
|
||||
size_t max_instance_leafs;
|
||||
size_t max_inner_nodes;
|
||||
size_t leaf_data_size;
|
||||
size_t min_primitives;
|
||||
size_t max_primitives;
|
||||
};
|
||||
|
||||
#define PREFIX_MK_SIZE(prefix, obj) \
|
||||
(struct prefix##_MKSizeEstimate) { \
|
||||
.numTriangles = (obj).numTriangles, \
|
||||
.numProcedurals = (obj).numProcedurals, \
|
||||
.numPrimitives = (obj).numPrimitives, \
|
||||
.numMeshes = (obj).numMeshes, \
|
||||
.numBuildPrimitives = (obj).numBuildPrimitives, \
|
||||
.numPrimitivesToSplit = (obj).numPrimitivesToSplit, \
|
||||
.instance_descs_start = (obj).instance_descs_start, \
|
||||
.geo_meta_data_start = (obj).geo_meta_data_start, \
|
||||
.node_data_start = (obj).node_data_start, \
|
||||
.leaf_data_start = (obj).leaf_data_start, \
|
||||
.procedural_data_start = (obj).procedural_data_start, \
|
||||
.back_pointer_start = (obj).back_pointer_start, \
|
||||
.sizeTotal = (obj).sizeTotal, \
|
||||
.updateScratchSizeTotal = (obj).updateScratchSizeTotal, \
|
||||
.fatleaf_table_start = (obj).fatleaf_table_start, \
|
||||
.innernode_table_start = (obj).innernode_table_start, \
|
||||
.max_fatleaves = (obj).max_fatleaves, \
|
||||
}
|
||||
|
||||
typedef struct AABB {
|
||||
float lower[4];
|
||||
float upper[4];
|
||||
} AABB;
|
||||
|
||||
struct Globals
|
||||
{
|
||||
struct AABB centroidBounds;
|
||||
|
||||
unsigned int build_record_start;
|
||||
unsigned int numPrimitives;
|
||||
unsigned int leafPrimType;
|
||||
unsigned int leafSize;
|
||||
|
||||
unsigned int numSplittedPrimitives;
|
||||
unsigned int numBuildRecords;
|
||||
|
||||
// spatial split sate
|
||||
unsigned int numOriginalPrimitives;
|
||||
float presplitPrioritySum;
|
||||
float probThreshold;
|
||||
|
||||
// binned-sah bfs state
|
||||
unsigned int counter;
|
||||
unsigned int numBuildRecords_extended;
|
||||
|
||||
// sync variable used for global-sync on work groups
|
||||
unsigned int sync;
|
||||
|
||||
|
||||
/* morton code builder state */
|
||||
unsigned int shift; // used by adaptive mc-builder
|
||||
unsigned int shift_mask; // used by adaptive mc-builder
|
||||
unsigned int binary_hierarchy_root;
|
||||
unsigned int p0_allocated_num;
|
||||
unsigned int p0_created_num;
|
||||
unsigned int morton_sort_in_flight;
|
||||
unsigned int sort_iterations;
|
||||
|
||||
gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid
|
||||
};
|
||||
|
||||
typedef struct BVHBase
|
||||
{
|
||||
// TODO: Implement the "copy-first-node" trick... duplicate root node here
|
||||
|
||||
uint64_t rootNodeOffset;
|
||||
|
||||
uint32_t reserved;
|
||||
|
||||
uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
|
||||
uint32_t quadLeafStart;
|
||||
uint32_t quadLeafCur;
|
||||
uint32_t proceduralDataStart;
|
||||
uint32_t proceduralDataCur;
|
||||
uint32_t instanceLeafStart;
|
||||
uint32_t instanceLeafEnd;
|
||||
uint32_t backPointerDataStart; //
|
||||
uint32_t refitTreeletsDataStart; // refit structs
|
||||
uint32_t refitStartPointDataStart; //
|
||||
uint32_t BVHDataEnd;
|
||||
|
||||
// number of bottom treelets
|
||||
// if 1, then the bottom treelet is also tip treelet
|
||||
uint32_t refitTreeletCnt;
|
||||
uint32_t refitTreeletCnt2; // always 0, used for atomic updates
|
||||
// data layout:
|
||||
// @backPointerDataStart
|
||||
// 'backpointer' - a dword per inner node.
|
||||
// The bits are used as follows:
|
||||
// 2:0 --> Used as a refit counter during BVH refitting. MBZ
|
||||
// 5:3 --> Number of children
|
||||
// 31:6 --> Index of the parent node in the internal node array
|
||||
// The root node has a parent index of all ones
|
||||
// @refitTreeletsDataStart
|
||||
// RefitTreelet[], the last treelet is for top treelet all previous are for bottom
|
||||
// @refitStartPointDataStart
|
||||
// for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
|
||||
// @backPointerDataEnd
|
||||
|
||||
uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves"
|
||||
uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children)
|
||||
uint32_t fatLeafTableStart;
|
||||
uint32_t innerTableStart;
|
||||
|
||||
uint32_t _pad[12];
|
||||
|
||||
struct RTASMetaData Meta;
|
||||
} BVHBase;
|
||||
|
||||
|
||||
struct BatchedInitGlobalsData
|
||||
{
|
||||
qword p_build_globals;
|
||||
qword p_bvh_buffer;
|
||||
dword numPrimitives;
|
||||
dword numGeometries;
|
||||
dword numInstances;
|
||||
dword instance_descs_start;
|
||||
dword geo_meta_data_start;
|
||||
dword node_data_start;
|
||||
dword leaf_data_start;
|
||||
dword procedural_data_start;
|
||||
dword back_pointer_start;
|
||||
dword sizeTotal;
|
||||
dword leafType;
|
||||
dword leafSize;
|
||||
dword fatleaf_table_start;
|
||||
dword innernode_table_start;
|
||||
};
|
||||
|
||||
|
||||
#define BFS_NUM_BINS 16
|
||||
#define BFS_NUM_VCONTEXTS 256
|
||||
#define BFS_MAX_DEPTH 32
|
||||
|
||||
#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
|
||||
|
||||
struct BFS_Split
|
||||
{
|
||||
float sah;
|
||||
int dim;
|
||||
int pos;
|
||||
};
|
||||
|
||||
struct BFS_BinInfo
|
||||
{
|
||||
float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6]
|
||||
// The 6 are lower(xyz) and -upper(xyz)
|
||||
// bins use negated-max so that we can use vectorized mins instead of min/max pairs
|
||||
uint counts[3 * BFS_NUM_BINS];
|
||||
};
|
||||
|
||||
struct SAHBuildGlobals
|
||||
{
|
||||
qword p_primref_index_buffers;
|
||||
qword p_primrefs_buffer;
|
||||
qword p_bvh2;
|
||||
qword p_globals; // TODO: deprecate this
|
||||
qword p_bvh_base;
|
||||
gpuva_t p_qnode_root_buffer;
|
||||
|
||||
dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks'
|
||||
dword num_primrefs;
|
||||
dword leaf_size;
|
||||
dword leaf_type;
|
||||
|
||||
dword root_buffer_num_produced;
|
||||
dword root_buffer_num_produced_hi;
|
||||
dword root_buffer_num_consumed;
|
||||
dword root_buffer_num_consumed_hi;
|
||||
dword root_buffer_num_to_consume;
|
||||
dword root_buffer_num_to_consume_hi;
|
||||
};
|
||||
|
||||
typedef union LRBounds
|
||||
{
|
||||
struct
|
||||
{
|
||||
struct AABB3f left_centroid_bounds;
|
||||
struct AABB3f left_geom_bounds;
|
||||
struct AABB3f right_centroid_bounds;
|
||||
struct AABB3f right_geom_bounds;
|
||||
} boxes;
|
||||
struct
|
||||
{
|
||||
float Array[24];
|
||||
} scalars;
|
||||
} LRBounds;
|
||||
|
||||
|
||||
struct VContext
|
||||
{
|
||||
uint dispatch_primref_begin; // range of primrefs for this task
|
||||
uint dispatch_primref_end;
|
||||
uint bvh2_root; // BVH2 root node for this task
|
||||
uint tree_depth; // depth of this node in the tree
|
||||
uint num_left; // primref counts
|
||||
uint num_right;
|
||||
uint lr_mask; // lower 8b : left mask. upper 8b : right mask
|
||||
uint batch_index;
|
||||
|
||||
// pass1 global working state and output
|
||||
struct BFS_Split split;
|
||||
struct BFS_BinInfo global_bin_info;
|
||||
|
||||
// pass2 global working state and output
|
||||
LRBounds lr_bounds;
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct BFSDispatchRecord
|
||||
{
|
||||
ushort batch_index;
|
||||
ushort context_id;
|
||||
};
|
||||
|
||||
|
||||
struct BFSDispatchQueue
|
||||
{
|
||||
uint num_dispatches;
|
||||
uint wg_count[BFS_NUM_VCONTEXTS];
|
||||
struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
|
||||
};
|
||||
|
||||
struct BFS1SpillStackEntry
|
||||
{
|
||||
uint primref_begin;
|
||||
uint primref_end;
|
||||
uint bvh2_root;
|
||||
ushort tree_depth;
|
||||
ushort batch_index;
|
||||
};
|
||||
|
||||
struct BFS1SpillStack
|
||||
{
|
||||
uint size;
|
||||
struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
|
||||
};
|
||||
|
||||
struct QNodeGlobalRootBufferEntry
|
||||
{
|
||||
uint bvh2_node;
|
||||
uint qnode;
|
||||
uint build_idx;
|
||||
uint _pad;
|
||||
};
|
||||
|
||||
struct QNodeGlobalRootBuffer
|
||||
{
|
||||
uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
|
||||
struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
|
||||
};
|
||||
|
||||
struct DFSDispatchRecord
|
||||
{
|
||||
uint primref_base;
|
||||
uint bvh2_base;
|
||||
uint batch_index;
|
||||
ushort num_primrefs;
|
||||
ushort tree_depth;
|
||||
};
|
||||
|
||||
|
||||
struct DFSDispatchQueue
|
||||
{
|
||||
struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
|
||||
};
|
||||
|
||||
#define VCONTEXT_STATE_EXECUTING 0
|
||||
#define VCONTEXT_STATE_UNALLOCATED 1
|
||||
|
||||
union SchedulerUnion
|
||||
{
|
||||
struct VContextScheduler
|
||||
{
|
||||
/////////////////////////////////////////////////////////////
|
||||
// State data used for communication with command streamer
|
||||
// NOTE: This part must match definition in 'new_sah_builder.grl'
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword num_bfs_wgs;
|
||||
dword num_dfs_wgs;
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
|
||||
dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
|
||||
dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
|
||||
|
||||
dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass
|
||||
dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
|
||||
dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
|
||||
|
||||
dword vcontext_state[BFS_NUM_VCONTEXTS];
|
||||
|
||||
struct BFSDispatchQueue bfs_queue;
|
||||
struct DFSDispatchQueue dfs_queue;
|
||||
|
||||
struct VContext contexts[BFS_NUM_VCONTEXTS];
|
||||
|
||||
struct BFS1SpillStack bfs2_spill_stack;
|
||||
} vContextScheduler;
|
||||
|
||||
struct QnodeScheduler
|
||||
{
|
||||
dword num_qnode_grb_curr_entries;
|
||||
dword num_qnode_grb_new_entries;
|
||||
|
||||
dword scheduler_postsync;
|
||||
dword _pad1;
|
||||
|
||||
dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
|
||||
dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
|
||||
|
||||
dword batched_builds_to_process;
|
||||
dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
|
||||
dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
|
||||
|
||||
struct QNodeGlobalRootBuffer qnode_global_root_buffer;
|
||||
} qnodeScheduler;
|
||||
};
|
||||
|
||||
|
||||
struct BVH2Node
|
||||
{
|
||||
struct AABB3f box;
|
||||
uint meta_u; // leaf: primref start. inner: offset from node to its first child
|
||||
uint meta_ss;
|
||||
//ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes
|
||||
//uchar is_inner; // 1 if inner, 0 if leaf
|
||||
//uchar mask;
|
||||
};
|
||||
|
||||
struct BVH2
|
||||
{
|
||||
uint num_nodes;
|
||||
uint _pad[7]; // align to 32B
|
||||
};
|
||||
|
||||
struct BatchedBLSDispatchEntry
|
||||
{
|
||||
/////////////////////////////////////////////////////////////
|
||||
// State data used for communication with command streamer
|
||||
// NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
|
||||
/////////////////////////////////////////////////////////////
|
||||
qword p_data_buffer;
|
||||
qword num_elements; // number of elements in p_data_buffer
|
||||
};
|
||||
|
||||
struct SAHBuildArgsBatchable
|
||||
{
|
||||
qword p_globals_ptrs;
|
||||
qword p_scheduler;
|
||||
qword p_buffers_info;
|
||||
qword p_sah_globals;
|
||||
|
||||
dword num_max_qnode_global_root_buffer_entries;
|
||||
dword num_builds;
|
||||
};
|
||||
|
||||
#define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \
|
||||
(struct prefix##_SAHBuildArgsBatchable) { \
|
||||
.p_globals_ptrs = (obj).p_globals_ptrs, \
|
||||
.p_scheduler = (obj).p_scheduler, \
|
||||
.p_buffers_info = (obj).p_buffers_info, \
|
||||
.p_sah_globals = (obj).p_sah_globals, \
|
||||
.num_max_qnode_global_root_buffer_entries = \
|
||||
(obj).num_max_qnode_global_root_buffer_entries, \
|
||||
.num_builds = (obj).num_builds, \
|
||||
}
|
||||
|
||||
|
||||
struct SAHBuildBuffersInfo
|
||||
{
|
||||
gpuva_t p_globals;
|
||||
gpuva_t p_primref_index_buffers;
|
||||
gpuva_t p_primrefs_buffer;
|
||||
gpuva_t p_bvh2;
|
||||
gpuva_t p_bvh_base;
|
||||
gpuva_t p_qnode_root_buffer;
|
||||
dword sah_globals_flags;
|
||||
dword _pad;
|
||||
gpuva_t _pad2;
|
||||
};
|
||||
|
||||
#endif /* GRL_STRUCTS_H */
|
||||
|
|
@ -1,459 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLRTASCommon.h"
|
||||
|
||||
#include "affinespace.h"
|
||||
|
||||
#ifndef __OPENCL_VERSION__
|
||||
# include "stdio.h" //for printf
|
||||
#endif
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
|
||||
GRL_INLINE void AABB3f_init(struct AABB3f *aabb)
|
||||
{
|
||||
aabb->lower[0] = (float)(INFINITY);
|
||||
aabb->lower[1] = (float)(INFINITY);
|
||||
aabb->lower[2] = (float)(INFINITY);
|
||||
|
||||
aabb->upper[0] = -(float)(INFINITY);
|
||||
aabb->upper[1] = -(float)(INFINITY);
|
||||
aabb->upper[2] = -(float)(INFINITY);
|
||||
}
|
||||
|
||||
GRL_INLINE float3 AABB3f_load_lower( const struct AABB3f* aabb )
|
||||
{
|
||||
float3 v = { aabb->lower[0], aabb->lower[1], aabb->lower[2] };
|
||||
return v;
|
||||
}
|
||||
GRL_INLINE float3 AABB3f_load_upper( const struct AABB3f* aabb )
|
||||
{
|
||||
float3 v = { aabb->upper[0], aabb->upper[1], aabb->upper[2] };
|
||||
return v;
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_extend(struct AABB3f *aabb, const struct AABB3f *v)
|
||||
{
|
||||
aabb->lower[0] = fmin(aabb->lower[0], v->lower[0]);
|
||||
aabb->lower[1] = fmin(aabb->lower[1], v->lower[1]);
|
||||
aabb->lower[2] = fmin(aabb->lower[2], v->lower[2]);
|
||||
aabb->upper[0] = fmax(aabb->upper[0], v->upper[0]);
|
||||
aabb->upper[1] = fmax(aabb->upper[1], v->upper[1]);
|
||||
aabb->upper[2] = fmax(aabb->upper[2], v->upper[2]);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_intersect(struct AABB3f* aabb, struct AABB3f inters)
|
||||
{
|
||||
aabb->upper[0] = fmin(inters.upper[0],aabb->upper[0]);
|
||||
aabb->upper[1] = fmin(inters.upper[1],aabb->upper[1]);
|
||||
aabb->upper[2] = fmin(inters.upper[2],aabb->upper[2]);
|
||||
aabb->lower[0] = fmax(inters.lower[0],aabb->lower[0]);
|
||||
aabb->lower[1] = fmax(inters.lower[1],aabb->lower[1]);
|
||||
aabb->lower[2] = fmax(inters.lower[2],aabb->lower[2]);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_trim_upper(struct AABB3f* aabb, const float* upper)
|
||||
{
|
||||
aabb->upper[0] = fmin(upper[0], aabb->upper[0]);
|
||||
aabb->upper[1] = fmin(upper[1], aabb->upper[1]);
|
||||
aabb->upper[2] = fmin(upper[2], aabb->upper[2]);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_set( struct AABB3f* aabb, float3 lower, float3 upper )
|
||||
{
|
||||
aabb->lower[0] = lower.x ;
|
||||
aabb->lower[1] = lower.y ;
|
||||
aabb->lower[2] = lower.z ;
|
||||
aabb->upper[0] = upper.x ;
|
||||
aabb->upper[1] = upper.y ;
|
||||
aabb->upper[2] = upper.z ;
|
||||
}
|
||||
|
||||
inline void AABB3f_extend_point(struct AABB3f *aabb, const float3 p)
|
||||
{
|
||||
aabb->lower[0] = fmin(aabb->lower[0], p.x);
|
||||
aabb->lower[1] = fmin(aabb->lower[1], p.y);
|
||||
aabb->lower[2] = fmin(aabb->lower[2], p.z);
|
||||
aabb->upper[0] = fmax(aabb->upper[0], p.x);
|
||||
aabb->upper[1] = fmax(aabb->upper[1], p.y);
|
||||
aabb->upper[2] = fmax(aabb->upper[2], p.z);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_extendlu(struct AABB3f *aabb, const float3 lower, const float3 upper)
|
||||
{
|
||||
aabb->lower[0] = fmin(aabb->lower[0], lower.x);
|
||||
aabb->lower[1] = fmin(aabb->lower[1], lower.y);
|
||||
aabb->lower[2] = fmin(aabb->lower[2], lower.z);
|
||||
aabb->upper[0] = fmax(aabb->upper[0], upper.x);
|
||||
aabb->upper[1] = fmax(aabb->upper[1], upper.y);
|
||||
aabb->upper[2] = fmax(aabb->upper[2], upper.z);
|
||||
}
|
||||
|
||||
GRL_INLINE float3 AABB3f_size(struct AABB3f* aabb)
|
||||
{
|
||||
return AABB3f_load_upper(aabb) - AABB3f_load_lower(aabb);
|
||||
}
|
||||
|
||||
GRL_INLINE float AABB3f_halfArea(struct AABB3f *aabb)
|
||||
{
|
||||
const float3 d = AABB3f_load_upper( aabb ) - AABB3f_load_lower( aabb );
|
||||
return d.x* (d.y + d.z) + (d.y * d.z);
|
||||
}
|
||||
|
||||
GRL_INLINE float halfArea_AABB3f(struct AABB3f *aabb) // TODO: Remove me
|
||||
{
|
||||
const float3 d = { aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2] };
|
||||
return fma(d.x, (d.y + d.z), d.y * d.z);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_set_lower(struct AABB3f* aabb, float3 lower)
|
||||
{
|
||||
aabb->lower[0] = lower.x;
|
||||
aabb->lower[1] = lower.y;
|
||||
aabb->lower[2] = lower.z;
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_set_upper(struct AABB3f* aabb, float3 upper)
|
||||
{
|
||||
aabb->upper[0] = upper.x;
|
||||
aabb->upper[1] = upper.y;
|
||||
aabb->upper[2] = upper.z;
|
||||
}
|
||||
|
||||
GRL_INLINE float3 conservativeExtent(float3 extent)
|
||||
{
|
||||
const float v = FLT_EPSILON * fmax(extent.x, fmax(extent.y, extent.z));
|
||||
float3 v3 = { v,v,v };
|
||||
extent = extent + v3;
|
||||
return extent;
|
||||
}
|
||||
|
||||
inline struct AABB3f GRL_OVERLOADABLE transform_aabb(float3 lower, float3 upper, const float* Transform)
|
||||
{
|
||||
#if 1
|
||||
// We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
|
||||
// New AABB is center +- Extent.
|
||||
//
|
||||
// For derivation see:
|
||||
// https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
|
||||
//
|
||||
|
||||
float3 Center = (upper + lower) * 0.5f;
|
||||
float3 Extent = (conservativeExtent(upper) - lower) * 0.5f;
|
||||
|
||||
float cx = Center.x * Transform[0] + Center.y * Transform[1] + Center.z * Transform[2] + Transform[3];
|
||||
float cy = Center.x * Transform[4] + Center.y * Transform[5] + Center.z * Transform[6] + Transform[7];
|
||||
float cz = Center.x * Transform[8] + Center.y * Transform[9] + Center.z * Transform[10] + Transform[11];
|
||||
float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
|
||||
float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
|
||||
float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
|
||||
|
||||
Center.x = cx; Center.y = cy; Center.z = cz;
|
||||
Extent.x = ex; Extent.y = ey; Extent.z = ez;
|
||||
|
||||
struct AABB3f box;
|
||||
AABB3f_set_lower(&box, Center - Extent);
|
||||
AABB3f_set_upper(&box, Center + Extent);
|
||||
return box;
|
||||
#else
|
||||
struct AffineSpace3f xfm = AffineSpace3f_load_row_major(Transform);
|
||||
|
||||
float3 plll = { lower.x, lower.y, lower.z };
|
||||
float3 pllu = { lower.x, lower.y, upper.z };
|
||||
float3 plul = { lower.x, upper.y, lower.z };
|
||||
float3 pluu = { lower.x, upper.y, upper.z };
|
||||
float3 pull = { upper.x, lower.y, lower.z };
|
||||
float3 pulu = { upper.x, lower.y, upper.z };
|
||||
float3 puul = { upper.x, upper.y, lower.z };
|
||||
float3 puuu = { upper.x, upper.y, upper.z };
|
||||
plll = xfmPoint(xfm, plll) ;
|
||||
pllu = xfmPoint(xfm, pllu) ;
|
||||
plul = xfmPoint(xfm, plul) ;
|
||||
pluu = xfmPoint(xfm, pluu) ;
|
||||
pull = xfmPoint(xfm, pull) ;
|
||||
pulu = xfmPoint(xfm, pulu) ;
|
||||
puul = xfmPoint(xfm, puul) ;
|
||||
puuu = xfmPoint(xfm, puuu) ;
|
||||
|
||||
float3 p1_min = fmin(plll, pull);
|
||||
float3 p2_min = fmin(pllu, pulu);
|
||||
float3 p3_min = fmin(plul, puul);
|
||||
float3 p4_min = fmin(pluu, puuu);
|
||||
float3 p1_max = fmax(plll, pull);
|
||||
float3 p2_max = fmax(pllu, pulu);
|
||||
float3 p3_max = fmax(plul, puul);
|
||||
float3 p4_max = fmax(pluu, puuu);
|
||||
p1_min = fmin(p1_min, p3_min);
|
||||
p2_min = fmin(p2_min, p4_min);
|
||||
p1_max = fmax(p1_max, p3_max);
|
||||
p2_max = fmax(p2_max, p4_max);
|
||||
p1_min = fmin(p1_min, p2_min);
|
||||
p1_max = fmax(p1_max, p2_max);
|
||||
|
||||
AABB3f out = {
|
||||
{p1_min.x,p1_min.y,p1_min.z},
|
||||
{p1_max.x,p1_max.y,p1_max.z}
|
||||
};
|
||||
return out;
|
||||
#endif
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB3f GRL_OVERLOADABLE transform_aabb(struct AABB3f box, const float* Transform)
|
||||
{
|
||||
float3 lower = { box.lower[0], box.lower[1], box.lower[2] };
|
||||
float3 upper = { box.upper[0], box.upper[1], box.upper[2] };
|
||||
return transform_aabb(lower, upper, Transform);
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB3f AABB3f_transform(struct AffineSpace3f xfm, struct AABB3f in)
|
||||
{
|
||||
struct AABB3f out;
|
||||
float rmTransform[12];
|
||||
load_row_major_from_AffineSpace3f(xfm, rmTransform);
|
||||
out = transform_aabb(in, rmTransform);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
GRL_INLINE bool AABB3f_isIn(struct AABB3f bigger, float3 contained)
|
||||
{
|
||||
bool iscontained =
|
||||
contained.x >= bigger.lower[0] &&
|
||||
contained.y >= bigger.lower[1] &&
|
||||
contained.z >= bigger.lower[2] &&
|
||||
contained.x <= bigger.upper[0] &&
|
||||
contained.y <= bigger.upper[1] &&
|
||||
contained.z <= bigger.upper[2];
|
||||
|
||||
return iscontained;
|
||||
}
|
||||
|
||||
GRL_INLINE bool AABB3f_isSubset(struct AABB3f bigger, struct AABB3f contained)
|
||||
{
|
||||
bool iscontained =
|
||||
contained.lower[0] >= bigger.lower[0] &&
|
||||
contained.lower[1] >= bigger.lower[1] &&
|
||||
contained.lower[2] >= bigger.lower[2] &&
|
||||
contained.upper[0] <= bigger.upper[0] &&
|
||||
contained.upper[1] <= bigger.upper[1] &&
|
||||
contained.upper[2] <= bigger.upper[2];
|
||||
|
||||
return iscontained;
|
||||
}
|
||||
|
||||
GRL_INLINE bool AABB3f_is_degenerate(struct AABB3f* box )
|
||||
{
|
||||
return box->lower[0] > box->upper[0] ||
|
||||
box->lower[1] > box->upper[1] ||
|
||||
box->lower[2] > box->upper[2];
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_print(struct AABB3f *aabb)
|
||||
{
|
||||
printf("AABB {\n");
|
||||
printf(" lower = %f, %f, %f\n", aabb->lower[0], aabb->lower[1], aabb->lower[2]);
|
||||
printf(" upper = %f, %f, %f\n", aabb->upper[0], aabb->upper[1], aabb->upper[2]);
|
||||
printf("}\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef __OPENCL_VERSION__
|
||||
GRL_INLINE struct AABB3f AABB3f_sub_group_shuffle(struct AABB3f *aabb, const uint slotID)
|
||||
{
|
||||
struct AABB3f bounds;
|
||||
bounds.lower[0] = intel_sub_group_shuffle(aabb->lower[0], slotID);
|
||||
bounds.lower[1] = intel_sub_group_shuffle(aabb->lower[1], slotID);
|
||||
bounds.lower[2] = intel_sub_group_shuffle(aabb->lower[2], slotID);
|
||||
bounds.upper[0] = intel_sub_group_shuffle(aabb->upper[0], slotID);
|
||||
bounds.upper[1] = intel_sub_group_shuffle(aabb->upper[1], slotID);
|
||||
bounds.upper[2] = intel_sub_group_shuffle(aabb->upper[2], slotID);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB3f AABB3f_sub_group_reduce(struct AABB3f *aabb)
|
||||
{
|
||||
struct AABB3f bounds;
|
||||
bounds.lower[0] = sub_group_reduce_min(aabb->lower[0]);
|
||||
bounds.lower[1] = sub_group_reduce_min(aabb->lower[1]);
|
||||
bounds.lower[2] = sub_group_reduce_min(aabb->lower[2]);
|
||||
bounds.upper[0] = sub_group_reduce_max(aabb->upper[0]);
|
||||
bounds.upper[1] = sub_group_reduce_max(aabb->upper[1]);
|
||||
bounds.upper[2] = sub_group_reduce_max(aabb->upper[2]);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB3f AABB3f_sub_group_scan_exclusive_min_max(struct AABB3f *aabb)
|
||||
{
|
||||
struct AABB3f bounds;
|
||||
bounds.lower[0] = sub_group_scan_exclusive_min(aabb->lower[0]);
|
||||
bounds.lower[1] = sub_group_scan_exclusive_min(aabb->lower[1]);
|
||||
bounds.lower[2] = sub_group_scan_exclusive_min(aabb->lower[2]);
|
||||
bounds.upper[0] = sub_group_scan_exclusive_max(aabb->upper[0]);
|
||||
bounds.upper[1] = sub_group_scan_exclusive_max(aabb->upper[1]);
|
||||
bounds.upper[2] = sub_group_scan_exclusive_max(aabb->upper[2]);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AABB3f AABB3f_sub_group_scan_inclusive_min_max(struct AABB3f *aabb)
|
||||
{
|
||||
struct AABB3f bounds;
|
||||
bounds.lower[0] = sub_group_scan_inclusive_min(aabb->lower[0]);
|
||||
bounds.lower[1] = sub_group_scan_inclusive_min(aabb->lower[1]);
|
||||
bounds.lower[2] = sub_group_scan_inclusive_min(aabb->lower[2]);
|
||||
bounds.upper[0] = sub_group_scan_inclusive_max(aabb->upper[0]);
|
||||
bounds.upper[1] = sub_group_scan_inclusive_max(aabb->upper[1]);
|
||||
bounds.upper[2] = sub_group_scan_inclusive_max(aabb->upper[2]);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_atomic_merge_local_nocheck(local struct AABB3f *aabb, const float4 lower, const float4 upper)
|
||||
{
|
||||
atomic_min((local float *)&aabb->lower + 0, lower.x);
|
||||
atomic_min((local float *)&aabb->lower + 1, lower.y);
|
||||
atomic_min((local float *)&aabb->lower + 2, lower.z);
|
||||
atomic_max((local float *)&aabb->upper + 0, upper.x);
|
||||
atomic_max((local float *)&aabb->upper + 1, upper.y);
|
||||
atomic_max((local float *)&aabb->upper + 2, upper.z);
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE void AABB3f_atomic_merge_global_lu( global struct AABB3f* aabb, const float3 lower, const float3 upper )
|
||||
{
|
||||
atomic_min( (global float*) & aabb->lower + 0, lower.x );
|
||||
atomic_min( (global float*) & aabb->lower + 1, lower.y );
|
||||
atomic_min( (global float*) & aabb->lower + 2, lower.z );
|
||||
atomic_max( (global float*) & aabb->upper + 0, upper.x );
|
||||
atomic_max( (global float*) & aabb->upper + 1, upper.y );
|
||||
atomic_max( (global float*) & aabb->upper + 2, upper.z );
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_atomic_merge_local_lu( local struct AABB3f* aabb, const float3 lower, const float3 upper )
|
||||
{
|
||||
atomic_min( (local float*) & aabb->lower + 0, lower.x );
|
||||
atomic_min( (local float*) & aabb->lower + 1, lower.y );
|
||||
atomic_min( (local float*) & aabb->lower + 2, lower.z );
|
||||
atomic_max( (local float*) & aabb->upper + 0, upper.x );
|
||||
atomic_max( (local float*) & aabb->upper + 1, upper.y );
|
||||
atomic_max( (local float*) & aabb->upper + 2, upper.z );
|
||||
}
|
||||
|
||||
GRL_INLINE void Uniform_AABB3f_atomic_merge_local_sub_group_lu(uniform local struct AABB3f* aabb, const float3 lower, const float3 upper)
|
||||
{
|
||||
float lx = sub_group_reduce_min(lower.x);
|
||||
float ly = sub_group_reduce_min(lower.y);
|
||||
float lz = sub_group_reduce_min(lower.z);
|
||||
|
||||
float ux = sub_group_reduce_max(upper.x);
|
||||
float uy = sub_group_reduce_max(upper.y);
|
||||
float uz = sub_group_reduce_max(upper.z);
|
||||
|
||||
if (get_sub_group_local_id() == 0)
|
||||
{
|
||||
atomic_min((local float*) & aabb->lower + 0, lx);
|
||||
atomic_min((local float*) & aabb->lower + 1, ly);
|
||||
atomic_min((local float*) & aabb->lower + 2, lz);
|
||||
atomic_max((local float*) & aabb->upper + 0, ux);
|
||||
atomic_max((local float*) & aabb->upper + 1, uy);
|
||||
atomic_max((local float*) & aabb->upper + 2, uz);
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_atomic_merge_global_sub_group_lu(uniform global struct AABB3f* aabb, const float3 lower, const float3 upper)
|
||||
{
|
||||
uint lane = get_sub_group_local_id();
|
||||
float l[3];
|
||||
l[0] = sub_group_reduce_min(lower.x);
|
||||
l[1] = sub_group_reduce_min(lower.y);
|
||||
l[2] = sub_group_reduce_min(lower.z);
|
||||
float u[3];
|
||||
u[0] = sub_group_reduce_max(upper.x);
|
||||
u[1] = sub_group_reduce_max(upper.y);
|
||||
u[2] = sub_group_reduce_max(upper.z);
|
||||
|
||||
if (lane < 3)
|
||||
{
|
||||
atomic_min((global float*)&aabb->lower + lane, l[lane]);
|
||||
atomic_max((global float*)&aabb->upper + lane, u[lane]);
|
||||
}
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_atomic_merge_global( global struct AABB3f* aabb, struct AABB3f* other )
|
||||
{
|
||||
float3 lower = AABB3f_load_lower( other );
|
||||
float3 upper = AABB3f_load_upper( other );
|
||||
atomic_min( (global float*) & aabb->lower + 0, lower.x );
|
||||
atomic_min( (global float*) & aabb->lower + 1, lower.y );
|
||||
atomic_min( (global float*) & aabb->lower + 2, lower.z );
|
||||
atomic_max( (global float*) & aabb->upper + 0, upper.x );
|
||||
atomic_max( (global float*) & aabb->upper + 1, upper.y );
|
||||
atomic_max( (global float*) & aabb->upper + 2, upper.z );
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_atomic_merge_localBB_nocheck( local struct AABB3f* aabb, struct AABB3f* bb )
|
||||
{
|
||||
atomic_min( (local float*) & aabb->lower + 0, bb->lower[0] );
|
||||
atomic_min( (local float*) & aabb->lower + 1, bb->lower[1] );
|
||||
atomic_min( (local float*) & aabb->lower + 2, bb->lower[2] );
|
||||
atomic_max( (local float*) & aabb->upper + 0, bb->upper[0] );
|
||||
atomic_max( (local float*) & aabb->upper + 1, bb->upper[1] );
|
||||
atomic_max( (local float*) & aabb->upper + 2, bb->upper[2] );
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_atomic_merge_local(local struct AABB3f *aabb, const float4 lower, const float4 upper)
|
||||
{
|
||||
if (lower.x < aabb->lower[0])
|
||||
atomic_min((local float *)&aabb->lower + 0, lower.x);
|
||||
if (lower.y < aabb->lower[1])
|
||||
atomic_min((local float *)&aabb->lower + 1, lower.y);
|
||||
if (lower.z < aabb->lower[2])
|
||||
atomic_min((local float *)&aabb->lower + 2, lower.z);
|
||||
if (upper.x > aabb->upper[0])
|
||||
atomic_max((local float *)&aabb->upper + 0, upper.x);
|
||||
if (upper.y > aabb->upper[1])
|
||||
atomic_max((local float *)&aabb->upper + 1, upper.y);
|
||||
if (upper.z > aabb->upper[2])
|
||||
atomic_max((local float *)&aabb->upper + 2, upper.z);
|
||||
}
|
||||
|
||||
GRL_INLINE void AABB3f_atomic_merge_global_local(global struct AABB3f *dest, local struct AABB3f *source)
|
||||
{
|
||||
float3 l = AABB3f_load_lower(source);
|
||||
float3 u = AABB3f_load_upper(source);
|
||||
atomic_min((global float *)&dest->lower + 0, l.x );
|
||||
atomic_min((global float *)&dest->lower + 1, l.y );
|
||||
atomic_min((global float *)&dest->lower + 2, l.z );
|
||||
atomic_max((global float *)&dest->upper + 0, u.x );
|
||||
atomic_max((global float *)&dest->upper + 1, u.y );
|
||||
atomic_max((global float *)&dest->upper + 2, u.z );
|
||||
}
|
||||
|
||||
|
||||
struct AABB3f AABB3f_construct( float3 min, float3 max )
|
||||
{
|
||||
struct AABB3f bb;
|
||||
bb.lower[0] = min.x; bb.lower[1] = min.y; bb.lower[2] = min.z;
|
||||
bb.upper[0] = max.x; bb.upper[1] = max.y; bb.upper[2] = max.z;
|
||||
return bb;
|
||||
}
|
||||
|
||||
struct AABB3f AABB3f_select( struct AABB3f left, struct AABB3f right, int3 cond )
|
||||
{
|
||||
float3 l = select( AABB3f_load_lower(&left), AABB3f_load_lower(&right), cond );
|
||||
float3 u = select( AABB3f_load_upper(&left), AABB3f_load_upper(&right), cond );
|
||||
return AABB3f_construct( l, u );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
||||
|
|
@ -1,691 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
//
|
||||
// This file is to contain structure definitions related to the Gen12 QBVH6 acceleration structures
|
||||
//
|
||||
//
|
||||
|
||||
//********************************************************************************************
|
||||
// WARNING!!!!!
|
||||
// This file is shared by OpenCL and C++ source code and must be compatible.
|
||||
// There should only be C structure definitions and trivial GRL_INLINE functions here
|
||||
//
|
||||
//********************************************************************************************
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLRTASCommon.h"
|
||||
#include "GRLUtilities.h"
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
GRL_NAMESPACE_BEGIN(GEN12)
|
||||
|
||||
enum_uint8(NodeType)
|
||||
{
|
||||
NODE_TYPE_MIXED = 0x0, // identifies a mixed internal node where each child can have a different type
|
||||
NODE_TYPE_INTERNAL = 0x0, // internal BVH node with 6 children
|
||||
NODE_TYPE_INSTANCE = 0x1, // instance leaf
|
||||
NODE_TYPE_PROCEDURAL = 0x3, // procedural leaf
|
||||
NODE_TYPE_QUAD = 0x4, // quad leaf
|
||||
NODE_TYPE_INVALID = 0x7 // indicates invalid node
|
||||
};
|
||||
|
||||
|
||||
typedef enum PrimLeafType
|
||||
{
|
||||
TYPE_NONE = 0,
|
||||
|
||||
TYPE_QUAD = 0,
|
||||
|
||||
/* For a node type of NODE_TYPE_PROCEDURAL we support enabling
|
||||
* and disabling the opaque/non_opaque culling. */
|
||||
|
||||
TYPE_OPACITY_CULLING_ENABLED = 0,
|
||||
TYPE_OPACITY_CULLING_DISABLED = 1
|
||||
} PrimLeafType;
|
||||
|
||||
#define BVH_MAGIC_MACRO "GEN12_RTAS_005" // If serialization-breaking or algorithm-breaking changes are made, increment the digits at the end
|
||||
static const char BVH_MAGIC[16] = BVH_MAGIC_MACRO;
|
||||
|
||||
typedef struct BVHBase
|
||||
{
|
||||
// TODO: Implement the "copy-first-node" trick... duplicate root node here
|
||||
|
||||
uint64_t rootNodeOffset;
|
||||
|
||||
uint32_t reserved;
|
||||
|
||||
uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
|
||||
uint32_t quadLeafStart;
|
||||
uint32_t quadLeafCur;
|
||||
uint32_t proceduralDataStart;
|
||||
uint32_t proceduralDataCur;
|
||||
uint32_t instanceLeafStart;
|
||||
uint32_t instanceLeafEnd;
|
||||
uint32_t backPointerDataStart; //
|
||||
uint32_t refitTreeletsDataStart; // refit structs
|
||||
uint32_t refitStartPointDataStart; //
|
||||
uint32_t BVHDataEnd;
|
||||
|
||||
// number of bottom treelets
|
||||
// if 1, then the bottom treelet is also tip treelet
|
||||
uint32_t refitTreeletCnt;
|
||||
uint32_t refitTreeletCnt2; // always 0, used for atomic updates
|
||||
// data layout:
|
||||
// @backPointerDataStart
|
||||
// 'backpointer' - a dword per inner node.
|
||||
// The bits are used as follows:
|
||||
// 2:0 --> Used as a refit counter during BVH refitting. MBZ
|
||||
// 5:3 --> Number of children
|
||||
// 31:6 --> Index of the parent node in the internal node array
|
||||
// The root node has a parent index of all ones
|
||||
// @refitTreeletsDataStart
|
||||
// RefitTreelet[], the last treelet is for top treelet all previous are for bottom
|
||||
// @refitStartPointDataStart
|
||||
// for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
|
||||
// @backPointerDataEnd
|
||||
|
||||
uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves"
|
||||
uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children)
|
||||
uint32_t fatLeafTableStart;
|
||||
uint32_t innerTableStart;
|
||||
|
||||
uint32_t quadLeftoversCountNewAtomicUpdate; // number of quad leftovers for new atomic update
|
||||
uint32_t quadTableSizeNewAtomicUpdate; // size of quad Table including leftovers, padded to 256
|
||||
uint32_t quadIndicesDataStart;
|
||||
|
||||
uint32_t _pad[9];
|
||||
|
||||
struct RTASMetaData Meta;
|
||||
|
||||
} BVHBase;
|
||||
|
||||
GRL_INLINE struct GeoMetaData* BVHBase_GetGeoMetaData(BVHBase* base)
|
||||
{
|
||||
return (struct GeoMetaData*)(((char*)base) + base->Meta.geoDescsStart);
|
||||
}
|
||||
|
||||
#ifdef __OPENCL_VERSION__
|
||||
#define BVH_ROOT_NODE_OFFSET sizeof(BVHBase)
|
||||
#else
|
||||
#define BVH_ROOT_NODE_OFFSET sizeof(GRL::RTAS::GEN12::BVHBase)
|
||||
#endif
|
||||
|
||||
GRL_STATIC_ASSERT( sizeof(BVHBase) == BVH_ROOT_NODE_OFFSET, "Wrong size!");
|
||||
GRL_STATIC_ASSERT( (sizeof(BVHBase) % 64) == 0 , "Misaligned size!");
|
||||
|
||||
typedef struct BackPointers {
|
||||
} BackPointers;
|
||||
|
||||
// threshold for size of bottom treelets, note usually treelets will be 2-3x smaller than that number
|
||||
// means that no bottom treelet has more paths than this number
|
||||
#define TREELET_NUM_STARTPOINTS 1536
|
||||
|
||||
// threshold under which only one treelet will be created
|
||||
#define SINGLE_TREELET_THRESHOLD 3072
|
||||
|
||||
typedef struct LeafTableEntry {
|
||||
|
||||
uint backpointer;
|
||||
uint inner_node_index;
|
||||
uint leaf_index;
|
||||
} LeafTableEntry;
|
||||
|
||||
typedef struct InnerNodeTableEntry {
|
||||
|
||||
uint node_index_and_numchildren; // numchildren in 3 lsbs
|
||||
uint first_child;
|
||||
|
||||
} InnerNodeTableEntry;
|
||||
|
||||
typedef struct QuadDataIndices
|
||||
{
|
||||
uint header_data[4];
|
||||
uint vert_idx[4];
|
||||
} QuadDataIndices;
|
||||
|
||||
typedef struct RefitTreelet {
|
||||
uint32_t startpoint_offset;
|
||||
uint32_t numStartpoints;
|
||||
uint32_t numNonTrivialStartpoints;
|
||||
uint8_t maxDepth;
|
||||
uint8_t depthLess64; // depth from bottom at which there are less 64 paths
|
||||
uint8_t depthLess128;// depth from bottom at which there are less 128 paths
|
||||
uint8_t depthLess256;// depth from bottom at which there are less 256 paths
|
||||
} RefitTreelet;
|
||||
|
||||
// if RefitTreelet has number of startpoints == 1
|
||||
// it should be reinterpreted as:
|
||||
typedef struct RefitTreeletTrivial {
|
||||
uint32_t theOnlyNodeIndex;
|
||||
uint32_t numStartpoints; // have to be 1 or 0
|
||||
int32_t childrenOffsetOfTheNode; // 0th node based
|
||||
uint8_t maxDepth;
|
||||
uint8_t numChildrenOfTheNode;
|
||||
} RefitTreeletTrivial;
|
||||
|
||||
// 5:0 - depth after you die
|
||||
// 31:6 - Index of the inner node
|
||||
typedef uint32_t StartPoint;
|
||||
|
||||
struct HwInstanceLeaf;
|
||||
struct QuadLeaf;
|
||||
struct ProceduralLeaf;
|
||||
struct InternalNode;
|
||||
|
||||
typedef struct HwInstanceLeaf HwInstanceLeaf;
|
||||
typedef struct InternalNode InternalNode;
|
||||
typedef struct QuadLeaf QuadLeaf;
|
||||
typedef struct ProceduralLeaf ProceduralLeaf;
|
||||
|
||||
GRL_INLINE uint32_t BackPointer_GetParentIndex( uint32_t bp )
|
||||
{
|
||||
return bp >> 6;
|
||||
}
|
||||
GRL_INLINE uint32_t BackPointer_GetNumChildren( uint32_t bp )
|
||||
{
|
||||
return (bp >> 3) & (7);
|
||||
}
|
||||
GRL_INLINE uint32_t BackPointer_GetRefitCount( uint32_t bp )
|
||||
{
|
||||
return bp & 7;
|
||||
}
|
||||
GRL_INLINE bool BackPointer_IsRoot( uint32_t bp )
|
||||
{
|
||||
return (bp >> 6) == 0x03FFFFFF;
|
||||
}
|
||||
|
||||
GRL_INLINE InternalNode* BVHBase_GetRootNode( const BVHBase* p )
|
||||
{
|
||||
return (InternalNode*)( ((char*)p) + BVH_ROOT_NODE_OFFSET);
|
||||
}
|
||||
|
||||
GRL_INLINE AABB3f BVHBase_GetRootAABB(const BVHBase* p)
|
||||
{
|
||||
return p->Meta.bounds;
|
||||
}
|
||||
|
||||
GRL_INLINE InternalNode* BVHBase_GetInternalNodes(const BVHBase* p)
|
||||
{
|
||||
return (InternalNode*)(((char*)p) + BVH_ROOT_NODE_OFFSET);
|
||||
}
|
||||
GRL_INLINE InternalNode* BVHBase_GetInternalNodesEnd(const BVHBase* p)
|
||||
{
|
||||
return (InternalNode*)(((char*)p) + (size_t)(64u * p->nodeDataCur));
|
||||
}
|
||||
GRL_INLINE uint32_t BVHBase_GetNumInternalNodes(const BVHBase* p)
|
||||
{
|
||||
return p->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE QuadLeaf* BVHBase_GetQuadLeaves(const BVHBase* p)
|
||||
{
|
||||
return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafStart));
|
||||
}
|
||||
GRL_INLINE const QuadLeaf* BVHBase_GetQuadLeaves_End(const BVHBase* p)
|
||||
{
|
||||
return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafCur));
|
||||
}
|
||||
|
||||
GRL_INLINE const ProceduralLeaf* BVHBase_GetProceduralLeaves_End(const BVHBase* p)
|
||||
{
|
||||
return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataCur));
|
||||
}
|
||||
|
||||
GRL_INLINE ProceduralLeaf* BVHBase_GetProceduralLeaves(const BVHBase* p)
|
||||
{
|
||||
return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataStart));
|
||||
}
|
||||
|
||||
GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves(const BVHBase* p )
|
||||
{
|
||||
char* pRTASBits = (char*)p;
|
||||
return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafStart));
|
||||
}
|
||||
|
||||
GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves_End(const BVHBase* p )
|
||||
{
|
||||
char* pRTASBits = (char*) p;
|
||||
return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafEnd));
|
||||
}
|
||||
|
||||
GRL_INLINE uint BVHBase_GetNumHWInstanceLeaves( const BVHBase* p )
|
||||
{
|
||||
return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
|
||||
}
|
||||
|
||||
GRL_INLINE uint* BVHBase_GetRefitStartPoints(const BVHBase* p)
|
||||
{
|
||||
return (uint32_t*)(((char*)p) + (size_t)(64u * p->refitStartPointDataStart));
|
||||
}
|
||||
|
||||
GRL_INLINE uint BVHBase_GetRefitStartPointsSize(const BVHBase* p)
|
||||
{
|
||||
return 64u * (p->fatLeafTableStart - p->refitStartPointDataStart);
|
||||
}
|
||||
|
||||
GRL_INLINE uint StartPoint_GetDepth(StartPoint s)
|
||||
{
|
||||
return s & ((1 << 6) - 1);
|
||||
}
|
||||
|
||||
GRL_INLINE uint StartPoint_GetNodeIdx(StartPoint s)
|
||||
{
|
||||
return s >> 6;
|
||||
}
|
||||
|
||||
GRL_INLINE RefitTreelet* BVHBase_GetRefitTreeletDescs(const BVHBase* p)
|
||||
{
|
||||
return (RefitTreelet*)(((char*)p) + (size_t)(64u * p->refitTreeletsDataStart));
|
||||
}
|
||||
|
||||
// this is treelet count as should be executed, ie. num of bottom treelets if there are top and bottoms.
|
||||
// to get real number of all treelets including tip, the formula is
|
||||
// actualNumTreelets = refitTreeletCnt > 1 ? refitTreeletCnt + 1 : 1;
|
||||
GRL_INLINE uint32_t* BVHBase_GetRefitTreeletCntPtr(BVHBase* p)
|
||||
{
|
||||
return &p->refitTreeletCnt;
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t BVHBase_GetRefitTreeletCnt(const BVHBase* p)
|
||||
{
|
||||
return p->refitTreeletCnt;
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t BVHBase_IsSingleTreelet(const BVHBase* p)
|
||||
{
|
||||
return p->refitTreeletCnt == 1;
|
||||
}
|
||||
|
||||
GRL_INLINE BackPointers* BVHBase_GetBackPointers(const BVHBase* p)
|
||||
{
|
||||
return (BackPointers*)(((char*)p) + (size_t)(64u * p->backPointerDataStart));
|
||||
}
|
||||
|
||||
|
||||
GRL_INLINE LeafTableEntry* BVHBase_GetFatLeafTable(const BVHBase* p)
|
||||
{
|
||||
return (LeafTableEntry*)(((char*)p) + (size_t)(64u * p->fatLeafTableStart));
|
||||
}
|
||||
GRL_INLINE InnerNodeTableEntry* BVHBase_GetInnerNodeTable(const BVHBase* p)
|
||||
{
|
||||
return (InnerNodeTableEntry*)(((char*)p) + (size_t)(64u * p->innerTableStart));
|
||||
}
|
||||
GRL_INLINE QuadDataIndices* BVHBase_GetQuadDataIndicesTable(const BVHBase* p)
|
||||
{
|
||||
return (QuadDataIndices*)(((char*)p) + (size_t)(64u * p->quadIndicesDataStart));
|
||||
}
|
||||
|
||||
GRL_INLINE unsigned* InnerNode_GetBackPointer(
|
||||
BackPointers* backpointersStruct,
|
||||
uint32_t inodeOffset /*in 64B units, from the earliest Inner node*/)
|
||||
{
|
||||
uint* backpointersArray = (uint*)backpointersStruct;
|
||||
// BACKPOINTER_LAYOUT
|
||||
uint new_index = inodeOffset; //<-layout canonical
|
||||
//uint new_index = inodeOffset*16; //<-layout scattered
|
||||
// uint new_index = (inodeOffset & (~0xFFFF)) | (((inodeOffset & 0xFF) << 8) | ((inodeOffset & 0xFF00) >> 8)); //<-layout hashed
|
||||
|
||||
return backpointersArray + new_index;
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t BVHBase_GetRefitStructsDataSize(const BVHBase* p)
|
||||
{
|
||||
return 64u * (p->BVHDataEnd - p->backPointerDataStart);
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t BVHBase_GetBackpointersDataSize(const BVHBase* p)
|
||||
{
|
||||
return 64u * (p->refitTreeletsDataStart - p->backPointerDataStart);
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t* BVHBase_GetBVHDataEnd( const BVHBase* p )
|
||||
{
|
||||
return (uint32_t*)(((char*)p) + (size_t)(64u * p->BVHDataEnd));
|
||||
}
|
||||
|
||||
GRL_INLINE bool BVHBase_HasBackPointers( const BVHBase* p )
|
||||
{
|
||||
return p->refitTreeletsDataStart > p->backPointerDataStart;
|
||||
}
|
||||
|
||||
GRL_INLINE const size_t BVHBase_GetNumQuads(const BVHBase* p)
|
||||
{
|
||||
return p->quadLeafCur - p->quadLeafStart;
|
||||
}
|
||||
|
||||
GRL_INLINE const size_t BVHBase_GetNumProcedurals(const BVHBase* p)
|
||||
{
|
||||
return p->proceduralDataCur - p->proceduralDataStart;
|
||||
}
|
||||
|
||||
GRL_INLINE const size_t BVHBase_GetNumInstances(const BVHBase* p)
|
||||
{
|
||||
return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
|
||||
}
|
||||
|
||||
GRL_INLINE const size_t BVHBase_totalBytes(const BVHBase* p)
|
||||
{
|
||||
return p->BVHDataEnd * 64u;
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct HwInstanceLeaf
|
||||
{
|
||||
/* first 64 bytes accessed during traversal */
|
||||
struct Part0
|
||||
{
|
||||
//uint32_t shaderIndex : 24;
|
||||
//uint32_t geomMask : 8;
|
||||
uint32_t DW0;
|
||||
|
||||
// uint32_t instanceContributionToHitGroupIndex : 24;
|
||||
// uint32_t pad0 : 8
|
||||
//
|
||||
// NOTE: Traversal shaders are implemented by aliasing instance leaves as procedural and sending them through the procedural path
|
||||
// For a procedural instance, bit 29 should be set to 1, to disable "opaque culling"
|
||||
// and bits 30 and 31 must be zero. See also the definition of the 'PrimLeafDesc' structure
|
||||
uint32_t DW1;
|
||||
|
||||
// uint64_t rootNodePtr : 48;
|
||||
// uint64_t instFlags : 8;
|
||||
// uint64_t pad1 : 8;
|
||||
uint64_t DW2_DW3;
|
||||
|
||||
// Vec3f world2obj_vx; // 1st row of Worl2Obj transform
|
||||
float world2obj_vx_x;
|
||||
float world2obj_vx_y;
|
||||
float world2obj_vx_z;
|
||||
|
||||
// Vec3f world2obj_vy; // 2nd row of Worl2Obj transform
|
||||
float world2obj_vy_x;
|
||||
float world2obj_vy_y;
|
||||
float world2obj_vy_z;
|
||||
|
||||
// Vec3f world2obj_vz; // 3rd row of Worl2Obj transform
|
||||
float world2obj_vz_x;
|
||||
float world2obj_vz_y;
|
||||
float world2obj_vz_z;
|
||||
|
||||
// Vec3f obj2world_p; // translation of Obj2World transform (on purpose in fist 64 bytes)
|
||||
float obj2world_p_x;
|
||||
float obj2world_p_y;
|
||||
float obj2world_p_z;
|
||||
} part0;
|
||||
|
||||
/* second 64 bytes accessed during shading */
|
||||
// NOTE: Everything in this block is under SW control
|
||||
struct Part1
|
||||
{
|
||||
// uint64_t bvhPtr : 48;
|
||||
// uint64_t pad : 16;
|
||||
uint64_t DW0_DW1;
|
||||
|
||||
uint32_t instanceID;
|
||||
uint32_t instanceIndex;
|
||||
|
||||
// Vec3f world2obj_vx; // 1st row of Worl2Obj transform
|
||||
float obj2world_vx_x;
|
||||
float obj2world_vx_y;
|
||||
float obj2world_vx_z;
|
||||
|
||||
// Vec3f world2obj_vy; // 2nd row of Worl2Obj transform
|
||||
float obj2world_vy_x;
|
||||
float obj2world_vy_y;
|
||||
float obj2world_vy_z;
|
||||
|
||||
// Vec3f world2obj_vz; // 3rd row of Worl2Obj transform
|
||||
float obj2world_vz_x;
|
||||
float obj2world_vz_y;
|
||||
float obj2world_vz_z;
|
||||
|
||||
// Vec3f obj2world_p; // translation of Obj2World transform (on purpose in fist 64 bytes)
|
||||
float world2obj_p_x;
|
||||
float world2obj_p_y;
|
||||
float world2obj_p_z;
|
||||
} part1;
|
||||
};
|
||||
|
||||
__constant const uint64_t c_one = 1ul;
|
||||
|
||||
GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceMask( const HwInstanceLeaf* p )
|
||||
{
|
||||
return p->part0.DW0 >> 24;
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceContributionToHitGroupIndex( const HwInstanceLeaf* p )
|
||||
{
|
||||
return p->part0.DW1 & 0x00ffffff;
|
||||
}
|
||||
|
||||
GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceFlags( const HwInstanceLeaf* p )
|
||||
{
|
||||
return (p->part0.DW2_DW3 >> 48) & 0xff;
|
||||
}
|
||||
GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceID( const HwInstanceLeaf* p )
|
||||
{
|
||||
return p->part1.instanceID;
|
||||
}
|
||||
|
||||
GRL_INLINE gpuva_t HwInstanceLeaf_GetBVH( const HwInstanceLeaf* p ) { return p->part1.DW0_DW1 & ((c_one << 48) - 1); }
|
||||
GRL_INLINE gpuva_t HwInstanceLeaf_GetStartNode( const HwInstanceLeaf* p ) { return p->part0.DW2_DW3 & ((c_one << 48) - 1); }
|
||||
GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceIndex( const HwInstanceLeaf* p ) { return p->part1.instanceIndex; }
|
||||
|
||||
GRL_INLINE void HwInstanceLeaf_GetTransform(struct HwInstanceLeaf* p, float* transform)
|
||||
{
|
||||
transform[0] = p->part1.obj2world_vx_x;
|
||||
transform[1] = p->part1.obj2world_vy_x;
|
||||
transform[2] = p->part1.obj2world_vz_x;
|
||||
transform[3] = p->part0.obj2world_p_x;
|
||||
transform[4] = p->part1.obj2world_vx_y;
|
||||
transform[5] = p->part1.obj2world_vy_y;
|
||||
transform[6] = p->part1.obj2world_vz_y;
|
||||
transform[7] = p->part0.obj2world_p_y;
|
||||
transform[8] = p->part1.obj2world_vx_z;
|
||||
transform[9] = p->part1.obj2world_vy_z;
|
||||
transform[10] = p->part1.obj2world_vz_z;
|
||||
transform[11] = p->part0.obj2world_p_z;
|
||||
}
|
||||
|
||||
GRL_INLINE void HwInstanceLeaf_SetBVH( HwInstanceLeaf* p, gpuva_t b ) {
|
||||
uint64_t mask = ((c_one << 48) - 1);
|
||||
uint64_t v = p->part1.DW0_DW1;
|
||||
v = (b & mask) | (v & ~mask);
|
||||
p->part1.DW0_DW1 = v;
|
||||
}
|
||||
GRL_INLINE void HwInstanceLeaf_SetStartNode( HwInstanceLeaf* p, gpuva_t b ) {
|
||||
uint64_t mask = ((c_one << 48) - 1);
|
||||
uint64_t v = p->part0.DW2_DW3;
|
||||
v = (b & mask) | (v & ~mask);
|
||||
p->part0.DW2_DW3 = v;
|
||||
}
|
||||
GRL_INLINE void HwInstanceLeaf_SetStartNodeAndInstanceFlags( HwInstanceLeaf* p,
|
||||
gpuva_t root,
|
||||
uint8_t flags ) {
|
||||
uint64_t mask = ((1ull << 48) - 1);
|
||||
uint64_t v = (root & mask) | ((uint64_t)(flags)<<48);
|
||||
p->part1.DW0_DW1 = v;
|
||||
}
|
||||
|
||||
struct InternalNode
|
||||
{
|
||||
float lower[3]; // world space origin of quantization grid
|
||||
int32_t childOffset; // offset to all children in 64B multiples
|
||||
|
||||
uint8_t nodeType; // the type of the node
|
||||
uint8_t pad; // unused byte
|
||||
|
||||
int8_t exp_x; // 2^exp_x is the size of the grid in x dimension
|
||||
int8_t exp_y; // 2^exp_y is the size of the grid in y dimension
|
||||
int8_t exp_z; // 2^exp_z is the size of the grid in z dimension
|
||||
uint8_t nodeMask; // mask used for ray filtering
|
||||
|
||||
struct ChildData
|
||||
{
|
||||
//uint8_t blockIncr : 2; // size of child in 64 byte blocks. Must be ==2 for instance leaves, <=2 for quad leaves.
|
||||
//uint8_t startPrim : 4; // start primitive in fat leaf mode or child type in mixed mode
|
||||
//uint8_t pad : 2; // unused bits
|
||||
uint8_t bits;
|
||||
} childData[6];
|
||||
|
||||
uint8_t lower_x[6]; // the quantized lower bounds in x-dimension
|
||||
uint8_t upper_x[6]; // the quantized upper bounds in x-dimension
|
||||
uint8_t lower_y[6]; // the quantized lower bounds in y-dimension
|
||||
uint8_t upper_y[6]; // the quantized upper bounds in y-dimension
|
||||
uint8_t lower_z[6]; // the quantized lower bounds in z-dimension
|
||||
uint8_t upper_z[6]; // the quantized upper bounds in z-dimension
|
||||
};
|
||||
|
||||
GRL_INLINE uint InternalNode_GetChildBlockIncr( const InternalNode* p, uint idx )
|
||||
{
|
||||
return p->childData[idx].bits & 3;
|
||||
}
|
||||
GRL_INLINE uint InternalNode_GetChildStartPrim( const InternalNode* p, uint idx )
|
||||
{
|
||||
return (p->childData[idx].bits>>2) & 0xf;
|
||||
}
|
||||
|
||||
GRL_INLINE uint8_t InternalNode_GetChildType( const InternalNode* p, uint idx )
|
||||
{
|
||||
return (p->childData[idx].bits >> 2) & 0xF;
|
||||
}
|
||||
|
||||
GRL_INLINE void InternalNode_SetChildType( InternalNode* p, uint idx, uint type )
|
||||
{
|
||||
uint bits = p->childData[idx].bits;
|
||||
const uint mask = (0xF << 2);
|
||||
bits = ((type << 2) & mask) | (bits & ~mask);
|
||||
p->childData[idx].bits = (uint8_t)bits;
|
||||
}
|
||||
|
||||
GRL_INLINE bool InternalNode_IsChildValid( const InternalNode* p, size_t child )
|
||||
{
|
||||
bool lower = p->lower_x[child] & 0x80; // invalid nodes are indicated by setting lower_msb = 1 and upper_msb=0
|
||||
bool upper = p->upper_x[child] & 0x80;
|
||||
return !lower || upper;
|
||||
}
|
||||
|
||||
GRL_INLINE AABB3f InternalNode_GetChildAABB(const InternalNode* node, size_t i)
|
||||
{
|
||||
float4 lower, upper;
|
||||
const float4 base = { node->lower[0], node->lower[1], node->lower[2], 0.0f };
|
||||
const int4 lower_i = { node->lower_x[i], node->lower_y[i], node->lower_z[i], 0 };
|
||||
const int4 upper_i = { node->upper_x[i], node->upper_y[i], node->upper_z[i], 0 };
|
||||
const int4 exp_i = { node->exp_x, node->exp_y, node->exp_z, 0 };
|
||||
lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
|
||||
upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
|
||||
AABB3f aabb3f = {
|
||||
{ lower.x, lower.y, lower.z },
|
||||
{ upper.x, upper.y, upper.z } };
|
||||
return aabb3f;
|
||||
}
|
||||
|
||||
GRL_INLINE void* InternalNode_GetChildren( InternalNode* node)
|
||||
{
|
||||
return (void*)(((char*)node) + node->childOffset * 64);
|
||||
}
|
||||
|
||||
typedef struct PrimLeafDesc
|
||||
{
|
||||
//uint32_t shaderIndex : 24; // shader index used for shader record calculations
|
||||
//uint32_t geomMask : 8; // geometry mask used for ray masking
|
||||
uint32_t shaderIndex_geomMask;
|
||||
|
||||
//uint32_t geomIndex : 29; // the geometry index specifies the n'th geometry of the scene
|
||||
//PrimLeafType type : 1; // see above
|
||||
//GeometryFlags geomFlags : 2; // geometry flags of this geometry
|
||||
uint32_t geomIndex_flags;
|
||||
} PrimLeafDesc;
|
||||
|
||||
GRL_INLINE uint32_t PrimLeaf_GetShaderIndex( const PrimLeafDesc* p )
|
||||
{
|
||||
return p->shaderIndex_geomMask & ((1 << 24) - 1);
|
||||
}
|
||||
GRL_INLINE uint32_t PrimLeaf_GetGeoIndex( const PrimLeafDesc* p )
|
||||
{
|
||||
return p->geomIndex_flags & ((1<<29)-1);
|
||||
}
|
||||
GRL_INLINE uint32_t PrimLeaf_GetGeomFlags( const PrimLeafDesc* p )
|
||||
{
|
||||
return (p->geomIndex_flags >> 30);
|
||||
}
|
||||
GRL_INLINE uint32_t PrimLeaf_GetType(const PrimLeafDesc* p)
|
||||
{
|
||||
return (p->geomIndex_flags >> 29) & 1;
|
||||
}
|
||||
|
||||
struct QuadLeaf
|
||||
{
|
||||
PrimLeafDesc leafDesc;
|
||||
|
||||
uint32_t primIndex0;
|
||||
|
||||
//uint32_t primIndex1Delta : 16;
|
||||
//uint32_t j0 : 2;
|
||||
//uint32_t j1 : 2;
|
||||
//uint32_t j2 : 2;
|
||||
//uint32_t last : 1; // last quad in list
|
||||
//uint32_t pad : 9;
|
||||
uint32_t DW1;
|
||||
|
||||
float v[4][3];
|
||||
};
|
||||
|
||||
GRL_INLINE uint32_t QuadLeaf_GetPrimIndexDelta( const QuadLeaf* p )
|
||||
{
|
||||
return p->DW1 & 0x0000ffff;
|
||||
}
|
||||
GRL_INLINE uint32_t QuadLeaf_GetPrimIndex0( const QuadLeaf* p )
|
||||
{
|
||||
return p->primIndex0;
|
||||
}
|
||||
GRL_INLINE uint32_t QuadLeaf_GetPrimIndex1( const QuadLeaf* p )
|
||||
{
|
||||
return p->primIndex0 + QuadLeaf_GetPrimIndexDelta(p);
|
||||
}
|
||||
GRL_INLINE bool QuadLeaf_IsSingleTriangle( const QuadLeaf* p )
|
||||
{
|
||||
return QuadLeaf_GetPrimIndexDelta(p) == 0;
|
||||
}
|
||||
GRL_INLINE uint32_t QuadLeaf_GetSecondTriangleIndices( const QuadLeaf* p )
|
||||
{
|
||||
return (p->DW1>>16) & 0x3f;
|
||||
}
|
||||
|
||||
GRL_INLINE void QuadLeaf_SetVertices( QuadLeaf* quad, float3 v0, float3 v1, float3 v2, float3 v3 )
|
||||
{
|
||||
quad->v[0][0] = v0.x;
|
||||
quad->v[0][1] = v0.y;
|
||||
quad->v[0][2] = v0.z;
|
||||
quad->v[1][0] = v1.x;
|
||||
quad->v[1][1] = v1.y;
|
||||
quad->v[1][2] = v1.z;
|
||||
quad->v[2][0] = v2.x;
|
||||
quad->v[2][1] = v2.y;
|
||||
quad->v[2][2] = v2.z;
|
||||
quad->v[3][0] = v3.x;
|
||||
quad->v[3][1] = v3.y;
|
||||
quad->v[3][2] = v3.z;
|
||||
}
|
||||
|
||||
|
||||
struct ProceduralLeaf {
|
||||
PrimLeafDesc leafDesc;
|
||||
|
||||
// Number of primitives + "last" bits.
|
||||
// The meaning of this section is SW-defined and flexible
|
||||
uint32_t DW1 ;
|
||||
uint32_t _primIndex[13];
|
||||
} ;
|
||||
|
||||
GRL_NAMESPACE_END(Gen12)
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,152 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
//********************************************************************************************
|
||||
// WARNING!!!!!
|
||||
//
|
||||
// This file is shared by OpenCL and C++ source code and must be a pure C header
|
||||
// There should only be C structure definitions and trivial inline functions here
|
||||
//
|
||||
//********************************************************************************************
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLOCLCompatibility.h"
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
|
||||
typedef uint32_t dword;
|
||||
typedef uint64_t qword;
|
||||
typedef qword gpuva_t;
|
||||
|
||||
|
||||
enum_uint8( InstanceFlags )
|
||||
{
|
||||
INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
|
||||
INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
|
||||
INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
|
||||
INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8,
|
||||
};
|
||||
|
||||
enum_uint8( GeometryFlags )
|
||||
{
|
||||
GEOMETRY_FLAG_NONE = 0x0,
|
||||
GEOMETRY_FLAG_OPAQUE = 0x1,
|
||||
GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2,
|
||||
};
|
||||
|
||||
enum_uint8( GeometryType )
|
||||
{
|
||||
GEOMETRY_TYPE_TRIANGLES = 0,
|
||||
GEOMETRY_TYPE_PROCEDURAL = 1,
|
||||
NUM_GEOMETRY_TYPES = 2
|
||||
};
|
||||
|
||||
// NOTE: Does NOT match DXR
|
||||
enum_uint8( IndexFormat )
|
||||
{
|
||||
INDEX_FORMAT_NONE = 0, // INDEX_FORMAT_NONE Indicates non-indexed geometry
|
||||
INDEX_FORMAT_R16_UINT = 2,
|
||||
INDEX_FORMAT_R32_UINT = 4,
|
||||
INDEX_FORMAT_END = INDEX_FORMAT_R32_UINT + 1
|
||||
};
|
||||
|
||||
// NOTE: Does NOT match DXR
|
||||
enum_uint8( VertexFormat )
|
||||
{
|
||||
VERTEX_FORMAT_R32G32_FLOAT = 0,
|
||||
VERTEX_FORMAT_R32G32B32_FLOAT = 1,
|
||||
VERTEX_FORMAT_R16G16_FLOAT = 2,
|
||||
VERTEX_FORMAT_R16G16B16A16_FLOAT = 3,
|
||||
VERTEX_FORMAT_R16G16_SNORM = 4,
|
||||
VERTEX_FORMAT_R16G16B16A16_SNORM = 5,
|
||||
VERTEX_FORMAT_R16G16B16A16_UNORM = 6,
|
||||
VERTEX_FORMAT_R16G16_UNORM = 7,
|
||||
VERTEX_FORMAT_R10G10B10A2_UNORM = 8,
|
||||
VERTEX_FORMAT_R8G8B8A8_UNORM = 9,
|
||||
VERTEX_FORMAT_R8G8_UNORM = 10,
|
||||
VERTEX_FORMAT_R8G8B8A8_SNORM = 11,
|
||||
VERTEX_FORMAT_R8G8_SNORM = 12,
|
||||
VERTEX_FORMAT_END = VERTEX_FORMAT_R8G8_SNORM + 1
|
||||
};
|
||||
|
||||
|
||||
|
||||
enum_uint32(RTASFlags)
|
||||
{
|
||||
// These flags match DXR
|
||||
BUILD_FLAG_ALLOW_UPDATE = 1<<0,
|
||||
BUILD_FLAG_ALLOW_COMPACTION = 1<<1,
|
||||
BUILD_FLAG_PREFER_FAST_TRACE = 1<<2,
|
||||
BUILD_FLAG_PREFER_FAST_BUILD = 1<<3,
|
||||
BUILD_FLAG_MINIMIZE_MEMORY = 1<<4,
|
||||
BUILD_FLAG_PERFORM_UPDATE = 1<<5,
|
||||
|
||||
// internal flags start here
|
||||
BUILD_FLAG_DISALLOW_REBRAID = 1<<16,
|
||||
|
||||
BUILD_FLAG_ALL = 0x0001003f
|
||||
};
|
||||
|
||||
enum_uint8(BVHType)
|
||||
{
|
||||
BVH_TYPE_NONE, // This is a sentinel for drivers to use when compiling out GRL on non-RT devices
|
||||
BVH_TYPE_GEN12,
|
||||
};
|
||||
|
||||
enum_uint8(PostBuildInfoType)
|
||||
{
|
||||
PBI_CURRENT_SIZE,
|
||||
PBI_COMPACTED_SIZE,
|
||||
PBI_DXR_TOOLS_VISUALIZATION_DESC,
|
||||
PBI_DXR_SERIALIZATION_DESC,
|
||||
};
|
||||
|
||||
enum_uint32(HazardTypes)
|
||||
{
|
||||
HAZARD_RTAS_READ = 1 << 0,
|
||||
HAZARD_RTAS_WRITE = 1 << 1,
|
||||
HAZARD_READ = 1 << 2,
|
||||
HAZARD_WRITE = 1 << 3,
|
||||
HAZARD_ALL = 0xf
|
||||
};
|
||||
|
||||
enum_uint32(RaytracingAccelerationStructureType)
|
||||
{
|
||||
TOP_LEVEL = 0x0,
|
||||
BOTTOM_LEVEL = 0x1,
|
||||
};
|
||||
|
||||
typedef struct PostbuildInfoCurrentSize
|
||||
{
|
||||
uint64_t CurrentSizeInBytes;
|
||||
} PostbuildInfoCurrentSize;
|
||||
|
||||
typedef struct PostbuildInfoCompactedSize
|
||||
{
|
||||
uint64_t CompactedSizeInBytes;
|
||||
} PostbuildInfoCompactedSize;
|
||||
|
||||
typedef struct PostbuildInfoToolsVisualizationDesc
|
||||
{
|
||||
uint64_t DecodedSizeInBytes;
|
||||
} PostbuildInfoToolsVisualizationDesc;
|
||||
|
||||
typedef struct PostbuildInfoSerializationDesc
|
||||
{
|
||||
uint64_t SerializedSizeInBytes;
|
||||
uint64_t NumBottomLevelAccelerationStructurePointers;
|
||||
} PostbuildInfoSerializationDesc;
|
||||
|
||||
typedef struct DecodeHeader
|
||||
{
|
||||
RaytracingAccelerationStructureType Type;
|
||||
uint32_t NumDesc;
|
||||
} DecodeHeader;
|
||||
|
||||
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,210 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef __OPENCL_VERSION__
|
||||
|
||||
typedef uchar uint8_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef uint uint32_t;
|
||||
typedef ulong uint64_t;
|
||||
typedef char int8_t;
|
||||
typedef short int16_t;
|
||||
typedef int int32_t;
|
||||
typedef long int64_t;
|
||||
|
||||
#else
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef uint8_t uchar;
|
||||
typedef uint16_t ushort;
|
||||
typedef uint32_t uint;
|
||||
typedef uint64_t ulong;
|
||||
|
||||
#define __constant
|
||||
#define __global
|
||||
|
||||
typedef struct uint2
|
||||
{
|
||||
#ifdef __cplusplus
|
||||
uint2() {};
|
||||
uint2( uint ix, uint iy ) : x( ix ), y( iy ) {};
|
||||
#endif
|
||||
uint x;
|
||||
uint y;
|
||||
} uint2;
|
||||
|
||||
typedef struct uint3
|
||||
{
|
||||
#ifdef __cplusplus
|
||||
uint3() {};
|
||||
uint3( uint ix, uint iy, uint iz ) : x( ix ), y( iy ), z( iz ) {};
|
||||
#endif
|
||||
uint x;
|
||||
uint y;
|
||||
uint z;
|
||||
} uint3;
|
||||
|
||||
typedef struct int3
|
||||
{
|
||||
int32_t x;
|
||||
int32_t y;
|
||||
int32_t z;
|
||||
|
||||
#ifdef __cplusplus
|
||||
int3() {};
|
||||
int3(int32_t ix, int32_t iy, int32_t iz) : x(ix), y(iy), z(iz) {};
|
||||
|
||||
int3 operator+(const int32_t i) const { return int3(this->x + i, this->y + i, this->z + i); }
|
||||
int3 operator<<(const int32_t i) const { return int3(this->x << i, this->y << i, this->z << i); }
|
||||
#endif
|
||||
} int3;
|
||||
|
||||
typedef struct int4
|
||||
{
|
||||
int32_t x;
|
||||
int32_t y;
|
||||
int32_t z;
|
||||
int32_t w;
|
||||
|
||||
#ifdef __cplusplus
|
||||
int4() {};
|
||||
int4(int32_t ix, int32_t iy, int32_t iz, int32_t iw) : x(ix), y(iy), z(iz), w(iw) {};
|
||||
|
||||
int4 operator+(const int32_t i) const { return int4(this->x + i, this->y + i, this->z + i, this->w + i); }
|
||||
int4 operator-(const int32_t i) const { return int4(this->x - i, this->y - i, this->z - i, this->w - i); }
|
||||
int4 operator<<(const int32_t i) const { return int4(this->x << i, this->y << i, this->z << i, this->w << i); }
|
||||
#endif
|
||||
} int4;
|
||||
|
||||
typedef struct float3
|
||||
{
|
||||
float x;
|
||||
float y;
|
||||
float z;
|
||||
|
||||
#ifdef __cplusplus
|
||||
float3(){};
|
||||
float3( float ix, float iy, float iz ) : x(ix), y(iy), z(iz){};
|
||||
|
||||
float3 operator+( const float3& f3 ) { return float3( this->x + f3.x, this->y + f3.y, this->z + f3.z ); }
|
||||
float3 operator*( const float& f ) { return float3( this->x * f, this->y * f, this->z * f ); }
|
||||
float3 operator*( const float3& f3 ) const { return float3(this->x * f3.x, this->y * f3.y, this->z * f3.z); }
|
||||
float3 operator-() { return float3(-this->x, -this->y, -this->z); }
|
||||
float3 operator-( const float3& f3) { return float3(this->x - f3.x, this->y - f3.y, this->z - f3.z); }
|
||||
#endif
|
||||
} float3;
|
||||
|
||||
typedef struct float4
|
||||
{
|
||||
float x;
|
||||
float y;
|
||||
float z;
|
||||
float w;
|
||||
|
||||
#ifdef __cplusplus
|
||||
float4() {};
|
||||
float4( float ix, float iy, float iz, float iw ) : x( ix ), y( iy ), z( iz ), w( iw ) {};
|
||||
|
||||
float4 operator+(const float4& f4) const { return float4(this->x + f4.x, this->y + f4.y, this->z + f4.z, this->w + f4.w); }
|
||||
float4 operator*(const float4& f4) const { return float4(this->x * f4.x, this->y * f4.y, this->z * f4.z, this->w * f4.w); }
|
||||
#endif
|
||||
} float4;
|
||||
|
||||
#endif /* ! __OPENCL_VERSION__ */
|
||||
|
||||
|
||||
#ifndef __cplusplus
|
||||
|
||||
#define GRL_NAMESPACE_BEGIN(x)
|
||||
#define GRL_NAMESPACE_END(x)
|
||||
#define GRL_OVERLOADABLE __attribute((overloadable))
|
||||
#define GRL_INLINE __attribute__((always_inline)) inline static
|
||||
|
||||
# define enum_uint8(name) \
|
||||
typedef uint8_t name; \
|
||||
enum name##_uint32
|
||||
# define enum_uint16(name) \
|
||||
typedef uint16_t name; \
|
||||
enum name##_uint32
|
||||
# define enum_uint32(name) \
|
||||
typedef uint32_t name; \
|
||||
enum name##_uint32
|
||||
|
||||
#define OCL_BYTE_ALIGN(n) __attribute__ ((aligned (n)))
|
||||
#define GRL_STATIC_ASSERT(condition,desc)
|
||||
|
||||
#else /* C++ */
|
||||
#ifdef __OPENCL_VERSION__
|
||||
#error "OpenCL C++ not supported by this header"
|
||||
#endif
|
||||
|
||||
#define GRL_NAMESPACE_BEGIN(x) namespace x {
|
||||
#define GRL_NAMESPACE_END(x) }
|
||||
#define GRL_OVERLOADABLE
|
||||
#define GRL_INLINE inline
|
||||
|
||||
#define enum_uint8(N) enum N : uint8_t
|
||||
#define enum_uint16(N) enum N : uint16_t
|
||||
#define enum_uint32(N) enum N : uint32_t
|
||||
|
||||
#define OCL_BYTE_ALIGN(n)
|
||||
#define GRL_STATIC_ASSERT(condition,desc) static_assert( condition, desc )
|
||||
|
||||
#include <cmath>
|
||||
|
||||
inline float3 fmin(float3 a, float3 b)
|
||||
{
|
||||
float3 o = { std::fmin(a.x, b.x), std::fmin(a.y, b.y), std::fmin(a.z, b.z) };
|
||||
return o;
|
||||
}
|
||||
|
||||
inline float3 fmax(float3 a, float3 b)
|
||||
{
|
||||
float3 o = { std::fmax(a.x, b.x), std::fmax(a.y, b.y), std::fmax(a.z, b.z) };
|
||||
return o;
|
||||
}
|
||||
|
||||
inline float3 operator/(const float3& f3, const float& f) { return float3(f3.x / f, f3.y / f, f3.z / f); }
|
||||
|
||||
inline float dot(const float3& a, const float3& b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
inline float as_float(uint32_t i)
|
||||
{
|
||||
union { float f; uint32_t i; } fi;
|
||||
|
||||
fi.i = i;
|
||||
return fi.f;
|
||||
}
|
||||
|
||||
inline float3 as_float3(int3 i3)
|
||||
{
|
||||
float3 o = { as_float(i3.x), as_float(i3.y), as_float(i3.z) };
|
||||
return o;
|
||||
}
|
||||
|
||||
inline float4 as_float4(int4 i4)
|
||||
{
|
||||
float4 o = { as_float(i4.x), as_float(i4.y), as_float(i4.z), as_float(i4.w) };
|
||||
return o;
|
||||
}
|
||||
|
||||
inline float4 convert_float4_rtn(int4 i4)
|
||||
{
|
||||
return float4(static_cast<float>(i4.x), static_cast<float>(i4.y), static_cast<float>(i4.z), static_cast<float>(i4.w));
|
||||
}
|
||||
|
||||
inline float4 convert_float4_rtp(int4 i4)
|
||||
{
|
||||
return convert_float4_rtn(i4);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -1,142 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
//
|
||||
// This file is to contain structure definitions for RTAS-related meta-deta.
|
||||
// The structures here should be generic enough to apply to any acceleration structure.
|
||||
// If we ever move to KD-Trees or Octrees, this file should not need to change.
|
||||
//
|
||||
|
||||
//********************************************************************************************
|
||||
// WARNING!!!!!
|
||||
//
|
||||
// This file is shared by OpenCL and C++ source code and must be a pure C header
|
||||
// There should only be C structure definitions and trivial inline functions here
|
||||
//
|
||||
//********************************************************************************************
|
||||
|
||||
|
||||
#pragma once
|
||||
#include "GRLIntTypes.h"
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
|
||||
typedef struct SerializationIdentifier
|
||||
{
|
||||
uint8_t Bytes[16];
|
||||
} SerializationIdentifier;
|
||||
|
||||
GRL_STATIC_ASSERT(sizeof(SerializationIdentifier) == 16, "Wrong size!");
|
||||
|
||||
|
||||
// Header structure for RTAS serialization.
|
||||
// This structure is binary-compatible with the DXR and Vulkan API definitions
|
||||
typedef struct SerializationHeader
|
||||
{
|
||||
SerializationIdentifier DriverID; // DXR 'DriverOpaqueGUID'. Vulkan: 'driverUUID'
|
||||
SerializationIdentifier GRLID; // DXR 'DriverOpaqueVersioningData'. Vulkan: 'accelerationStructureUUID'
|
||||
|
||||
uint64_t SerializedSizeInBytesIncludingHeader;
|
||||
uint64_t DeserializedSizeInBytes;
|
||||
uint64_t InstanceHandleCount;
|
||||
} SerializationHeader;
|
||||
|
||||
GRL_STATIC_ASSERT(sizeof(SerializationHeader) == 56, "Wrong size!");
|
||||
|
||||
// This structure is binary-compatible with DXR and Vulkan 'InstanceDesc' structures
|
||||
typedef struct InstanceDesc {
|
||||
float Transform[3][4];
|
||||
uint32_t InstanceIDAndMask; // mask in 8 msbs
|
||||
uint32_t InstanceContributionToHitGroupIndexAndFlags; // flags in 8 msbs
|
||||
gpuva_t AccelerationStructureGPUVA; // NOTE: In GRL this is always a VA. Vulkan CPU builds use handles here, and these may need to be translated
|
||||
} InstanceDesc;
|
||||
GRL_STATIC_ASSERT(sizeof(InstanceDesc) == 64, "Wrong size!");
|
||||
|
||||
typedef struct GeoMetaData{
|
||||
uint32_t PrimitiveCount;
|
||||
uint16_t Type;
|
||||
uint16_t Flags;
|
||||
} GeoMetaData;
|
||||
GRL_STATIC_ASSERT(sizeof(GeoMetaData) == 8, "Wrong size!");
|
||||
|
||||
typedef struct AABB3f {
|
||||
float lower[3];
|
||||
float upper[3];
|
||||
} AABB3f;
|
||||
GRL_STATIC_ASSERT(sizeof(AABB3f) == 24, "Wrong size!");
|
||||
|
||||
enum_uint32(error_t_) {
|
||||
error_t_no_error = 0x0,
|
||||
error_t_internal_node_child_OOB = 0x1,
|
||||
error_t_leaf_node_child_OOB = 0x2,
|
||||
error_t_unrecognised_node_t = 0x4,
|
||||
error_t_mixed_node_unsupported = 0x8,
|
||||
error_t_instance_pointers_inconsistent = 0x10,
|
||||
error_t_instance_pointed_root_not_internal = 0x20,
|
||||
error_t_leaf_node_instance_child_missed_by_64B = 0x40,
|
||||
error_t_internal_node_child_cycle = 0x80,
|
||||
error_t_input_geo_insane = 0x100,
|
||||
error_t_quad_leaf_broken = 0x200,
|
||||
error_t_backpointer_not_reset = 0x400,
|
||||
error_t_backpointer_wrong_children_num = 0x500,
|
||||
error_t_backpointer_inconsitent_parent_child = 0x600,
|
||||
error_t_backpointer_root_not_root_error = 0x700,
|
||||
error_t_backpointer_OOB = 0x800,
|
||||
error_t_backpointers_buffer_too_small = 0x900,
|
||||
error_t_atomic_update_struct_fatleaf_count_oob = 0x1000, // for this and following:
|
||||
error_t_atomic_update_struct_fatleaf_node_idx_oob = 0x2000, // offset_in_BVH is just index in fatleaf or inner node arrays
|
||||
error_t_atomic_update_struct_fatleaf_backpointer_mismatch = 0x3000,
|
||||
error_t_atomic_update_struct_fatleaf_num_children_error = 0x4000,
|
||||
error_t_atomic_update_struct_fatleaf_children_non_leaf = 0x5000,
|
||||
error_t_atomic_update_struct_inner_count_oob = 0x6000,
|
||||
error_t_atomic_update_struct_inner_node_idx_oob = 0x7000,
|
||||
error_t_atomic_update_struct_inner_node_child_idx_error = 0x8000,
|
||||
error_t_atomic_update_struct_inner_num_children_error = 0x9000,
|
||||
error_t_atomic_update_struct_inner_children_non_internal = 0xA000,
|
||||
error_t_unknown = 1u << 31,
|
||||
};
|
||||
|
||||
enum_uint32(error_phase_t) {
|
||||
error_phase_t_unknown = 0,
|
||||
error_phase_t_post_build_Morton = 1,
|
||||
error_phase_t_post_build_Trivial = 2,
|
||||
error_phase_t_post_build_NewSAH = 3,
|
||||
error_phase_t_post_update = 4,
|
||||
error_phase_t_pre_update = 5,
|
||||
error_phase_t_post_copy_op = 6,
|
||||
};
|
||||
|
||||
typedef struct ERROR_INFO {
|
||||
error_t_ type;
|
||||
uint offset_in_BVH; //in 64B units
|
||||
error_phase_t when;
|
||||
uint reserved;
|
||||
} ERROR_INFO;
|
||||
|
||||
// Meta-data common to all acceleration structures, which is needed to implement required functionality
|
||||
// All RTAS structures must contain a struct of this type named 'Meta'
|
||||
typedef struct RTASMetaData {
|
||||
struct AABB3f bounds;
|
||||
|
||||
uint32_t instanceDescsStart; // byte offset to array of original instance_descs used for build. Required for DXR visualization and serialization
|
||||
uint32_t instanceCount;
|
||||
|
||||
uint32_t geoDescsStart; // byte offset to array of 'GeoMetaData' matching input geos. Required for DXR visualization
|
||||
uint32_t geoCount;
|
||||
|
||||
uint64_t allocationSize; // Size of the memory allocation containing this RTAS
|
||||
// This is the size given to the app in the prebuild info when the RTAS was first created
|
||||
// If RTAS was compacted, this will be the compacted size
|
||||
|
||||
ERROR_INFO errors; // only used in debug mode
|
||||
} RTASMetaData;
|
||||
|
||||
GRL_STATIC_ASSERT( sizeof(RTASMetaData) == 64, "Wrong size!");
|
||||
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLIntTypes.h"
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(_INTERNAL)
|
||||
|
||||
struct GeometryTriangles
|
||||
{
|
||||
gpuva_t pTransformBuffer;
|
||||
gpuva_t pIndexBuffer;
|
||||
gpuva_t pVertexBuffer;
|
||||
qword VertexBufferByteStride;
|
||||
dword IndexCount;
|
||||
dword VertexCount;
|
||||
IndexFormat IndexFormat;
|
||||
VertexFormat VertexFormat;
|
||||
};
|
||||
|
||||
struct GeometryProcedural
|
||||
{
|
||||
gpuva_t pAABBs_GPUVA; ///<elements of pAABBs_GPUVA are gpuAABB format.
|
||||
qword AABBByteStride;
|
||||
dword AABBCount;
|
||||
};
|
||||
|
||||
// TODO we miss 'unsigned int ShaderIndex_Mask; // extension' field
|
||||
struct Geo
|
||||
{
|
||||
union
|
||||
{
|
||||
struct GeometryTriangles Triangles;
|
||||
struct GeometryProcedural Procedural;
|
||||
} Desc;
|
||||
|
||||
GeometryType Type;
|
||||
uint8_t Flags;
|
||||
};
|
||||
|
||||
// Matches the Vulkan VkAccelerationStructureBuildRangeInfoKHR structure
|
||||
// See Vulkan spec for data access rules:
|
||||
// https://registry.khronos.org/vulkan/specs/latest/man/html/VkAccelerationStructureBuildRangeInfoKHR.html
|
||||
//
|
||||
struct IndirectBuildRangeInfo
|
||||
{
|
||||
dword primitiveCount; // Number of primitives
|
||||
dword primitiveOffset; // Byte offset to primitive data
|
||||
dword firstVertex; // Index of first vertex
|
||||
dword transformOffset; // Byte offset to transform data (for triangle Geo with non-null transform)
|
||||
};
|
||||
|
||||
GRL_NAMESPACE_END(_INTERNAL)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLOCLCompatibility.h"
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
|
||||
GRL_INLINE float4 bitShiftLdexp4(float4 x, int4 y)
|
||||
{
|
||||
y = (y + 127) << 23;
|
||||
return x * as_float4(y);
|
||||
}
|
||||
|
||||
GRL_INLINE float3 bitShiftLdexp3(float3 x, int3 y)
|
||||
{
|
||||
y = (y + 127) << 23;
|
||||
return x * as_float3(y);
|
||||
}
|
||||
|
||||
GRL_INLINE float bitShiftLdexp(float x, int y)
|
||||
{
|
||||
y = (y + 127) << 23;
|
||||
return x * as_float(y);
|
||||
}
|
||||
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,192 +0,0 @@
|
|||
//
|
||||
// Copyright (C) 2009-2021 Intel Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GRLRTASCommon.h"
|
||||
|
||||
GRL_NAMESPACE_BEGIN(GRL)
|
||||
GRL_NAMESPACE_BEGIN(RTAS)
|
||||
inline float3 GRL_OVERLOADABLE cross(const float3 a, const float3 b)
|
||||
{
|
||||
float3 res = { a.y * b.z - a.z * b.y,
|
||||
a.z * b.x - a.x * b.z,
|
||||
a.x * b.y - a.y * b.x };
|
||||
return res;
|
||||
}
|
||||
|
||||
struct LinearSpace3f
|
||||
{
|
||||
float3 vx;
|
||||
float3 vy;
|
||||
float3 vz;
|
||||
};
|
||||
|
||||
/* compute the determinant of the matrix */
|
||||
GRL_INLINE struct LinearSpace3f LinearSpace3f_Constructor(const float3 vx, const float3 vy, const float3 vz)
|
||||
{
|
||||
struct LinearSpace3f xfm;
|
||||
xfm.vx = vx;
|
||||
xfm.vy = vy;
|
||||
xfm.vz = vz;
|
||||
return xfm;
|
||||
}
|
||||
|
||||
/* compute the determinant of the matrix */
|
||||
GRL_INLINE float LinearSpace3f_det(struct LinearSpace3f xfm)
|
||||
{
|
||||
return dot(xfm.vx, cross(xfm.vy, xfm.vz));
|
||||
}
|
||||
|
||||
/* compute transposed matrix */
|
||||
GRL_INLINE struct LinearSpace3f LinearSpace3f_transpose(struct LinearSpace3f in)
|
||||
{
|
||||
float3 x = { in.vx.x, in.vy.x, in.vz.x };
|
||||
float3 y = { in.vx.y, in.vy.y, in.vz.y };
|
||||
float3 z = { in.vx.z, in.vy.z, in.vz.z };
|
||||
|
||||
return LinearSpace3f_Constructor(x,
|
||||
y,
|
||||
z);
|
||||
}
|
||||
|
||||
/* compute adjoint matrix */
|
||||
GRL_INLINE const struct LinearSpace3f LinearSpace3f_adjoint(struct LinearSpace3f in)
|
||||
{
|
||||
return LinearSpace3f_transpose(LinearSpace3f_Constructor(cross(in.vy, in.vz),
|
||||
cross(in.vz, in.vx),
|
||||
cross(in.vx, in.vy)));
|
||||
}
|
||||
|
||||
/* compute inverse matrix */
|
||||
GRL_INLINE struct LinearSpace3f LinearSpace3f_invert(struct LinearSpace3f in)
|
||||
{
|
||||
const float det = LinearSpace3f_det(in);
|
||||
const struct LinearSpace3f adj = LinearSpace3f_adjoint(in);
|
||||
return LinearSpace3f_Constructor(adj.vx / det, adj.vy / det, adj.vz / det);
|
||||
}
|
||||
|
||||
GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct LinearSpace3f xfm, float3 p)
|
||||
{
|
||||
return xfm.vx * p.x + xfm.vy * p.y + xfm.vz * p.z;
|
||||
}
|
||||
|
||||
struct AffineSpace3f
|
||||
{
|
||||
struct LinearSpace3f l;
|
||||
float3 p;
|
||||
};
|
||||
|
||||
GRL_INLINE struct AffineSpace3f AffineSpace3f_Constructor(struct LinearSpace3f l, float3 p)
|
||||
{
|
||||
struct AffineSpace3f out;
|
||||
out.l = l;
|
||||
out.p = p;
|
||||
return out;
|
||||
}
|
||||
|
||||
GRL_INLINE struct AffineSpace3f AffineSpace3f_load_row_major(const float *in)
|
||||
{
|
||||
struct AffineSpace3f out;
|
||||
out.l.vx.x = in[0];
|
||||
out.l.vx.y = in[4];
|
||||
out.l.vx.z = in[8];
|
||||
out.l.vy.x = in[1];
|
||||
out.l.vy.y = in[5];
|
||||
out.l.vy.z = in[9];
|
||||
out.l.vz.x = in[2];
|
||||
out.l.vz.y = in[6];
|
||||
out.l.vz.z = in[10];
|
||||
out.p.x = in[3];
|
||||
out.p.y = in[7];
|
||||
out.p.z = in[11];
|
||||
return out;
|
||||
}
|
||||
|
||||
// squared proportion of oriented transformed cube to aa box that would contain it.
|
||||
// the smaller it is the more overhead transformation produces
|
||||
GRL_INLINE
|
||||
float transformation_bbox_surf_overhead(const float* Transform)
|
||||
{
|
||||
// We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
|
||||
// New AABB is center +- Extent.
|
||||
//
|
||||
// For derivation see:
|
||||
// https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
|
||||
//
|
||||
|
||||
|
||||
// take the cube of side 1 and see how big aabb containing it transformed is vs just surface of transformed
|
||||
float ex = fabs(Transform[0]) + fabs(Transform[1]) + fabs(Transform[2]);
|
||||
float ey = fabs(Transform[4]) + fabs(Transform[5]) + fabs(Transform[6]);
|
||||
float ez = fabs(Transform[8]) + fabs(Transform[9]) + fabs(Transform[10]);
|
||||
|
||||
// we will compare squared sizes
|
||||
ex = ex * ex;
|
||||
ey = ey * ey;
|
||||
ez = ez * ez;
|
||||
|
||||
// surface of aabb containing oriented box;
|
||||
float aabb_sq_half_surf = ex * ey + ey * ez + ez * ex;
|
||||
|
||||
// ^2 lengths of transformed <1,0,0>, <0,1,0>, <0,0,1>
|
||||
float obx = Transform[0] * Transform[0] + Transform[4] * Transform[4] + Transform[8] * Transform[8];
|
||||
float oby = Transform[1] * Transform[1] + Transform[5] * Transform[5] + Transform[9] * Transform[9];
|
||||
float obz = Transform[2] * Transform[2] + Transform[6] * Transform[6] + Transform[10] * Transform[10];
|
||||
|
||||
float obb_sq_half_surf = obx * oby + oby * obz + obz * obx;
|
||||
|
||||
return obb_sq_half_surf / aabb_sq_half_surf;
|
||||
|
||||
// ex = 2.0
|
||||
// ey = 2.0
|
||||
// ez = 2.0
|
||||
// ex = 4.0
|
||||
// ey = 4.0
|
||||
// ez = 4.0
|
||||
// aabb_half_surf = 16+16 *2.0 + 2.0*2.0+ 2.0*2.0; = 12;
|
||||
// aabb_sq_half_surf = 144;
|
||||
//
|
||||
// obx = 4.0;
|
||||
// oby = 4.0;
|
||||
// obz = 4.0;
|
||||
// obb_sq_half_surf = 16 + 16+ 16;
|
||||
// obb_sq_half_surf = 16.0 *3 = 48
|
||||
}
|
||||
|
||||
GRL_INLINE void load_row_major_from_AffineSpace3f(struct AffineSpace3f in, float* out)
|
||||
{
|
||||
out[0] = in.l.vx.x;
|
||||
out[4] = in.l.vx.y;
|
||||
out[8] = in.l.vx.z;
|
||||
out[1] = in.l.vy.x;
|
||||
out[5] = in.l.vy.y;
|
||||
out[9] = in.l.vy.z;
|
||||
out[2] = in.l.vz.x;
|
||||
out[6] = in.l.vz.y;
|
||||
out[10] = in.l.vz.z;
|
||||
|
||||
out[3] = in.p.x;
|
||||
out[7] = in.p.y;
|
||||
out[11] = in.p.z;
|
||||
}
|
||||
|
||||
GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct AffineSpace3f xfm, float3 p)
|
||||
{
|
||||
return xfmPoint(xfm.l, p) + xfm.p;
|
||||
}
|
||||
|
||||
/* compute inverse matrix */
|
||||
GRL_INLINE struct AffineSpace3f AffineSpace3f_invert(struct AffineSpace3f in)
|
||||
{
|
||||
const struct LinearSpace3f il = LinearSpace3f_invert(in.l);
|
||||
float3 ip = -xfmPoint(il, in.p);
|
||||
return AffineSpace3f_Constructor(il, ip);
|
||||
}
|
||||
|
||||
GRL_NAMESPACE_END(RTAS)
|
||||
GRL_NAMESPACE_END(GRL)
|
||||
|
|
@ -1,186 +0,0 @@
|
|||
# Copyright © 2021 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
grl_lib_files = [
|
||||
'gpu/libs/libraries.grl',
|
||||
]
|
||||
|
||||
grl_grl_files = [
|
||||
'gpu/build_leaf.grl',
|
||||
'gpu/build_primref.grl',
|
||||
# 'gpu/build_refit.grl',
|
||||
'gpu/copy.grl',
|
||||
# 'gpu/grl_api_interface_verify.grl',
|
||||
'gpu/misc.grl',
|
||||
# 'gpu/morton_builder.grl',
|
||||
# 'gpu/msb_radix_bitonic_sort.grl',
|
||||
'gpu/new_sah_builder.grl',
|
||||
'gpu/postbuild_info.grl',
|
||||
# 'gpu/presplit.grl',
|
||||
# 'gpu/radix_sort.grl',
|
||||
# 'gpu/rebraid.grl',
|
||||
# 'gpu/traversal_shader.grl',
|
||||
]
|
||||
|
||||
grl_lib_args = []
|
||||
foreach libfile : grl_lib_files
|
||||
grl_lib_args += '--library'
|
||||
grl_lib_args += files(libfile)
|
||||
endforeach
|
||||
|
||||
grl_genX_files = [
|
||||
'genX_grl_dispatch.c',
|
||||
'genX_grl_uuid.cpp',
|
||||
]
|
||||
|
||||
grl_lib_args = []
|
||||
foreach libfile : grl_lib_files
|
||||
grl_lib_args += '--library'
|
||||
grl_lib_args += files(libfile)
|
||||
endforeach
|
||||
|
||||
grl_cl_kernel_h = custom_target(
|
||||
'grl_cl_kernel.h',
|
||||
input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files],
|
||||
output : 'grl_cl_kernel.h',
|
||||
command : [
|
||||
prog_python, '@INPUT0@', '--out-h', '@OUTPUT@',
|
||||
grl_lib_args, files(grl_grl_files),
|
||||
],
|
||||
)
|
||||
|
||||
has_ply = run_command(
|
||||
prog_python, '-c',
|
||||
'''
|
||||
import ply
|
||||
''', check : false)
|
||||
if has_ply.returncode() != 0
|
||||
error('Python (3.x) ply module required to build GRL kernels.')
|
||||
endif
|
||||
|
||||
r = run_command(prog_python, 'grl_cl_kernel_gen.py',
|
||||
grl_lib_args, '--ls-kernels', grl_grl_files, check : false)
|
||||
assert(r.returncode() == 0, 'Failed to fetch GRL CL kernels')
|
||||
grl_kernels = r.stdout().strip().split()
|
||||
|
||||
grl_metakernel_c = []
|
||||
grl_metakernel_h = []
|
||||
foreach grl_file : grl_grl_files
|
||||
base_outfile = 'grl_metakernel_' + fs.replace_suffix(fs.name(grl_file), '')
|
||||
outfiles = custom_target(
|
||||
base_outfile,
|
||||
input : ['grl_metakernel_gen.py', grl_file, grl_lib_files],
|
||||
output : [base_outfile + '.h', base_outfile + '.c'],
|
||||
command : [
|
||||
prog_python, '@INPUT0@', '--out-h', '@OUTPUT0@',
|
||||
'--out-c', '@OUTPUT1@', grl_lib_args, '@INPUT1@',
|
||||
],
|
||||
)
|
||||
grl_metakernel_h += outfiles[0]
|
||||
grl_metakernel_c += outfiles[1]
|
||||
endforeach
|
||||
|
||||
grl_genX_libs = []
|
||||
foreach t : [['125', 'gfx125', 'dg2'], ['200', 'gfx20', 'lnl'],
|
||||
['300', 'gfx30', 'ptl'], ]
|
||||
verX10 = t[0]
|
||||
genX_prefix = t[1]
|
||||
platform = t[2]
|
||||
|
||||
grl_compiled_cl_kernels = []
|
||||
foreach k : grl_kernels
|
||||
# get_cl_files dumps out filename:entrypoint:libfile1,libfile2,libfile3
|
||||
cl_file = k.split(':')[0]
|
||||
entrypoint = k.split(':')[1]
|
||||
library_files = k.split(':')[2]
|
||||
kernel_prefix = '_'.join([
|
||||
genX_prefix,
|
||||
fs.replace_suffix(cl_file, '').replace('gpu/', '').replace('/', '_'),
|
||||
entrypoint
|
||||
])
|
||||
input_args = [ files(cl_file), ]
|
||||
if library_files != ''
|
||||
foreach lib_file : library_files.split(',')
|
||||
input_args += [ lib_file ]
|
||||
endforeach
|
||||
endif
|
||||
prepended_input_args = []
|
||||
foreach input_arg : input_args
|
||||
prepended_input_args += ['--in', input_arg]
|
||||
endforeach
|
||||
outfile = kernel_prefix + '.h'
|
||||
grl_compiled_cl_kernels += custom_target(
|
||||
outfile,
|
||||
input : cl_file,
|
||||
output : outfile,
|
||||
command : [
|
||||
prog_intel_clc, '-p', platform, '--prefix', kernel_prefix,
|
||||
'-e', entrypoint, prepended_input_args, '-o', '@OUTPUT@', '--',
|
||||
'-cl-std=cl2.0', '-D__OPENCL_VERSION__=200',
|
||||
'-DMAX_HW_SIMD_WIDTH=16', '-DMAX_WORKGROUP_SIZE=16',
|
||||
'-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
|
||||
'-I' + join_paths(meson.current_source_dir(), 'gpu'),
|
||||
'-I' + join_paths(meson.current_source_dir(), 'include'),
|
||||
],
|
||||
env: ['MESA_SHADER_CACHE_DISABLE=true',
|
||||
'MESA_SPIRV_LOG_LEVEL=error'],
|
||||
depends : dep_prog_intel_clc
|
||||
)
|
||||
endforeach
|
||||
|
||||
grl_cl_kernel_c = custom_target(
|
||||
'grl_@0@_cl_kernel.c'.format(genX_prefix),
|
||||
input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files],
|
||||
output : 'grl_@0@_cl_kernel.c'.format(genX_prefix),
|
||||
command : [
|
||||
prog_python, '@INPUT0@', '--out-c', '@OUTPUT@',
|
||||
grl_lib_args, '--prefix', genX_prefix, files(grl_grl_files),
|
||||
],
|
||||
)
|
||||
|
||||
grl_genX_libs += static_library(
|
||||
'grl_@0@'.format(genX_prefix),
|
||||
[grl_cl_kernel_h, grl_compiled_cl_kernels, grl_cl_kernel_c,
|
||||
grl_genX_files, grl_metakernel_c, grl_metakernel_h],
|
||||
include_directories : [
|
||||
inc_include, inc_src,
|
||||
inc_intel,
|
||||
],
|
||||
c_args : [
|
||||
no_override_init_args, sse2_args,
|
||||
'-DGFX_VERx10=@0@'.format(verX10),
|
||||
],
|
||||
cpp_args : [
|
||||
sse2_args,
|
||||
'-DGFX_VERx10=@0@'.format(verX10),
|
||||
],
|
||||
dependencies : [
|
||||
dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers,
|
||||
idep_vulkan_runtime_headers, idep_anv_headers, idep_genxml,
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
)
|
||||
endforeach
|
||||
|
||||
libgrl_deps = [
|
||||
dep_valgrind,
|
||||
idep_nir_headers,
|
||||
idep_vulkan_util_headers,
|
||||
idep_vulkan_wsi_headers,
|
||||
]
|
||||
|
||||
libgrl = static_library(
|
||||
'grl',
|
||||
[grl_cl_kernel_h],
|
||||
include_directories : [
|
||||
inc_include, inc_src, inc_intel,
|
||||
],
|
||||
link_whole : [grl_genX_libs],
|
||||
dependencies : [libgrl_deps, idep_anv_headers],
|
||||
)
|
||||
idep_grl = declare_dependency(
|
||||
link_with : libgrl,
|
||||
dependencies : libgrl_deps,
|
||||
sources : [grl_metakernel_h, grl_cl_kernel_h],
|
||||
include_directories : include_directories('include', 'gpu'),
|
||||
)
|
||||
|
|
@ -39,22 +39,10 @@ idep_anv_headers = declare_dependency(
|
|||
|
||||
bvh_spv = []
|
||||
if with_intel_vk_rt
|
||||
if with_intel_bvh_grl
|
||||
subdir('grl')
|
||||
optional_libgrl = [libgrl]
|
||||
anv_flags += '-DANV_SUPPORT_RT_GRL=1'
|
||||
else
|
||||
subdir('bvh')
|
||||
idep_grl = null_dep
|
||||
optional_libgrl = []
|
||||
anv_flags += '-DANV_SUPPORT_RT_GRL=0'
|
||||
endif
|
||||
subdir('bvh')
|
||||
anv_flags += '-DANV_SUPPORT_RT=1'
|
||||
else
|
||||
idep_grl = null_dep
|
||||
optional_libgrl = []
|
||||
anv_flags += '-DANV_SUPPORT_RT=0'
|
||||
anv_flags += '-DANV_SUPPORT_RT_GRL=0'
|
||||
endif
|
||||
|
||||
intel_icd = custom_target(
|
||||
|
|
@ -111,15 +99,9 @@ anv_per_hw_ver_files = files(
|
|||
'genX_simple_shader.c',
|
||||
)
|
||||
if with_intel_vk_rt
|
||||
if with_intel_bvh_grl
|
||||
anv_per_hw_ver_files += files(
|
||||
'genX_acceleration_structure_grl.c',
|
||||
)
|
||||
else
|
||||
anv_per_hw_ver_files += files(
|
||||
'genX_acceleration_structure.c',
|
||||
)
|
||||
endif
|
||||
anv_per_hw_ver_files += files(
|
||||
'genX_acceleration_structure.c',
|
||||
)
|
||||
endif
|
||||
|
||||
foreach _gfx_ver : ['90', '110', '120', '125', '200', '300']
|
||||
|
|
@ -135,7 +117,7 @@ foreach _gfx_ver : ['90', '110', '120', '125', '200', '300']
|
|||
dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml,
|
||||
idep_vulkan_util_headers, idep_vulkan_wsi_headers,
|
||||
idep_vulkan_runtime_headers, idep_mesautil,
|
||||
idep_intel_driver_ds_headers, idep_grl,
|
||||
idep_intel_driver_ds_headers,
|
||||
idep_intel_shaders, idep_intel_blorp,
|
||||
],
|
||||
)
|
||||
|
|
@ -271,7 +253,7 @@ libvulkan_intel = shared_library(
|
|||
include_directories : [
|
||||
inc_include, inc_src, inc_intel,
|
||||
],
|
||||
link_whole : [libanv_common, libanv_per_hw_ver_libs] + optional_libgrl,
|
||||
link_whole : [libanv_common, libanv_per_hw_ver_libs],
|
||||
link_with : [
|
||||
libisl, libintel_perf,
|
||||
],
|
||||
|
|
@ -313,7 +295,7 @@ if with_tests
|
|||
link_with : [
|
||||
libanv_per_hw_ver_libs, libintel_common,
|
||||
libisl, libintel_perf,
|
||||
] + optional_libgrl,
|
||||
],
|
||||
dependencies : [
|
||||
dep_thread, dep_dl, dep_m, anv_deps,
|
||||
idep_nir, idep_vulkan_util, idep_vulkan_wsi, idep_vulkan_runtime,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue