gallium/swr: Remove driver source

The OpenSWR will be maintained on a classic/LTS branch.

Reviewed-by: Dylan Baker <dylan@pnwbakers.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11264>
This commit is contained in:
Jan Zielinski 2021-06-09 13:19:44 +02:00 committed by Marge Bot
parent d22d328859
commit 855793c6c6
178 changed files with 0 additions and 85594 deletions

View file

@ -1,64 +0,0 @@
---
Language: Cpp
AccessModifierOffset: -3
AlignAfterOpenBracket: true
AlignEscapedNewlinesLeft: false
AlignOperands: false
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AlwaysBreakAfterDefinitionReturnType: true
AlwaysBreakTemplateDeclarations: false
AlwaysBreakBeforeMultilineStrings: false
BreakBeforeBinaryOperators: NonAssignment
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: true
BinPackParameters: false
BinPackArguments: false
ColumnLimit: 78
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 3
DerivePointerAlignment: false
ExperimentalAutoDetectBinPacking: false
IndentCaseLabels: false
IndentWrappedFunctionNames: false
IndentFunctionDeclarationAfterType: false
MaxEmptyLinesToKeep: 2
KeepEmptyLinesAtTheStartOfBlocks: true
NamespaceIndentation: Inner
ObjCBlockIndentWidth: 3
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakString: 1000
PenaltyBreakFirstLessLess: 120
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 0
PointerAlignment: Right
SpacesBeforeTrailingComments: 1
Cpp11BracedListStyle: true
Standard: Cpp11
IndentWidth: 3
TabWidth: 8
UseTab: Never
BreakBeforeBraces: Linux
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpacesInAngles: false
SpaceInEmptyParentheses: false
SpacesInCStyleCastParentheses: false
SpaceAfterCStyleCast: false
SpacesInContainerLiterals: true
SpaceBeforeAssignmentOperators: true
ContinuationIndentWidth: 3
CommentPragmas: '^ IWYU pragma:'
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
SpaceBeforeParens: ControlStatements
DisableFormat: false
...

View file

@ -1,411 +0,0 @@
# Copyright © 2017-2020 Intel Corporation
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
files_swr_common = files(
'rasterizer/common/formats.cpp',
'rasterizer/common/formats.h',
'rasterizer/common/intrin.h',
'rasterizer/common/isa.hpp',
'rasterizer/common/os.cpp',
'rasterizer/common/os.h',
'rasterizer/common/rdtsc_buckets.cpp',
'rasterizer/common/rdtsc_buckets.h',
'rasterizer/common/rdtsc_buckets_shared.h',
'rasterizer/common/rdtsc_buckets_shared.h',
'rasterizer/common/simd16intrin.h',
'rasterizer/common/simdintrin.h',
'rasterizer/common/simdlib.hpp',
'rasterizer/common/simdlib_interface.hpp',
'rasterizer/common/simdlib_types.hpp',
'rasterizer/common/swr_assert.cpp',
'rasterizer/common/swr_assert.h',
)
files_swr_mesa = files(
'swr_loader.cpp',
'swr_clear.cpp',
'swr_context.cpp',
'swr_context.h',
'swr_draw.cpp',
'swr_public.h',
'swr_resource.h',
'swr_screen.cpp',
'swr_screen.h',
'swr_state.cpp',
'swr_state.h',
'swr_tex_sample.cpp',
'swr_tex_sample.h',
'swr_scratch.h',
'swr_scratch.cpp',
'swr_shader.cpp',
'swr_shader.h',
'swr_memory.h',
'swr_fence.h',
'swr_fence.cpp',
'swr_fence_work.h',
'swr_fence_work.cpp',
'swr_query.h',
'swr_query.cpp',
'rasterizer/jitter/blend_jit.cpp',
'rasterizer/jitter/blend_jit.h',
'rasterizer/jitter/builder.cpp',
'rasterizer/jitter/builder.h',
'rasterizer/jitter/builder_math.h',
'rasterizer/jitter/builder_mem.cpp',
'rasterizer/jitter/builder_mem.h',
'rasterizer/jitter/builder_gfx_mem.cpp',
'rasterizer/jitter/builder_gfx_mem.h',
'rasterizer/jitter/builder_misc.cpp',
'rasterizer/jitter/builder_misc.h',
'rasterizer/jitter/fetch_jit.cpp',
'rasterizer/jitter/fetch_jit.h',
'rasterizer/jitter/jit_api.h',
'rasterizer/jitter/JitManager.cpp',
'rasterizer/jitter/JitManager.h',
'rasterizer/jitter/streamout_jit.cpp',
'rasterizer/jitter/streamout_jit.h',
'rasterizer/jitter/shader_lib/DebugOutput.cpp',
'rasterizer/jitter/shader_lib/Scatter.cpp',
'rasterizer/jitter/functionpasses/lower_x86.cpp',
'rasterizer/memory/SurfaceState.h'
)
files_swr_arch = files(
'rasterizer/archrast/archrast.cpp',
'rasterizer/archrast/archrast.h',
'rasterizer/archrast/eventmanager.h',
'rasterizer/core/api.cpp',
'rasterizer/core/api.h',
'rasterizer/core/arena.h',
'rasterizer/core/backend.cpp',
'rasterizer/core/backend_clear.cpp',
'rasterizer/core/backend_sample.cpp',
'rasterizer/core/backend_singlesample.cpp',
'rasterizer/core/backend.h',
'rasterizer/core/backend_impl.h',
'rasterizer/core/binner.cpp',
'rasterizer/core/binner.h',
'rasterizer/core/blend.h',
'rasterizer/core/clip.cpp',
'rasterizer/core/clip.h',
'rasterizer/core/conservativeRast.h',
'rasterizer/core/context.h',
'rasterizer/core/depthstencil.h',
'rasterizer/core/fifo.hpp',
'rasterizer/core/format_conversion.h',
'rasterizer/core/format_traits.h',
'rasterizer/core/format_types.h',
'rasterizer/core/format_utils.h',
'rasterizer/core/frontend.cpp',
'rasterizer/core/frontend.h',
'rasterizer/core/knobs.h',
'rasterizer/core/knobs_init.h',
'rasterizer/core/multisample.h',
'rasterizer/core/pa_avx.cpp',
'rasterizer/core/pa.h',
'rasterizer/core/rasterizer.cpp',
'rasterizer/core/rasterizer.h',
'rasterizer/core/rasterizer_impl.h',
'rasterizer/core/rdtsc_core.cpp',
'rasterizer/core/rdtsc_core.h',
'rasterizer/core/ringbuffer.h',
'rasterizer/core/state.h',
'rasterizer/core/state_funcs.h',
'rasterizer/core/tessellator.h',
'rasterizer/core/tessellator.hpp',
'rasterizer/core/tessellator.cpp',
'rasterizer/core/threads.cpp',
'rasterizer/core/threads.h',
'rasterizer/core/tilemgr.cpp',
'rasterizer/core/tilemgr.h',
'rasterizer/core/tileset.h',
'rasterizer/core/utils.h',
'rasterizer/memory/ClearTile.cpp',
'rasterizer/memory/Convert.h',
'rasterizer/memory/LoadTile.cpp',
'rasterizer/memory/LoadTile.h',
'rasterizer/memory/LoadTile_Linear.cpp',
'rasterizer/memory/LoadTile_TileX.cpp',
'rasterizer/memory/LoadTile_TileY.cpp',
'rasterizer/memory/StoreTile.cpp',
'rasterizer/memory/StoreTile.h',
'rasterizer/memory/StoreTile_Linear2.cpp',
'rasterizer/memory/StoreTile_Linear.cpp',
'rasterizer/memory/StoreTile_TileW.cpp',
'rasterizer/memory/StoreTile_TileX2.cpp',
'rasterizer/memory/StoreTile_TileX.cpp',
'rasterizer/memory/StoreTile_TileY2.cpp',
'rasterizer/memory/StoreTile_TileY.cpp',
'rasterizer/memory/TilingFunctions.h',
'rasterizer/memory/tilingtraits.h',
'rasterizer/memory/InitMemory.h',
'rasterizer/memory/InitMemory.cpp',
'rasterizer/memory/SurfaceState.h'
)
swr_context_files = files('swr_context.h')
swr_state_files = files('rasterizer/core/state.h')
swr_surf_state_files = files('rasterizer/memory/SurfaceState.h')
swr_event_proto_files = files('rasterizer/archrast/events.proto')
swr_event_pproto_files = files('rasterizer/archrast/events_private.proto')
swr_gen_backend_files = files('rasterizer/codegen/templates/gen_backend.cpp')
swr_gen_rasterizer_files = files('rasterizer/codegen/templates/gen_rasterizer.cpp')
swr_gen_header_init_files = files('rasterizer/codegen/templates/gen_header_init.hpp')
swr_gen_llvm_ir_macros_py = files('rasterizer/codegen/gen_llvm_ir_macros.py')
swr_gen_backends_py = files('rasterizer/codegen/gen_backends.py')
swr_gen_builder_depends = files(
'rasterizer/codegen/templates/gen_builder.hpp',
'rasterizer/codegen/gen_common.py'
)
subdir('rasterizer/jitter')
subdir('rasterizer/codegen')
subdir('rasterizer/core/backends')
swr_incs = include_directories(
'rasterizer/codegen', 'rasterizer/core', 'rasterizer/jitter',
'rasterizer/archrast', 'rasterizer',
)
swr_cpp_args = []
if cpp.has_argument('-fno-strict-aliasing')
swr_cpp_args += '-fno-strict-aliasing'
endif
if cpp.has_argument('-Wno-aligned-new')
swr_cpp_args += '-Wno-aligned-new'
endif
swr_arch_libs = []
swr_defines = []
swr_avx_args = cpp.first_supported_argument(
'-target-cpu=sandybridge', '-mavx', '-march=core-avx', '-tp=sandybridge',
'/arch:AVX',
)
if swr_avx_args == []
error('Cannot find AVX support for swr. (these are required for SWR an all architectures.)')
endif
shared_swr = get_option('shared-swr')
if not shared_swr
if with_swr_arches.length() > 1
error('When SWR is linked statically only one architecture is allowed.')
endif
swr_defines += '-DHAVE_SWR_BUILTIN'
endif
if with_swr_arches.contains('skx')
swr_skx_args = cpp.first_supported_argument(
'-march=skylake-avx512', '-target-cpu=x86-skylake', '-xCORE-AVX512',
)
if swr_skx_args == []
error('Cannot find SKX support for swr.')
endif
swr_defines += '-DHAVE_SWR_SKX'
if shared_swr
swr_arch_libs += shared_library(
'swrSKX',
[files_swr_common, files_swr_arch],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_skx_args,
'-DKNOB_ARCH=KNOB_ARCH_AVX512',
],
gnu_symbol_visibility : 'hidden',
link_args : [ld_args_gc_sections],
include_directories : [swr_incs],
dependencies : [dep_thread, dep_llvm],
version : '0.0.0',
soversion : host_machine.system() == 'windows' ? '' : '0',
install : true,
name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
)
else
swr_arch_libs += static_library(
'swrSKX',
[files_swr_common, files_swr_arch],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_skx_args,
'-DKNOB_ARCH=KNOB_ARCH_AVX512',
],
gnu_symbol_visibility : 'hidden',
link_args : [ld_args_gc_sections],
include_directories : [swr_incs],
dependencies : [dep_thread, dep_llvm],
)
endif
endif
if with_swr_arches.contains('knl')
swr_knl_args = cpp.first_supported_argument(
'-march=knl', '-target-cpu=mic-knl', '-xMIC-AVX512',
)
if swr_knl_args == []
error('Cannot find KNL support for swr.')
endif
swr_defines += '-DHAVE_SWR_KNL'
if shared_swr
swr_arch_libs += shared_library(
'swrKNL',
[files_swr_common, files_swr_arch],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_knl_args,
'-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS',
],
gnu_symbol_visibility : 'hidden',
link_args : [ld_args_gc_sections],
include_directories : [swr_incs],
dependencies : [dep_thread, dep_llvm],
version : '0.0.0',
soversion : host_machine.system() == 'windows' ? '' : '0',
install : true,
name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
)
else
swr_arch_libs += static_library(
'swrKNL',
[files_swr_common, files_swr_arch],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_knl_args,
'-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS',
],
gnu_symbol_visibility : 'hidden',
link_args : [ld_args_gc_sections],
include_directories : [swr_incs],
dependencies : [dep_thread, dep_llvm],
)
endif
endif
if with_swr_arches.contains('avx2')
swr_avx2_args = cpp.first_supported_argument(
'-target-cpu=haswell', '-march=core-avx2', '-tp=haswell', '/arch:AVX2',
)
if swr_avx2_args == []
if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c'])
swr_avx2_args = ['-mavx2', '-mfma', '-mbmi2', '-mf16c']
else
error('Cannot find AVX2 support for swr.')
endif
endif
swr_defines += '-DHAVE_SWR_AVX2'
if shared_swr
swr_arch_libs += shared_library(
'swrAVX2',
[files_swr_common, files_swr_arch],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args,
'-DKNOB_ARCH=KNOB_ARCH_AVX2',
],
gnu_symbol_visibility : 'hidden',
link_args : [ld_args_gc_sections],
include_directories : [swr_incs],
dependencies : [dep_thread, dep_llvm],
version : '0.0.0',
soversion : host_machine.system() == 'windows' ? '' : '0',
install : true,
name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
)
else
swr_arch_libs += static_library(
'swrAVX2',
[files_swr_common, files_swr_arch],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args,
'-DKNOB_ARCH=KNOB_ARCH_AVX2',
],
gnu_symbol_visibility : 'hidden',
link_args : [ld_args_gc_sections],
include_directories : [swr_incs],
dependencies : [dep_thread, dep_llvm],
)
endif
endif
if with_swr_arches.contains('avx')
swr_defines += '-DHAVE_SWR_AVX'
if shared_swr
swr_arch_libs += shared_library(
'swrAVX',
[files_swr_common, files_swr_arch],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
'-DKNOB_ARCH=KNOB_ARCH_AVX',
],
gnu_symbol_visibility : 'hidden',
link_args : [ld_args_gc_sections],
include_directories : [swr_incs],
dependencies : [dep_thread, dep_llvm],
version : '0.0.0',
soversion : host_machine.system() == 'windows' ? '' : '0',
install : true,
name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
)
else
swr_arch_libs += static_library(
'swrAVX',
[files_swr_common, files_swr_arch],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
'-DKNOB_ARCH=KNOB_ARCH_AVX',
],
gnu_symbol_visibility : 'hidden',
link_args : [ld_args_gc_sections],
include_directories : [swr_incs],
dependencies : [dep_thread, dep_llvm],
)
endif
endif
if swr_arch_libs == []
error('SWR configured, but no SWR architectures configured')
endif
# The swr_avx_args are needed for intrensic usage in swr api headers.
libmesaswr = static_library(
'mesaswr',
[files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp,
gen_builder_hpp, gen_builder_meta_hpp, gen_builder_intrin_hpp],
cpp_args : [
cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
swr_defines,
],
gnu_symbol_visibility : 'hidden',
include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, swr_incs],
dependencies : [dep_llvm, idep_mesautil],
)
link_libs = [libmesaswr]
if not shared_swr
link_libs += swr_arch_libs
endif
driver_swr = declare_dependency(
compile_args : '-DGALLIUM_SWR',
link_with : link_libs
)

View file

@ -1,8 +0,0 @@
((prog-mode
(c-basic-offset . 4)
(c-file-style . "k&r")
(fill-column . 78)
(indent-tabs-mode . nil)
(show-trailing-whitespace . t)
)
)

View file

@ -1,114 +0,0 @@
---
Language: Cpp
# BasedOnStyle: LLVM
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: true
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: true
AfterControlStatement: true
AfterEnum: true
AfterFunction: true
AfterNamespace: true
AfterObjCDeclaration: true
AfterStruct: true
AfterUnion: true
#AfterExternBlock: false
BeforeCatch: true
BeforeElse: true
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
BreakBeforeInheritanceComma: false
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: AfterColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 100
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
#IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
Priority: 2
- Regex: '^(<|"(gtest|gmock|isl|json)/)'
Priority: 3
- Regex: '.*'
Priority: 1
IncludeIsMainRegex: '(Test)?$'
IndentCaseLabels: false
#IndentPPDirectives: AfterHash
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: All
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
#RawStringFormats:
# - Delimiter: pb
# Language: TextProto
# BasedOnStyle: google
ReflowComments: true
SortIncludes: false
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 4
UseTab: Never
...

View file

@ -1,708 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file archrast.cpp
*
* @brief Implementation for archrast.
*
******************************************************************************/
#include <sys/stat.h>
#include <atomic>
#include <map>
#include "common/os.h"
#include "archrast/archrast.h"
#include "archrast/eventmanager.h"
#include "gen_ar_event.hpp"
#include "gen_ar_eventhandlerfile.hpp"
namespace ArchRast
{
//////////////////////////////////////////////////////////////////////////
/// @brief struct that keeps track of depth and stencil event information
struct DepthStencilStats
{
uint32_t earlyZTestPassCount = 0;
uint32_t earlyZTestFailCount = 0;
uint32_t lateZTestPassCount = 0;
uint32_t lateZTestFailCount = 0;
uint32_t earlyStencilTestPassCount = 0;
uint32_t earlyStencilTestFailCount = 0;
uint32_t lateStencilTestPassCount = 0;
uint32_t lateStencilTestFailCount = 0;
};
struct CStats
{
uint32_t trivialRejectCount;
uint32_t trivialAcceptCount;
uint32_t mustClipCount;
};
struct TEStats
{
uint32_t inputPrims = 0;
//@todo:: Change this to numPatches. Assumed: 1 patch per prim. If holds, its fine.
};
struct GSStateInfo
{
uint32_t inputPrimCount;
uint32_t primGeneratedCount;
uint32_t vertsInput;
};
struct RastStats
{
uint32_t rasterTiles = 0;
};
struct CullStats
{
uint32_t degeneratePrimCount = 0;
uint32_t backfacePrimCount = 0;
};
struct AlphaStats
{
uint32_t alphaTestCount = 0;
uint32_t alphaBlendCount = 0;
};
//////////////////////////////////////////////////////////////////////////
/// @brief Event handler that handles API thread events. This is shared
/// between the API and its caller (e.g. driver shim) but typically
/// there is only a single API thread per context. So you can save
/// information in the class to be used for other events.
class EventHandlerApiStats : public EventHandlerFile
{
public:
EventHandlerApiStats(uint32_t id) : EventHandlerFile(id)
{
#if defined(_WIN32)
// Attempt to copy the events.proto file to the ArchRast output dir. It's common for
// tools to place the events.proto file in the DEBUG_OUTPUT_DIR when launching AR. If it
// exists, this will attempt to copy it the first time we get here to package it with
// the stats. Otherwise, the user would need to specify the events.proto location when
// parsing the stats in post.
std::stringstream eventsProtoSrcFilename, eventsProtoDstFilename;
eventsProtoSrcFilename << KNOB_DEBUG_OUTPUT_DIR << "\\events.proto" << std::ends;
eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 1)
<< "\\events.proto" << std::ends;
// If event.proto already exists, we're done; else do the copy
struct stat buf; // Use a Posix stat for file existence check
if (!stat(eventsProtoDstFilename.str().c_str(), &buf) == 0)
{
// Now check to make sure the events.proto source exists
if (stat(eventsProtoSrcFilename.str().c_str(), &buf) == 0)
{
std::ifstream srcFile;
srcFile.open(eventsProtoSrcFilename.str().c_str(), std::ios::binary);
if (srcFile.is_open())
{
// Just do a binary buffer copy
std::ofstream dstFile;
dstFile.open(eventsProtoDstFilename.str().c_str(), std::ios::binary);
dstFile << srcFile.rdbuf();
dstFile.close();
}
srcFile.close();
}
}
#endif
}
virtual void Handle(const DrawInstancedEvent& event)
{
DrawInfoEvent e(event.data.drawId,
ArchRast::Instanced,
event.data.topology,
event.data.numVertices,
0,
0,
event.data.startVertex,
event.data.numInstances,
event.data.startInstance,
event.data.tsEnable,
event.data.gsEnable,
event.data.soEnable,
event.data.soTopology,
event.data.splitId);
EventHandlerFile::Handle(e);
}
virtual void Handle(const DrawIndexedInstancedEvent& event)
{
DrawInfoEvent e(event.data.drawId,
ArchRast::IndexedInstanced,
event.data.topology,
0,
event.data.numIndices,
event.data.indexOffset,
event.data.baseVertex,
event.data.numInstances,
event.data.startInstance,
event.data.tsEnable,
event.data.gsEnable,
event.data.soEnable,
event.data.soTopology,
event.data.splitId);
EventHandlerFile::Handle(e);
}
};
//////////////////////////////////////////////////////////////////////////
/// @brief Event handler that handles worker thread events. There is one
/// event handler per thread. The python script will need to sum
/// up counters across all of the threads.
class EventHandlerWorkerStats : public EventHandlerFile
{
public:
EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
{
memset(mShaderStats, 0, sizeof(mShaderStats));
}
virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
{
// earlyZ test compute
mDSSingleSample.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSSingleSample.earlyZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// earlyStencil test compute
mDSSingleSample.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSSingleSample.earlyStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
// earlyZ test single and multi sample
mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSCombined.earlyZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// earlyStencil test single and multi sample
mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSCombined.earlyStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
mNeedFlush = true;
}
virtual void Handle(const EarlyDepthStencilInfoSampleRate& event)
{
// earlyZ test compute
mDSSampleRate.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSSampleRate.earlyZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// earlyStencil test compute
mDSSampleRate.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSSampleRate.earlyStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
// earlyZ test single and multi sample
mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSCombined.earlyZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// earlyStencil test single and multi sample
mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSCombined.earlyStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
mNeedFlush = true;
}
virtual void Handle(const EarlyDepthStencilInfoNullPS& event)
{
// earlyZ test compute
mDSNullPS.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSNullPS.earlyZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// earlyStencil test compute
mDSNullPS.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSNullPS.earlyStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
mNeedFlush = true;
}
virtual void Handle(const LateDepthStencilInfoSingleSample& event)
{
// lateZ test compute
mDSSingleSample.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSSingleSample.lateZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// lateStencil test compute
mDSSingleSample.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSSingleSample.lateStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
// lateZ test single and multi sample
mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSCombined.lateZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// lateStencil test single and multi sample
mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSCombined.lateStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
mNeedFlush = true;
}
virtual void Handle(const LateDepthStencilInfoSampleRate& event)
{
// lateZ test compute
mDSSampleRate.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSSampleRate.lateZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// lateStencil test compute
mDSSampleRate.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSSampleRate.lateStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
// lateZ test single and multi sample
mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSCombined.lateZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// lateStencil test single and multi sample
mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSCombined.lateStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
mNeedFlush = true;
}
virtual void Handle(const LateDepthStencilInfoNullPS& event)
{
// lateZ test compute
mDSNullPS.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
mDSNullPS.lateZTestFailCount +=
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
// lateStencil test compute
mDSNullPS.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
mDSNullPS.lateStencilTestFailCount +=
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
mNeedFlush = true;
}
virtual void Handle(const EarlyDepthInfoPixelRate& event)
{
// earlyZ test compute
mDSPixelRate.earlyZTestPassCount += event.data.depthPassCount;
mDSPixelRate.earlyZTestFailCount +=
(_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
mNeedFlush = true;
}
virtual void Handle(const LateDepthInfoPixelRate& event)
{
// lateZ test compute
mDSPixelRate.lateZTestPassCount += event.data.depthPassCount;
mDSPixelRate.lateZTestFailCount +=
(_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
mNeedFlush = true;
}
virtual void Handle(const ClipInfoEvent& event)
{
mClipper.mustClipCount += _mm_popcnt_u32(event.data.clipMask);
mClipper.trivialRejectCount +=
event.data.numInvocations - _mm_popcnt_u32(event.data.validMask);
mClipper.trivialAcceptCount +=
_mm_popcnt_u32(event.data.validMask & ~event.data.clipMask);
}
void UpdateStats(SWR_SHADER_STATS* pStatTotals, const SWR_SHADER_STATS* pStatUpdate)
{
pStatTotals->numInstExecuted += pStatUpdate->numInstExecuted;
pStatTotals->numSampleExecuted += pStatUpdate->numSampleExecuted;
pStatTotals->numSampleLExecuted += pStatUpdate->numSampleLExecuted;
pStatTotals->numSampleBExecuted += pStatUpdate->numSampleBExecuted;
pStatTotals->numSampleCExecuted += pStatUpdate->numSampleCExecuted;
pStatTotals->numSampleCLZExecuted += pStatUpdate->numSampleCLZExecuted;
pStatTotals->numSampleCDExecuted += pStatUpdate->numSampleCDExecuted;
pStatTotals->numGather4Executed += pStatUpdate->numGather4Executed;
pStatTotals->numGather4CExecuted += pStatUpdate->numGather4CExecuted;
pStatTotals->numGather4CPOExecuted += pStatUpdate->numGather4CPOExecuted;
pStatTotals->numGather4CPOCExecuted += pStatUpdate->numGather4CPOCExecuted;
pStatTotals->numLodExecuted += pStatUpdate->numLodExecuted;
}
virtual void Handle(const VSStats& event)
{
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
UpdateStats(&mShaderStats[SHADER_VERTEX], pStats);
}
virtual void Handle(const GSStats& event)
{
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
UpdateStats(&mShaderStats[SHADER_GEOMETRY], pStats);
}
virtual void Handle(const DSStats& event)
{
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
UpdateStats(&mShaderStats[SHADER_DOMAIN], pStats);
}
virtual void Handle(const HSStats& event)
{
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
UpdateStats(&mShaderStats[SHADER_HULL], pStats);
}
virtual void Handle(const PSStats& event)
{
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
UpdateStats(&mShaderStats[SHADER_PIXEL], pStats);
mNeedFlush = true;
}
virtual void Handle(const CSStats& event)
{
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
UpdateStats(&mShaderStats[SHADER_COMPUTE], pStats);
mNeedFlush = true;
}
// Flush cached events for this draw
virtual void FlushDraw(uint32_t drawId)
{
if (mNeedFlush == false)
return;
EventHandlerFile::Handle(PSInfo(drawId,
mShaderStats[SHADER_PIXEL].numInstExecuted,
mShaderStats[SHADER_PIXEL].numSampleExecuted,
mShaderStats[SHADER_PIXEL].numSampleLExecuted,
mShaderStats[SHADER_PIXEL].numSampleBExecuted,
mShaderStats[SHADER_PIXEL].numSampleCExecuted,
mShaderStats[SHADER_PIXEL].numSampleCLZExecuted,
mShaderStats[SHADER_PIXEL].numSampleCDExecuted,
mShaderStats[SHADER_PIXEL].numGather4Executed,
mShaderStats[SHADER_PIXEL].numGather4CExecuted,
mShaderStats[SHADER_PIXEL].numGather4CPOExecuted,
mShaderStats[SHADER_PIXEL].numGather4CPOCExecuted,
mShaderStats[SHADER_PIXEL].numLodExecuted));
EventHandlerFile::Handle(CSInfo(drawId,
mShaderStats[SHADER_COMPUTE].numInstExecuted,
mShaderStats[SHADER_COMPUTE].numSampleExecuted,
mShaderStats[SHADER_COMPUTE].numSampleLExecuted,
mShaderStats[SHADER_COMPUTE].numSampleBExecuted,
mShaderStats[SHADER_COMPUTE].numSampleCExecuted,
mShaderStats[SHADER_COMPUTE].numSampleCLZExecuted,
mShaderStats[SHADER_COMPUTE].numSampleCDExecuted,
mShaderStats[SHADER_COMPUTE].numGather4Executed,
mShaderStats[SHADER_COMPUTE].numGather4CExecuted,
mShaderStats[SHADER_COMPUTE].numGather4CPOExecuted,
mShaderStats[SHADER_COMPUTE].numGather4CPOCExecuted,
mShaderStats[SHADER_COMPUTE].numLodExecuted));
// singleSample
EventHandlerFile::Handle(EarlyZSingleSample(
drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount));
EventHandlerFile::Handle(LateZSingleSample(
drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount));
EventHandlerFile::Handle(
EarlyStencilSingleSample(drawId,
mDSSingleSample.earlyStencilTestPassCount,
mDSSingleSample.earlyStencilTestFailCount));
EventHandlerFile::Handle(
LateStencilSingleSample(drawId,
mDSSingleSample.lateStencilTestPassCount,
mDSSingleSample.lateStencilTestFailCount));
// sampleRate
EventHandlerFile::Handle(EarlyZSampleRate(
drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount));
EventHandlerFile::Handle(LateZSampleRate(
drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount));
EventHandlerFile::Handle(
EarlyStencilSampleRate(drawId,
mDSSampleRate.earlyStencilTestPassCount,
mDSSampleRate.earlyStencilTestFailCount));
EventHandlerFile::Handle(LateStencilSampleRate(drawId,
mDSSampleRate.lateStencilTestPassCount,
mDSSampleRate.lateStencilTestFailCount));
// combined
EventHandlerFile::Handle(
EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount));
EventHandlerFile::Handle(
LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount));
EventHandlerFile::Handle(EarlyStencil(drawId,
mDSCombined.earlyStencilTestPassCount,
mDSCombined.earlyStencilTestFailCount));
EventHandlerFile::Handle(LateStencil(drawId,
mDSCombined.lateStencilTestPassCount,
mDSCombined.lateStencilTestFailCount));
// pixelRate
EventHandlerFile::Handle(EarlyZPixelRate(
drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount));
EventHandlerFile::Handle(LateZPixelRate(
drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount));
// NullPS
EventHandlerFile::Handle(
EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount));
EventHandlerFile::Handle(EarlyStencilNullPS(
drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount));
// Rasterized Subspans
EventHandlerFile::Handle(RasterTiles(drawId, rastStats.rasterTiles));
// Alpha Subspans
EventHandlerFile::Handle(
AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount));
// Primitive Culling
EventHandlerFile::Handle(
CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount));
mDSSingleSample = {};
mDSSampleRate = {};
mDSCombined = {};
mDSPixelRate = {};
mDSNullPS = {};
rastStats = {};
mCullStats = {};
mAlphaStats = {};
mShaderStats[SHADER_PIXEL] = {};
mShaderStats[SHADER_COMPUTE] = {};
mNeedFlush = false;
}
virtual void Handle(const FrontendDrawEndEvent& event)
{
// Clipper
EventHandlerFile::Handle(ClipperEvent(event.data.drawId,
mClipper.trivialRejectCount,
mClipper.trivialAcceptCount,
mClipper.mustClipCount));
// Tesselator
EventHandlerFile::Handle(TessPrims(event.data.drawId, mTS.inputPrims));
// Geometry Shader
EventHandlerFile::Handle(GSInputPrims(event.data.drawId, mGS.inputPrimCount));
EventHandlerFile::Handle(GSPrimsGen(event.data.drawId, mGS.primGeneratedCount));
EventHandlerFile::Handle(GSVertsInput(event.data.drawId, mGS.vertsInput));
EventHandlerFile::Handle(VSInfo(event.data.drawId,
mShaderStats[SHADER_VERTEX].numInstExecuted,
mShaderStats[SHADER_VERTEX].numSampleExecuted,
mShaderStats[SHADER_VERTEX].numSampleLExecuted,
mShaderStats[SHADER_VERTEX].numSampleBExecuted,
mShaderStats[SHADER_VERTEX].numSampleCExecuted,
mShaderStats[SHADER_VERTEX].numSampleCLZExecuted,
mShaderStats[SHADER_VERTEX].numSampleCDExecuted,
mShaderStats[SHADER_VERTEX].numGather4Executed,
mShaderStats[SHADER_VERTEX].numGather4CExecuted,
mShaderStats[SHADER_VERTEX].numGather4CPOExecuted,
mShaderStats[SHADER_VERTEX].numGather4CPOCExecuted,
mShaderStats[SHADER_VERTEX].numLodExecuted));
EventHandlerFile::Handle(HSInfo(event.data.drawId,
mShaderStats[SHADER_HULL].numInstExecuted,
mShaderStats[SHADER_HULL].numSampleExecuted,
mShaderStats[SHADER_HULL].numSampleLExecuted,
mShaderStats[SHADER_HULL].numSampleBExecuted,
mShaderStats[SHADER_HULL].numSampleCExecuted,
mShaderStats[SHADER_HULL].numSampleCLZExecuted,
mShaderStats[SHADER_HULL].numSampleCDExecuted,
mShaderStats[SHADER_HULL].numGather4Executed,
mShaderStats[SHADER_HULL].numGather4CExecuted,
mShaderStats[SHADER_HULL].numGather4CPOExecuted,
mShaderStats[SHADER_HULL].numGather4CPOCExecuted,
mShaderStats[SHADER_HULL].numLodExecuted));
EventHandlerFile::Handle(DSInfo(event.data.drawId,
mShaderStats[SHADER_DOMAIN].numInstExecuted,
mShaderStats[SHADER_DOMAIN].numSampleExecuted,
mShaderStats[SHADER_DOMAIN].numSampleLExecuted,
mShaderStats[SHADER_DOMAIN].numSampleBExecuted,
mShaderStats[SHADER_DOMAIN].numSampleCExecuted,
mShaderStats[SHADER_DOMAIN].numSampleCLZExecuted,
mShaderStats[SHADER_DOMAIN].numSampleCDExecuted,
mShaderStats[SHADER_DOMAIN].numGather4Executed,
mShaderStats[SHADER_DOMAIN].numGather4CExecuted,
mShaderStats[SHADER_DOMAIN].numGather4CPOExecuted,
mShaderStats[SHADER_DOMAIN].numGather4CPOCExecuted,
mShaderStats[SHADER_DOMAIN].numLodExecuted));
EventHandlerFile::Handle(GSInfo(event.data.drawId,
mShaderStats[SHADER_GEOMETRY].numInstExecuted,
mShaderStats[SHADER_GEOMETRY].numSampleExecuted,
mShaderStats[SHADER_GEOMETRY].numSampleLExecuted,
mShaderStats[SHADER_GEOMETRY].numSampleBExecuted,
mShaderStats[SHADER_GEOMETRY].numSampleCExecuted,
mShaderStats[SHADER_GEOMETRY].numSampleCLZExecuted,
mShaderStats[SHADER_GEOMETRY].numSampleCDExecuted,
mShaderStats[SHADER_GEOMETRY].numGather4Executed,
mShaderStats[SHADER_GEOMETRY].numGather4CExecuted,
mShaderStats[SHADER_GEOMETRY].numGather4CPOExecuted,
mShaderStats[SHADER_GEOMETRY].numGather4CPOCExecuted,
mShaderStats[SHADER_GEOMETRY].numLodExecuted));
mShaderStats[SHADER_VERTEX] = {};
mShaderStats[SHADER_HULL] = {};
mShaderStats[SHADER_DOMAIN] = {};
mShaderStats[SHADER_GEOMETRY] = {};
// Reset Internal Counters
mClipper = {};
mTS = {};
mGS = {};
}
virtual void Handle(const GSPrimInfo& event)
{
mGS.inputPrimCount += event.data.inputPrimCount;
mGS.primGeneratedCount += event.data.primGeneratedCount;
mGS.vertsInput += event.data.vertsInput;
}
virtual void Handle(const TessPrimCount& event) { mTS.inputPrims += event.data.primCount; }
virtual void Handle(const RasterTileCount& event)
{
rastStats.rasterTiles += event.data.rasterTiles;
}
virtual void Handle(const CullInfoEvent& event)
{
mCullStats.degeneratePrimCount += _mm_popcnt_u32(
event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask));
mCullStats.backfacePrimCount += _mm_popcnt_u32(
event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask));
}
virtual void Handle(const AlphaInfoEvent& event)
{
mAlphaStats.alphaTestCount += event.data.alphaTestEnable;
mAlphaStats.alphaBlendCount += event.data.alphaBlendEnable;
}
protected:
bool mNeedFlush;
// Per draw stats
DepthStencilStats mDSSingleSample = {};
DepthStencilStats mDSSampleRate = {};
DepthStencilStats mDSPixelRate = {};
DepthStencilStats mDSCombined = {};
DepthStencilStats mDSNullPS = {};
DepthStencilStats mDSOmZ = {};
CStats mClipper = {};
TEStats mTS = {};
GSStateInfo mGS = {};
RastStats rastStats = {};
CullStats mCullStats = {};
AlphaStats mAlphaStats = {};
SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES];
};
static EventManager* FromHandle(HANDLE hThreadContext)
{
return reinterpret_cast<EventManager*>(hThreadContext);
}
// Construct an event manager and associate a handler with it.
HANDLE CreateThreadContext(AR_THREAD type)
{
// Can we assume single threaded here?
static std::atomic<uint32_t> counter(0);
uint32_t id = counter.fetch_add(1);
EventManager* pManager = new EventManager();
if (pManager)
{
EventHandlerFile* pHandler = nullptr;
if (type == AR_THREAD::API)
{
pHandler = new EventHandlerApiStats(id);
pManager->Attach(pHandler);
pHandler->Handle(ThreadStartApiEvent());
}
else
{
pHandler = new EventHandlerWorkerStats(id);
pManager->Attach(pHandler);
pHandler->Handle(ThreadStartWorkerEvent());
}
pHandler->MarkHeader();
return pManager;
}
SWR_INVALID("Failed to register thread.");
return nullptr;
}
void DestroyThreadContext(HANDLE hThreadContext)
{
EventManager* pManager = FromHandle(hThreadContext);
SWR_ASSERT(pManager != nullptr);
delete pManager;
}
// Dispatch event for this thread.
void Dispatch(HANDLE hThreadContext, const Event& event)
{
if (event.IsEnabled())
{
EventManager* pManager = reinterpret_cast<EventManager*>(hThreadContext);
SWR_ASSERT(pManager != nullptr);
pManager->Dispatch(event);
}
}
// Flush for this thread.
void FlushDraw(HANDLE hThreadContext, uint32_t drawId)
{
EventManager* pManager = FromHandle(hThreadContext);
SWR_ASSERT(pManager != nullptr);
pManager->FlushDraw(drawId);
}
} // namespace ArchRast

View file

@ -1,49 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file archrast.h
*
* @brief Definitions for archrast.
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "gen_ar_event.hpp"
#include "eventmanager.h"
namespace ArchRast
{
enum class AR_THREAD
{
API = 0,
WORKER = 1
};
HANDLE CreateThreadContext(AR_THREAD type);
void DestroyThreadContext(HANDLE hThreadContext);
// Dispatch event for this thread.
void Dispatch(HANDLE hThreadContext, const Event& event);
void FlushDraw(HANDLE hThreadContext, uint32_t drawId);
}; // namespace ArchRast

View file

@ -1,88 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file archrast.h
*
* @brief Definitions for the event manager.
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "gen_ar_event.hpp"
#include "gen_ar_eventhandler.hpp"
#include <vector>
namespace ArchRast
{
//////////////////////////////////////////////////////////////////////////
/// EventManager - interface to dispatch events to handlers.
/// Event handling occurs only on a single thread.
//////////////////////////////////////////////////////////////////////////
class EventManager
{
public:
EventManager() {}
~EventManager()
{
// Event manager owns destroying handler objects once attached.
///@note See comment for Detach.
for (auto pHandler : mHandlers)
{
delete pHandler;
}
}
void Attach(EventHandler* pHandler)
{
SWR_ASSERT(pHandler != nullptr);
mHandlers.push_back(pHandler);
}
void Dispatch(const Event& event)
{
///@todo Add event filter check here.
for (auto pHandler : mHandlers)
{
event.Accept(pHandler);
}
}
void FlushDraw(uint32_t drawId)
{
for (auto pHandler : mHandlers)
{
pHandler->FlushDraw(drawId);
}
}
private:
// Handlers stay registered for life
void Detach(EventHandler* pHandler) { SWR_INVALID("Should not be called"); }
std::vector<EventHandler*> mHandlers;
};
}; // namespace ArchRast

View file

@ -1,427 +0,0 @@
# Copyright (C) 2016 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#
# Provides definitions for events.
enum AR_DRAW_TYPE
{
Instanced = 0,
IndexedInstanced = 1,
InstancedSplit = 2,
IndexedInstancedSplit = 3
};
event Framework::ThreadStartApiEvent
{
};
event Framework::ThreadStartWorkerEvent
{
};
///@brief Used as a helper event to indicate end of frame. Does not guarantee to capture end of frame on all APIs
event ApiSwr::FrameEndEvent
{
uint32_t frameId; // current frame id
uint32_t nextDrawId; // next draw id (always incremental - does not reset)
};
///@brief Synchronization event.
event ApiSwr::SwrSyncEvent
{
uint32_t drawId;
};
///@brief Invalidate hot tiles (i.e. tile cache)
event ApiSwr::SwrInvalidateTilesEvent
{
uint32_t drawId;
};
///@brief Invalidate and discard hot tiles within pixel region
event ApiSwr::SwrDiscardRectEvent
{
uint32_t drawId;
};
///@brief Flush tiles out to memory that is typically owned by driver (e.g. Flush RT cache)
event ApiSwr::SwrStoreTilesEvent
{
uint32_t drawId;
};
event PipelineStats::DrawInfoEvent
{
uint32_t drawId;
AR_DRAW_TYPE type; // type of draw (indexed, instanced, etc)
uint32_t topology; // topology of draw
uint32_t numVertices; // number of vertices for draw
uint32_t numIndices; // number of indices for draw
int32_t indexOffset; // offset into index buffer
int32_t baseVertex; // which vertex to start with
uint32_t numInstances; // number of instances to draw
uint32_t startInstance; // which instance to start fetching
uint32_t tsEnable; // tesselation enabled
uint32_t gsEnable; // geometry shader enabled
uint32_t soEnable; // stream-out enabled
uint32_t soTopology; // topology of stream-out
uint32_t splitId; // split draw count or id
};
event PipelineStats::DispatchEvent
{
uint32_t drawId;
uint32_t threadGroupCountX; // num thread groups in X dimension
uint32_t threadGroupCountY; // num thread groups in Y dimension
uint32_t threadGroupCountZ; // num thread groups in Z dimension
};
event PipelineStats::FrontendStatsEvent
{
uint32_t drawId;
uint64_t IaVertices;
uint64_t IaPrimitives;
uint64_t VsInvocations;
uint64_t HsInvocations;
uint64_t DsInvocations;
uint64_t GsInvocations;
uint64_t GsPrimitives;
uint64_t CInvocations;
uint64_t CPrimitives;
uint64_t SoPrimStorageNeeded0;
uint64_t SoPrimStorageNeeded1;
uint64_t SoPrimStorageNeeded2;
uint64_t SoPrimStorageNeeded3;
uint64_t SoNumPrimsWritten0;
uint64_t SoNumPrimsWritten1;
uint64_t SoNumPrimsWritten2;
uint64_t SoNumPrimsWritten3;
};
event PipelineStats::BackendStatsEvent
{
uint32_t drawId;
uint64_t DepthPassCount;
uint64_t PsInvocations;
uint64_t CsInvocations;
};
event PipelineStats::EarlyZSingleSample
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::LateZSingleSample
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::EarlyStencilSingleSample
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::LateStencilSingleSample
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::EarlyZSampleRate
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::LateZSampleRate
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::EarlyStencilSampleRate
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::LateStencilSampleRate
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
// Total Early-Z counts, SingleSample and SampleRate
event PipelineStats::EarlyZ
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
// Total LateZ counts, SingleSample and SampleRate
event PipelineStats::LateZ
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
// Total EarlyStencil counts, SingleSample and SampleRate
event PipelineStats::EarlyStencil
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
// Total LateStencil counts, SingleSample and SampleRate
event PipelineStats::LateStencil
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::EarlyZNullPS
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::EarlyStencilNullPS
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::EarlyZPixelRate
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::LateZPixelRate
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::EarlyOmZ
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::EarlyOmStencil
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::LateOmZ
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::LateOmStencil
{
uint32_t drawId;
uint64_t passCount;
uint64_t failCount;
};
event PipelineStats::GSInputPrims
{
uint32_t drawId;
uint64_t inputPrimCount;
};
event PipelineStats::GSPrimsGen
{
uint32_t drawId;
uint64_t primGeneratedCount;
};
event PipelineStats::GSVertsInput
{
uint32_t drawId;
uint64_t vertsInput;
};
event PipelineStats::TessPrims
{
uint32_t drawId;
uint64_t primCount;
};
event PipelineStats::RasterTiles
{
uint32_t drawId;
uint32_t rastTileCount;
};
event PipelineStats::ClipperEvent
{
uint32_t drawId;
uint32_t trivialRejectCount;
uint32_t trivialAcceptCount;
uint32_t mustClipCount;
};
event PipelineStats::CullEvent
{
uint32_t drawId;
uint64_t backfacePrimCount;
uint64_t degeneratePrimCount;
};
event PipelineStats::AlphaEvent
{
uint32_t drawId;
uint32_t alphaTestCount;
uint32_t alphaBlendCount;
};
event ShaderStats::VSInfo
{
uint32_t drawId;
uint32_t numInstExecuted;
uint32_t numSampleExecuted;
uint32_t numSampleLExecuted;
uint32_t numSampleBExecuted;
uint32_t numSampleCExecuted;
uint32_t numSampleCLZExecuted;
uint32_t numSampleCDExecuted;
uint32_t numGather4Executed;
uint32_t numGather4CExecuted;
uint32_t numGather4CPOExecuted;
uint32_t numGather4CPOCExecuted;
uint32_t numLodExecuted;
};
event ShaderStats::HSInfo
{
uint32_t drawId;
uint32_t numInstExecuted;
uint32_t numSampleExecuted;
uint32_t numSampleLExecuted;
uint32_t numSampleBExecuted;
uint32_t numSampleCExecuted;
uint32_t numSampleCLZExecuted;
uint32_t numSampleCDExecuted;
uint32_t numGather4Executed;
uint32_t numGather4CExecuted;
uint32_t numGather4CPOExecuted;
uint32_t numGather4CPOCExecuted;
uint32_t numLodExecuted;
};
event ShaderStats::DSInfo
{
uint32_t drawId;
uint32_t numInstExecuted;
uint32_t numSampleExecuted;
uint32_t numSampleLExecuted;
uint32_t numSampleBExecuted;
uint32_t numSampleCExecuted;
uint32_t numSampleCLZExecuted;
uint32_t numSampleCDExecuted;
uint32_t numGather4Executed;
uint32_t numGather4CExecuted;
uint32_t numGather4CPOExecuted;
uint32_t numGather4CPOCExecuted;
uint32_t numLodExecuted;
};
event ShaderStats::GSInfo
{
uint32_t drawId;
uint32_t numInstExecuted;
uint32_t numSampleExecuted;
uint32_t numSampleLExecuted;
uint32_t numSampleBExecuted;
uint32_t numSampleCExecuted;
uint32_t numSampleCLZExecuted;
uint32_t numSampleCDExecuted;
uint32_t numGather4Executed;
uint32_t numGather4CExecuted;
uint32_t numGather4CPOExecuted;
uint32_t numGather4CPOCExecuted;
uint32_t numLodExecuted;
};
event ShaderStats::PSInfo
{
uint32_t drawId;
uint32_t numInstExecuted;
uint32_t numSampleExecuted;
uint32_t numSampleLExecuted;
uint32_t numSampleBExecuted;
uint32_t numSampleCExecuted;
uint32_t numSampleCLZExecuted;
uint32_t numSampleCDExecuted;
uint32_t numGather4Executed;
uint32_t numGather4CExecuted;
uint32_t numGather4CPOExecuted;
uint32_t numGather4CPOCExecuted;
uint32_t numLodExecuted;
};
event ShaderStats::CSInfo
{
uint32_t drawId;
uint32_t numInstExecuted;
uint32_t numSampleExecuted;
uint32_t numSampleLExecuted;
uint32_t numSampleBExecuted;
uint32_t numSampleCExecuted;
uint32_t numSampleCLZExecuted;
uint32_t numSampleCDExecuted;
uint32_t numGather4Executed;
uint32_t numGather4CExecuted;
uint32_t numGather4CPOExecuted;
uint32_t numGather4CPOCExecuted;
uint32_t numLodExecuted;
};

View file

@ -1,212 +0,0 @@
# Copyright (C) 2018 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#
# Provides definitions for private internal events that are only used internally
# to rasty for communicating information between Rasty and Archrast. One goal for
# ArchRast is to not pollute the Rasty code with lots of calculations, etc. that
# are needed to compute per draw statistics, etc.
event PipelineStats::EarlyDepthStencilInfoSingleSample
{
uint64_t depthPassMask;
uint64_t stencilPassMask;
uint64_t coverageMask;
};
event PipelineStats::EarlyDepthStencilInfoSampleRate
{
uint64_t depthPassMask;
uint64_t stencilPassMask;
uint64_t coverageMask;
};
event PipelineStats::EarlyDepthStencilInfoNullPS
{
uint64_t depthPassMask;
uint64_t stencilPassMask;
uint64_t coverageMask;
};
event PipelineStats::LateDepthStencilInfoSingleSample
{
uint64_t depthPassMask;
uint64_t stencilPassMask;
uint64_t coverageMask;
};
event PipelineStats::LateDepthStencilInfoSampleRate
{
uint64_t depthPassMask;
uint64_t stencilPassMask;
uint64_t coverageMask;
};
event PipelineStats::LateDepthStencilInfoNullPS
{
uint64_t depthPassMask;
uint64_t stencilPassMask;
uint64_t coverageMask;
};
event PipelineStats::EarlyDepthInfoPixelRate
{
uint64_t depthPassCount;
uint64_t activeLanes;
};
event PipelineStats::LateDepthInfoPixelRate
{
uint64_t depthPassCount;
uint64_t activeLanes;
};
event PipelineStats::BackendDrawEndEvent
{
uint32_t drawId;
};
event PipelineStats::FrontendDrawEndEvent
{
uint32_t drawId;
};
event Memory::MemoryAccessEvent
{
uint32_t drawId;
uint64_t tsc;
uint64_t ptr;
uint32_t size;
uint8_t isRead;
uint8_t client;
};
event Memory::MemoryStatsEndEvent
{
uint32_t drawId;
};
event PipelineStats::TessPrimCount
{
uint64_t primCount;
};
event PipelineStats::RasterTileCount
{
uint32_t drawId;
uint64_t rasterTiles;
};
event PipelineStats::GSPrimInfo
{
uint64_t inputPrimCount;
uint64_t primGeneratedCount;
uint64_t vertsInput;
};
// validMask is primitives that still need to be clipped. They weren't rejected due to trivial reject or nan.
// clipMask is primitives that need to be clipped. So trivial accepts will be 0 while validMask for that is 1.
// Trivial reject is numInvocations - pop_cnt32(validMask)
// Trivial accept is validMask & ~clipMask
// Must clip count is pop_cnt32(clipMask)
event PipelineStats::ClipInfoEvent
{
uint32_t numInvocations;
uint32_t validMask;
uint32_t clipMask;
};
event PipelineStats::CullInfoEvent
{
uint32_t drawId;
uint64_t degeneratePrimMask;
uint64_t backfacePrimMask;
uint32_t validMask;
};
event PipelineStats::AlphaInfoEvent
{
uint32_t drawId;
uint32_t alphaTestEnable;
uint32_t alphaBlendEnable;
};
event PipelineStats::DrawInstancedEvent
{
uint32_t drawId;
uint32_t topology;
uint32_t numVertices;
int32_t startVertex;
uint32_t numInstances;
uint32_t startInstance;
uint32_t tsEnable;
uint32_t gsEnable;
uint32_t soEnable;
uint32_t soTopology;
uint32_t splitId; // Split draw count or id.
};
event PipelineStats::DrawIndexedInstancedEvent
{
uint32_t drawId;
uint32_t topology;
uint32_t numIndices;
int32_t indexOffset;
int32_t baseVertex;
uint32_t numInstances;
uint32_t startInstance;
uint32_t tsEnable;
uint32_t gsEnable;
uint32_t soEnable;
uint32_t soTopology;
uint32_t splitId; // Split draw count or id.
};
event ShaderStats::VSStats
{
HANDLE hStats; // SWR_SHADER_STATS
};
event ShaderStats::HSStats
{
HANDLE hStats; // SWR_SHADER_STATS
};
event ShaderStats::DSStats
{
HANDLE hStats; // SWR_SHADER_STATS
};
event ShaderStats::GSStats
{
HANDLE hStats; // SWR_SHADER_STATS
};
event ShaderStats::PSStats
{
HANDLE hStats; // SWR_SHADER_STATS
};
event ShaderStats::CSStats
{
HANDLE hStats; // SWR_SHADER_STATS
};

View file

@ -1,327 +0,0 @@
# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
# Python source
import os
import sys
import re
from gen_common import *
def parse_event_fields(lines, idx, event_dict):
"""
Parses lines from a proto file that contain an event definition and stores it in event_dict
"""
fields = []
end_of_event = False
# record all fields in event definition.
# note: we don't check if there's a leading brace.
while not end_of_event and idx < len(lines):
line = lines[idx].rstrip()
idx += 1
# ex 1: uint32_t numSampleCLZExecuted; // number of sample_cl_z instructions executed
# ex 2: char reason[256]; // size of reason
match = re.match(r'^(\s*)([\w\*]+)(\s+)([\w]+)(\[\d+\])*;\s*(\/\/.*)*$', line)
# group 1 -
# group 2 type
# group 3 -
# group 4 name
# group 5 [array size]
# group 6 //comment
if match:
field = {
"type": match.group(2),
"name": match.group(4),
"size": int(match.group(5)[1:-1]) if match.group(5) else 1,
"desc": match.group(6)[2:].strip() if match.group(6) else "",
}
fields.append(field)
end_of_event = re.match(r'(\s*)};', line)
event_dict['fields'] = fields
event_dict['num_fields'] = len(fields)
return idx
def parse_enums(lines, idx, event_dict):
"""
Parses lines from a proto file that contain an enum definition and stores it in event_dict
"""
enum_names = []
end_of_enum = False
# record all enum values in enumeration
# note: we don't check if there's a leading brace.
while not end_of_enum and idx < len(lines):
line = lines[idx].rstrip()
idx += 1
preprocessor = re.search(r'#if|#endif', line)
if not preprocessor:
enum = re.match(r'(\s*)(\w+)(\s*)', line)
if enum:
enum_names.append(line)
end_of_enum = re.match(r'(\s*)};', line)
event_dict['names'] = enum_names
return idx
def parse_protos(files, verbose=False):
"""
Parses a proto file and returns a dictionary of event definitions
"""
# Protos structure:
#
# {
# "events": {
# "defs": { // dict of event definitions where keys are 'group_name::event_name"
# ...,
# "ApiStat::DrawInfoEvent": {
# "id": 3,
# "group": "ApiStat",
# "name": "DrawInfoEvent", // name of event without 'group_name::' prefix
# "desc": "",
# "fields": [
# {
# "type": "uint32_t",
# "name": "drawId",
# "size": 1,
# "desc": "",
# },
# ...
# ]
# },
# ...
# },
# "groups": { // dict of groups with lists of event keys
# "ApiStat": [
# "ApiStat::DispatchEvent",
# "ApiStat::DrawInfoEvent",
# ...
# ],
# "Framework": [
# "Framework::ThreadStartApiEvent",
# "Framework::ThreadStartWorkerEvent",
# ...
# ],
# ...
# },
# "map": { // map of event ids to match archrast output to event key
# "1": "Framework::ThreadStartApiEvent",
# "2": "Framework::ThreadStartWorkerEvent",
# "3": "ApiStat::DrawInfoEvent",
# ...
# }
# },
# "enums": { ... } // enums follow similar defs, map (groups?) structure
# }
protos = {
'events': {
'defs': {}, # event dictionary containing events with their fields
'map': {}, # dictionary to map event ids to event names
'groups': {} # event keys stored by groups
},
'enums': {
'defs': {},
'map': {}
}
}
event_id = 0
enum_id = 0
if type(files) is not list:
files = [files]
for filename in files:
if verbose:
print("Parsing proto file: %s" % os.path.normpath(filename))
with open(filename, 'r') as f:
lines = f.readlines()
in_brief = False
brief = []
idx = 0
while idx < len(lines):
line = lines[idx].strip()
idx += 1
# If currently processing a brief, keep processing or change state
if in_brief:
match = re.match(r'^\s*\/\/\/\s*(.*)$', line) # i.e. "/// more event desc..."
if match:
brief.append(match.group(1).strip())
continue
else:
in_brief = False
# Match event/enum brief
match = re.match(r'^\s*\/\/\/\s*@(brief|breif)\s*(.*)$', line) # i.e. "///@brief My event desc..."
if match:
in_brief = True
brief.append(match.group(2).strip())
continue
# Match event definition
match = re.match(r'event(\s*)(((\w*)::){0,1}(\w+))', line) # i.e. "event SWTag::CounterEvent"
if match:
event_id += 1
# Parse event attributes
event_key = match.group(2) # i.e. SWTag::CounterEvent
event_group = match.group(4) if match.group(4) else "" # i.e. SWTag
event_name = match.group(5) # i.e. CounterEvent
# Define event attributes
event = {
'id': event_id,
'group': event_group,
'name': event_name,
'desc': ' '.join(brief)
}
# Add period at end of event desc if necessary
if event["desc"] and event["desc"][-1] != '.':
event["desc"] += '.'
# Reset brief
brief = []
# Now add event fields
idx = parse_event_fields(lines, idx, event)
# Register event and mapping
protos['events']['defs'][event_key] = event
protos['events']['map'][event_id] = event_key
continue
# Match enum definition
match = re.match(r'enum(\s*)(\w+)', line)
if match:
enum_id += 1
# Parse enum attributes
enum_name = match.group(2)
# Define enum attr
enum = {
'name': enum_name,
'desc': ' '.join(brief)
}
# Add period at end of event desc if necessary
if enum["desc"] and enum["desc"][-1] != '.':
enum["desc"] += '.'
# Reset brief
brief = []
# Now add enum fields
idx = parse_enums(lines, idx, enum)
# Register enum and mapping
protos['enums']['defs'][enum_name] = enum
protos['enums']['map'][enum_id] = enum_name
continue
# Sort and group events
event_groups = protos['events']['groups']
for key in sorted(protos['events']['defs']):
group = protos['events']['defs'][key]['group']
if group not in event_groups:
event_groups[group] = []
event_groups[group].append(key)
return protos
def main():
# Parse args...
parser = ArgumentParser()
parser.add_argument("--proto", "-p", dest="protos", nargs='+', help="Path to all proto file(s) to process. Accepts one or more paths (i.e. events.proto and events_private.proto)", required=True)
parser.add_argument("--output-dir", help="Output dir (defaults to ./codegen). Will create folder if it does not exist.", required=False, default="codegen")
parser.add_argument("--verbose", "-v", help="Verbose", action="store_true")
args = parser.parse_args()
if not os.path.exists(args.output_dir):
MakeDir(args.output_dir)
for f in args.protos:
if not os.path.exists(f):
print('Error: Could not find proto file %s' % f, file=sys.stderr)
return 1
# Parse each proto file and add to protos container
protos = parse_protos(args.protos, args.verbose)
files = [
["gen_ar_event.hpp", ""],
["gen_ar_event.cpp", ""],
["gen_ar_eventhandler.hpp", "gen_ar_event.hpp"],
["gen_ar_eventhandlerfile.hpp", "gen_ar_eventhandler.hpp"]
]
rval = 0
try:
# Delete existing files
for f in files:
filename = f[0]
output_fullpath = os.path.join(args.output_dir, filename)
if os.path.exists(output_fullpath):
if args.verbose:
print("Deleting existing file: %s" % output_fullpath)
os.remove(output_fullpath)
# Generate files from templates
print("Generating c++ from proto files...")
for f in files:
filename = f[0]
event_header = f[1]
curdir = os.path.dirname(os.path.abspath(__file__))
template_file = os.path.join(curdir, 'templates', filename)
output_fullpath = os.path.join(args.output_dir, filename)
if args.verbose:
print("Generating: %s" % output_fullpath)
MakoTemplateWriter.to_file(template_file, output_fullpath,
cmdline=sys.argv,
filename=filename,
protos=protos,
event_header=event_header)
except Exception as e:
print(e)
rval = 1
return rval
if __name__ == '__main__':
sys.exit(main())

View file

@ -1,164 +0,0 @@
# Copyright (C) 2017-2018 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the 'Software'),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
# Python source
import itertools
import os
import sys
from gen_common import *
def main(args=sys.argv[1:]):
thisDir = os.path.dirname(os.path.realpath(__file__))
parser = ArgumentParser('Generate files and initialization functions for all permutations of BackendPixelRate.')
parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True)
parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir)
parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512')
parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0')
parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
parser.add_argument('--rast', help='Generate rasterizer functions instead of normal backend', action='store_true', default=False)
args = parser.parse_args(args)
class backendStrs :
def __init__(self) :
self.outFileName = 'gen_BackendPixelRate%s.cpp'
self.outHeaderName = 'gen_BackendPixelRate.hpp'
self.functionTableName = 'gBackendPixelRateTable'
self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<'
self.template = 'gen_backend.cpp'
self.hpp_template = 'gen_header_init.hpp'
self.cmakeFileName = 'gen_backends.cmake'
self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
self.tableName = 'BackendPixelRate'
if args.rast:
self.outFileName = 'gen_rasterizer%s.cpp'
self.outHeaderName = 'gen_rasterizer.hpp'
self.functionTableName = 'gRasterizerFuncs'
self.funcInstanceHeader = ' = RasterizeTriangle<RasterizerTraits<'
self.template = 'gen_rasterizer.cpp'
self.cmakeFileName = 'gen_rasterizer.cmake'
self.cmakeSrcVar = 'GEN_RASTERIZER_SOURCES'
self.tableName = 'RasterizerFuncs'
backend = backendStrs()
output_list = []
for x in args.dim:
output_list.append(list(range(x)))
# generate all permutations possible for template parameter inputs
output_combinations = list(itertools.product(*output_list))
output_list = []
# for each permutation
for x in range(len(output_combinations)):
# separate each template peram into its own list member
new_list = [output_combinations[x][i] for i in range(len(output_combinations[x]))]
tempStr = backend.functionTableName
#print each list member as an index in the multidimensional array
for i in new_list:
tempStr += '[' + str(i) + ']'
#map each entry in the permutation as its own string member, store as the template instantiation string
tempStr += backend.funcInstanceHeader + ','.join(map(str, output_combinations[x])) + '>>;'
#append the line of c++ code in the list of output lines
output_list.append(tempStr)
# how many files should we split the global template initialization into?
if (args.split == 0):
numFiles = 1
else:
numFiles = (len(output_list) + args.split - 1) // args.split
if (args.numfiles != 0):
numFiles = args.numfiles
linesPerFile = (len(output_list) + numFiles - 1) // numFiles
chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
tmp_output_dir = MakeTmpDir('_codegen')
if not os.path.exists(args.outdir):
try:
os.makedirs(args.outdir)
except OSError as err:
if err.errno != errno.EEXIST:
print('ERROR: Could not create directory:', args.outdir, file=sys.stderr)
return 1
rval = 0
# generate .cpp files
try:
if args.cpp:
baseCppName = os.path.join(tmp_output_dir, backend.outFileName)
templateCpp = os.path.join(thisDir, 'templates', backend.template)
for fileNum in range(numFiles):
filename = baseCppName % str(fileNum)
MakoTemplateWriter.to_file(
templateCpp,
baseCppName % str(fileNum),
cmdline=sys.argv,
fileNum=fileNum,
funcList=chunkedList[fileNum])
if args.hpp:
baseHppName = os.path.join(tmp_output_dir, backend.outHeaderName)
templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template)
MakoTemplateWriter.to_file(
templateHpp,
baseHppName,
cmdline=sys.argv,
numFiles=numFiles,
filename=backend.outHeaderName,
tableName=backend.tableName)
# generate gen_backend.cmake file
if args.cmake:
templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake')
cmakeFile = os.path.join(tmp_output_dir, backend.cmakeFileName)
MakoTemplateWriter.to_file(
templateCmake,
cmakeFile,
cmdline=sys.argv,
srcVar=backend.cmakeSrcVar,
numFiles=numFiles,
baseCppName='${RASTY_GEN_SRC_DIR}/backends/' + os.path.basename(baseCppName))
rval = CopyDirFilesIfDifferent(tmp_output_dir, args.outdir)
except:
rval = 1
finally:
DeleteDirTree(tmp_output_dir)
return rval
if __name__ == '__main__':
sys.exit(main())

View file

@ -1,291 +0,0 @@
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
# Python source
import os
import errno
import sys
import argparse
import tempfile
import filecmp
import shutil
from mako.template import Template
from mako.exceptions import RichTraceback
#==============================================================================
def ConcatLists(list_of_lists):
output = []
for l in list_of_lists: output += l
return output
#==============================================================================
def MakeTmpDir(suffix=''):
'''
Create temporary directory for use in codegen scripts.
'''
return tempfile.mkdtemp(suffix)
#==============================================================================
def MakeDir(dir_path):
'''
Create a directory if it doesn't exist
returns 0 on success, non-zero on failure
'''
dir_path = os.path.abspath(dir_path)
if not os.path.exists(dir_path):
try:
os.makedirs(dir_path)
except OSError as err:
if err.errno != errno.EEXIST:
return 1
else:
if not os.path.isdir(dir_path):
return 1
return 0
#==============================================================================
def DeleteDirTree(dir_path):
'''
Delete directory tree.
returns 0 on success, non-zero on failure
'''
rval = 0
try:
shutil.rmtree(dir_path, False)
except:
rval = 1
return rval
#==============================================================================
def CopyFileIfDifferent(src, dst, verbose = False):
'''
Copy <src> file to <dst> file if the <dst>
file either doesn't contain the file or the file
contents are different.
returns 0 on success, non-zero on failure
'''
assert os.path.isfile(src)
assert (False == os.path.exists(dst) or os.path.isfile(dst))
need_copy = not os.path.exists(dst)
if not need_copy:
need_copy = not filecmp.cmp(src, dst)
if need_copy:
try:
shutil.copy2(src, dst)
except:
print('ERROR: Could not copy %s to %s' % (src, dst), file=sys.stderr)
return 1
if verbose:
print(src, '-->', dst)
return 0
#==============================================================================
def CopyDirFilesIfDifferent(src, dst, recurse = True, verbose = False, orig_dst = None):
'''
Copy files <src> directory to <dst> directory if the <dst>
directory either doesn't contain the file or the file
contents are different.
Optionally recurses into subdirectories
returns 0 on success, non-zero on failure
'''
assert os.path.isdir(src)
assert os.path.isdir(dst)
src = os.path.abspath(src)
dst = os.path.abspath(dst)
if not orig_dst:
orig_dst = dst
for f in os.listdir(src):
src_path = os.path.join(src, f)
dst_path = os.path.join(dst, f)
# prevent recursion
if src_path == orig_dst:
continue
if os.path.isdir(src_path):
if recurse:
if MakeDir(dst_path):
print('ERROR: Could not create directory:', dst_path, file=sys.stderr)
return 1
if verbose:
print('mkdir', dst_path)
rval = CopyDirFilesIfDifferent(src_path, dst_path, recurse, verbose, orig_dst)
else:
rval = CopyFileIfDifferent(src_path, dst_path, verbose)
if rval:
return rval
return 0
#==============================================================================
class MakoTemplateWriter:
'''
MakoTemplateWriter - Class (namespace) for functions to generate strings
or files using the Mako template module.
See http://docs.makotemplates.org/en/latest/ for
mako documentation.
'''
@staticmethod
def to_string(template_filename, **kwargs):
'''
Write template data to a string object and return the string
'''
from mako.template import Template
from mako.exceptions import RichTraceback
try:
template = Template(filename=template_filename)
# Split + Join fixes line-endings for whatever platform you are using
return '\n'.join(template.render(**kwargs).splitlines())
except:
traceback = RichTraceback()
for (filename, lineno, function, line) in traceback.traceback:
print('File %s, line %s, in %s' % (filename, lineno, function))
print(line, '\n')
print('%s: %s' % (str(traceback.error.__class__.__name__), traceback.error))
raise
@staticmethod
def to_file(template_filename, output_filename, **kwargs):
'''
Write template data to a file
'''
if MakeDir(os.path.dirname(output_filename)):
return 1
with open(output_filename, 'w') as outfile:
print(MakoTemplateWriter.to_string(template_filename, **kwargs), file=outfile)
return 0
#==============================================================================
class ArgumentParser(argparse.ArgumentParser):
'''
Subclass of argparse.ArgumentParser
Allow parsing from command files that start with @
Example:
>bt run @myargs.txt
Contents of myargs.txt:
-m <machine>
--target cdv_win7
The below function allows multiple args to be placed on the same text-file line.
The default is one token per line, which is a little cumbersome.
Also allow all characters after a '#' character to be ignored.
'''
#==============================================================================
class _HelpFormatter(argparse.RawTextHelpFormatter):
''' Better help formatter for argument parser '''
def _split_lines(self, text, width):
''' optimized split lines algorithm, indents split lines '''
lines = text.splitlines()
out_lines = []
if len(lines):
out_lines.append(lines[0])
for line in lines[1:]:
out_lines.append(' ' + line)
return out_lines
#==============================================================================
def __init__(self, *args, **kwargs):
''' Constructor. Compatible with argparse.ArgumentParser(),
but with some modifications for better usage and help display.
'''
super(ArgumentParser, self).__init__(
*args,
fromfile_prefix_chars='@',
formatter_class=ArgumentParser._HelpFormatter,
**kwargs)
#==========================================================================
def convert_arg_line_to_args(self, arg_line):
''' convert one line of parsed file to arguments '''
arg_line = arg_line.split('#', 1)[0]
if sys.platform == 'win32':
arg_line = arg_line.replace('\\', '\\\\')
for arg in shlex.split(arg_line):
if not arg.strip():
continue
yield arg
#==========================================================================
def _read_args_from_files(self, arg_strings):
''' read arguments from files '''
# expand arguments referencing files
new_arg_strings = []
for arg_string in arg_strings:
# for regular arguments, just add them back into the list
if arg_string[0] not in self.fromfile_prefix_chars:
new_arg_strings.append(arg_string)
# replace arguments referencing files with the file content
else:
filename = arg_string[1:]
# Search in sys.path
if not os.path.exists(filename):
for path in sys.path:
filename = os.path.join(path, arg_string[1:])
if os.path.exists(filename):
break
try:
args_file = open(filename)
try:
arg_strings = []
for arg_line in args_file.read().splitlines():
for arg in self.convert_arg_line_to_args(arg_line):
arg_strings.append(arg)
arg_strings = self._read_args_from_files(arg_strings)
new_arg_strings.extend(arg_strings)
finally:
args_file.close()
except IOError:
err = sys.exc_info()[1]
self.error(str(err))
# return the modified argument list
return new_arg_strings

View file

@ -1,80 +0,0 @@
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
# Python source
import os
import sys
import knob_defs
from gen_common import *
def main(args=sys.argv[1:]):
# parse args
parser = ArgumentParser()
parser.add_argument("--output", "-o", help="Path to output file", required=True)
parser.add_argument("--gen_h", "-gen_h", help="Generate gen_knobs.h", action="store_true", default=False)
parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate gen_knobs.cpp", action="store_true", required=False)
args = parser.parse_args()
cur_dir = os.path.dirname(os.path.abspath(__file__))
template_cpp = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp')
template_h = os.path.join(cur_dir, 'templates', 'gen_knobs.h')
output_filename = os.path.basename(args.output)
output_dir = MakeTmpDir('_codegen')
output_file = os.path.join(output_dir, output_filename)
rval = 0
try:
if args.gen_h:
MakoTemplateWriter.to_file(
template_h,
output_file,
cmdline=sys.argv,
filename='gen_knobs',
knobs=knob_defs.KNOBS)
if args.gen_cpp:
MakoTemplateWriter.to_file(
template_cpp,
output_file,
cmdline=sys.argv,
filename='gen_knobs',
knobs=knob_defs.KNOBS,
includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'])
rval = CopyFileIfDifferent(output_file, args.output)
except:
rval = 1
finally:
# ignore errors from delete of tmp directory
DeleteDirTree(output_dir)
return 0
if __name__ == '__main__':
sys.exit(main())

View file

@ -1,362 +0,0 @@
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
import os, sys, re
from gen_common import *
from argparse import FileType
inst_aliases = {
'SHUFFLE_VECTOR': 'VSHUFFLE',
'INSERT_ELEMENT': 'VINSERT',
'EXTRACT_ELEMENT': 'VEXTRACT',
'MEM_SET': 'MEMSET',
'MEM_CPY': 'MEMCOPY',
'MEM_MOVE': 'MEMMOVE',
'L_SHR': 'LSHR',
'A_SHR': 'ASHR',
'BIT_CAST': 'BITCAST',
'U_DIV': 'UDIV',
'S_DIV': 'SDIV',
'U_REM': 'UREM',
'S_REM': 'SREM',
'BIN_OP': 'BINOP',
}
intrinsics = [
['VGATHERPD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
['VGATHERPS', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
['VGATHERDD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
['VSCATTERPS', ['pBase', 'mask', 'indices', 'src', 'scale'], 'src'],
['VRCPPS', ['a'], 'a'],
['VROUND', ['a', 'rounding'], 'a'],
['BEXTR_32', ['src', 'control'], 'src'],
['VPSHUFB', ['a', 'b'], 'a'],
['VPERMD', ['a', 'idx'], 'a'],
['VPERMPS', ['idx', 'a'], 'a'],
['VCVTPD2PS', ['a'], 'getVectorType(mFP32Ty, VEC_GET_NUM_ELEMS)'],
['VCVTPS2PH', ['a', 'round'], 'mSimdInt16Ty'],
['VHSUBPS', ['a', 'b'], 'a'],
['VPTESTC', ['a', 'b'], 'mInt32Ty'],
['VPTESTZ', ['a', 'b'], 'mInt32Ty'],
['VPHADDD', ['a', 'b'], 'a'],
['PDEP32', ['a', 'b'], 'a'],
['RDTSC', [], 'mInt64Ty'],
]
llvm_intrinsics = [
['CTTZ', 'cttz', ['a', 'flag'], ['a']],
['CTLZ', 'ctlz', ['a', 'flag'], ['a']],
['VSQRTPS', 'sqrt', ['a'], ['a']],
['STACKSAVE', 'stacksave', [], []],
['STACKRESTORE', 'stackrestore', ['a'], []],
['VMINPS', 'minnum', ['a', 'b'], ['a']],
['VMAXPS', 'maxnum', ['a', 'b'], ['a']],
['VFMADDPS', 'fmuladd', ['a', 'b', 'c'], ['a']],
['DEBUGTRAP', 'debugtrap', [], []],
['POPCNT', 'ctpop', ['a'], ['a']],
['LOG2', 'log2', ['a'], ['a']],
['FABS', 'fabs', ['a'], ['a']],
['EXP2', 'exp2', ['a'], ['a']],
['COS', 'cos', ['a'], ['a']],
['SIN', 'sin', ['a'], ['a']],
['FLOOR', 'floor', ['a'], ['a']],
['POW', 'pow', ['a', 'b'], ['a']]
]
this_dir = os.path.dirname(os.path.abspath(__file__))
template = os.path.join(this_dir, 'templates', 'gen_builder.hpp')
def convert_uppercamel(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper()
'''
Given an input file (e.g. IRBuilder.h) generates function dictionary.
'''
def parse_ir_builder(input_file):
functions = []
lines = input_file.readlines()
deprecated = None
idx = 0
while idx < len(lines) - 1:
line = lines[idx].rstrip()
idx += 1
if deprecated is None:
deprecated = re.search(r'LLVM_ATTRIBUTE_DEPRECATED', line)
#match = re.search(r'\*Create', line)
match = re.search(r'[\*\s]Create(\w*)\(', line)
if match is not None:
#print('Line: %s' % match.group(1))
# Skip function if LLVM_ATTRIBUTE_DEPRECATED found before
if deprecated is not None:
deprecated = None
continue
if re.search(r'^\s*Create', line) is not None:
func_sig = lines[idx-2].rstrip() + line
else:
func_sig = line
end_of_args = False
while not end_of_args:
end_paren = re.search(r'\)', line)
if end_paren is not None:
end_of_args = True
else:
line = lines[idx].rstrip()
func_sig += line
idx += 1
delfunc = re.search(r'LLVM_DELETED_FUNCTION|= delete;', func_sig)
if not delfunc:
func = re.search(r'(.*?)\*[\n\s]*(Create\w*)\((.*?)\)', func_sig)
if func is not None:
return_type = func.group(1).strip() + '*'
func_name = func.group(2)
arguments = func.group(3)
func_args = []
arg_names = []
args = arguments.split(',')
for arg in args:
arg = arg.strip()
if arg:
func_args.append(arg)
split_args = arg.split('=')
arg_name = split_args[0].rsplit(None, 1)[-1]
reg_arg = re.search(r'[\&\*]*(\w*)', arg_name)
if reg_arg:
arg_names += [reg_arg.group(1)]
ignore = False
# The following functions need to be ignored in openswr.
# API change in llvm-5.0 breaks baked autogen files
if (
(func_name == 'CreateFence' or
func_name == 'CreateAtomicCmpXchg' or
func_name == 'CreateAtomicRMW')):
ignore = True
# The following functions need to be ignored.
if (func_name == 'CreateInsertNUWNSWBinOp' or
func_name == 'CreateMaskedIntrinsic' or
func_name == 'CreateAlignmentAssumptionHelper' or
func_name == 'CreateGEP' or
func_name == 'CreateLoad' or
func_name == 'CreateMaskedLoad' or
func_name == 'CreateStore' or
func_name == 'CreateMaskedStore' or
func_name == 'CreateFCmpHelper' or
func_name == 'CreateElementUnorderedAtomicMemCpy'):
ignore = True
# Convert CamelCase to CAMEL_CASE
func_mod = re.search(r'Create(\w*)', func_name)
if func_mod:
func_mod = func_mod.group(1)
func_mod = convert_uppercamel(func_mod)
if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_':
func_mod = func_mod[0] + func_mod[2:]
# Substitute alias based on CAMEL_CASE name.
func_alias = inst_aliases.get(func_mod)
if not func_alias:
func_alias = func_mod
if func_name == 'CreateCall' or func_name == 'CreateGEP':
arglist = re.search(r'ArrayRef', ', '.join(func_args))
if arglist:
func_alias = func_alias + 'A'
if not ignore:
functions.append({
'name' : func_name,
'alias' : func_alias,
'return' : return_type,
'args' : ', '.join(func_args),
'arg_names' : arg_names,
})
return functions
'''
Auto-generates macros for LLVM IR
'''
def generate_gen_h(functions, output_dir):
filename = 'gen_builder.hpp'
output_filename = os.path.join(output_dir, filename)
templfuncs = []
for func in functions:
decl = '%s %s(%s)' % (func['return'], func['alias'], func['args'])
templfuncs.append({
'decl' : decl,
'intrin' : func['name'],
'args' : func['arg_names'],
})
MakoTemplateWriter.to_file(
template,
output_filename,
cmdline=sys.argv,
comment='Builder IR Wrappers',
filename=filename,
functions=templfuncs,
isX86=False, isIntrin=False)
'''
Auto-generates macros for LLVM IR
'''
def generate_meta_h(output_dir):
filename = 'gen_builder_meta.hpp'
output_filename = os.path.join(output_dir, filename)
functions = []
for inst in intrinsics:
name = inst[0]
args = inst[1]
ret = inst[2]
#print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
if len(args) != 0:
declargs = 'Value* ' + ', Value* '.join(args)
decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, declargs)
else:
decl = 'Value* %s(const llvm::Twine& name = "")' % (name)
# determine the return type of the intrinsic. It can either be:
# - type of one of the input arguments
# - snippet of code to set the return type
if ret in args:
returnTy = ret + '->getType()'
else:
returnTy = ret
functions.append({
'decl' : decl,
'name' : name,
'args' : args,
'returnType': returnTy
})
MakoTemplateWriter.to_file(
template,
output_filename,
cmdline=sys.argv,
comment='meta intrinsics',
filename=filename,
functions=functions,
isX86=True, isIntrin=False)
def generate_intrin_h(output_dir):
filename = 'gen_builder_intrin.hpp'
output_filename = os.path.join(output_dir, filename)
functions = []
for inst in llvm_intrinsics:
#print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
if len(inst[2]) != 0:
declargs = 'Value* ' + ', Value* '.join(inst[2])
decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs)
else:
decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
functions.append({
'decl' : decl,
'intrin' : inst[1],
'args' : inst[2],
'types' : inst[3],
})
MakoTemplateWriter.to_file(
template,
output_filename,
cmdline=sys.argv,
comment='llvm intrinsics',
filename=filename,
functions=functions,
isX86=False, isIntrin=True)
'''
Function which is invoked when this script is started from a command line.
Will present and consume a set of arguments which will tell this script how
to behave
'''
def main():
# Parse args...
parser = ArgumentParser()
parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False)
parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True)
parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False)
parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False)
parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False)
args = parser.parse_args()
if not os.path.exists(args.output):
os.makedirs(args.output)
final_output_dir = args.output
args.output = MakeTmpDir('_codegen')
rval = 0
try:
if args.input:
functions = parse_ir_builder(args.input)
if args.gen_h:
generate_gen_h(functions, args.output)
elif args.gen_h:
print('Need to specify --input for --gen_h!')
if args.gen_meta_h:
generate_meta_h(args.output)
if args.gen_intrin_h:
generate_intrin_h(args.output)
rval = CopyDirFilesIfDifferent(args.output, final_output_dir)
except:
print('ERROR: Could not generate llvm_ir_macros', file=sys.stderr)
rval = 1
finally:
DeleteDirTree(args.output)
return rval
if __name__ == '__main__':
sys.exit(main())
# END OF FILE

View file

@ -1,360 +0,0 @@
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
import os, sys, re
from gen_common import *
from argparse import FileType
'''
'''
def gen_llvm_type(type, name, idx, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file):
llvm_type = ''
if is_llvm_struct:
if is_pointer or is_pointer_pointer:
llvm_type = 'Type::getInt32Ty(ctx)'
else:
llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type
elif is_llvm_enum:
llvm_type = 'Type::getInt32Ty(ctx)'
elif is_llvm_pfn:
llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)'
else:
if type == 'BYTE' or type == 'char' or type == 'uint8_t' or type == 'int8_t' or type == 'bool':
llvm_type = 'Type::getInt8Ty(ctx)'
elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t' or type == 'gfxptr_t':
llvm_type = 'Type::getInt64Ty(ctx)'
elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
llvm_type = 'Type::getInt16Ty(ctx)'
elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t':
llvm_type = 'Type::getInt32Ty(ctx)'
elif type == 'float' or type == 'FLOAT':
llvm_type = 'Type::getFloatTy(ctx)'
elif type == 'double' or type == 'DOUBLE':
llvm_type = 'Type::getDoubleTy(ctx)'
elif type == 'void' or type == 'VOID':
llvm_type = 'Type::getInt32Ty(ctx)'
elif type == 'HANDLE':
llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)'
elif type == 'simdscalar':
llvm_type = 'getVectorType(Type::getFloatTy(ctx), pJitMgr->mVWidth)'
elif type == 'simdscalari':
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
elif type == 'simd16scalar':
llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)'
elif type == 'simd16scalari':
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)'
elif type == '__m128i':
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 4)'
elif type == 'SIMD256::Float':
llvm_type = 'getVectorType(Type::getFloatTy(ctx), 8)'
elif type == 'SIMD256::Integer':
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 8)'
elif type == 'SIMD512::Float':
llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)'
elif type == 'SIMD512::Integer':
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)'
elif type == 'simdvector':
llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)'
elif type == 'simd16vector':
llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)'
elif type == 'SIMD256::Vec4':
llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)'
elif type == 'SIMD512::Vec4':
llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)'
else:
llvm_type = 'Gen_%s(pJitMgr)' % type
if is_pointer:
llvm_type = 'PointerType::get(%s, 0)' % llvm_type
if is_pointer_pointer:
llvm_type = 'PointerType::get(%s, 0)' % llvm_type
if is_array_array:
llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count)
elif is_array:
llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
return {
'name' : name,
'lineNum' : idx,
'type' : llvm_type,
}
'''
'''
def gen_llvm_types(input_file, output_file):
lines = input_file.readlines()
types = []
for idx in range(len(lines)):
line = lines[idx].rstrip()
if 'gen_llvm_types FINI' in line:
break
match = re.match(r'(\s*)struct(\s*)(\w+)', line)
if match:
llvm_args = []
# Detect start of structure
is_fwd_decl = re.search(r';', line)
if not is_fwd_decl:
# Extract the command name
struct_name = match.group(3).strip()
type_entry = {
'name' : struct_name,
'lineNum' : idx+1,
'members' : [],
}
end_of_struct = False
while not end_of_struct and idx < len(lines)-1:
idx += 1
line = lines[idx].rstrip()
is_llvm_typedef = re.search(r'@llvm_typedef', line)
if is_llvm_typedef is not None:
is_llvm_typedef = True
continue
else:
is_llvm_typedef = False
###########################################
# Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure.
is_llvm_struct = re.search(r'@llvm_struct', line)
if is_llvm_struct is not None:
is_llvm_struct = True
else:
is_llvm_struct = False
###########################################
# Is field the start of a function? Tells script to ignore it
is_llvm_func_start = re.search(r'@llvm_func_start', line)
if is_llvm_func_start is not None:
while not end_of_struct and idx < len(lines)-1:
idx += 1
line = lines[idx].rstrip()
is_llvm_func_end = re.search(r'@llvm_func_end', line)
if is_llvm_func_end is not None:
break;
continue
###########################################
# Is field a function? Tells script to ignore it
is_llvm_func = re.search(r'@llvm_func', line)
if is_llvm_func is not None:
continue
###########################################
# Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type.
is_llvm_enum = re.search(r'@llvm_enum', line)
if is_llvm_enum is not None:
is_llvm_enum = True
else:
is_llvm_enum = False
###########################################
# Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type.
is_llvm_pfn = re.search(r'@llvm_pfn', line)
if is_llvm_pfn is not None:
is_llvm_pfn = True
else:
is_llvm_pfn = False
###########################################
# Is field const?
is_const = re.search(r'\s+const\s+', line)
if is_const is not None:
is_const = True
else:
is_const = False
###########################################
# Is field a pointer?
is_pointer_pointer = re.search('\*\*', line)
if is_pointer_pointer is not None:
is_pointer_pointer = True
else:
is_pointer_pointer = False
###########################################
# Is field a pointer?
is_pointer = re.search('\*', line)
if is_pointer is not None:
is_pointer = True
else:
is_pointer = False
###########################################
# Is field an array of arrays?
# TODO: Can add this to a list.
is_array_array = re.search('\[(\w*)\]\[(\w*)\]', line)
array_count = '0'
array_count1 = '0'
if is_array_array is not None:
array_count = is_array_array.group(1)
array_count1 = is_array_array.group(2)
is_array_array = True
else:
is_array_array = False
###########################################
# Is field an array?
is_array = re.search('\[(\w*)\]', line)
if is_array is not None:
array_count = is_array.group(1)
is_array = True
else:
is_array = False
is_scoped = re.search('::', line)
if is_scoped is not None:
is_scoped = True
else:
is_scoped = False
type = None
name = None
if is_const and is_pointer:
if is_scoped:
field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)', line)
type = '%s%s' % (field_match.group(4), field_match.group(5))
name = field_match.group(7)
else:
field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)', line)
type = field_match.group(4)
name = field_match.group(6)
elif is_pointer:
field_match = re.match(r'(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)', line)
if field_match:
type = field_match.group(3)
name = field_match.group(5)
elif is_const:
field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)', line)
if field_match:
type = field_match.group(4)
name = field_match.group(6)
else:
if is_scoped:
field_match = re.match(r'\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)', line)
if field_match:
type = field_match.group(1) + '::' + field_match.group(2)
name = field_match.group(3)
else:
field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)', line)
if field_match:
type = field_match.group(2)
name = field_match.group(4)
if is_llvm_typedef is False:
if type is not None:
type_entry['members'].append(
gen_llvm_type(
type, name, idx+1, is_pointer, is_pointer_pointer, is_array, is_array_array,
array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file))
# Detect end of structure
end_of_struct = re.match(r'(\s*)};', line)
if end_of_struct:
types.append(type_entry)
cur_dir = os.path.dirname(os.path.abspath(__file__))
template = os.path.join(cur_dir, 'templates', 'gen_llvm.hpp')
MakoTemplateWriter.to_file(
template,
output_file,
cmdline=sys.argv,
filename=os.path.basename(output_file),
types=types,
input_dir=os.path.dirname(input_file.name),
input_file=os.path.basename(input_file.name))
'''
Function which is invoked when this script is started from a command line.
Will present and consume a set of arguments which will tell this script how
to behave
'''
def main():
# Parse args...
parser = ArgumentParser()
parser.add_argument('--input', '-i', type=FileType('r'),
help='Path to input file containing structs', required=True)
parser.add_argument('--output', '-o', action='store',
help='Path to output file', required=True)
args = parser.parse_args()
final_output_dir = os.path.dirname(args.output)
if MakeDir(final_output_dir):
return 1
final_output_file = args.output
tmp_dir = MakeTmpDir('_codegen')
args.output = os.path.join(tmp_dir, os.path.basename(args.output))
rval = 0
try:
gen_llvm_types(args.input, args.output)
rval = CopyFileIfDifferent(args.output, final_output_file)
except:
print('ERROR: Could not generate llvm types', file=sys.stderr)
rval = 1
finally:
DeleteDirTree(tmp_dir)
return rval
if __name__ == '__main__':
sys.exit(main())
# END OF FILE

View file

@ -1,383 +0,0 @@
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
import sys
# Python source
KNOBS = [
['ENABLE_ASSERT_DIALOGS', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Use dialogs when asserts fire.',
'Asserts are only enabled in debug builds'],
'category' : 'debug',
}],
['SINGLE_THREADED', {
'type' : 'bool',
'default' : 'false',
'desc' : ['If enabled will perform all rendering on the API thread.',
'This is useful mainly for debugging purposes.'],
'category' : 'debug',
}],
['DUMP_SHADER_IR', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'],
'category' : 'debug',
}],
['USE_GENERIC_STORETILE', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Always use generic function for performing StoreTile.',
'Will be slightly slower than using optimized (jitted) path'],
'category' : 'debug_adv',
}],
['FAST_CLEAR', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and',
'defer clear execution to first backend op on hottile, or hottile store'],
'category' : 'perf_adv',
}],
['MAX_NUMA_NODES', {
'type' : 'uint32_t',
'default' : '1' if sys.platform == 'win32' else '0',
'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
' 0 == ALL NUMA-nodes in the system',
' N == Use at most N NUMA-nodes for rendering'],
'category' : 'perf',
}],
['MAX_CORES_PER_NUMA_NODE', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Maximum # of cores per NUMA-node used for worker threads.',
' 0 == ALL non-API thread cores per NUMA-node',
' N == Use at most N cores per NUMA-node'],
'category' : 'perf',
}],
['MAX_THREADS_PER_CORE', {
'type' : 'uint32_t',
'default' : '1',
'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.',
' 0 == ALL hyper-threads per core',
' N == Use at most N hyper-threads per physical core'],
'category' : 'perf',
}],
['MAX_WORKER_THREADS', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Maximum worker threads to spawn.',
'',
'IMPORTANT: If this is non-zero, no worker threads will be bound to',
'specific HW threads. They will all be "floating" SW threads.',
'In this case, the above 3 KNOBS will be ignored.'],
'category' : 'perf',
}],
['BASE_NUMA_NODE', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Starting NUMA node index to use when allocating compute resources.',
'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
'category' : 'perf',
}],
['BASE_CORE', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Starting core index to use when allocating compute resources.',
'Setting this to a non-zero value will reduce the maximum # of cores used.'],
'category' : 'perf',
}],
['BASE_THREAD', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Starting thread index to use when allocating compute resources.',
'Setting this to a non-zero value will reduce the maximum # of threads used.'],
'category' : 'perf',
}],
['BUCKETS_START_FRAME', {
'type' : 'uint32_t',
'default' : '1200',
'desc' : ['Frame from when to start saving buckets data.',
'',
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
'for this to have an effect.'],
'category' : 'perf_adv',
}],
['BUCKETS_END_FRAME', {
'type' : 'uint32_t',
'default' : '1400',
'desc' : ['Frame at which to stop saving buckets data.',
'',
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
'for this to have an effect.'],
'category' : 'perf_adv',
}],
['WORKER_SPIN_LOOP_COUNT', {
'type' : 'uint32_t',
'default' : '5000',
'desc' : ['Number of spin-loop iterations worker threads will perform',
'before going to sleep when waiting for work'],
'category' : 'perf_adv',
}],
['MAX_DRAWS_IN_FLIGHT', {
'type' : 'uint32_t',
'default' : '256',
'desc' : ['Maximum number of draws outstanding before API thread blocks.',
'This value MUST be evenly divisible into 2^32'],
'category' : 'perf_adv',
}],
['MAX_PRIMS_PER_DRAW', {
'type' : 'uint32_t',
'default' : '49152',
'desc' : ['Maximum primitives in a single Draw().',
'Larger primitives are split into smaller Draw calls.',
'Should be a multiple of (3 * vectorWidth).'],
'category' : 'perf_adv',
}],
['MAX_TESS_PRIMS_PER_DRAW', {
'type' : 'uint32_t',
'default' : '16',
'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.',
'Larger primitives are split into smaller Draw calls.',
'Should be a multiple of (vectorWidth).'],
'category' : 'perf_adv',
}],
['DEBUG_OUTPUT_DIR', {
'type' : 'std::string',
'default' : r'%TEMP%\Rast\DebugOutput' if sys.platform == 'win32' else '/tmp/Rast/DebugOutput',
'desc' : ['Output directory for debug data.'],
'category' : 'debug',
}],
['JIT_ENABLE_CACHE', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Enables caching of compiled shaders'],
'category' : 'debug_adv',
}],
['JIT_OPTIMIZATION_LEVEL', {
'type' : 'int',
'default' : '-1',
'desc' : ['JIT compile optimization level:',],
'category' : 'debug',
'control' : 'dropdown',
'choices' : [
{
'name' : 'Automatic',
'desc' : 'Automatic based on other KNOB and build settings',
'value' : -1,
},
{
'name' : 'Debug',
'desc' : 'No optimization: -O0',
'value' : 0,
},
{
'name' : 'Less',
'desc' : 'Some optimization: -O1',
'value' : 1,
},
{
'name' : 'Optimize',
'desc' : 'Default Clang / LLVM optimizations: -O2',
'value' : 2,
},
{
'name' : 'Aggressive',
'desc' : 'Maximum optimization: -O3',
'value' : 3,
},
],
}],
['JIT_CACHE_DIR', {
'type' : 'std::string',
'default' : r'%TEMP%\SWR\JitCache' if sys.platform == 'win32' else '${HOME}/.swr/jitcache',
'desc' : ['Cache directory for compiled shaders.'],
'category' : 'debug',
}],
['TOSS_DRAW', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Disable per-draw/dispatch execution'],
'category' : 'perf',
}],
['TOSS_QUEUE_FE', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at worker FE',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf_adv',
}],
['TOSS_FETCH', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at vertex fetch',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf_adv',
}],
['TOSS_IA', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at input assembler',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf_adv',
}],
['TOSS_VS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at vertex shader',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf_adv',
}],
['TOSS_SETUP_TRIS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at primitive setup',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf_adv',
}],
['TOSS_BIN_TRIS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at primitive binning',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf_adv',
}],
['TOSS_RS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at rasterizer',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf_adv',
}],
['DISABLE_SPLIT_DRAW', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Don\'t split large draws into smaller draws.,',
'MAX_PRIMS_PER_DRAW and MAX_TESS_PRIMS_PER_DRAW can be used to control split size.',
'',
'Useful to disable split draws for gathering archrast stats.'],
'category' : 'perf_adv',
}],
['AR_ENABLE_PIPELINE_STATS', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Enable pipeline stats when using Archrast'],
'category' : 'archrast',
}],
['AR_ENABLE_SHADER_STATS', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Enable shader stats when using Archrast'],
'category' : 'archrast',
}],
['AR_ENABLE_SWTAG_DATA', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Enable SWTag data when using Archrast'],
'category' : 'archrast',
}],
['AR_ENABLE_SWR_EVENTS', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Enable internal SWR events when using Archrast'],
'category' : 'archrast',
}],
['AR_ENABLE_PIPELINE_EVENTS', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Enable pipeline events when using Archrast'],
'category' : 'archrast',
}],
['AR_ENABLE_SHADER_EVENTS', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Enable shader events when using Archrast'],
'category' : 'archrast',
}],
['AR_ENABLE_SWTAG_EVENTS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Enable SWTag events when using Archrast'],
'category' : 'archrast',
}],
['AR_ENABLE_MEMORY_EVENTS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Enable memory events when using Archrast'],
'category' : 'archrast',
}],
['AR_MEM_SET_BYTE_GRANULARITY', {
'type' : 'uint32_t',
'default' : '64',
'desc' : ['Granularity and alignment of tracking of memory accesses',
'ONLY ACTIVE UNDER ArchRast.'],
'category' : 'archrast',
}],
]

View file

@ -1,77 +0,0 @@
# Copyright © 2017-2018 Intel Corporation
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
gen_knobs_cpp = custom_target(
'gen_knobs.cpp',
input : ['gen_knobs.py'],
output : 'gen_knobs.cpp',
command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_cpp'],
depend_files : files(
'knob_defs.py', 'gen_common.py',
'templates/gen_knobs.cpp',
),
)
gen_knobs_h = custom_target(
'gen_knobs.h',
input : ['gen_knobs.py'],
output : 'gen_knobs.h',
command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_h'],
depend_files : files(
'knob_defs.py', 'gen_common.py',
'templates/gen_knobs.h',
),
)
# The generators above this are needed individually, while the below generators
# are all inputs to the same lib, so they don't need unique names.
files_swr_common += [
gen_builder_hpp, gen_builder_meta_hpp, gen_knobs_h, gen_knobs_cpp
]
foreach x : [[swr_context_files, 'gen_swr_context_llvm.h'],
[swr_state_files, 'gen_state_llvm.h'],
[swr_surf_state_files, 'gen_surf_state_llvm.h']]
files_swr_common += custom_target(
x[1],
input : ['gen_llvm_types.py', x[0]],
output : x[1],
command : [prog_python, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@'],
depend_files : files(
'templates/gen_llvm.hpp',
'gen_common.py',
),
)
endforeach
ar_output_filenames = ['gen_ar_event.hpp', 'gen_ar_event.cpp', 'gen_ar_eventhandler.hpp', 'gen_ar_eventhandlerfile.hpp']
ar_template_filenames = []
foreach fname : ar_output_filenames
ar_template_filenames += join_paths('templates', fname)
endforeach
files_swr_common += custom_target(
'gen_archrast',
input : ['gen_archrast.py', swr_event_proto_files, swr_event_pproto_files],
output : ar_output_filenames,
command : [prog_python, '@INPUT0@', '--proto', '@INPUT1@', '@INPUT2@', '--output-dir', meson.current_build_dir()],
depend_files : files('gen_common.py', ar_template_filenames)
)

View file

@ -1,55 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file ${filename}
*
* @brief Implementation for events. auto-generated file
*
* DO NOT EDIT
*
* Generation Command Line:
* ${'\n * '.join(cmdline)}
*
******************************************************************************/
// clang-format off
#include "common/os.h"
#include "gen_ar_event.hpp"
#include "gen_ar_eventhandler.hpp"
using namespace ArchRast;
<% sorted_groups = sorted(protos['events']['groups']) %>
% for group in sorted_groups:
% for event_key in protos['events']['groups'][group]:
<%
event = protos['events']['defs'][event_key]
%>
void ${event['name']}::Accept(EventHandler* pHandler) const
{
pHandler->Handle(*this);
}
% endfor
% endfor
// clan-format on

View file

@ -1,168 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file ${filename}
*
* @brief Definitions for events. auto-generated file
*
* DO NOT EDIT
*
* Generation Command Line:
* ${'\n * '.join(cmdline)}
*
******************************************************************************/
// clang-format off
#pragma once
#include "common/os.h"
#include "core/state.h"
<%
always_enabled_knob_groups = ['Framework', 'SWTagFramework', 'ApiSwr']
group_knob_remap_table = {
"ShaderStats": "KNOB_AR_ENABLE_SHADER_STATS",
"PipelineStats" : "KNOB_AR_ENABLE_PIPELINE_STATS",
"SWTagData" : "KNOB_AR_ENABLE_SWTAG_DATA",
}
%>
namespace ArchRast
{
<% sorted_enums = sorted(protos['enums']['defs']) %>
% for name in sorted_enums:
enum ${name}
{<% names = protos['enums']['defs'][name]['names'] %>
% for i in range(len(names)):
${names[i].lstrip()}
% endfor
};
% endfor
// Forward decl
class EventHandler;
//////////////////////////////////////////////////////////////////////////
/// Event - interface for handling events.
//////////////////////////////////////////////////////////////////////////
struct Event
{
const uint32_t eventId = {0xFFFFFFFF};
Event() {}
virtual ~Event() {}
virtual bool IsEnabled() const { return true; };
virtual const uint32_t GetEventId() const = 0;
virtual void Accept(EventHandler* pHandler) const = 0;
};
<% sorted_groups = sorted(protos['events']['groups']) %>
% for group in sorted_groups:
% for event_key in protos['events']['groups'][group]:
<%
event = protos['events']['defs'][event_key]
%>
//////////////////////////////////////////////////////////////////////////
/// ${event_key}Data
//////////////////////////////////////////////////////////////////////////
#pragma pack(push, 1)
struct ${event['name']}Data
{<%
fields = event['fields'] %>
// Fields
% for i in range(len(fields)):
% if fields[i]['size'] > 1:
${fields[i]['type']} ${fields[i]['name']}[${fields[i]['size']}];
% else:
${fields[i]['type']} ${fields[i]['name']};
% endif
% endfor
};
#pragma pack(pop)
//////////////////////////////////////////////////////////////////////////
/// ${event_key}
//////////////////////////////////////////////////////////////////////////
struct ${event['name']} : Event
{<%
fields = event['fields'] %>
const uint32_t eventId = {${ event['id'] }};
${event['name']}Data data;
// Constructor
${event['name']}(
% for i in range(len(fields)):
% if i < len(fields)-1:
% if fields[i]['size'] > 1:
${fields[i]['type']}* ${fields[i]['name']},
uint32_t ${fields[i]['name']}_size,
% else:
${fields[i]['type']} ${fields[i]['name']},
% endif
% endif
% if i == len(fields)-1:
% if fields[i]['size'] > 1:
${fields[i]['type']}* ${fields[i]['name']},
uint32_t ${fields[i]['name']}_size
% else:
${fields[i]['type']} ${fields[i]['name']}
% endif
% endif
% endfor
)
{
% for i in range(len(fields)):
% if fields[i]['size'] > 1:
% if fields[i]['type'] == 'char':
// Copy size of string (null-terminated) followed by string into entire buffer
SWR_ASSERT(${fields[i]['name']}_size + 1 < ${fields[i]['size']} - sizeof(uint32_t), "String length must be less than size of char buffer - size(uint32_t)!");
memcpy(data.${fields[i]['name']}, &${fields[i]['name']}_size, sizeof(uint32_t));
strcpy_s(data.${fields[i]['name']} + sizeof(uint32_t), ${fields[i]['name']}_size + 1, ${fields[i]['name']});
% else:
memcpy(data.${fields[i]['name']}, ${fields[i]['name']}, ${fields[i]['name']}_size);
% endif
% else:
data.${fields[i]['name']} = ${fields[i]['name']};
% endif
% endfor
}
virtual void Accept(EventHandler* pHandler) const;
inline const uint32_t GetEventId() const { return eventId; }
% if group not in always_enabled_knob_groups:
<%
if group in group_knob_remap_table:
group_knob_define = group_knob_remap_table[group]
else:
group_knob_define = 'KNOB_AR_ENABLE_' + group.upper() + '_EVENTS'
%>
bool IsEnabled() const
{
static const bool IsEventEnabled = true; // TODO: Replace with knob for each event
return ${group_knob_define} && IsEventEnabled;
}
% endif
};
% endfor
% endfor
} // namespace ArchRast
// clang-format on

View file

@ -1,61 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file ${filename}
*
* @brief Event handler interface. auto-generated file
*
* DO NOT EDIT
*
* Generation Command Line:
* ${'\n * '.join(cmdline)}
*
******************************************************************************/
// clang-format on
#pragma once
#include "${event_header}"
namespace ArchRast
{
//////////////////////////////////////////////////////////////////////////
/// EventHandler - interface for handling events.
//////////////////////////////////////////////////////////////////////////
class EventHandler
{
public:
EventHandler() {}
virtual ~EventHandler() {}
virtual void FlushDraw(uint32_t drawId) {}
<% sorted_groups = sorted(protos['events']['groups']) %>
% for group in sorted_groups:
% for event_key in protos['events']['groups'][group]:
<%
event = protos['events']['defs'][event_key]
%> virtual void Handle(const ${event['name']}& event) {}
% endfor
% endfor
};
} // namespace ArchRast
// clan-format off

View file

@ -1,174 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file ${filename}
*
* @brief Event handler interface. auto-generated file
*
* DO NOT EDIT
*
* Generation Command Line:
* ${'\n * '.join(cmdline)}
*
******************************************************************************/
// clang-format off
#pragma once
#include "common/os.h"
#include "${event_header}"
#include <fstream>
#include <sstream>
#include <iostream>
#include <thread>
namespace ArchRast
{
//////////////////////////////////////////////////////////////////////////
/// EventHandlerFile - interface for handling events.
//////////////////////////////////////////////////////////////////////////
class EventHandlerFile : public EventHandler
{
public:
EventHandlerFile(uint32_t id) : mBufOffset(0)
{
#if defined(_WIN32)
DWORD pid = GetCurrentProcessId();
TCHAR procname[MAX_PATH];
GetModuleFileName(NULL, procname, MAX_PATH);
const char* pBaseName = strrchr(procname, '\\');
std::stringstream outDir;
outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
mOutputDir = outDir.str();
if (CreateDirectory(mOutputDir.c_str(), NULL))
{
std::cout << std::endl
<< "ArchRast Dir: " << mOutputDir << std::endl
<< std::endl
<< std::flush;
}
// There could be multiple threads creating thread pools. We
// want to make sure they are uniquely identified by adding in
// the creator's thread id into the filename.
std::stringstream fstr;
fstr << outDir.str().c_str() << "\\ar_event" << std::this_thread::get_id();
fstr << "_" << id << ".bin" << std::ends;
mFilename = fstr.str();
#else
// There could be multiple threads creating thread pools. We
// want to make sure they are uniquely identified by adding in
// the creator's thread id into the filename.
std::stringstream fstr;
fstr << "/tmp/ar_event" << std::this_thread::get_id();
fstr << "_" << id << ".bin" << std::ends;
mFilename = fstr.str();
#endif
}
virtual ~EventHandlerFile() { FlushBuffer(); }
//////////////////////////////////////////////////////////////////////////
/// @brief Flush buffer to file.
bool FlushBuffer()
{
if (mBufOffset > 0)
{
if (mBufOffset == mHeaderBufOffset)
{
// Nothing to flush. Only header has been generated.
return false;
}
std::ofstream file;
file.open(mFilename, std::ios::out | std::ios::app | std::ios::binary);
if (!file.is_open())
{
SWR_INVALID("ArchRast: Could not open event file!");
return false;
}
file.write((char*)mBuffer, mBufOffset);
file.close();
mBufOffset = 0;
mHeaderBufOffset = 0; // Reset header offset so its no longer considered.
}
return true;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Write event and its payload to the memory buffer.
void Write(uint32_t eventId, const char* pBlock, uint32_t size)
{
if ((mBufOffset + size + sizeof(eventId)) > mBufferSize)
{
if (!FlushBuffer())
{
// Don't corrupt what's already in the buffer?
/// @todo Maybe add corrupt marker to buffer here in case we can open file in
/// future?
return;
}
}
memcpy(&mBuffer[mBufOffset], (char*)&eventId, sizeof(eventId));
mBufOffset += sizeof(eventId);
memcpy(&mBuffer[mBufOffset], pBlock, size);
mBufOffset += size;
}
<% sorted_groups = sorted(protos['events']['groups']) %>
% for group in sorted_groups:
% for event_key in protos['events']['groups'][group]:
<%
event = protos['events']['defs'][event_key]
%>
//////////////////////////////////////////////////////////////////////////
/// @brief Handle ${event_key} event
virtual void Handle(const ${event['name']}& event)
{
% if event['num_fields'] == 0:
Write(event.eventId, (char*)&event.data, 0);
% else:
Write(event.eventId, (char*)&event.data, sizeof(event.data));
% endif
}
% endfor
% endfor
//////////////////////////////////////////////////////////////////////////
/// @brief Everything written to buffer this point is the header.
virtual void MarkHeader()
{
mHeaderBufOffset = mBufOffset;
}
std::string mFilename;
std::string mOutputDir;
static const uint32_t mBufferSize = 1024;
uint8_t mBuffer[mBufferSize];
uint32_t mBufOffset{0};
uint32_t mHeaderBufOffset{0};
};
} // namespace ArchRast
// clang-format on

View file

@ -1,42 +0,0 @@
//============================================================================
// Copyright (C) 2017 Intel Corporation. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice (including the next
// paragraph) shall be included in all copies or substantial portions of the
// Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
//
// @file BackendPixelRate${fileNum}.cpp
//
// @brief auto-generated file
//
// DO NOT EDIT
//
// Generation Command Line:
// ${'\n// '.join(cmdline)}
//
//============================================================================
#include "core/backend.h"
#include "core/backend_impl.h"
void InitBackendPixelRate${fileNum}()
{
%for func in funcList:
${func}
%endfor
}

View file

@ -1,84 +0,0 @@
//============================================================================
// Copyright (C) 2014-2020 Intel Corporation. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice (including the next
// paragraph) shall be included in all copies or substantial portions of the
// Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
//
// @file ${filename}
//
// @brief auto-generated file
//
// DO NOT EDIT
//
// Generation Command Line:
// ${'\n// '.join(cmdline)}
//
//============================================================================
// clang-format off
#pragma once
//============================================================================
// Auto-generated ${comment}
//============================================================================
%for func in functions:
<%argList = ', '.join(func['args'])%>\
${func['decl']}
{
%if isX86:
%if len(func['args']) != 0:
SmallVector<Type*, ${len(func['args'])}> argTypes;
%for arg in func['args']:
argTypes.push_back(${arg}->getType());
%endfor
#if LLVM_VERSION_MAJOR >= 12
#define VEC_GET_NUM_ELEMS cast<FixedVectorType>(a->getType())->getNumElements()
#elif LLVM_VERSION_MAJOR >= 11
#define VEC_GET_NUM_ELEMS cast<VectorType>(a->getType())->getNumElements()
#else
#define VEC_GET_NUM_ELEMS a->getType()->getVectorNumElements()
#endif
FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, argTypes, false);
%else:
FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, {}, false);
%endif:
#if LLVM_VERSION_MAJOR >= 9
Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy).getCallee());
#else
Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy));
#endif
return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
%elif isIntrin:
%if len(func['types']) != 0:
SmallVector<Type*, ${len(func['types'])}> args;
%for arg in func['types']:
args.push_back(${arg}->getType());
%endfor
Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args);
return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
%else:
Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
%endif
%else:
return IRB()->${func['intrin']}(${argList});
%endif
}
% endfor
// clang-format on

View file

@ -1,46 +0,0 @@
//============================================================================
// Copyright (C) 2017 Intel Corporation. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice (including the next
// paragraph) shall be included in all copies or substantial portions of the
// Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
//
// @file ${filename}
//
// @brief auto-generated file
//
// DO NOT EDIT
//
// Generation Command Line:
// ${'\n// '.join(cmdline)}
//
//============================================================================
// clang-format off
%for num in range(numFiles):
void Init${tableName}${num}();
%endfor
static INLINE void Init${tableName}()
{
%for num in range(numFiles):
Init${tableName}${num}();
%endfor
}
// clang-format on

View file

@ -1,143 +0,0 @@
/******************************************************************************
* Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file ${filename}.cpp
*
* @brief Dynamic Knobs for Core.
*
* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
*
* Generation Command Line:
* ${'\n * '.join(cmdline)}
*
******************************************************************************/
// clang-format off
<% calc_max_knob_len(knobs) %>
% for inc in includes:
#include <${inc}>
% endfor
#include <regex>
#include <core/utils.h>
//========================================================
// Implementation
//========================================================
void KnobBase::autoExpandEnvironmentVariables(std::string& text)
{
size_t start;
while ((start = text.find("${'${'}")) != std::string::npos)
{
size_t end = text.find("}");
if (end == std::string::npos)
break;
const std::string var = GetEnv(text.substr(start + 2, end - start - 2));
text.replace(start, end - start + 1, var);
}
// win32 style variable replacement
while ((start = text.find("%")) != std::string::npos)
{
size_t end = text.find("%", start + 1);
if (end == std::string::npos)
break;
const std::string var = GetEnv(text.substr(start + 1, end - start - 1));
text.replace(start, end - start + 1, var);
}
}
//========================================================
// Static Data Members
//========================================================
% for knob in knobs:
% if knob[1]['type'] == 'std::string':
${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = "${repr(knob[1]['default'])[1:-1]}";
% else:
${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = ${knob[1]['default']};
% endif
% endfor
GlobalKnobs g_GlobalKnobs;
//========================================================
// Knob Initialization
//========================================================
GlobalKnobs::GlobalKnobs()
{
% for knob in knobs :
InitKnob(${ knob[0] });
% endfor
}
//========================================================
// Knob Display (Convert to String)
//========================================================
std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
{
std::basic_stringstream<char> str;
str << std::showbase << std::setprecision(1) << std::fixed;
if (optPerLinePrefix == nullptr)
{
optPerLinePrefix = "";
}
% for knob in knobs:
str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
% if knob[1]['type'] == 'bool':
str << (KNOB_${knob[0]} ? "+\n" : "-\n");
% elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
str << std::dec << KNOB_${knob[0]} << "\n";
% else:
str << KNOB_${knob[0]} << "\n";
% endif
% endfor
str << std::ends;
return str.str();
}
<%!
# Globally available python
max_len = 0
def calc_max_knob_len(knobs):
global max_len
max_len = 0
for knob in knobs:
if len(knob[0]) > max_len: max_len = len(knob[0])
max_len += len('KNOB_ ')
if max_len % 4: max_len += 4 - (max_len % 4)
def space_knob(knob):
knob_len = len('KNOB_' + knob)
return ' '*(max_len - knob_len)
def calc_max_name_len(choices_array):
_max_len = 0
for choice in choices_array:
if len(choice['name']) > _max_len: _max_len = len(choice['name'])
if _max_len % 4: _max_len += 4 - (_max_len % 4)
return _max_len
def space_name(name, max_len):
name_len = len(name)
return ' '*(max_len - name_len)
%>
// clang-format on

View file

@ -1,154 +0,0 @@
/******************************************************************************
* Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file ${filename}.h
*
* @brief Dynamic Knobs for Core.
*
* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
*
* Generation Command Line:
* ${'\n * '.join(cmdline)}
*
******************************************************************************/
// clang-format off
<% calc_max_knob_len(knobs) %>
#pragma once
#include <string>
struct KnobBase
{
private:
// Update the input string.
static void autoExpandEnvironmentVariables(std::string& text);
protected:
// Leave input alone and return new string.
static std::string expandEnvironmentVariables(std::string const& input)
{
std::string text = input;
autoExpandEnvironmentVariables(text);
return text;
}
template <typename T>
static T expandEnvironmentVariables(T const& input)
{
return input;
}
};
template <typename T>
struct Knob : KnobBase
{
public:
const T& Value() const { return m_Value; }
const T& Value(T const& newValue)
{
m_Value = expandEnvironmentVariables(newValue);
return Value();
}
private:
T m_Value;
};
#define DEFINE_KNOB(_name, _type) \\
struct Knob_##_name : Knob<_type> \\
{ \\
static const char* Name() { return "KNOB_" #_name; } \\
static _type DefaultValue() { return (m_default); } \\
private: \\
static _type m_default; \\
} _name;
#define GET_KNOB(_name) g_GlobalKnobs._name.Value()
#define SET_KNOB(_name, _newValue) g_GlobalKnobs._name.Value(_newValue)
struct GlobalKnobs
{
% for knob in knobs:
//-----------------------------------------------------------
// KNOB_${knob[0]}
//
% for line in knob[1]['desc']:
// ${line}
% endfor
% if knob[1].get('choices'):
<%
choices = knob[1].get('choices')
_max_len = calc_max_name_len(choices) %>//
% for i in range(len(choices)):
// ${choices[i]['name']}${space_name(choices[i]['name'], _max_len)} = ${format(choices[i]['value'], '#010x')}
% endfor
% endif
//
DEFINE_KNOB(${knob[0]}, ${knob[1]['type']});
% endfor
std::string ToString(const char* optPerLinePrefix="");
GlobalKnobs();
};
extern GlobalKnobs g_GlobalKnobs;
#undef DEFINE_KNOB
% for knob in knobs:
#define KNOB_${knob[0]}${space_knob(knob[0])} GET_KNOB(${knob[0]})
% endfor
<%!
# Globally available python
max_len = 0
def calc_max_knob_len(knobs):
global max_len
max_len = 0
for knob in knobs:
if len(knob[0]) > max_len: max_len = len(knob[0])
max_len += len('KNOB_ ')
if max_len % 4: max_len += 4 - (max_len % 4)
def space_knob(knob):
knob_len = len('KNOB_' + knob)
return ' '*(max_len - knob_len)
def calc_max_name_len(choices_array):
_max_len = 0
for choice in choices_array:
if len(choice['name']) > _max_len: _max_len = len(choice['name'])
if _max_len % 4: _max_len += 4 - (_max_len % 4)
return _max_len
def space_name(name, max_len):
name_len = len(name)
return ' '*(max_len - name_len)
%>
// clang-format on

View file

@ -1,109 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file ${filename}
*
* @brief auto-generated file
*
* DO NOT EDIT
*
* Generation Command Line:
* ${'\n * '.join(cmdline)}
*
******************************************************************************/
// clang-format off
#include <llvm/IR/DerivedTypes.h>
#pragma once
namespace SwrJit
{
using namespace llvm;
%for type in types:
INLINE static StructType* Gen_${type['name']}(JitManager* pJitMgr)
{
%if needs_ctx(type):
LLVMContext& ctx = pJitMgr->mContext;
%endif
#if LLVM_VERSION_MAJOR >= 12
StructType* pRetType = StructType::getTypeByName(pJitMgr->mContext, "${type['name']}");
#else
StructType* pRetType = pJitMgr->mpCurrentModule->getTypeByName("${type['name']}");
#endif
if (pRetType == nullptr)
{
std::vector<Type*> members =<% (max_type_len, max_name_len) = calc_max_len(type['members']) %>
{
%for member in type['members']:
/* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ ${member['type']},
%endfor
};
pRetType = StructType::create(members, "${type['name']}", false);
// Compute debug metadata
llvm::DIBuilder builder(*pJitMgr->mpCurrentModule);
llvm::DIFile* pFile = builder.createFile("${input_file}", "${os.path.normpath(input_dir).replace('\\', '/')}");
std::vector<std::pair<std::string, uint32_t>> dbgMembers =
{
%for member in type['members']:
std::make_pair("${member['name']}", ${pad(len(member['name']), max_name_len)}${member['lineNum']}),
%endfor
};
pJitMgr->CreateDebugStructType(pRetType, "${type['name']}", pFile, ${type['lineNum']}, dbgMembers);
}
return pRetType;
}
%for member in type['members']:
static const uint32_t ${type['name']}_${member['name']} ${pad(len(member['name']), max_name_len)}= ${loop.index};
%endfor
%endfor
} // namespace SwrJit
<%! # Global function definitions
import os
def needs_ctx(struct_type):
for m in struct_type.get('members', []):
if '(ctx)' in m.get('type', ''):
return True
return False
def calc_max_len(fields):
max_type_len = 0
max_name_len = 0
for f in fields:
if len(f['type']) > max_type_len: max_type_len = len(f['type'])
if len(f['name']) > max_name_len: max_name_len = len(f['name'])
return (max_type_len, max_name_len)
def pad(cur_len, max_len):
pad_amt = max_len - cur_len
return ' '*pad_amt
%>
// clang-format on

View file

@ -1,44 +0,0 @@
//============================================================================
// Copyright (C) 2017 Intel Corporation. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice (including the next
// paragraph) shall be included in all copies or substantial portions of the
// Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
//
// @file gen_rasterizer${fileNum}.cpp
//
// @brief auto-generated file
//
// DO NOT EDIT
//
// Generation Command Line:
// ${'\n// '.join(cmdline)}
//
//============================================================================
// clang-format off
#include "core/rasterizer.h"
#include "core/rasterizer_impl.h"
void InitRasterizerFuncs${fileNum}()
{
%for func in funcList:
${func}
%endfor
}
// clang-format on

File diff suppressed because it is too large Load diff

View file

@ -1,268 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file formats.h
*
* @brief auto-generated file
*
* DO NOT EDIT
*
******************************************************************************/
#pragma once
#include "common/os.h"
//////////////////////////////////////////////////////////////////////////
/// SWR_TYPE - Format component type
//////////////////////////////////////////////////////////////////////////
enum SWR_TYPE
{
SWR_TYPE_UNKNOWN,
SWR_TYPE_UNUSED,
SWR_TYPE_UNORM,
SWR_TYPE_SNORM,
SWR_TYPE_UINT,
SWR_TYPE_SINT,
SWR_TYPE_FLOAT,
SWR_TYPE_SSCALED,
SWR_TYPE_USCALED,
SWR_TYPE_SFIXED,
};
//////////////////////////////////////////////////////////////////////////
/// SWR_FORMAT
//////////////////////////////////////////////////////////////////////////
enum SWR_FORMAT
{
R32G32B32A32_FLOAT = 0x0,
R32G32B32A32_SINT = 0x1,
R32G32B32A32_UINT = 0x2,
R64G64_FLOAT = 0x5,
R32G32B32X32_FLOAT = 0x6,
R32G32B32A32_SSCALED = 0x7,
R32G32B32A32_USCALED = 0x8,
R32G32B32A32_SFIXED = 0x20,
R32G32B32_FLOAT = 0x40,
R32G32B32_SINT = 0x41,
R32G32B32_UINT = 0x42,
R32G32B32_SSCALED = 0x45,
R32G32B32_USCALED = 0x46,
R32G32B32_SFIXED = 0x50,
R16G16B16A16_UNORM = 0x80,
R16G16B16A16_SNORM = 0x81,
R16G16B16A16_SINT = 0x82,
R16G16B16A16_UINT = 0x83,
R16G16B16A16_FLOAT = 0x84,
R32G32_FLOAT = 0x85,
R32G32_SINT = 0x86,
R32G32_UINT = 0x87,
R32_FLOAT_X8X24_TYPELESS = 0x88,
X32_TYPELESS_G8X24_UINT = 0x89,
L32A32_FLOAT = 0x8A,
R64_FLOAT = 0x8D,
R16G16B16X16_UNORM = 0x8E,
R16G16B16X16_FLOAT = 0x8F,
L32X32_FLOAT = 0x91,
I32X32_FLOAT = 0x92,
R16G16B16A16_SSCALED = 0x93,
R16G16B16A16_USCALED = 0x94,
R32G32_SSCALED = 0x95,
R32G32_USCALED = 0x96,
R32G32_SFIXED = 0xA0,
B8G8R8A8_UNORM = 0xC0,
B8G8R8A8_UNORM_SRGB = 0xC1,
R10G10B10A2_UNORM = 0xC2,
R10G10B10A2_UNORM_SRGB = 0xC3,
R10G10B10A2_UINT = 0xC4,
R8G8B8A8_UNORM = 0xC7,
R8G8B8A8_UNORM_SRGB = 0xC8,
R8G8B8A8_SNORM = 0xC9,
R8G8B8A8_SINT = 0xCA,
R8G8B8A8_UINT = 0xCB,
R16G16_UNORM = 0xCC,
R16G16_SNORM = 0xCD,
R16G16_SINT = 0xCE,
R16G16_UINT = 0xCF,
R16G16_FLOAT = 0xD0,
B10G10R10A2_UNORM = 0xD1,
B10G10R10A2_UNORM_SRGB = 0xD2,
R11G11B10_FLOAT = 0xD3,
R10G10B10_FLOAT_A2_UNORM = 0xD5,
R32_SINT = 0xD6,
R32_UINT = 0xD7,
R32_FLOAT = 0xD8,
R24_UNORM_X8_TYPELESS = 0xD9,
X24_TYPELESS_G8_UINT = 0xDA,
L32_UNORM = 0xDD,
L16A16_UNORM = 0xDF,
I24X8_UNORM = 0xE0,
L24X8_UNORM = 0xE1,
I32_FLOAT = 0xE3,
L32_FLOAT = 0xE4,
A32_FLOAT = 0xE5,
B8G8R8X8_UNORM = 0xE9,
B8G8R8X8_UNORM_SRGB = 0xEA,
R8G8B8X8_UNORM = 0xEB,
R8G8B8X8_UNORM_SRGB = 0xEC,
R9G9B9E5_SHAREDEXP = 0xED,
B10G10R10X2_UNORM = 0xEE,
L16A16_FLOAT = 0xF0,
R10G10B10X2_USCALED = 0xF3,
R8G8B8A8_SSCALED = 0xF4,
R8G8B8A8_USCALED = 0xF5,
R16G16_SSCALED = 0xF6,
R16G16_USCALED = 0xF7,
R32_SSCALED = 0xF8,
R32_USCALED = 0xF9,
B5G6R5_UNORM = 0x100,
B5G6R5_UNORM_SRGB = 0x101,
B5G5R5A1_UNORM = 0x102,
B5G5R5A1_UNORM_SRGB = 0x103,
B4G4R4A4_UNORM = 0x104,
B4G4R4A4_UNORM_SRGB = 0x105,
R8G8_UNORM = 0x106,
R8G8_SNORM = 0x107,
R8G8_SINT = 0x108,
R8G8_UINT = 0x109,
R16_UNORM = 0x10A,
R16_SNORM = 0x10B,
R16_SINT = 0x10C,
R16_UINT = 0x10D,
R16_FLOAT = 0x10E,
I16_UNORM = 0x111,
L16_UNORM = 0x112,
A16_UNORM = 0x113,
L8A8_UNORM = 0x114,
I16_FLOAT = 0x115,
L16_FLOAT = 0x116,
A16_FLOAT = 0x117,
L8A8_UNORM_SRGB = 0x118,
B5G5R5X1_UNORM = 0x11A,
B5G5R5X1_UNORM_SRGB = 0x11B,
R8G8_SSCALED = 0x11C,
R8G8_USCALED = 0x11D,
R16_SSCALED = 0x11E,
R16_USCALED = 0x11F,
A1B5G5R5_UNORM = 0x124,
A4B4G4R4_UNORM = 0x125,
L8A8_UINT = 0x126,
L8A8_SINT = 0x127,
R8_UNORM = 0x140,
R8_SNORM = 0x141,
R8_SINT = 0x142,
R8_UINT = 0x143,
A8_UNORM = 0x144,
I8_UNORM = 0x145,
L8_UNORM = 0x146,
R8_SSCALED = 0x149,
R8_USCALED = 0x14A,
L8_UNORM_SRGB = 0x14C,
L8_UINT = 0x152,
L8_SINT = 0x153,
I8_UINT = 0x154,
I8_SINT = 0x155,
DXT1_RGB_SRGB = 0x180,
YCRCB_SWAPUVY = 0x183,
BC1_UNORM = 0x186,
BC2_UNORM = 0x187,
BC3_UNORM = 0x188,
BC4_UNORM = 0x189,
BC5_UNORM = 0x18A,
BC1_UNORM_SRGB = 0x18B,
BC2_UNORM_SRGB = 0x18C,
BC3_UNORM_SRGB = 0x18D,
YCRCB_SWAPUV = 0x18F,
DXT1_RGB = 0x191,
R8G8B8_UNORM = 0x193,
R8G8B8_SNORM = 0x194,
R8G8B8_SSCALED = 0x195,
R8G8B8_USCALED = 0x196,
R64G64B64A64_FLOAT = 0x197,
R64G64B64_FLOAT = 0x198,
BC4_SNORM = 0x199,
BC5_SNORM = 0x19A,
R16G16B16_FLOAT = 0x19B,
R16G16B16_UNORM = 0x19C,
R16G16B16_SNORM = 0x19D,
R16G16B16_SSCALED = 0x19E,
R16G16B16_USCALED = 0x19F,
BC6H_SF16 = 0x1A1,
BC7_UNORM = 0x1A2,
BC7_UNORM_SRGB = 0x1A3,
BC6H_UF16 = 0x1A4,
R8G8B8_UNORM_SRGB = 0x1A8,
R16G16B16_UINT = 0x1B0,
R16G16B16_SINT = 0x1B1,
R32_SFIXED = 0x1B2,
R10G10B10A2_SNORM = 0x1B3,
R10G10B10A2_USCALED = 0x1B4,
R10G10B10A2_SSCALED = 0x1B5,
R10G10B10A2_SINT = 0x1B6,
B10G10R10A2_SNORM = 0x1B7,
B10G10R10A2_USCALED = 0x1B8,
B10G10R10A2_SSCALED = 0x1B9,
B10G10R10A2_UINT = 0x1BA,
B10G10R10A2_SINT = 0x1BB,
R8G8B8_UINT = 0x1C8,
R8G8B8_SINT = 0x1C9,
RAW = 0x1FF,
NUM_SWR_FORMATS = 0x200,
};
//////////////////////////////////////////////////////////////////////////
/// SWR_FORMAT_INFO - Format information
//////////////////////////////////////////////////////////////////////////
struct SWR_FORMAT_INFO
{
const char* name;
SWR_TYPE type[4];
uint32_t defaults[4];
uint32_t swizzle[4]; ///< swizzle per component
uint32_t bpc[4]; ///< bits per component
uint32_t bpp; ///< bits per pixel
uint32_t Bpp; ///< bytes per pixel
uint32_t numComps; ///< number of components
bool isSRGB;
bool isBC;
bool isSubsampled;
bool isLuminance;
bool isNormalized[4];
float toFloat[4];
uint32_t bcWidth;
uint32_t bcHeight;
};
extern const SWR_FORMAT_INFO gFormatInfo[NUM_SWR_FORMATS];
//////////////////////////////////////////////////////////////////////////
/// @brief Retrieves format info struct for given format.
/// @param format - SWR format
INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format)
{
SWR_ASSERT(format < NUM_SWR_FORMATS, "Invalid Surface Format: %d", format);
SWR_ASSERT(gFormatInfo[format].name != nullptr, "Invalid Surface Format: %d", format);
return gFormatInfo[format];
}
// lookup table for unorm8 srgb -> float conversion
extern const uint32_t srgb8Table[256];

View file

@ -1,120 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_INTRIN_H__
#define __SWR_INTRIN_H__
#include "os.h"
#if !defined(SIMD_ARCH)
#define SIMD_ARCH KNOB_ARCH
#endif
#include "simdlib_types.hpp"
typedef SIMDImpl::SIMD128Impl::Float simd4scalar;
typedef SIMDImpl::SIMD128Impl::Double simd4scalard;
typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector;
typedef SIMDImpl::SIMD128Impl::Mask simd4mask;
typedef SIMDImpl::SIMD256Impl::Float simd8scalar;
typedef SIMDImpl::SIMD256Impl::Double simd8scalard;
typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector;
typedef SIMDImpl::SIMD256Impl::Mask simd8mask;
typedef SIMDImpl::SIMD512Impl::Float simd16scalar;
typedef SIMDImpl::SIMD512Impl::Double simd16scalard;
typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector;
typedef SIMDImpl::SIMD512Impl::Mask simd16mask;
#if KNOB_SIMD_WIDTH == 8
typedef simd8scalar simdscalar;
typedef simd8scalard simdscalard;
typedef simd8scalari simdscalari;
typedef simd8vector simdvector;
typedef simd8mask simdmask;
#else
#error Unsupported vector width
#endif
INLINE
UINT pdep_u32(UINT a, UINT mask)
{
#if KNOB_ARCH >= KNOB_ARCH_AVX2
return _pdep_u32(a, mask);
#else
UINT result = 0;
// copied from http://wm.ite.pl/articles/pdep-soft-emu.html
// using bsf instead of funky loop
unsigned long maskIndex = 0;
while (_BitScanForward(&maskIndex, mask))
{
// 1. isolate lowest set bit of mask
const UINT lowest = 1 << maskIndex;
// 2. populate LSB from src
const UINT LSB = (UINT)((int)(a << 31) >> 31);
// 3. copy bit from mask
result |= LSB & lowest;
// 4. clear lowest bit
mask &= ~lowest;
// 5. prepare for next iteration
a >>= 1;
}
return result;
#endif
}
INLINE
UINT pext_u32(UINT a, UINT mask)
{
#if KNOB_ARCH >= KNOB_ARCH_AVX2
return _pext_u32(a, mask);
#else
UINT result = 0;
unsigned long maskIndex;
uint32_t currentBit = 0;
while (_BitScanForward(&maskIndex, mask))
{
// 1. isolate lowest set bit of mask
const UINT lowest = 1 << maskIndex;
// 2. copy bit from mask
result |= ((a & lowest) > 0) << currentBit++;
// 3. clear lowest bit
mask &= ~lowest;
}
return result;
#endif
}
#endif //__SWR_INTRIN_H__

View file

@ -1,231 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#include <iostream>
#include <vector>
#include <bitset>
#include <array>
#include <string>
#include <algorithm>
// Clang for Windows does supply an intrin.h with __cpuid intrinsics, however...
// It seems to not realize that a write to "b" (ebx) will kill the value in rbx.
// This attempts to use the "native" clang / gcc intrinsics instead of the windows
// compatible ones.
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
#else
#include <string.h>
#if !defined(__cpuid)
#include <cpuid.h>
#endif
#endif
class InstructionSet
{
public:
InstructionSet() : CPU_Rep(){};
// getters
std::string Vendor(void) { return CPU_Rep.vendor_; }
std::string Brand(void) { return CPU_Rep.brand_; }
bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; }
bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; }
bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; }
bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; }
bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; }
bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; }
bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; }
bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; }
bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; }
bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; }
bool AES(void) { return CPU_Rep.f_1_ECX_[25]; }
bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; }
bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; }
bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; }
bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; }
bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; }
bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; }
bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; }
bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; }
bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; }
bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; }
bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; }
bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; }
bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; }
bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; }
bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; }
bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; }
bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; }
bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; }
bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; }
bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; }
bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; }
bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; }
bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; }
bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; }
bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; }
bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; }
bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; }
bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; }
bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; }
bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; }
bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; }
bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; }
bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; }
bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; }
bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; }
bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; }
bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; }
bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; }
bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; }
bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; }
bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; }
private:
class InstructionSet_Internal
{
public:
InstructionSet_Internal() :
nIds_{0}, nExIds_{0}, isIntel_{false}, isAMD_{false}, f_1_ECX_{0}, f_1_EDX_{0},
f_7_EBX_{0}, f_7_ECX_{0}, f_81_ECX_{0}, f_81_EDX_{0}, data_{}, extdata_{}
{
// int cpuInfo[4] = {-1};
std::array<int, 4> cpui;
// Calling __cpuid with 0x0 as the function_id argument
// gets the number of the highest valid function ID.
#if defined(_MSC_VER) && !defined(__clang__)
__cpuid(cpui.data(), 0);
nIds_ = cpui[0];
#else
nIds_ = __get_cpuid_max(0, NULL);
#endif
for (int i = 0; i <= nIds_; ++i)
{
#if defined(_MSC_VER) && !defined(__clang__)
__cpuidex(cpui.data(), i, 0);
#else
int* data = cpui.data();
__cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
#endif
data_.push_back(cpui);
}
// Capture vendor string
char vendor[0x20];
memset(vendor, 0, sizeof(vendor));
*reinterpret_cast<int*>(vendor) = data_[0][1];
*reinterpret_cast<int*>(vendor + 4) = data_[0][3];
*reinterpret_cast<int*>(vendor + 8) = data_[0][2];
vendor_ = vendor;
if (vendor_ == "GenuineIntel")
{
isIntel_ = true;
}
else if (vendor_ == "AuthenticAMD")
{
isAMD_ = true;
}
// load bitset with flags for function 0x00000001
if (nIds_ >= 1)
{
f_1_ECX_ = data_[1][2];
f_1_EDX_ = data_[1][3];
}
// load bitset with flags for function 0x00000007
if (nIds_ >= 7)
{
f_7_EBX_ = data_[7][1];
f_7_ECX_ = data_[7][2];
}
// Calling __cpuid with 0x80000000 as the function_id argument
// gets the number of the highest valid extended ID.
#if defined(_MSC_VER) && !defined(__clang__)
__cpuid(cpui.data(), 0x80000000);
nExIds_ = cpui[0];
#else
nExIds_ = __get_cpuid_max(0x80000000, NULL);
#endif
char brand[0x40];
memset(brand, 0, sizeof(brand));
for (unsigned i = 0x80000000; i <= nExIds_; ++i)
{
#if defined(_MSC_VER) && !defined(__clang__)
__cpuidex(cpui.data(), i, 0);
#else
int* data = cpui.data();
__cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
#endif
extdata_.push_back(cpui);
}
// load bitset with flags for function 0x80000001
if (nExIds_ >= 0x80000001)
{
f_81_ECX_ = extdata_[1][2];
f_81_EDX_ = extdata_[1][3];
}
// Interpret CPU brand string if reported
if (nExIds_ >= 0x80000004)
{
memcpy(brand, extdata_[2].data(), sizeof(cpui));
memcpy(brand + 16, extdata_[3].data(), sizeof(cpui));
memcpy(brand + 32, extdata_[4].data(), sizeof(cpui));
brand_ = brand;
}
};
int nIds_;
unsigned nExIds_;
std::string vendor_;
std::string brand_;
bool isIntel_;
bool isAMD_;
std::bitset<32> f_1_ECX_;
std::bitset<32> f_1_EDX_;
std::bitset<32> f_7_EBX_;
std::bitset<32> f_7_ECX_;
std::bitset<32> f_81_ECX_;
std::bitset<32> f_81_EDX_;
std::vector<std::array<int, 4>> data_;
std::vector<std::array<int, 4>> extdata_;
};
const InstructionSet_Internal CPU_Rep;
};

View file

@ -1,314 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#include "common/os.h"
#include <vector>
#include <array>
#include <sstream>
#if defined(_WIN32)
#include <shlobj.h>
#endif // Windows
#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
#include <pthread.h>
#endif // Linux
#if defined(_MSC_VER)
static const DWORD MS_VC_EXCEPTION = 0x406D1388;
#pragma pack(push, 8)
typedef struct tagTHREADNAME_INFO
{
DWORD dwType; // Must be 0x1000.
LPCSTR szName; // Pointer to name (in user addr space).
DWORD dwThreadID; // Thread ID (-1=caller thread).
DWORD dwFlags; // Reserved for future use, must be zero.
} THREADNAME_INFO;
#pragma pack(pop)
void LegacySetThreadName(const char* pThreadName)
{
THREADNAME_INFO info;
info.dwType = 0x1000;
info.szName = pThreadName;
info.dwThreadID = GetCurrentThreadId();
info.dwFlags = 0;
if (!IsDebuggerPresent())
{
// No debugger attached to interpret exception, no need to actually do it
return;
}
#pragma warning(push)
#pragma warning(disable : 6320 6322)
__try
{
RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
}
#pragma warning(pop)
}
#endif // _WIN32
void SWR_API SetCurrentThreadName(const char* pThreadName)
{
#if defined(_MSC_VER)
// The SetThreadDescription API was brought in version 1607 of Windows 10.
typedef HRESULT(WINAPI * PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription);
// The SetThreadDescription API works even if no debugger is attached.
auto pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription"));
if (!pfnSetThreadDescription)
{
// try KernelBase.dll
pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription"));
}
if (pfnSetThreadDescription)
{
std::string utf8Name = pThreadName;
std::wstring wideName;
wideName.resize(utf8Name.size() + 1);
swprintf_s(&(wideName.front()), wideName.size(), L"%S", utf8Name.c_str());
HRESULT hr = pfnSetThreadDescription(GetCurrentThread(), wideName.c_str());
SWR_ASSERT(SUCCEEDED(hr), "Failed to set thread name to %s", pThreadName);
// Fall through - it seems like some debuggers only recognize the exception
}
// Fall back to exception based hack
LegacySetThreadName(pThreadName);
#endif // _WIN32
#if defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
pthread_setname_np(pthread_self(), pThreadName);
#endif // Linux
}
#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
static void
SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken)
{
out_segments.clear();
std::istringstream f(input);
std::string s;
while (std::getline(f, s, splitToken))
{
if (s.size())
{
out_segments.push_back(s);
}
}
}
#endif // Unix
void SWR_API CreateDirectoryPath(const std::string& path)
{
#if defined(_WIN32)
SHCreateDirectoryExA(nullptr, path.c_str(), nullptr);
#endif // Windows
#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
std::vector<std::string> pathSegments;
SplitString(pathSegments, path, '/');
std::string tmpPath;
for (auto const& segment : pathSegments)
{
tmpPath.push_back('/');
tmpPath += segment;
int result = mkdir(tmpPath.c_str(), 0777);
if (result == -1 && errno != EEXIST)
{
break;
}
}
#endif // Unix
}
/// Execute Command (block until finished)
/// @returns process exit value
int SWR_API ExecCmd(const std::string& cmd, ///< (In) Command line string
const char* pOptEnvStrings, ///< (Optional In) Environment block for new process
std::string* pOptStdOut, ///< (Optional Out) Standard Output text
std::string* pOptStdErr, ///< (Optional Out) Standard Error text
const std::string* pOptStdIn) ///< (Optional In) Standard Input text
{
int rvalue = -1;
#if defined(_WIN32)
struct WinPipe
{
HANDLE hRead;
HANDLE hWrite;
};
std::array<WinPipe, 3> hPipes = {};
SECURITY_ATTRIBUTES saAttr = {sizeof(SECURITY_ATTRIBUTES)};
saAttr.bInheritHandle = TRUE; // Pipe handles are inherited by child process.
saAttr.lpSecurityDescriptor = NULL;
{
bool bFail = false;
for (WinPipe& p : hPipes)
{
if (!CreatePipe(&p.hRead, &p.hWrite, &saAttr, 0))
{
bFail = true;
}
}
if (bFail)
{
for (WinPipe& p : hPipes)
{
CloseHandle(p.hRead);
CloseHandle(p.hWrite);
}
return rvalue;
}
}
STARTUPINFOA StartupInfo{};
StartupInfo.cb = sizeof(STARTUPINFOA);
StartupInfo.dwFlags = STARTF_USESTDHANDLES;
StartupInfo.dwFlags |= STARTF_USESHOWWINDOW;
StartupInfo.wShowWindow = SW_HIDE;
if (pOptStdIn)
{
StartupInfo.hStdInput = hPipes[0].hRead;
}
StartupInfo.hStdOutput = hPipes[1].hWrite;
StartupInfo.hStdError = hPipes[2].hWrite;
PROCESS_INFORMATION procInfo{};
// CreateProcess can modify the string
std::string local_cmd = cmd;
BOOL ProcessValue = CreateProcessA(NULL,
(LPSTR)local_cmd.c_str(),
NULL,
NULL,
TRUE,
0,
(LPVOID)pOptEnvStrings,
NULL,
&StartupInfo,
&procInfo);
if (ProcessValue && procInfo.hProcess)
{
auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr) {
char buf[1024];
DWORD dwRead = 0;
DWORD dwAvail = 0;
while (true)
{
if (!::PeekNamedPipe(hPipe, NULL, 0, NULL, &dwAvail, NULL))
{
break;
}
if (!dwAvail) // no data available, return
{
break;
}
if (!::ReadFile(hPipe,
buf,
std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)),
&dwRead,
NULL) ||
!dwRead)
{
// error, the child process might ended
break;
}
buf[dwRead] = 0;
if (pOutStr)
{
(*pOutStr) += buf;
}
}
};
bool bProcessEnded = false;
size_t bytesWritten = 0;
do
{
if (pOptStdIn && (pOptStdIn->size() > bytesWritten))
{
DWORD bytesToWrite = static_cast<DWORD>(pOptStdIn->size()) - bytesWritten;
if (!::WriteFile(hPipes[0].hWrite,
pOptStdIn->data() + bytesWritten,
bytesToWrite,
&bytesToWrite,
nullptr))
{
// Failed to write to pipe
break;
}
bytesWritten += bytesToWrite;
}
// Give some timeslice (50ms), so we won't waste 100% cpu.
bProcessEnded = (WaitForSingleObject(procInfo.hProcess, 50) == WAIT_OBJECT_0);
ReadFromPipe(hPipes[1].hRead, pOptStdOut);
ReadFromPipe(hPipes[2].hRead, pOptStdErr);
} while (!bProcessEnded);
DWORD exitVal = 0;
if (!GetExitCodeProcess(procInfo.hProcess, &exitVal))
{
exitVal = 1;
}
CloseHandle(procInfo.hProcess);
CloseHandle(procInfo.hThread);
rvalue = exitVal;
}
for (WinPipe& p : hPipes)
{
CloseHandle(p.hRead);
CloseHandle(p.hWrite);
}
#else
// Non-Windows implementation
#endif
return rvalue;
}

View file

@ -1,365 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_OS_H__
#define __SWR_OS_H__
#include <cstddef>
#include "core/knobs.h"
#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
#define SWR_API __cdecl
#define SWR_VISIBLE __declspec(dllexport)
#ifndef NOMINMAX
#undef UNICODE
#define NOMINMAX
#include <windows.h>
#undef NOMINMAX
#define UNICODE
#else
#undef UNICODE
#include <windows.h>
#define UNICODE
#endif
#include <intrin.h>
#include <cstdint>
#if defined(MemoryFence)
// Windows.h defines MemoryFence as _mm_mfence, but this conflicts with llvm::sys::MemoryFence
#undef MemoryFence
#endif
#if defined(_MSC_VER)
#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
#elif defined(__GNUC__)
#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
#endif
#if defined(_DEBUG)
// We compile Debug builds with inline function expansion enabled. This allows
// functions compiled with __forceinline to be inlined even in Debug builds.
// The inline_depth(0) pragma below will disable inline function expansion for
// normal INLINE / inline functions, but not for __forceinline functions.
// Our SIMD function wrappers (see simdlib.hpp) use __forceinline even in
// Debug builds.
#define INLINE inline
#pragma inline_depth(0)
#else
// Use of __forceinline increases compile time dramatically in release builds
// and provides almost 0 measurable benefit. Disable until we have a compelling
// use-case
// #define INLINE __forceinline
#define INLINE inline
#endif
#ifndef FORCEINLINE
#define FORCEINLINE __forceinline
#endif
#define DEBUGBREAK __debugbreak()
#define PRAGMA_WARNING_PUSH_DISABLE(...) \
__pragma(warning(push)); \
__pragma(warning(disable : __VA_ARGS__));
#define PRAGMA_WARNING_POP() __pragma(warning(pop))
static inline void* AlignedMalloc(size_t _Size, size_t _Alignment)
{
return _aligned_malloc(_Size, _Alignment);
}
static inline void AlignedFree(void* p)
{
return _aligned_free(p);
}
#if defined(_WIN64)
#define BitScanReverseSizeT BitScanReverse64
#define BitScanForwardSizeT BitScanForward64
#define _mm_popcount_sizeT _mm_popcnt_u64
#else
#define BitScanReverseSizeT BitScanReverse
#define BitScanForwardSizeT BitScanForward
#define _mm_popcount_sizeT _mm_popcnt_u32
#endif
#if !defined(_WIN64)
extern "C" {
inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
{
if (Mask == 0)
return 0;
#ifdef __GNUC__
*Index = __builtin_ctzll(Mask);
#else
*Index = 0;
for (int i = 0; i < 64; ++ i)
if ((1ULL << i) & Mask)
*Index = i;
#endif
return 1;
}
inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
{
if (Mask == 0)
return 0;
#ifdef __GNUC__
*Index = 63 - __builtin_clzll(Mask);
#else
*Index = 0;
for (int i = 63; i >= 0; -- i)
if ((1ULL << i) & Mask)
*Index = i;
#endif
return 1;
}
}
#endif
#elif defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
#define SWR_API
#define SWR_VISIBLE __attribute__((visibility("default")))
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
#include <stdint.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/stat.h>
#include <stdio.h>
#include <limits.h>
typedef void VOID;
typedef void* LPVOID;
typedef int INT;
typedef unsigned int UINT;
typedef void* HANDLE;
typedef int LONG;
typedef unsigned int DWORD;
#undef FALSE
#define FALSE 0
#undef TRUE
#define TRUE 1
#define MAX_PATH PATH_MAX
#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
#ifndef INLINE
#define INLINE __inline
#endif
#ifndef FORCEINLINE
#define FORCEINLINE INLINE
#endif
#define DEBUGBREAK asm("int $3")
#if !defined(__CYGWIN__)
#ifndef __cdecl
#define __cdecl
#endif
#ifndef __stdcall
#define __stdcall
#endif
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
#define __declspec(x) __declspec_##x
#define __declspec_align(y) __attribute__((aligned(y)))
#define __declspec_deprecated __attribute__((deprecated))
#define __declspec_dllexport
#define __declspec_dllimport
#define __declspec_noinline __attribute__((__noinline__))
#define __declspec_nothrow __attribute__((nothrow))
#define __declspec_novtable
#define __declspec_thread __thread
#else
#define __declspec(X)
#endif
#endif
#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
inline uint64_t __rdtsc()
{
long low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((uint64_t)high << 32));
}
#endif
#if !defined(__clang__) && !defined(__INTEL_COMPILER)
// Intrinsic not defined in gcc < 10
#if (__GNUC__) && (GCC_VERSION < 100000)
static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
{
_mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
_mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
}
#endif
// gcc prior to 4.9 doesn't have _mm*_undefined_*
#if (__GNUC__) && (GCC_VERSION < 40900)
#define _mm_undefined_si128 _mm_setzero_si128
#define _mm256_undefined_ps _mm256_setzero_ps
#endif
#endif
inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
{
if (Mask == 0)
return 0;
*Index = __builtin_ctzll(Mask);
return 1;
}
inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
{
if (Mask == 0)
return 0;
*Index = __builtin_ctz(Mask);
return 1;
}
inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
{
if (Mask == 0)
return 0;
*Index = 63 - __builtin_clzll(Mask);
return 1;
}
inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
{
if (Mask == 0)
return 0;
*Index = 31 - __builtin_clz(Mask);
return 1;
}
inline void* AlignedMalloc(size_t size, size_t alignment)
{
void* ret;
if (posix_memalign(&ret, alignment, size))
{
return NULL;
}
return ret;
}
static inline void AlignedFree(void* p)
{
free(p);
}
#define _countof(a) (sizeof(a) / sizeof(*(a)))
#define sprintf_s sprintf
#define strcpy_s(dst, size, src) strncpy(dst, src, size)
#define GetCurrentProcessId getpid
#define InterlockedCompareExchange(Dest, Exchange, Comparand) \
__sync_val_compare_and_swap(Dest, Comparand, Exchange)
#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
#define InterlockedAdd(Addend, Value) __sync_add_and_fetch(Addend, Value)
#define InterlockedAdd64(Addend, Value) __sync_add_and_fetch(Addend, Value)
#define _ReadWriteBarrier() asm volatile("" ::: "memory")
#define PRAGMA_WARNING_PUSH_DISABLE(...)
#define PRAGMA_WARNING_POP()
#define ZeroMemory(dst, size) memset(dst, 0, size)
#else
#error Unsupported OS/system.
#endif
#define THREAD thread_local
// Universal types
typedef uint8_t KILOBYTE[1024];
typedef KILOBYTE MEGABYTE[1024];
typedef MEGABYTE GIGABYTE[1024];
#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
#define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES)
#include "common/swr_assert.h"
#ifdef __GNUC__
#define ATTR_UNUSED __attribute__((unused))
#else
#define ATTR_UNUSED
#endif
#define SWR_FUNC(_retType, _funcName, /* args */...) \
typedef _retType(SWR_API* PFN##_funcName)(__VA_ARGS__); \
_retType SWR_API _funcName(__VA_ARGS__);
// Defined in os.cpp
void SWR_API SetCurrentThreadName(const char* pThreadName);
void SWR_API CreateDirectoryPath(const std::string& path);
/// Execute Command (block until finished)
/// @returns process exit value
int SWR_API
ExecCmd(const std::string& cmd, ///< (In) Command line string
const char* pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process
std::string* pOptStdOut = nullptr, ///< (Optional Out) Standard Output text
std::string* pOptStdErr = nullptr, ///< (Optional Out) Standard Error text
const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text
/// Helper for setting up FP state
/// @returns old csr state
static INLINE uint32_t SetOptimalVectorCSR()
{
uint32_t oldCSR = _mm_getcsr();
uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK));
newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
_mm_setcsr(newCSR);
return oldCSR;
}
/// Set Vector CSR state.
/// @param csrState - should be value returned from SetOptimalVectorCSR()
static INLINE void RestoreVectorCSR(uint32_t csrState)
{
_mm_setcsr(csrState);
}
#endif //__SWR_OS_H__

View file

@ -1,192 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rdtsc_buckets.cpp
*
* @brief implementation of rdtsc buckets.
*
* Notes:
*
******************************************************************************/
#include "rdtsc_buckets.h"
#include <inttypes.h>
#if defined(_WIN32)
#define PATH_SEPARATOR "\\"
#elif defined(__unix__) || defined(__APPLE__)
#define PATH_SEPARATOR "/"
#else
#error "Unsupported platform"
#endif
THREAD UINT tlsThreadId = 0;
BucketManager::~BucketManager()
{
}
void BucketManager::RegisterThread(const std::string& name)
{
BUCKET_THREAD newThread;
newThread.name = name;
newThread.root.children.reserve(mBuckets.size());
newThread.root.id = 0;
newThread.root.pParent = nullptr;
newThread.pCurrent = &newThread.root;
mThreadMutex.lock();
// assign unique thread id for this thread
size_t id = mThreads.size();
newThread.id = (UINT)id;
tlsThreadId = (UINT)id;
// store new thread
mThreads.push_back(newThread);
mThreadMutex.unlock();
}
UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
{
mThreadMutex.lock();
size_t id = mBuckets.size();
mBuckets.push_back(desc);
mThreadMutex.unlock();
return (UINT)id;
}
void BucketManager::PrintBucket(
FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
{
const char* arrows[] = {
"",
"|-> ",
" |-> ",
" |-> ",
" |-> ",
" |-> ",
" |-> ",
" |-> ",
" |-> ",
};
// compute percent of total cycles used by this bucket
float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0);
// compute percent of parent cycles used by this bucket
float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
// compute average cycle count per invocation
uint64_t CPE = bucket.elapsed / bucket.count;
BUCKET_DESC& desc = mBuckets[bucket.id];
// construct hierarchy visualization
std::string str = arrows[level];
str += desc.name;
char hier[80];
strcpy_s(hier, sizeof(hier)-1, str.c_str());
// print out
fprintf(f,
"%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n",
percentTotal,
percentParent,
bucket.elapsed,
CPE,
bucket.count,
(unsigned long)0,
(uint32_t)0,
hier);
// dump all children of this bucket
for (const BUCKET& child : bucket.children)
{
if (child.count)
{
PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child);
}
}
}
void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
{
// print header
fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str());
fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n");
// compute thread level total cycle counts across all buckets from root
const BUCKET& root = thread.root;
uint64_t totalCycles = 0;
for (const BUCKET& child : root.children)
{
totalCycles += child.elapsed;
}
for (const BUCKET& child : root.children)
{
if (child.count)
{
PrintBucket(f, 0, totalCycles, totalCycles, child);
}
}
}
void BucketManager::PrintReport(const std::string& filename)
{
{
FILE* f = fopen(filename.c_str(), "w");
assert(f);
mThreadMutex.lock();
for (const BUCKET_THREAD& thread : mThreads)
{
PrintThread(f, thread);
fprintf(f, "\n");
}
mThreadMutex.unlock();
fclose(f);
}
}
void BucketManager::StartCapture()
{
printf("Capture Starting\n");
mCapturing = true;
}
void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
{
pBucketMgr->StartBucket(id);
}
void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
{
pBucketMgr->StopBucket(id);
}

View file

@ -1,227 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rdtsc_buckets.h
*
* @brief declaration for rdtsc buckets.
*
* Notes:
*
******************************************************************************/
#pragma once
#include "os.h"
#include <vector>
#include <mutex>
#include <sstream>
#include "rdtsc_buckets_shared.h"
// unique thread id stored in thread local storage
extern THREAD UINT tlsThreadId;
//////////////////////////////////////////////////////////////////////////
/// @brief BucketManager encapsulates a single instance of the buckets
/// functionality. There can be one or many bucket managers active
/// at any time. The manager owns all the threads and
/// bucket information that have been registered to it.
class BucketManager
{
public:
uint32_t mCurrentFrame;
std::vector<uint32_t> mBucketMap;
bool mBucketsInitialized;
std::string mBucketMgrName;
BucketManager(std::string name) : mCurrentFrame(0), mBucketsInitialized(false), mBucketMgrName(name)
{
mBucketMap.clear();
}
~BucketManager();
// removes all registered thread data
void ClearThreads()
{
mThreadMutex.lock();
mThreads.clear();
mThreadMutex.unlock();
}
// removes all registered buckets
void ClearBuckets()
{
mThreadMutex.lock();
mBuckets.clear();
mThreadMutex.unlock();
}
/// Registers a new thread with the manager.
/// @param name - name of thread, used for labels in reports and threadviz
void RegisterThread(const std::string& name);
/// Registers a new bucket type with the manager. Returns a unique
/// id which should be used in subsequent calls to start/stop the bucket
/// @param desc - description of the bucket
/// @return unique id
UINT RegisterBucket(const BUCKET_DESC& desc);
// print report
void PrintReport(const std::string& filename);
// start capturing
void StartCapture();
// stop capturing
INLINE void StopCapture()
{
mCapturing = false;
// wait for all threads to pop back to root bucket
bool stillCapturing = true;
while (stillCapturing)
{
stillCapturing = false;
for (const BUCKET_THREAD& t : mThreads)
{
if (t.level > 0)
{
stillCapturing = true;
continue;
}
}
}
mDoneCapturing = true;
printf("Capture Stopped\n");
}
// start a bucket
// @param id generated by RegisterBucket
INLINE void StartBucket(UINT id)
{
if (!mCapturing)
return;
SWR_ASSERT(tlsThreadId < mThreads.size());
BUCKET_THREAD& bt = mThreads[tlsThreadId];
uint64_t tsc = __rdtsc();
{
if (bt.pCurrent->children.size() < mBuckets.size())
{
bt.pCurrent->children.resize(mBuckets.size());
}
BUCKET& child = bt.pCurrent->children[id];
child.pParent = bt.pCurrent;
child.id = id;
child.start = tsc;
// update thread's currently executing bucket
bt.pCurrent = &child;
}
bt.level++;
}
// stop the currently executing bucket
INLINE void StopBucket(UINT id)
{
SWR_ASSERT(tlsThreadId < mThreads.size());
BUCKET_THREAD& bt = mThreads[tlsThreadId];
if (bt.level == 0)
{
return;
}
uint64_t tsc = __rdtsc();
{
if (bt.pCurrent->start == 0)
return;
SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
bt.pCurrent->elapsed += (tsc - bt.pCurrent->start);
bt.pCurrent->count++;
// pop to parent
bt.pCurrent = bt.pCurrent->pParent;
}
bt.level--;
}
INLINE void AddEvent(uint32_t id, uint32_t count)
{
if (!mCapturing)
return;
SWR_ASSERT(tlsThreadId < mThreads.size());
BUCKET_THREAD& bt = mThreads[tlsThreadId];
// don't record events for threadviz
{
if (bt.pCurrent->children.size() < mBuckets.size())
{
bt.pCurrent->children.resize(mBuckets.size());
}
BUCKET& child = bt.pCurrent->children[id];
child.pParent = bt.pCurrent;
child.id = id;
child.count += count;
}
}
private:
void PrintBucket(
FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
void PrintThread(FILE* f, const BUCKET_THREAD& thread);
// list of active threads that have registered with this manager
std::vector<BUCKET_THREAD> mThreads;
// list of buckets registered with this manager
std::vector<BUCKET_DESC> mBuckets;
// is capturing currently enabled
volatile bool mCapturing{false};
// has capturing completed
volatile bool mDoneCapturing{false};
std::mutex mThreadMutex;
std::string mThreadVizDir;
};
// C helpers for jitter
void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);

View file

@ -1,169 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rdtsc_buckets.h
*
* @brief declaration for rdtsc buckets.
*
* Notes:
*
******************************************************************************/
#pragma once
#include <vector>
#include <cassert>
struct BUCKET
{
uint32_t id{0};
uint64_t start{0};
uint64_t elapsed{0};
uint32_t count{0};
BUCKET* pParent{nullptr};
std::vector<BUCKET> children;
};
struct BUCKET_DESC
{
// name of bucket, used in reports
std::string name;
// description of bucket, used in threadviz
std::string description;
// enable for threadviz dumping
bool enableThreadViz;
// threadviz color of bucket, in RGBA8_UNORM format
uint32_t color;
};
struct BUCKET_THREAD
{
// name of thread, used in reports
std::string name;
// id for this thread, assigned by the thread manager
uint32_t id{0};
// root of the bucket hierarchy for this thread
BUCKET root;
// currently executing bucket somewhere in the hierarchy
BUCKET* pCurrent{nullptr};
// currently executing hierarchy level
uint32_t level{0};
// threadviz file object
FILE* vizFile{nullptr};
BUCKET_THREAD() {}
BUCKET_THREAD(const BUCKET_THREAD& that)
{
name = that.name;
id = that.id;
root = that.root;
pCurrent = &root;
vizFile = that.vizFile;
}
};
enum VIZ_TYPE
{
VIZ_START = 0,
VIZ_STOP = 1,
VIZ_DATA = 2
};
struct VIZ_START_DATA
{
uint8_t type;
uint32_t bucketId;
uint64_t timestamp;
};
struct VIZ_STOP_DATA
{
uint8_t type;
uint64_t timestamp;
};
inline void Serialize(FILE* f, const VIZ_START_DATA& data)
{
fwrite(&data, sizeof(VIZ_START_DATA), 1, f);
}
inline void Deserialize(FILE* f, VIZ_START_DATA& data)
{
fread(&data, sizeof(VIZ_START_DATA), 1, f);
assert(data.type == VIZ_START);
}
inline void Serialize(FILE* f, const VIZ_STOP_DATA& data)
{
fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f);
}
inline void Deserialize(FILE* f, VIZ_STOP_DATA& data)
{
fread(&data, sizeof(VIZ_STOP_DATA), 1, f);
assert(data.type == VIZ_STOP);
}
inline void Serialize(FILE* f, const std::string& string)
{
assert(string.size() <= 256);
uint8_t length = (uint8_t)string.size();
fwrite(&length, sizeof(length), 1, f);
fwrite(string.c_str(), string.size(), 1, f);
}
inline void Deserialize(FILE* f, std::string& string)
{
char cstr[256];
uint8_t length;
fread(&length, sizeof(length), 1, f);
fread(cstr, length, 1, f);
cstr[length] = 0;
string.assign(cstr);
}
inline void Serialize(FILE* f, const BUCKET_DESC& desc)
{
Serialize(f, desc.name);
Serialize(f, desc.description);
fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
fwrite(&desc.color, sizeof(desc.color), 1, f);
}
inline void Deserialize(FILE* f, BUCKET_DESC& desc)
{
Deserialize(f, desc.name);
Deserialize(f, desc.description);
fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
fread(&desc.color, sizeof(desc.color), 1, f);
}

View file

@ -1,168 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_SIMD16INTRIN_H__
#define __SWR_SIMD16INTRIN_H__
#if KNOB_SIMD16_WIDTH == 16
typedef SIMD512 SIMD16;
#else
#error Unsupported vector width
#endif // KNOB_SIMD16_WIDTH == 16
#define _simd16_setzero_ps SIMD16::setzero_ps
#define _simd16_setzero_si SIMD16::setzero_si
#define _simd16_set1_ps SIMD16::set1_ps
#define _simd16_set1_epi8 SIMD16::set1_epi8
#define _simd16_set1_epi32 SIMD16::set1_epi32
#define _simd16_set_ps SIMD16::set_ps
#define _simd16_set_epi32 SIMD16::set_epi32
#define _simd16_load_ps SIMD16::load_ps
#define _simd16_loadu_ps SIMD16::loadu_ps
#if 1
#define _simd16_load1_ps SIMD16::broadcast_ss
#endif
#define _simd16_load_si SIMD16::load_si
#define _simd16_loadu_si SIMD16::loadu_si
#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m)
#define _simd16_store_ps SIMD16::store_ps
#define _simd16_store_si SIMD16::store_si
#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a)
#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a)
#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b)
#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b)
#define _simd16_maskstore_ps SIMD16::maskstore_ps
#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b)
#define _simd16_blendv_ps SIMD16::blendv_ps
#define _simd16_blendv_epi32 SIMD16::blendv_epi32
#define _simd16_mul_ps SIMD16::mul_ps
#define _simd16_div_ps SIMD16::div_ps
#define _simd16_add_ps SIMD16::add_ps
#define _simd16_sub_ps SIMD16::sub_ps
#define _simd16_rsqrt_ps SIMD16::rsqrt_ps
#define _simd16_min_ps SIMD16::min_ps
#define _simd16_max_ps SIMD16::max_ps
#define _simd16_movemask_ps SIMD16::movemask_ps
#define _simd16_movemask_pd SIMD16::movemask_pd
#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32
#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32
#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps
#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
#define _simd16_cmplt_ps SIMD16::cmplt_ps
#define _simd16_cmpgt_ps SIMD16::cmpgt_ps
#define _simd16_cmpneq_ps SIMD16::cmpneq_ps
#define _simd16_cmpeq_ps SIMD16::cmpeq_ps
#define _simd16_cmpge_ps SIMD16::cmpge_ps
#define _simd16_cmple_ps SIMD16::cmple_ps
#define _simd16_castsi_ps SIMD16::castsi_ps
#define _simd16_castps_si SIMD16::castps_si
#define _simd16_castsi_pd SIMD16::castsi_pd
#define _simd16_castpd_si SIMD16::castpd_si
#define _simd16_castpd_ps SIMD16::castpd_ps
#define _simd16_castps_pd SIMD16::castps_pd
#define _simd16_and_ps SIMD16::and_ps
#define _simd16_andnot_ps SIMD16::andnot_ps
#define _simd16_or_ps SIMD16::or_ps
#define _simd16_xor_ps SIMD16::xor_ps
#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
#define _simd16_mul_epi32 SIMD16::mul_epi32
#define _simd16_mullo_epi32 SIMD16::mullo_epi32
#define _simd16_sub_epi32 SIMD16::sub_epi32
#define _simd16_sub_epi64 SIMD16::sub_epi64
#define _simd16_min_epi32 SIMD16::min_epi32
#define _simd16_max_epi32 SIMD16::max_epi32
#define _simd16_min_epu32 SIMD16::min_epu32
#define _simd16_max_epu32 SIMD16::max_epu32
#define _simd16_add_epi32 SIMD16::add_epi32
#define _simd16_and_si SIMD16::and_si
#define _simd16_andnot_si SIMD16::andnot_si
#define _simd16_or_si SIMD16::or_si
#define _simd16_xor_si SIMD16::xor_si
#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32
#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32
#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32
#define _simd16_testz_ps SIMD16::testz_ps
#define _simd16_unpacklo_ps SIMD16::unpacklo_ps
#define _simd16_unpackhi_ps SIMD16::unpackhi_ps
#define _simd16_unpacklo_pd SIMD16::unpacklo_pd
#define _simd16_unpackhi_pd SIMD16::unpackhi_pd
#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8
#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8
#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16
#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16
#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32
#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32
#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64
#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64
#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a)
#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a)
#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a)
#define _simd16_fmadd_ps SIMD16::fmadd_ps
#define _simd16_fmsub_ps SIMD16::fmsub_ps
#define _simd16_adds_epu8 SIMD16::adds_epu8
#define _simd16_subs_epu8 SIMD16::subs_epu8
#define _simd16_add_epi8 SIMD16::add_epi8
#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8
#define _simd16_i32gather_ps(m, index, scale) \
SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index)
#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) \
SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
#define _simd16_abs_epi32 SIMD16::abs_epi32
#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64
#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64
#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16
#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16
#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8
#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8
#define _simd16_permute_ps_i(a, i) SIMD16::permute_ps<i>(a)
#define _simd16_permute_ps SIMD16::permute_ps
#define _simd16_permute_epi32 SIMD16::permute_epi32
#define _simd16_sllv_epi32 SIMD16::sllv_epi32
#define _simd16_srlv_epi32 SIMD16::sllv_epi32
#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b)
#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b)
#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b)
#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b)
#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b)
#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b)
#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b)
#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16
#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32
#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32
#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64
#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64
#define _simd16_packus_epi16 SIMD16::packus_epi16
#define _simd16_packs_epi16 SIMD16::packs_epi16
#define _simd16_packus_epi32 SIMD16::packus_epi32
#define _simd16_packs_epi32 SIMD16::packs_epi32
#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>
#define _simd16_int2mask(mask) simd16mask(mask)
#define _simd16_mask2int(mask) int(mask)
#define _simd16_vmask_ps SIMD16::vmask_ps
#endif //__SWR_SIMD16INTRIN_H_

View file

@ -1,322 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_SIMDINTRIN_H__
#define __SWR_SIMDINTRIN_H__
#include "common/intrin.h"
#include "common/simdlib.hpp"
#if KNOB_SIMD_WIDTH == 8
typedef SIMD256 SIMD;
#else
#error Unsupported vector width
#endif // KNOB_SIMD16_WIDTH == 16
#define _simd128_maskstore_ps SIMD128::maskstore_ps
#define _simd128_fmadd_ps SIMD128::fmadd_ps
#define _simd_load_ps SIMD::load_ps
#define _simd_load1_ps SIMD::broadcast_ss
#define _simd_loadu_ps SIMD::loadu_ps
#define _simd_setzero_ps SIMD::setzero_ps
#define _simd_set1_ps SIMD::set1_ps
#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b)
#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b)
#define _simd_blendv_ps SIMD::blendv_ps
#define _simd_store_ps SIMD::store_ps
#define _simd_mul_ps SIMD::mul_ps
#define _simd_add_ps SIMD::add_ps
#define _simd_sub_ps SIMD::sub_ps
#define _simd_rsqrt_ps SIMD::rsqrt_ps
#define _simd_min_ps SIMD::min_ps
#define _simd_max_ps SIMD::max_ps
#define _simd_movemask_ps SIMD::movemask_ps
#define _simd_cvtps_epi32 SIMD::cvtps_epi32
#define _simd_cvttps_epi32 SIMD::cvttps_epi32
#define _simd_cvtepi32_ps SIMD::cvtepi32_ps
#define _simd_cmplt_ps SIMD::cmplt_ps
#define _simd_cmpgt_ps SIMD::cmpgt_ps
#define _simd_cmpneq_ps SIMD::cmpneq_ps
#define _simd_cmpeq_ps SIMD::cmpeq_ps
#define _simd_cmpge_ps SIMD::cmpge_ps
#define _simd_cmple_ps SIMD::cmple_ps
#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
#define _simd_and_ps SIMD::and_ps
#define _simd_or_ps SIMD::or_ps
#define _simd_rcp_ps SIMD::rcp_ps
#define _simd_div_ps SIMD::div_ps
#define _simd_castsi_ps SIMD::castsi_ps
#define _simd_castps_pd SIMD::castps_pd
#define _simd_castpd_ps SIMD::castpd_ps
#define _simd_andnot_ps SIMD::andnot_ps
#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a)
#define _simd_castpd_ps SIMD::castpd_ps
#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a))
#define _simd_stream_ps SIMD::stream_ps
#define _simd_movemask_pd SIMD::movemask_pd
#define _simd_castsi_pd SIMD::castsi_pd
#define _simd_mul_epi32 SIMD::mul_epi32
#define _simd_mullo_epi32 SIMD::mullo_epi32
#define _simd_sub_epi32 SIMD::sub_epi32
#define _simd_sub_epi64 SIMD::sub_epi64
#define _simd_min_epi32 SIMD::min_epi32
#define _simd_min_epu32 SIMD::min_epu32
#define _simd_max_epi32 SIMD::max_epi32
#define _simd_max_epu32 SIMD::max_epu32
#define _simd_add_epi32 SIMD::add_epi32
#define _simd_and_si SIMD::and_si
#define _simd_andnot_si SIMD::andnot_si
#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32
#define _simd_cmplt_epi32 SIMD::cmplt_epi32
#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32
#define _simd_or_si SIMD::or_si
#define _simd_xor_si SIMD::xor_si
#define _simd_castps_si SIMD::castps_si
#define _simd_adds_epu8 SIMD::adds_epu8
#define _simd_subs_epu8 SIMD::subs_epu8
#define _simd_add_epi8 SIMD::add_epi8
#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64
#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64
#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8
#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8
#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16
#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16
#define _simd_movemask_epi8 SIMD::movemask_epi8
#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a)
#define _simd_permute_ps SIMD::permute_ps
#define _simd_permute_epi32 SIMD::permute_epi32
#define _simd_srlv_epi32 SIMD::srlv_epi32
#define _simd_sllv_epi32 SIMD::sllv_epi32
#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8
#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8
#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16
#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16
#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32
#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32
#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64
#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64
#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a)
#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a)
#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a)
#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a)
#define _simd_fmadd_ps SIMD::fmadd_ps
#define _simd_fmsub_ps SIMD::fmsub_ps
#define _simd_shuffle_epi8 SIMD::shuffle_epi8
#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
#define _simd_mask_i32gather_ps(r, p, o, m, s) \
SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
#define _simd_abs_epi32 SIMD::abs_epi32
#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16
#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32
#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32
#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64
#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64
#define _simd_packus_epi16 SIMD::packus_epi16
#define _simd_packs_epi16 SIMD::packs_epi16
#define _simd_packus_epi32 SIMD::packus_epi32
#define _simd_packs_epi32 SIMD::packs_epi32
#define _simd_unpacklo_ps SIMD::unpacklo_ps
#define _simd_unpackhi_ps SIMD::unpackhi_ps
#define _simd_unpacklo_pd SIMD::unpacklo_pd
#define _simd_unpackhi_pd SIMD::unpackhi_pd
#define _simd_insertf128_ps SIMD::insertf128_ps
#define _simd_insertf128_pd SIMD::insertf128_pd
#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b)
#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a)
#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a)
#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a)
#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b)
#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b)
#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b)
#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b)
#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b)
#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b)
#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b)
#define _simd_set1_epi32 SIMD::set1_epi32
#define _simd_set_epi32 SIMD::set_epi32
#define _simd_set_ps SIMD::set_ps
#define _simd_set1_epi8 SIMD::set1_epi8
#define _simd_setzero_si SIMD::setzero_si
#define _simd_cvttps_epi32 SIMD::cvttps_epi32
#define _simd_store_si SIMD::store_si
#define _simd_broadcast_ss SIMD::broadcast_ss
#define _simd_maskstore_ps SIMD::maskstore_ps
#define _simd_load_si SIMD::load_si
#define _simd_loadu_si SIMD::loadu_si
#define _simd_sub_ps SIMD::sub_ps
#define _simd_testz_ps SIMD::testz_ps
#define _simd_testz_si SIMD::testz_si
#define _simd_xor_ps SIMD::xor_ps
#define _simd_loadu2_si SIMD::loadu2_si
#define _simd_storeu2_si SIMD::storeu2_si
#define _simd_blendv_epi32 SIMD::blendv_epi32
#define _simd_vmask_ps SIMD::vmask_ps
template <int mask>
SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b)
{
return SIMD128::castps_si(
SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
}
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
SIMDINLINE simdscalar vplaneps(simdscalar const& vA,
simdscalar const& vB,
simdscalar const& vC,
simdscalar const& vX,
simdscalar const& vY)
{
simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
vOut = _simd_fmadd_ps(vB, vY, vOut);
return vOut;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA,
simd4scalar const& vB,
simd4scalar const& vC,
simd4scalar const& vX,
simd4scalar const& vY)
{
simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
vOut = _simd128_fmadd_ps(vB, vY, vOut);
return vOut;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Interpolates a single component.
/// @param vI - barycentric I
/// @param vJ - barycentric J
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template <UINT Attrib, UINT Comp, UINT numComponents = 4>
static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI,
simdscalar const& vJ,
const float* pInterpBuffer)
{
const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0]))
{
// Ensure constant attribs are constant. Required for proper
// 3D resource copies.
return _simd_broadcast_ss(pInterpA);
}
simdscalar vA = _simd_broadcast_ss(pInterpA);
simdscalar vB = _simd_broadcast_ss(pInterpB);
simdscalar vC = _simd_broadcast_ss(pInterpC);
simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
vC = _simd_mul_ps(vk, vC);
return vplaneps(vA, vB, vC, vI, vJ);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Interpolates a single component (flat shade).
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template <UINT Attrib, UINT Comp, UINT numComponents = 4>
static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer)
{
const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
simdscalar vA = _simd_broadcast_ss(pInterpA);
return vA;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Interpolates a single component (flat shade).
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template <UINT Attrib, UINT Comp, UINT numComponents = 4>
static SIMDINLINE simdscalari InterpolateComponentFlatInt(const uint32_t* pInterpBuffer)
{
const uint32_t interpA = pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
simdscalari vA = _simd_set1_epi32(interpA);
return vA;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Interpolates a single component.
/// @param vI - barycentric I
/// @param vJ - barycentric J
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template <UINT Attrib, UINT Comp, UINT numComponents = 4>
static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI,
simd4scalar const& vJ,
const float* pInterpBuffer)
{
const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0]))
{
// Ensure constant attribs are constant. Required for proper
// 3D resource copies.
return SIMD128::broadcast_ss(pInterpA);
}
simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
vC = SIMD128::mul_ps(vk, vC);
return vplaneps(vA, vB, vC, vI, vJ);
}
static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a)
{
simd4scalari ai = SIMD128::castps_si(a);
return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
}
static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a)
{
simdscalari ai = _simd_castps_si(a);
return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
}
#include "simd16intrin.h"
#endif //__SWR_SIMDINTRIN_H__

View file

@ -1,234 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#include "simdlib_types.hpp"
// For documentation, please see the following include...
// #include "simdlib_interface.hpp"
namespace SIMDImpl
{
namespace SIMD128Impl
{
#if SIMD_ARCH >= SIMD_ARCH_AVX
struct AVXImpl
{
#define __SIMD_LIB_AVX_HPP__
#include "simdlib_128_avx.inl"
#undef __SIMD_LIB_AVX_HPP__
}; // struct AVXImpl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
#if SIMD_ARCH >= SIMD_ARCH_AVX2
struct AVX2Impl : AVXImpl
{
#define __SIMD_LIB_AVX2_HPP__
#include "simdlib_128_avx2.inl"
#undef __SIMD_LIB_AVX2_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
#if SIMD_ARCH >= SIMD_ARCH_AVX512
struct AVX512Impl : AVX2Impl
{
#if defined(SIMD_OPT_128_AVX512)
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_128_avx512.inl"
#if defined(SIMD_ARCH_KNIGHTS)
#include "simdlib_128_avx512_knights.inl"
#else // optimize for core
#include "simdlib_128_avx512_core.inl"
#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
#endif // SIMD_OPT_128_AVX512
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
struct Traits : SIMDImpl::Traits
{
#if SIMD_ARCH == SIMD_ARCH_AVX
using IsaImpl = AVXImpl;
#elif SIMD_ARCH == SIMD_ARCH_AVX2
using IsaImpl = AVX2Impl;
#elif SIMD_ARCH == SIMD_ARCH_AVX512
using IsaImpl = AVX512Impl;
#else
#error Invalid value for SIMD_ARCH
#endif
using Float = SIMD128Impl::Float;
using Double = SIMD128Impl::Double;
using Integer = SIMD128Impl::Integer;
using Vec4 = SIMD128Impl::Vec4;
using Mask = SIMD128Impl::Mask;
};
} // namespace SIMD128Impl
namespace SIMD256Impl
{
#if SIMD_ARCH >= SIMD_ARCH_AVX
struct AVXImpl
{
#define __SIMD_LIB_AVX_HPP__
#include "simdlib_256_avx.inl"
#undef __SIMD_LIB_AVX_HPP__
}; // struct AVXImpl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
#if SIMD_ARCH >= SIMD_ARCH_AVX2
struct AVX2Impl : AVXImpl
{
#define __SIMD_LIB_AVX2_HPP__
#include "simdlib_256_avx2.inl"
#undef __SIMD_LIB_AVX2_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
#if SIMD_ARCH >= SIMD_ARCH_AVX512
struct AVX512Impl : AVX2Impl
{
#if defined(SIMD_OPT_256_AVX512)
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_256_avx512.inl"
#if defined(SIMD_ARCH_KNIGHTS)
#include "simdlib_256_avx512_knights.inl"
#else // optimize for core
#include "simdlib_256_avx512_core.inl"
#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
#endif // SIMD_OPT_256_AVX512
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
struct Traits : SIMDImpl::Traits
{
#if SIMD_ARCH == SIMD_ARCH_AVX
using IsaImpl = AVXImpl;
#elif SIMD_ARCH == SIMD_ARCH_AVX2
using IsaImpl = AVX2Impl;
#elif SIMD_ARCH == SIMD_ARCH_AVX512
using IsaImpl = AVX512Impl;
#else
#error Invalid value for SIMD_ARCH
#endif
using Float = SIMD256Impl::Float;
using Double = SIMD256Impl::Double;
using Integer = SIMD256Impl::Integer;
using Vec4 = SIMD256Impl::Vec4;
using Mask = SIMD256Impl::Mask;
};
} // namespace SIMD256Impl
namespace SIMD512Impl
{
#if SIMD_ARCH >= SIMD_ARCH_AVX
template <typename SIMD256T>
struct AVXImplBase
{
#define __SIMD_LIB_AVX_HPP__
#include "simdlib_512_emu.inl"
#include "simdlib_512_emu_masks.inl"
#undef __SIMD_LIB_AVX_HPP__
}; // struct AVXImplBase
using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
#if SIMD_ARCH >= SIMD_ARCH_AVX2
using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
#if SIMD_ARCH >= SIMD_ARCH_AVX512
struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_512_avx512.inl"
#include "simdlib_512_avx512_masks.inl"
#if defined(SIMD_ARCH_KNIGHTS)
#include "simdlib_512_avx512_knights.inl"
#include "simdlib_512_avx512_masks_knights.inl"
#else // optimize for core
#include "simdlib_512_avx512_core.inl"
#include "simdlib_512_avx512_masks_core.inl"
#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX512ImplBase
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
struct Traits : SIMDImpl::Traits
{
#if SIMD_ARCH == SIMD_ARCH_AVX
using IsaImpl = AVXImpl;
#elif SIMD_ARCH == SIMD_ARCH_AVX2
using IsaImpl = AVX2Impl;
#elif SIMD_ARCH == SIMD_ARCH_AVX512
using IsaImpl = AVX512Impl;
#else
#error Invalid value for SIMD_ARCH
#endif
using Float = SIMD512Impl::Float;
using Double = SIMD512Impl::Double;
using Integer = SIMD512Impl::Integer;
using Vec4 = SIMD512Impl::Vec4;
using Mask = SIMD512Impl::Mask;
};
} // namespace SIMD512Impl
} // namespace SIMDImpl
template <typename Traits>
struct SIMDBase : Traits::IsaImpl
{
using CompareType = typename Traits::CompareType;
using ScaleFactor = typename Traits::ScaleFactor;
using RoundMode = typename Traits::RoundMode;
using SIMD = typename Traits::IsaImpl;
using Float = typename Traits::Float;
using Double = typename Traits::Double;
using Integer = typename Traits::Integer;
using Vec4 = typename Traits::Vec4;
using Mask = typename Traits::Mask;
}; // struct SIMDBase
using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
template <typename SIMD_T>
using CompareType = typename SIMD_T::CompareType;
template <typename SIMD_T>
using ScaleFactor = typename SIMD_T::ScaleFactor;
template <typename SIMD_T>
using RoundMode = typename SIMD_T::RoundMode;
template <typename SIMD_T>
using Float = typename SIMD_T::Float;
template <typename SIMD_T>
using Double = typename SIMD_T::Double;
template <typename SIMD_T>
using Integer = typename SIMD_T::Integer;
template <typename SIMD_T>
using Vec4 = typename SIMD_T::Vec4;
template <typename SIMD_T>
using Mask = typename SIMD_T::Mask;

View file

@ -1,593 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD128 AVX (1) implementation
//============================================================================
#define SIMD_WRAPPER_1(op) \
static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); }
#define SIMD_WRAPPER_2(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); }
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); }
#define SIMD_WRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return _mm_##op(a, b, ImmT); \
}
#define SIMD_DWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return _mm_##op(a, b, ImmT); \
}
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); }
#define SIMD_IWRAPPER_1I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return intrin(a, ImmT); \
}
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); }
#define SIMD_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); }
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \
}
#define SIMD_IWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return _mm_##op(a, b, ImmT); \
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
{
return add_ps(mul_ps(a, b), c);
}
static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
{
return sub_ps(mul_ps(a, b), c);
}
template <RoundMode RMT>
static SIMDINLINE Float SIMDCALL round_ps(Float a)
{
return _mm_round_ps(a, static_cast<int>(RMT));
}
static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
{
return round_ps<RoundMode::CEIL_NOEXC>(a);
}
static SIMDINLINE Float SIMDCALL floor_ps(Float a)
{
return round_ps<RoundMode::FLOOR_NOEXC>(a);
}
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2(mullo_epi32);
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
{
int32_t a, count;
a = _mm_extract_epi32(vA, 0);
count = _mm_extract_epi32(vB, 0);
a <<= count;
vA = _mm_insert_epi32(vA, a, 0);
a = _mm_extract_epi32(vA, 1);
count = _mm_extract_epi32(vB, 1);
a <<= count;
vA = _mm_insert_epi32(vA, a, 1);
a = _mm_extract_epi32(vA, 2);
count = _mm_extract_epi32(vB, 2);
a <<= count;
vA = _mm_insert_epi32(vA, a, 2);
a = _mm_extract_epi32(vA, 3);
count = _mm_extract_epi32(vB, 3);
a <<= count;
vA = _mm_insert_epi32(vA, a, 3);
return vA;
}
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n)
{
return _mm_srl_epi64(a, n);
}
template <int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
{
int32_t a, count;
a = _mm_extract_epi32(vA, 0);
count = _mm_extract_epi32(vB, 0);
a >>= count;
vA = _mm_insert_epi32(vA, a, 0);
a = _mm_extract_epi32(vA, 1);
count = _mm_extract_epi32(vB, 1);
a >>= count;
vA = _mm_insert_epi32(vA, a, 1);
a = _mm_extract_epi32(vA, 2);
count = _mm_extract_epi32(vB, 2);
a >>= count;
vA = _mm_insert_epi32(vA, a, 2);
a = _mm_extract_epi32(vA, 3);
count = _mm_extract_epi32(vB, 3);
a >>= count;
vA = _mm_insert_epi32(vA, a, 3);
return vA;
}
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
{
return _mm_castpd_ps(a);
}
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
{
return _mm_castps_si128(a);
}
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
{
return _mm_castsi128_pd(a);
}
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
{
return _mm_castps_pd(a);
}
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
{
return _mm_castsi128_ps(a);
}
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
{
return _mm_cvtepi32_ps(a);
}
static SIMDINLINE int32_t SIMDCALL cvtsi128_si32(Integer a) // return a.v[0]
{
return _mm_cvtsi128_si32(a);
}
static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, a[1]...a[3] = 0
{
return _mm_cvtsi32_si128(n);
}
SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
{
return _mm_cvtps_epi32(a);
}
static SIMDINLINE Integer SIMDCALL
cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return _mm_cvttps_epi32(a);
}
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
template <CompareType CmpTypeT>
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
{
return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
}
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
{
return cmp_ps<CompareType::LT_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
{
return cmp_ps<CompareType::GT_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
{
return cmp_ps<CompareType::NEQ_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
{
return cmp_ps<CompareType::EQ_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
{
return cmp_ps<CompareType::GE_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
{
return cmp_ps<CompareType::LE_OQ>(a, b);
}
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
static SIMDINLINE bool SIMDCALL testz_ps(Float a,
Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return 0 != _mm_testz_ps(a, b);
}
static SIMDINLINE bool SIMDCALL testz_si(Integer a,
Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return 0 != _mm_testz_si128(a, b);
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
Integer b,
Float mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
Integer b,
Integer mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
}
static SIMDINLINE Float SIMDCALL
broadcast_ss(float const* p) // return *p (all elements in vector get same value)
{
return _mm_broadcast_ss(p);
}
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
static SIMDINLINE Integer SIMDCALL
permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
}
static SIMDINLINE Float SIMDCALL
permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm_permutevar_ps(a, swiz);
}
SIMD_IWRAPPER_1I(shuffle_epi32);
template <int ImmT>
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_DWRAPPER_2I(shuffle_pd);
SIMD_WRAPPER_2I(shuffle_ps);
SIMD_IWRAPPER_2(unpackhi_epi16);
// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
{
return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
}
SIMD_IWRAPPER_2(unpackhi_epi64);
SIMD_IWRAPPER_2(unpackhi_epi8);
SIMD_DWRAPPER_2(unpackhi_pd);
SIMD_WRAPPER_2(unpackhi_ps);
SIMD_IWRAPPER_2(unpacklo_epi16);
SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
SIMD_IWRAPPER_2(unpacklo_epi64);
SIMD_IWRAPPER_2(unpacklo_epi8);
SIMD_DWRAPPER_2(unpacklo_pd);
SIMD_WRAPPER_2(unpacklo_ps);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
uint32_t* pOffsets = (uint32_t*)&idx;
Float vResult;
float* pResult = (float*)&vResult;
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
uint32_t offset = pOffsets[i];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
}
return vResult;
}
static SIMDINLINE Float SIMDCALL
load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
{
return broadcast_ss(p);
}
static SIMDINLINE Float SIMDCALL
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
{
return _mm_load_ps(p);
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
{
return _mm_load_si128(&p->v);
}
static SIMDINLINE Float SIMDCALL
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
{
return _mm_loadu_ps(p);
}
static SIMDINLINE Integer SIMDCALL
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
{
return _mm_lddqu_si128(&p->v);
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
uint32_t* pOffsets = (uint32_t*)&idx;
Float vResult = old;
float* pResult = (float*)&vResult;
unsigned long index;
uint32_t umask = movemask_ps(mask);
while (_BitScanForward(&index, umask))
{
umask &= ~(1 << index);
uint32_t offset = pOffsets[index];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[index] = *(float const*)(((uint8_t const*)p + offset));
}
return vResult;
}
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
{
_mm_maskstore_ps(p, mask, src);
}
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
return static_cast<uint32_t>(_mm_movemask_epi8(a));
}
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
{
return static_cast<uint32_t>(_mm_movemask_pd(a));
}
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
{
return static_cast<uint32_t>(_mm_movemask_ps(a));
}
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
{
return _mm_set1_epi32(i);
}
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
{
return _mm_set1_epi8(i);
}
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
{
return _mm_set1_ps(f);
}
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
{
return _mm_setzero_ps();
}
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
{
return _mm_setzero_si128();
}
static SIMDINLINE void SIMDCALL
store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm_store_ps(p, a);
}
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
{
_mm_store_si128(&p->v, a);
}
static SIMDINLINE void SIMDCALL
storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
{
_mm_storeu_si128(&p->v, a);
}
static SIMDINLINE void SIMDCALL
stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
_mm_stream_ps(p, a);
}
static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
{
return _mm_set_ps(in3, in2, in1, in0);
}
static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0)
{
return _mm_set_epi32(in3, in2, in1, in0);
}
template <int ImmT>
static SIMDINLINE float SIMDCALL extract_ps(Float a)
{
int tmp = _mm_extract_ps(a, ImmT);
return *reinterpret_cast<float*>(&tmp);
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
Integer vec = set1_epi32(mask);
const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01);
vec = and_si(vec, bit);
vec = cmplt_epi32(setzero_si(), vec);
return castsi_ps(vec);
}
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I

View file

@ -1,66 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX2_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD4 AVX (2) implementation
//
// Since this implementation inherits from the AVX (1) implementation,
// the only operations below ones that replace AVX (1) operations.
// Only 2 shifts and 2 gathers were introduced with AVX 2
// Also, add native support for FMA operations
//============================================================================
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
{
return _mm_sllv_epi32(vA, vB);
}
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
{
return _mm_srlv_epi32(vA, vB);
}
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
}
#undef SIMD_WRAPPER_3

View file

@ -1,368 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD128 AVX (512) implementation
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================
private:
static SIMDINLINE __m512 __conv(Float r)
{
return _mm512_castps128_ps512(r.v);
}
static SIMDINLINE __m512d __conv(Double r)
{
return _mm512_castpd128_pd512(r.v);
}
static SIMDINLINE __m512i __conv(Integer r)
{
return _mm512_castsi128_si512(r.v);
}
static SIMDINLINE Float __conv(__m512 r)
{
return _mm512_castps512_ps128(r);
}
static SIMDINLINE Double __conv(__m512d r)
{
return _mm512_castpd512_pd128(r);
}
static SIMDINLINE Integer __conv(__m512i r)
{
return _mm512_castsi512_si128(r);
}
public:
#define SIMD_WRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
}
#define SIMD_WRAPPER_3_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
#define SIMD_DWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
}
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
#define SIMD_IWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xf)); // return 1.0f / a
SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf)); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2_32(mullo_epi32);
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf)); // return a & b (int)
SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf)); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf)); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
// use AVX2 version
// SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
//-----------------------------------------------------------------------
// Conversion operations (Use AVX2 versions)
//-----------------------------------------------------------------------
// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
//-----------------------------------------------------------------------
// Comparison operations (Use AVX2 versions
//-----------------------------------------------------------------------
// SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
// SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
// SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
// SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
//
// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
//{
// return cmpgt_epi32(b, a);
//}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16
// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation
// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16); // uint16 -->
// uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for
// _mm256_packus_epi32 and _mm512_packus_epi32 SIMD_IWRAPPER_2_(permute_epi32,
// permutevar8x32_epi32);
// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for
// each 32-bit lane i (float)
//{
// return _mm256_permutevar8x32_ps(a, swiz);
//}
SIMD_IWRAPPER_1I_32(shuffle_epi32);
// template<int ImmT>
// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
//{
// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
//}
// SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_IWRAPPER_2_32(unpackhi_epi32);
SIMD_IWRAPPER_2_32(unpacklo_epi32);
// SIMD_IWRAPPER_2_16(unpackhi_epi16);
// SIMD_IWRAPPER_2_64(unpackhi_epi64);
// SIMD_IWRAPPER_2_8(unpackhi_epi8);
// SIMD_IWRAPPER_2_16(unpacklo_epi16);
// SIMD_IWRAPPER_2_64(unpacklo_epi64);
// SIMD_IWRAPPER_2_8(unpacklo_epi8);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
{
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
{
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
}
static SIMDINLINE Float SIMDCALL
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
{
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
}
static SIMDINLINE Integer SIMDCALL
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
{
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
}
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return __conv(_mm512_mask_i32gather_ps(
_mm512_setzero_ps(), __mmask16(0xf), __conv(idx), p, static_cast<int>(ScaleT)));
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
__mmask16 m = 0xf;
m = _mm512_mask_test_epi32_mask(
m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
return __conv(
_mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
}
// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
// {
// __mmask64 m = 0xffffull;
// return static_cast<uint32_t>(
// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
// }
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
{
__mmask16 m = 0xf;
m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
_mm512_mask_storeu_ps(p, m, __conv(src));
}
static SIMDINLINE void SIMDCALL
store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm512_mask_storeu_ps(p, __mmask16(0xf), __conv(a));
}
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
{
_mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1)));
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_1I_
#undef SIMD_WRAPPER_1I
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_3
#undef SIMD_DWRAPPER_1_
#undef SIMD_DWRAPPER_1
#undef SIMD_DWRAPPER_1I_
#undef SIMD_DWRAPPER_1I
#undef SIMD_DWRAPPER_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_8
#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_32
#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_8
#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_32
#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_8
#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_32
#undef SIMD_IWRAPPER_2_64
#undef SIMD_IWRAPPER_2I
//#undef SIMD_IWRAPPER_2I_8
//#undef SIMD_IWRAPPER_2I_16
//#undef SIMD_IWRAPPER_2I_32
//#undef SIMD_IWRAPPER_2I_64

View file

@ -1,196 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD128 AVX (512) implementation
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================
#define SIMD_WRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
}
#define SIMD_WRAPPER_3_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
#define SIMD_DWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
}
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
#define SIMD_IWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
}
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and
// _mm512_packs_epi16
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and
// _mm512_packs_epi32
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and
// _mm512_packus_epi16
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
// _mm512_packus_epi32
SIMD_IWRAPPER_2_16(unpackhi_epi16);
SIMD_IWRAPPER_2_64(unpackhi_epi64);
SIMD_IWRAPPER_2_8(unpackhi_epi8);
SIMD_IWRAPPER_2_16(unpacklo_epi16);
SIMD_IWRAPPER_2_64(unpacklo_epi64);
SIMD_IWRAPPER_2_8(unpacklo_epi8);
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = 0xffffull;
return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
}
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_1I_
#undef SIMD_WRAPPER_1I
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_3
#undef SIMD_DWRAPPER_1_
#undef SIMD_DWRAPPER_1
#undef SIMD_DWRAPPER_1I_
#undef SIMD_DWRAPPER_1I
#undef SIMD_DWRAPPER_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_8
#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_32
#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_8
#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_32
#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_8
#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_32
#undef SIMD_IWRAPPER_2_64
#undef SIMD_IWRAPPER_2I
//#undef SIMD_IWRAPPER_2I_8
//#undef SIMD_IWRAPPER_2I_16
//#undef SIMD_IWRAPPER_2I_32
//#undef SIMD_IWRAPPER_2I_64

View file

@ -1,34 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD128 AVX (512) implementation for Knights Family
//
// Since this implementation inherits from the AVX512Base implementation,
// the only operations below ones that replace AVX512F / AVX512CD operations
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================

View file

@ -1,826 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
using SIMD128T = SIMD128Impl::AVXImpl;
//============================================================================
// SIMD256 AVX (1) implementation
//============================================================================
#define SIMD_WRAPPER_1(op) \
static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); }
#define SIMD_WRAPPER_2(op) \
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
{ \
return _mm256_##op(a, b); \
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
{ \
return _mm256_##op(a, b); \
}
#define SIMD_WRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
{ \
return _mm256_##op(a, b, ImmT); \
}
#define SIMD_DWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
{ \
return _mm256_##op(a, b, ImmT); \
}
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
{ \
return _mm256_##op(a, b, c); \
}
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
#define SIMD_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return _mm256_##op(a, b); \
}
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \
}
#define SIMD_IFWRAPPER_2I(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT)); \
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return _mm256_##intrin(a, b, ImmT); \
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
#define SIMD_IWRAPPER_3(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
{ \
return _mm256_##op(a, b, c); \
}
// emulated integer simd
#define SIMD_EMU_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
{ \
return Integer{ \
SIMD128T::op(a.v4[0]), \
SIMD128T::op(a.v4[1]), \
}; \
}
#define SIMD_EMU_IWRAPPER_1L(op, shift) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
{ \
return Integer{ \
SIMD128T::op(a.v4[0]), \
SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
}; \
} \
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \
{ \
return Integer{ \
SIMD128T::op(a), \
SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
}; \
}
#define SIMD_EMU_IWRAPPER_1I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
{ \
return Integer{ \
SIMD128T::template op<ImmT>(a.v4[0]), \
SIMD128T::template op<ImmT>(a.v4[1]), \
}; \
}
#define SIMD_EMU_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return Integer{ \
SIMD128T::op(a.v4[0], b.v4[0]), \
SIMD128T::op(a.v4[1], b.v4[1]), \
}; \
}
#define SIMD_EMU_IWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return Integer{ \
SIMD128T::template op<ImmT>(a.v4[0], b.v[0]), \
SIMD128T::template op<ImmT>(a.v4[1], b.v[1]), \
}; \
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
Float const& b,
Float const& c) // return (a * b) + c
{
return add_ps(mul_ps(a, b), c);
}
static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a,
Float const& b,
Float const& c) // return (a * b) - c
{
return sub_ps(mul_ps(a, b), c);
}
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
template <RoundMode RMT>
static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
{
return _mm256_round_ps(a, static_cast<int>(RMT));
}
static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
{
return round_ps<RoundMode::CEIL_NOEXC>(a);
}
static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
{
return round_ps<RoundMode::FLOOR_NOEXC>(a);
}
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_EMU_IWRAPPER_2(mullo_epi32);
SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_IFWRAPPER_2(and_si, _mm256_and_ps); // return a & b (int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps); // return (~a) & b (int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_IFWRAPPER_2(or_si, _mm256_or_ps); // return a | b (int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA,
Integer const& vCount) // return a << b (uint32)
{
int32_t aHi, aLow, countHi, countLow;
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
__m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
__m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
__m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
aHi = _mm_extract_epi32(vAHi, 0);
countHi = _mm_extract_epi32(vCountHi, 0);
aHi <<= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 0);
aLow = _mm_extract_epi32(vALow, 0);
countLow = _mm_extract_epi32(vCountLow, 0);
aLow <<= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 0);
aHi = _mm_extract_epi32(vAHi, 1);
countHi = _mm_extract_epi32(vCountHi, 1);
aHi <<= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 1);
aLow = _mm_extract_epi32(vALow, 1);
countLow = _mm_extract_epi32(vCountLow, 1);
aLow <<= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 1);
aHi = _mm_extract_epi32(vAHi, 2);
countHi = _mm_extract_epi32(vCountHi, 2);
aHi <<= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 2);
aLow = _mm_extract_epi32(vALow, 2);
countLow = _mm_extract_epi32(vCountLow, 2);
aLow <<= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 2);
aHi = _mm_extract_epi32(vAHi, 3);
countHi = _mm_extract_epi32(vCountHi, 3);
aHi <<= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 3);
aLow = _mm_extract_epi32(vALow, 3);
countLow = _mm_extract_epi32(vCountLow, 3);
aLow <<= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 3);
__m256i ret = _mm256_set1_epi32(0);
ret = _mm256_insertf128_si256(ret, vAHi, 1);
ret = _mm256_insertf128_si256(ret, vALow, 0);
return ret;
}
SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint)
template <int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA,
Integer const& vCount) // return a >> b (uint32)
{
int32_t aHi, aLow, countHi, countLow;
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
__m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
__m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
__m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
aHi = _mm_extract_epi32(vAHi, 0);
countHi = _mm_extract_epi32(vCountHi, 0);
aHi >>= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 0);
aLow = _mm_extract_epi32(vALow, 0);
countLow = _mm_extract_epi32(vCountLow, 0);
aLow >>= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 0);
aHi = _mm_extract_epi32(vAHi, 1);
countHi = _mm_extract_epi32(vCountHi, 1);
aHi >>= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 1);
aLow = _mm_extract_epi32(vALow, 1);
countLow = _mm_extract_epi32(vCountLow, 1);
aLow >>= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 1);
aHi = _mm_extract_epi32(vAHi, 2);
countHi = _mm_extract_epi32(vCountHi, 2);
aHi >>= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 2);
aLow = _mm_extract_epi32(vALow, 2);
countLow = _mm_extract_epi32(vCountLow, 2);
aLow >>= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 2);
aHi = _mm_extract_epi32(vAHi, 3);
countHi = _mm_extract_epi32(vCountHi, 3);
aHi >>= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 3);
aLow = _mm_extract_epi32(vALow, 3);
countLow = _mm_extract_epi32(vCountLow, 3);
aLow >>= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 3);
__m256i ret = _mm256_set1_epi32(0);
ret = _mm256_insertf128_si256(ret, vAHi, 1);
ret = _mm256_insertf128_si256(ret, vALow, 0);
return ret;
}
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
{
return _mm256_castpd_ps(a);
}
static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
{
return _mm256_castps_si256(a);
}
static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
{
return _mm256_castsi256_pd(a);
}
static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
{
return _mm256_castps_pd(a);
}
static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a)
{
return _mm256_castpd_si256(a);
}
static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
{
return _mm256_castsi256_ps(a);
}
static SIMDINLINE Float SIMDCALL
cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float)
{
return _mm256_cvtepi32_ps(a);
}
SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16)
SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32)
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32)
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64)
SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64)
static SIMDINLINE Integer SIMDCALL
cvtps_epi32(Float const& a) // return (int32)a (float --> int32)
{
return _mm256_cvtps_epi32(a);
}
static SIMDINLINE Integer SIMDCALL
cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return _mm256_cvttps_epi32(a);
}
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
template <CompareType CmpTypeT>
static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
{
return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
}
static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::LT_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::GT_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::NEQ_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::EQ_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::GE_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::LE_OQ>(a, b);
}
SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
static SIMDINLINE bool SIMDCALL
testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return 0 != _mm256_testz_ps(a, b);
}
static SIMDINLINE bool SIMDCALL
testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return 0 != _mm256_testz_si256(a, b);
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32)
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
Integer const& b,
Float const& mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
Integer const& b,
Integer const& mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
}
static SIMDINLINE Float SIMDCALL
broadcast_ss(float const* p) // return *p (all elements in vector get same value)
{
return _mm256_broadcast_ss(p);
}
SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_EMU_IWRAPPER_2(
packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_EMU_IWRAPPER_2(
packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
template <int ImmT>
static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
{
return _mm256_permute_ps(a, ImmT);
}
static SIMDINLINE Integer SIMDCALL permute_epi32(
Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
{
Integer result;
// Ugly slow implementation
uint32_t const* pA = reinterpret_cast<uint32_t const*>(&a);
uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
uint32_t* pResult = reinterpret_cast<uint32_t*>(&result);
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
pResult[i] = pA[0xF & pSwiz[i]];
}
return result;
}
static SIMDINLINE Float SIMDCALL
permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
Float result;
// Ugly slow implementation
float const* pA = reinterpret_cast<float const*>(&a);
uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
float* pResult = reinterpret_cast<float*>(&result);
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
pResult[i] = pA[0xF & pSwiz[i]];
}
return result;
}
SIMD_WRAPPER_2I(permute2f128_ps);
SIMD_DWRAPPER_2I(permute2f128_pd);
SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
template <int ImmT>
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
{
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
}
SIMD_EMU_IWRAPPER_2(shuffle_epi8);
SIMD_DWRAPPER_2I(shuffle_pd);
SIMD_WRAPPER_2I(shuffle_ps);
SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
SIMD_DWRAPPER_2(unpackhi_pd);
SIMD_WRAPPER_2(unpackhi_ps);
SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
SIMD_DWRAPPER_2(unpacklo_pd);
SIMD_WRAPPER_2(unpacklo_ps);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
uint32_t* pOffsets = (uint32_t*)&idx;
Float vResult;
float* pResult = (float*)&vResult;
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
uint32_t offset = pOffsets[i];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
}
return vResult;
}
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return i32gather_ps<ScaleT>(p, idx);
}
static SIMDINLINE Float SIMDCALL
load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
{
return broadcast_ss(p);
}
static SIMDINLINE Float SIMDCALL
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
{
return _mm256_load_ps(p);
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
{
return _mm256_load_si256(&p->v);
}
static SIMDINLINE Float SIMDCALL
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
{
return _mm256_loadu_ps(p);
}
static SIMDINLINE Integer SIMDCALL
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
{
return _mm256_lddqu_si256(&p->v);
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
{
uint32_t* pOffsets = (uint32_t*)&idx;
Float vResult = old;
float* pResult = (float*)&vResult;
unsigned long index = 0;
uint32_t umask = movemask_ps(mask);
while (_BitScanForward(&index, umask))
{
umask &= ~(1 << index);
uint32_t offset = pOffsets[index];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[index] = *(float const*)(((uint8_t const*)p + offset));
}
return vResult;
}
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
{
return mask_i32gather_ps<ScaleT>(old, p, idx, mask);
}
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
{
_mm256_maskstore_ps(p, mask, src);
}
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
{
return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16);
}
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
{
return static_cast<uint32_t>(_mm256_movemask_pd(a));
}
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
{
return static_cast<uint32_t>(_mm256_movemask_ps(a));
}
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
{
return _mm256_set1_epi32(i);
}
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
{
return _mm256_set1_epi8(i);
}
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
{
return _mm256_set1_ps(f);
}
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
{
return _mm256_setzero_ps();
}
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
{
return _mm256_setzero_si256();
}
static SIMDINLINE void SIMDCALL
store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory)
{
_mm256_store_ps(p, a);
}
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
{
_mm256_store_si256(&p->v, a);
}
static SIMDINLINE void SIMDCALL
stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
_mm256_stream_ps(p, a);
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p)
{
return _mm256_broadcast_ps(&p->v);
}
template <int ImmT>
static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a)
{
return _mm256_extractf128_pd(a, ImmT);
}
template <int ImmT>
static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a)
{
return _mm256_extractf128_ps(a, ImmT);
}
template <int ImmT>
static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a)
{
return _mm256_extractf128_si256(a, ImmT);
}
template <int ImmT>
static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b)
{
return _mm256_insertf128_pd(a, b, ImmT);
}
template <int ImmT>
static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b)
{
return _mm256_insertf128_ps(a, b, ImmT);
}
template <int ImmT>
static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b)
{
return _mm256_insertf128_si256(a, b, ImmT);
}
#ifndef _mm256_set_m128i
#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
#endif
#ifndef _mm256_loadu2_m128i
#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
/* SIMD128Impl::Integer const* */ loaddr) \
_mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
#endif
static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi,
SIMD128Impl::Integer const* plo)
{
return _mm256_loadu2_m128i(&phi->v, &plo->v);
}
static SIMDINLINE Integer SIMDCALL
set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL
set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
{
return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi,
SIMD128Impl::Integer* plo,
Integer const& src)
{
_mm256_storeu2_m128i(&phi->v, &plo->v, src);
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
Integer vec = set1_epi32(mask);
const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
vec = and_si(vec, bit);
vec = cmplt_epi32(setzero_si(), vec);
return castsi_ps(vec);
}
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IFWRAPPER_2I
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_2I_
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_3
#undef SIMD_EMU_IWRAPPER_1
#undef SIMD_EMU_IWRAPPER_1I
#undef SIMD_EMU_IWRAPPER_2
#undef SIMD_EMU_IWRAPPER_2I

View file

@ -1,255 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX2_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD256 AVX (2) implementation
//
// Since this implementation inherits from the AVX (1) implementation,
// the only operations below ones that replace AVX (1) operations.
// Mostly these are integer operations that are no longer emulated with SSE
//============================================================================
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
#define SIMD_IWRAPPER_1L(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
{ \
return _mm256_##op(_mm256_castsi256_si128(a)); \
}
#define SIMD_IWRAPPER_1I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
{ \
return _mm256_##op(a, ImmT); \
}
#define SIMD_IWRAPPER_1I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
{ \
return _mm256_##intrin(a, ImmT); \
}
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return _mm256_##intrin(a, b); \
}
#define SIMD_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return _mm256_##op(a, b); \
}
#define SIMD_IWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return _mm256_##op(a, b, ImmT); \
}
#define SIMD_IWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return _mm256_##op(a, b, ImmT); \
}
//-----------------------------------------------------------------------
// Floating point arithmetic operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
Float const& b,
Float const& c) // return (a * b) + c
{
return _mm256_fmadd_ps(a, b, c);
}
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2(mullo_epi32);
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version]
// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly.
// Using and_ps instead inhibits the compiler's constant folding and actually issues
// the and intrinsic even though both inputs are constant values.
#else
// Use native integer and intrinsic
SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
#endif
SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
template <int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1L(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
SIMD_IWRAPPER_1L(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const& a,
Integer const& b) // return a < b (int32)
{
return cmpgt_epi32(b, a);
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
template <int ImmT>
static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
{
return _mm256_permute_ps(a, ImmT);
}
SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
static SIMDINLINE Float SIMDCALL
permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm256_permutevar8x32_ps(a, swiz);
}
SIMD_IWRAPPER_1I(shuffle_epi32);
template <int ImmT>
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
{
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
}
SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_IWRAPPER_2(unpackhi_epi16);
SIMD_IWRAPPER_2(unpackhi_epi32);
SIMD_IWRAPPER_2(unpackhi_epi64);
SIMD_IWRAPPER_2(unpackhi_epi8);
SIMD_IWRAPPER_2(unpacklo_epi16);
SIMD_IWRAPPER_2(unpacklo_epi32);
SIMD_IWRAPPER_2(unpacklo_epi64);
SIMD_IWRAPPER_2(unpacklo_epi8);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
}
#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version]
// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register
// correctly in early versions of MSVC 2019
#else
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
{
// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
// Only for this intrinsic - not sure why. :(
return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
}
#endif
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
{
return static_cast<uint32_t>(_mm256_movemask_epi8(a));
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1L
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_2I

View file

@ -1,349 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD256 AVX (512) implementation
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================
private:
static SIMDINLINE __m512 __conv(Float r)
{
return _mm512_castps256_ps512(r.v);
}
static SIMDINLINE __m512d __conv(Double r)
{
return _mm512_castpd256_pd512(r.v);
}
static SIMDINLINE __m512i __conv(Integer r)
{
return _mm512_castsi256_si512(r.v);
}
static SIMDINLINE Float __conv(__m512 r)
{
return _mm512_castps512_ps256(r);
}
static SIMDINLINE Double __conv(__m512d r)
{
return _mm512_castpd512_pd256(r);
}
static SIMDINLINE Integer __conv(__m512i r)
{
return _mm512_castsi512_si256(r);
}
public:
#define SIMD_WRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
#define SIMD_WRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
#define SIMD_WRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
}
#define SIMD_WRAPPER_3_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
#define SIMD_DWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
}
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
#define SIMD_IWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xff)); // return 1.0f / a
SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff)); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2_32(mullo_epi32);
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff)); // return a & b (int)
SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff)); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff)); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
// use AVX2 version
// SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
//-----------------------------------------------------------------------
// Conversion operations (Use AVX2 versions)
//-----------------------------------------------------------------------
// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
//-----------------------------------------------------------------------
// Comparison operations (Use AVX2 versions
//-----------------------------------------------------------------------
// SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
// SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
// SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
// SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
//
// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
//{
// return cmpgt_epi32(b, a);
//}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16
// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation
// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16); // uint16 -->
// uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for
// _mm256_packus_epi32 and _mm512_packus_epi32
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for
// each 32-bit lane i (float)
//{
// return _mm256_permutevar8x32_ps(a, swiz);
//}
SIMD_IWRAPPER_1I_32(shuffle_epi32);
// template<int ImmT>
// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
//{
// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
//}
// SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_IWRAPPER_2_32(unpackhi_epi32);
SIMD_IWRAPPER_2_32(unpacklo_epi32);
// SIMD_IWRAPPER_2_16(unpackhi_epi16);
// SIMD_IWRAPPER_2_64(unpackhi_epi64);
// SIMD_IWRAPPER_2_8(unpackhi_epi8);
// SIMD_IWRAPPER_2_16(unpacklo_epi16);
// SIMD_IWRAPPER_2_64(unpacklo_epi64);
// SIMD_IWRAPPER_2_8(unpacklo_epi8);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
{
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
{
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
}
static SIMDINLINE Float SIMDCALL
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
{
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
}
static SIMDINLINE Integer SIMDCALL
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
{
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
}
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return __conv(_mm512_mask_i32gather_ps(
_mm512_setzero_ps(), __mmask16(0xff), __conv(idx), p, static_cast<int>(ScaleT)));
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
__mmask16 m = 0xff;
m = _mm512_mask_test_epi32_mask(
m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
return __conv(
_mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
}
// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
// {
// __mmask64 m = 0xffffffffull;
// return static_cast<uint32_t>(
// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
// }
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
{
__mmask16 m = 0xff;
m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
_mm512_mask_storeu_ps(p, m, __conv(src));
}
static SIMDINLINE void SIMDCALL
store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm512_mask_storeu_ps(p, __mmask16(0xff), __conv(a));
}
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
{
_mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1)));
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_1I_
#undef SIMD_WRAPPER_1I
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_32
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_32
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_32
#undef SIMD_IWRAPPER_2I

View file

@ -1,129 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD256 AVX (512) implementation for Core processors
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
}
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
}
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
}
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and
// _mm512_packs_epi16
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and
// _mm512_packs_epi32
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and
// _mm512_packus_epi16
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
// _mm512_packus_epi32
SIMD_IWRAPPER_2_16(unpackhi_epi16);
SIMD_IWRAPPER_2_64(unpackhi_epi64);
SIMD_IWRAPPER_2_8(unpackhi_epi8);
SIMD_IWRAPPER_2_16(unpacklo_epi16);
SIMD_IWRAPPER_2_64(unpacklo_epi64);
SIMD_IWRAPPER_2_8(unpacklo_epi8);
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = 0xffffffffull;
return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
}
#undef SIMD_DWRAPPER_1_
#undef SIMD_DWRAPPER_1
#undef SIMD_DWRAPPER_1I_
#undef SIMD_DWRAPPER_1I
#undef SIMD_DWRAPPER_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_8
#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_8
#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_8
#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_64

View file

@ -1,34 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD256 AVX (512) implementation for Knights Family
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================

View file

@ -1,699 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
// gcc as of 7.1 was missing these intrinsics
#ifndef _mm512_cmpneq_ps_mask
#define _mm512_cmpneq_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_NEQ_UQ)
#endif
#ifndef _mm512_cmplt_ps_mask
#define _mm512_cmplt_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_LT_OS)
#endif
#ifndef _mm512_cmplt_pd_mask
#define _mm512_cmplt_pd_mask(a, b) _mm512_cmp_pd_mask((a), (b), _CMP_LT_OS)
#endif
#endif
//============================================================================
// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
// processors)
//
//============================================================================
static const int TARGET_SIMD_WIDTH = 16;
using SIMD256T = SIMD256Impl::AVX2Impl;
#define SIMD_WRAPPER_1_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
#define SIMD_WRAPPER_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
#define SIMD_WRAPPERI_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return _mm512_castsi512_ps( \
_mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
#define SIMD_WRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
#define SIMD_DWRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1_8(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1_4(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return intrin(a, ImmT); \
}
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
private:
static SIMDINLINE Integer vmask(__mmask16 m)
{
return _mm512_maskz_set1_epi32(m, -1);
}
static SIMDINLINE Integer vmask(__mmask8 m)
{
return _mm512_maskz_set1_epi64(m, -1LL);
}
public:
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps); // return 1.0f / a
SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
template <RoundMode RMT>
static SIMDINLINE Float SIMDCALL round_ps(Float a)
{
return _mm512_roundscale_ps(a, static_cast<int>(RMT));
}
static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
{
return round_ps<RoundMode::CEIL_NOEXC>(a);
}
static SIMDINLINE Float SIMDCALL floor_ps(Float a)
{
return round_ps<RoundMode::FLOOR_NOEXC>(a);
}
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
// SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
// SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2(mullo_epi32);
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
// SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int)
SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
// SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
// SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
// SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
// SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2(sllv_epi32);
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
#if 0
SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
template<int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
#endif
SIMD_IWRAPPER_2(srlv_epi32);
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
{
return _mm512_castpd_ps(a);
}
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
{
return _mm512_castps_si512(a);
}
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
{
return _mm512_castsi512_pd(a);
}
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
{
return _mm512_castps_pd(a);
}
static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
{
return _mm512_castpd_si512(a);
}
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
{
return _mm512_castsi512_ps(a);
}
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
{
return _mm512_cvtepi32_ps(a);
}
// SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
{
return _mm512_cvtps_epi32(a);
}
static SIMDINLINE Integer SIMDCALL
cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return _mm512_cvttps_epi32(a);
}
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
template <CompareType CmpTypeT>
static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
{
return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
}
template <CompareType CmpTypeT>
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
{
// Legacy vector mask generator
__mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
return castsi_ps(vmask(result));
}
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
{
return cmp_ps<CompareType::LT_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
{
return cmp_ps<CompareType::GT_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
{
return cmp_ps<CompareType::NEQ_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
{
return cmp_ps<CompareType::EQ_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
{
return cmp_ps<CompareType::GE_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
{
return cmp_ps<CompareType::LE_OQ>(a, b);
}
template <CompareTypeInt CmpTypeT>
static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
{
// Legacy vector mask generator
__mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
return vmask(result);
}
template <CompareTypeInt CmpTypeT>
static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
{
// Legacy vector mask generator
__mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
return vmask(result);
}
// SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8)
// SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8)
// SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
static SIMDINLINE bool SIMDCALL testz_ps(Float a,
Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
}
static SIMDINLINE bool SIMDCALL testz_si(Integer a,
Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
template <int ImmT>
static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a (float)
{
return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
}
template <int ImmT>
static SIMDINLINE Integer blend_epi32(Integer a, Integer b) // return ImmT ? b : a (int32)
{
return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
}
static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a (float)
{
return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
Integer b,
Float mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
Integer b,
Integer mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
}
static SIMDINLINE Float SIMDCALL
broadcast_ss(float const* p) // return *p (all elements in vector get same value)
{
return _mm512_set1_ps(*p);
}
template <int imm>
static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
{
return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
}
template <int imm>
static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
{
return _mm512_extractf64x4_pd(a, imm);
}
template <int imm>
static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
{
return _mm512_extracti64x4_epi64(a, imm);
}
template <int imm>
static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
{
return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
}
template <int imm>
static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
{
return _mm512_insertf64x4(a, b, imm);
}
template <int imm>
static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
{
return _mm512_inserti64x4(a, b, imm);
}
// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and
// _mm512_packs_epi16 SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32
// and _mm512_packs_epi32 SIMD_IWRAPPER_2(packus_epi16); // See documentation for
// _mm512_packus_epi16 and _mm512_packus_epi16 SIMD_IWRAPPER_2(packus_epi32); // See documentation
// for _mm512_packus_epi32 and _mm512_packus_epi32
template <int ImmT>
static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
{
return _mm512_permute_ps(a, ImmT);
}
static SIMDINLINE Integer SIMDCALL
permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm512_permutexvar_epi32(swiz, a);
}
static SIMDINLINE Float SIMDCALL
permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm512_permutexvar_ps(swiz, a);
}
SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
SIMD_IWRAPPER_1I(shuffle_epi32);
// SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_DWRAPPER_2I(shuffle_pd);
SIMD_WRAPPER_2I(shuffle_ps);
template <int ImmT>
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
{
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
}
SIMD_IWRAPPER_2(unpackhi_epi16);
// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
{
return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
}
SIMD_IWRAPPER_2(unpackhi_epi64);
// SIMD_IWRAPPER_2(unpackhi_epi8);
SIMD_DWRAPPER_2(unpackhi_pd);
SIMD_WRAPPER_2(unpackhi_ps);
// SIMD_IWRAPPER_2(unpacklo_epi16);
SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
SIMD_IWRAPPER_2(unpacklo_epi64);
// SIMD_IWRAPPER_2(unpacklo_epi8);
SIMD_DWRAPPER_2(unpacklo_pd);
SIMD_WRAPPER_2(unpacklo_ps);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
}
static SIMDINLINE Float SIMDCALL
load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
{
return broadcast_ss(p);
}
static SIMDINLINE Float SIMDCALL
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
{
return _mm512_load_ps(p);
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
{
return _mm512_load_si512(&p->v);
}
static SIMDINLINE Float SIMDCALL
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
{
return _mm512_loadu_ps(p);
}
static SIMDINLINE Integer SIMDCALL
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
{
return _mm512_loadu_si512(p);
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
__mmask16 k = _mm512_test_epi32_mask(castps_si(mask), set1_epi32(0x80000000));
return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
}
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
{
Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
_mm512_mask_store_ps(p, m, src);
}
// static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
//{
// __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
// return static_cast<uint64_t>(m);
//}
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
{
__mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL));
return static_cast<uint32_t>(m);
}
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
{
__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000));
return static_cast<uint32_t>(m);
}
static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value)
{
return _mm512_set1_epi64(i);
}
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
{
return _mm512_set1_epi32(i);
}
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
{
return _mm512_set1_epi8(i);
}
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
{
return _mm512_set1_ps(f);
}
static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
{
return _mm512_setzero_pd();
}
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
{
return _mm512_setzero_ps();
}
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
{
return _mm512_setzero_si512();
}
static SIMDINLINE void SIMDCALL
store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm512_store_ps(p, a);
}
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
{
_mm512_store_si512(&p->v, a);
}
static SIMDINLINE void SIMDCALL
storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
{
_mm512_storeu_si512(&p->v, a);
}
static SIMDINLINE void SIMDCALL
stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
_mm512_stream_ps(p, a);
}
static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
int i14,
int i13,
int i12,
int i11,
int i10,
int i9,
int i8,
int i7,
int i6,
int i5,
int i4,
int i3,
int i2,
int i1,
int i0)
{
return _mm512_set_epi32(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Integer SIMDCALL
set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL set_ps(float i15,
float i14,
float i13,
float i12,
float i11,
float i10,
float i9,
float i8,
float i7,
float i6,
float i5,
float i4,
float i3,
float i2,
float i1,
float i0)
{
return _mm512_set_ps(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL
set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
{
return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
}
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPERI_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I_
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I

View file

@ -1,186 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD16 AVX512 (F) implementation for Core processors
//
//============================================================================
#define SIMD_WRAPPER_1_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
#define SIMD_WRAPPER_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
#define SIMD_WRAPPERI_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return _mm512_castsi512_ps( \
_mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
#define SIMD_WRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
#define SIMD_DWRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1_8(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1_4(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return intrin(a, ImmT); \
}
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
private:
static SIMDINLINE Integer vmask(__mmask32 m)
{
return _mm512_maskz_set1_epi16(m, -1);
}
static SIMDINLINE Integer vmask(__mmask64 m)
{
return _mm512_maskz_set1_epi8(m, -1);
}
public:
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
template <CompareTypeInt CmpTypeT>
static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
{
// Legacy vector mask generator
__mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
return vmask(result);
}
template <CompareTypeInt CmpTypeT>
static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
{
// Legacy vector mask generator
__mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
return vmask(result);
}
SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8)
SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8)
SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32
SIMD_IWRAPPER_2(unpackhi_epi8); // See documentation for _mm512_unpackhi_epi8
SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16
SIMD_IWRAPPER_2(unpacklo_epi8); // See documentation for _mm512_unpacklo_epi8
SIMD_IWRAPPER_2(shuffle_epi8);
static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
return static_cast<uint64_t>(m);
}
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPERI_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I_
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I

View file

@ -1,132 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD16 AVX512 (F) implementation for Knights Family Processors
//
//============================================================================
#define SIMD_WRAPPER_1_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
#define SIMD_WRAPPER_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
#define SIMD_WRAPPERI_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return _mm512_castsi512_ps( \
_mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
#define SIMD_WRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
#define SIMD_DWRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1_8(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1_4(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
#define SIMD_IWRAPPER_1I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{ \
return intrin(a, ImmT); \
}
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{ \
return _mm512_##intrin(a, b, ImmT); \
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPERI_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I_
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I

View file

@ -1,27 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
// Implement mask-enabled SIMD functions

View file

@ -1,27 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
// Implement mask-enabled SIMD functions

View file

@ -1,27 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
// Implement mask-enabled SIMD functions

View file

@ -1,852 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD16 AVX (1) implementation
//============================================================================
static const int TARGET_SIMD_WIDTH = 8;
using SIMD128T = SIMD128Impl::AVXImpl;
#define SIMD_WRAPPER_1(op) \
static SIMDINLINE Float SIMDCALL op(Float const& a) \
{ \
return Float{ \
SIMD256T::op(a.v8[0]), \
SIMD256T::op(a.v8[1]), \
}; \
}
#define SIMD_WRAPPER_2(op) \
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
{ \
return Float{ \
SIMD256T::op(a.v8[0], b.v8[0]), \
SIMD256T::op(a.v8[1], b.v8[1]), \
}; \
}
#define SIMD_WRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
{ \
return Float{ \
SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
}; \
}
#define SIMD_WRAPPER_2I_1(op) \
template <int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
{ \
return Float{ \
SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
}; \
}
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
{ \
return Float{ \
SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
}; \
}
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
{ \
return Integer{ \
SIMD256T::op(a.v8[0]), \
SIMD256T::op(a.v8[1]), \
}; \
}
#define SIMD_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return Integer{ \
SIMD256T::op(a.v8[0], b.v8[0]), \
SIMD256T::op(a.v8[1], b.v8[1]), \
}; \
}
#define SIMD_IWRAPPER_2I(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return Integer{ \
SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
}; \
}
#define SIMD_IWRAPPER_2I_1(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return Integer{ \
SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
}; \
}
#define SIMD_IWRAPPER_2I_2(op) \
template <int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
{ \
return Integer{ \
SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]), \
SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]), \
}; \
}
#define SIMD_IWRAPPER_3(op) \
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
{ \
return Integer{ \
SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
}; \
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
template <RoundMode RMT>
static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
{
return Float{
SIMD256T::template round_ps<RMT>(a.v8[0]),
SIMD256T::template round_ps<RMT>(a.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
{
return round_ps<RoundMode::CEIL_NOEXC>(a);
}
static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
{
return round_ps<RoundMode::FLOOR_NOEXC>(a);
}
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2(mullo_epi32);
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_IWRAPPER_2(and_si); // return a & b (int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_IWRAPPER_2(or_si); // return a | b (int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
template <int ImmT>
static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
{
return Integer{
SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
};
}
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
template <int ImmT>
static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT (int32)
{
return Integer{
SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
};
}
template <int ImmT>
static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT (uint32)
{
return Integer{
SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
};
}
template <int ImmT> // for each 128-bit lane:
static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) // return a >> (ImmT*8) (uint)
{
return Integer{
SIMD256T::template srli_si<ImmT>(a.v8[0]),
SIMD256T::template srli_si<ImmT>(a.v8[1]),
};
}
template <int ImmT>
static SIMDINLINE Float SIMDCALL
srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
{
return Float{
SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
};
}
SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
{
return Float{
SIMD256T::castpd_ps(a.v8[0]),
SIMD256T::castpd_ps(a.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
{
return Integer{
SIMD256T::castps_si(a.v8[0]),
SIMD256T::castps_si(a.v8[1]),
};
}
static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
{
return Double{
SIMD256T::castsi_pd(a.v8[0]),
SIMD256T::castsi_pd(a.v8[1]),
};
}
static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
{
return Double{
SIMD256T::castps_pd(a.v8[0]),
SIMD256T::castps_pd(a.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
{
return Float{
SIMD256T::castsi_ps(a.v8[0]),
SIMD256T::castsi_ps(a.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL
cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float)
{
return Float{
SIMD256T::cvtepi32_ps(a.v8[0]),
SIMD256T::cvtepi32_ps(a.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL
cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a (uint8 --> int16)
{
return Integer{
SIMD256T::cvtepu8_epi16(a.v4[0]),
SIMD256T::cvtepu8_epi16(a.v4[1]),
};
}
static SIMDINLINE Integer SIMDCALL
cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint8 --> int32)
{
return Integer{
SIMD256T::cvtepu8_epi32(a.v4[0]),
SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
};
}
static SIMDINLINE Integer SIMDCALL
cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint16 --> int32)
{
return Integer{
SIMD256T::cvtepu16_epi32(a.v4[0]),
SIMD256T::cvtepu16_epi32(a.v4[1]),
};
}
static SIMDINLINE Integer SIMDCALL
cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint16 --> int64)
{
return Integer{
SIMD256T::cvtepu16_epi64(a.v4[0]),
SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
};
}
static SIMDINLINE Integer SIMDCALL
cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint32 --> int64)
{
return Integer{
SIMD256T::cvtepu32_epi64(a.v4[0]),
SIMD256T::cvtepu32_epi64(a.v4[1]),
};
}
static SIMDINLINE Integer SIMDCALL
cvtps_epi32(Float const& a) // return (int32)a (float --> int32)
{
return Integer{
SIMD256T::cvtps_epi32(a.v8[0]),
SIMD256T::cvtps_epi32(a.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL
cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return Integer{
SIMD256T::cvtps_epi32(a.v8[0]),
SIMD256T::cvtps_epi32(a.v8[1]),
};
}
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
template <CompareType CmpTypeT>
static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
{
return Float{
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::LT_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::GT_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::NEQ_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::EQ_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::GE_OQ>(a, b);
}
static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
{
return cmp_ps<CompareType::LE_OQ>(a, b);
}
template <CompareType CmpTypeT>
static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
{
return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
}
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
static SIMDINLINE bool SIMDCALL
testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
}
static SIMDINLINE bool SIMDCALL
testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
Integer const& b,
Float const& mask) // return mask ? b : a (int)
{
return Integer{
SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
Integer const& b,
Integer const& mask) // return mask ? b : a (int)
{
return Integer{
SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL
broadcast_ss(float const* p) // return *p (all elements in vector get same value)
{
float f = *p;
return Float{
SIMD256T::set1_ps(f),
SIMD256T::set1_ps(f),
};
}
template <int imm>
static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template <int imm>
static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template <int imm>
static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template <int imm>
static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
Float r = a;
r.v8[imm] = b;
return r;
}
template <int imm>
static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
Double r = a;
r.v8[imm] = b;
return r;
}
template <int imm>
static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
Integer r = a;
r.v8[imm] = b;
return r;
}
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
template <int ImmT>
static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
{
return Float{
SIMD256T::template permute_ps<ImmT>(a.v8[0]),
SIMD256T::template permute_ps<ImmT>(a.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL permute_epi32(
Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
{
return castps_si(permute_ps(castsi_ps(a), swiz));
}
static SIMDINLINE Float SIMDCALL
permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
const auto mask = SIMD256T::set1_epi32(7);
auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
return Float{
SIMD256T::blendv_ps(
lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
SIMD256T::blendv_ps(
hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
};
}
// All of the 512-bit permute2f128_XX intrinsics do the following:
//
// SELECT4(src, control) {
// CASE(control[1:0])
// 0 : tmp[127:0] : = src[127:0]
// 1 : tmp[127:0] : = src[255:128]
// 2 : tmp[127:0] : = src[383:256]
// 3 : tmp[127:0] : = src[511:384]
// ESAC
// RETURN tmp[127:0]
// }
//
// dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
// dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
// dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
// dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
// dst[MAX:512] : = 0
//
// Since the 256-bit AVX instructions use a 4-bit control field (instead
// of 2-bit for AVX512), we need to expand the control bits sent to the
// AVX instructions for emulation.
//
template <int shuf>
static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
{
return Float{
SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
a.v8[1]),
SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
b.v8[1]),
};
}
template <int shuf>
static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
{
return Double{
SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
a.v8[1]),
SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
b.v8[1]),
};
}
template <int shuf>
static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
{
return Integer{
SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
a.v8[1]),
SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
b.v8[1]),
};
}
SIMD_IWRAPPER_2I_1(shuffle_epi32);
SIMD_IWRAPPER_2I_2(shuffle_epi64);
SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_WRAPPER_2I_1(shuffle_pd);
SIMD_WRAPPER_2I_1(shuffle_ps);
SIMD_IWRAPPER_2(unpackhi_epi16);
SIMD_IWRAPPER_2(unpackhi_epi32);
SIMD_IWRAPPER_2(unpackhi_epi64);
SIMD_IWRAPPER_2(unpackhi_epi8);
SIMD_WRAPPER_2(unpackhi_pd);
SIMD_WRAPPER_2(unpackhi_ps);
SIMD_IWRAPPER_2(unpacklo_epi16);
SIMD_IWRAPPER_2(unpacklo_epi32);
SIMD_IWRAPPER_2(unpacklo_epi64);
SIMD_IWRAPPER_2(unpacklo_epi8);
SIMD_WRAPPER_2(unpacklo_pd);
SIMD_WRAPPER_2(unpacklo_ps);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return Float{
SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
};
}
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return Float{
SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL
load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
{
return broadcast_ss(p);
}
static SIMDINLINE Float SIMDCALL
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
{
return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
{
return Integer{
SIMD256T::load_si(&p->v8[0]),
SIMD256T::load_si(&p->v8[1]),
};
}
static SIMDINLINE Float SIMDCALL
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
{
return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
}
static SIMDINLINE Integer SIMDCALL
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
{
return Integer{
SIMD256T::loadu_si(&p->v8[0]),
SIMD256T::loadu_si(&p->v8[1]),
};
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
{
return Float{
SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
};
}
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
{
return Float{
SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
};
}
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
{
SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
}
static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
{
uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
return mask;
}
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
{
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
return mask;
}
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
{
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
return mask;
}
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
{
return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
}
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
{
return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
}
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
{
return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
}
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
{
return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
}
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
{
return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
}
static SIMDINLINE void SIMDCALL
store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory)
{
SIMD256T::store_ps(p, a.v8[0]);
SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
}
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
{
SIMD256T::store_si(&p->v8[0], a.v8[0]);
SIMD256T::store_si(&p->v8[1], a.v8[1]);
}
static SIMDINLINE void SIMDCALL
stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
SIMD256T::stream_ps(p, a.v8[0]);
SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
}
static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
int i14,
int i13,
int i12,
int i11,
int i10,
int i9,
int i8,
int i7,
int i6,
int i5,
int i4,
int i3,
int i2,
int i1,
int i0)
{
return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
}
static SIMDINLINE Integer SIMDCALL
set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL set_ps(float i15,
float i14,
float i13,
float i12,
float i11,
float i10,
float i9,
float i8,
float i7,
float i6,
float i5,
float i4,
float i3,
float i2,
float i1,
float i0)
{
return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
}
static SIMDINLINE Float SIMDCALL
set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
{
return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
}
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_2I_1
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_2I_1
#undef SIMD_IWRAPPER_3

View file

@ -1,27 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
// no backwards compatibility for simd mask-enabled functions

View file

@ -1,332 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#if 0
//===========================================================================
// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
//===========================================================================
struct SIMD256 // or SIMD4 or SIMD16
{
//=======================================================================
// SIMD Types
//
// These typedefs are examples. The SIMD256 and SIMD16 implementations will
// use different base types with this same naming.
using Float = __m256; // Packed single-precision float vector
using Double = __m256d; // Packed double-precision float vector
using Integer = __m256i; // Packed integer vector (mutable element widths)
using Mask = uint8_t; // Integer representing mask bits
//=======================================================================
// Standard interface
// (available in both SIMD256 and SIMD16 widths)
//=======================================================================
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
static Float add_ps(Float a, Float b); // return a + b
static Float div_ps(Float a, Float b); // return a / b
static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c
static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c
static Float max_ps(Float a, Float b); // return (a > b) ? a : b
static Float min_ps(Float a, Float b); // return (a < b) ? a : b
static Float mul_ps(Float a, Float b); // return a * b
static Float rcp_ps(Float a); // return 1.0f / a
static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a)
static Float sub_ps(Float a, Float b); // return a - b
enum class RoundMode
{
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
TO_NEG_INF = 0x01, // Round to negative infinity
TO_POS_INF = 0x02, // Round to positive infinity
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
RAISE_EXC = 0x00, // Raise exception on overflow
NO_EXC = 0x08, // Suppress exceptions
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
};
// return round_func(a)
//
// round_func is chosen on the RMT template parameter. See the documentation
// for the RoundMode enumeration above.
template <RoundMode RMT>
static Float round_ps(Float a); // return round(a)
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
static Integer abs_epi32(Integer a); // return absolute_value(a) (int32)
static Integer add_epi32(Integer a, Integer b); // return a + b (int32)
static Integer add_epi8(Integer a, Integer b); // return a + b (int8)
static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32)
static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32)
static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32)
static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32)
static Integer mul_epi32(Integer a, Integer b); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
static Float mullo_epi32(Integer a, Integer b);
static Integer sub_epi32(Integer a, Integer b); // return a - b (int32)
static Integer sub_epi64(Integer a, Integer b); // return a - b (int64)
static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
static Float and_ps(Float a, Float b); // return a & b (float treated as int)
static Integer and_si(Integer a, Integer b); // return a & b (int)
static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int)
static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int)
static Float or_ps(Float a, Float b); // return a | b (float treated as int)
static Float or_si(Integer a, Integer b); // return a | b (int)
static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int)
static Integer xor_si(Integer a, Integer b); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
template<int ImmT>
static Integer slli_epi32(Integer a); // return a << ImmT
static Integer sllv_epi32(Integer a, Integer b); // return a << b
template<int ImmT>
static Integer srai_epi32(Integer a); // return a >> ImmT (int32)
template<int ImmT>
static Integer srli_epi32(Integer a); // return a >> ImmT (uint32)
template<int ImmT> // for each 128-bit lane:
static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint)
template<int ImmT>
static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int
static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32)
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static Float castpd_ps(Double a); // return *(Float*)(&a)
static Integer castps_si(Float a); // return *(Integer*)(&a)
static Double castsi_pd(Integer a); // return *(Double*)(&a)
static Double castps_pd(Float a); // return *(Double*)(&a)
static Float castsi_ps(Integer a); // return *(Float*)(&a)
static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float)
static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16)
static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32)
static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32)
static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64)
static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64)
static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32)
static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32)
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
// Comparison types used with cmp_ps:
// - ordered comparisons are always false if either operand is NaN
// - unordered comparisons are always true if either operand is NaN
// - signaling comparisons raise an exception if either operand is NaN
// - non-signaling comparisons will never raise an exception
//
// Ordered: return (a != NaN) && (b != NaN) && (a cmp b)
// Unordered: return (a == NaN) || (b == NaN) || (a cmp b)
enum class CompareType
{
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
LT_OS = 0x01, // Less-than (ordered, signaling)
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
UNORD_Q = 0x03, // Unordered (nonsignaling)
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
NLT_US = 0x05, // Not-less-than (unordered, signaling)
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
ORD_Q = 0x07, // Ordered (nonsignaling)
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
GT_OS = 0x0E, // Greater-than (ordered, signaling)
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
EQ_OS = 0x10, // Equal (ordered, signaling)
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
UNORD_S = 0x13, // Unordered (signaling)
NEQ_US = 0x14, // Not-equal (unordered, signaling)
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
ORD_S = 0x17, // Ordered (signaling)
EQ_US = 0x18, // Equal (unordered, signaling)
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
FALSE_OS = 0x1B, // False (ordered, signaling)
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
TRUE_US = 0x1F, // True (unordered, signaling)
};
// return a (CmpTypeT) b (float)
//
// See documentation for CompareType above for valid values for CmpTypeT.
template<CompareType CmpTypeT>
static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above)
static Float cmpgt_ps(Float a, Float b); // return cmp_ps<CompareType::GT_OQ>(a, b)
static Float cmple_ps(Float a, Float b); // return cmp_ps<CompareType::LE_OQ>(a, b)
static Float cmplt_ps(Float a, Float b); // return cmp_ps<CompareType::LT_OQ>(a, b)
static Float cmpneq_ps(Float a, Float b); // return cmp_ps<CompareType::NEQ_OQ>(a, b)
static Float cmpeq_ps(Float a, Float b); // return cmp_ps<CompareType::EQ_OQ>(a, b)
static Float cmpge_ps(Float a, Float b); // return cmp_ps<CompareType::GE_OQ>(a, b)
static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8)
static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16)
static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32)
static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64)
static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8)
static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16)
static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32)
static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64)
static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32)
static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float)
static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int)
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
template<int ImmT>
static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float)
static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float)
static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value)
static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32)
static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float)
template<int SwizT>
static Integer shuffle_epi32(Integer a, Integer b);
template<int SwizT>
static Integer shuffle_epi64(Integer a, Integer b);
static Integer shuffle_epi8(Integer a, Integer b);
template<int SwizT>
static Float shuffle_pd(Double a, Double b);
template<int SwizT>
static Float shuffle_ps(Float a, Float b);
static Integer unpackhi_epi16(Integer a, Integer b);
static Integer unpackhi_epi32(Integer a, Integer b);
static Integer unpackhi_epi64(Integer a, Integer b);
static Integer unpackhi_epi8(Integer a, Integer b);
static Float unpackhi_pd(Double a, Double b);
static Float unpackhi_ps(Float a, Float b);
static Integer unpacklo_epi16(Integer a, Integer b);
static Integer unpacklo_epi32(Integer a, Integer b);
static Integer unpacklo_epi64(Integer a, Integer b);
static Integer unpacklo_epi8(Integer a, Integer b);
static Float unpacklo_pd(Double a, Double b);
static Float unpacklo_ps(Float a, Float b);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
enum class ScaleFactor
{
SF_1, // No scaling
SF_2, // Scale offset by 2
SF_4, // Scale offset by 4
SF_8, // Scale offset by 8
};
template<ScaleFactor ScaleT = ScaleFactor::SF_1>
static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT))
static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements)
static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory)
static Integer load_si(Integer const *p); // return *p
static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem)
static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem)
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<int ScaleT>
static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
static void maskstore_ps(float *p, Integer mask, Float src);
static int movemask_epi8(Integer a);
static int movemask_pd(Double a);
static int movemask_ps(Float a);
static Integer set1_epi32(int i); // return i (all elements are same value)
static Integer set1_epi8(char i); // return i (all elements are same value)
static Float set1_ps(float f); // return f (all elements are same value)
static Float setzero_ps(); // return 0 (float)
static Integer setzero_si(); // return 0 (integer)
static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory)
static void store_si(Integer *p, Integer a); // *p = a
static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache)
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
static Float broadcast_ps(__m128 const *p);
template<int ImmT>
static __m128d extractf128_pd(Double a);
template<int ImmT>
static __m128 extractf128_ps(Float a);
template<int ImmT>
static __m128i extractf128_si(Integer a);
template<int ImmT>
static Double insertf128_pd(Double a, __m128d b);
template<int ImmT>
static Float insertf128_ps(Float a, __m128 b);
template<int ImmT>
static Integer insertf128_si(Integer a, __m128i b);
static Integer loadu2_si(__m128 const* phi, __m128 const* plo);
template<int ImmT>
static Double permute2f128_pd(Double a, Double b);
template<int ImmT>
static Float permute2f128_ps(Float a, Float b);
template<int ImmT>
static Integer permute2f128_si(Integer a, Integer b);
static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
static void storeu2_si(__m128i *phi, __m128i *plo, Integer src);
//=======================================================================
// Advanced masking interface (currently available only in SIMD16 width)
//=======================================================================
};
#endif // #if 0

View file

@ -1,457 +0,0 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#if !defined(__cplusplus)
#error C++ compilation required
#endif
#include <immintrin.h>
#include <inttypes.h>
#include <stdint.h>
#define SIMD_ARCH_AVX 0
#define SIMD_ARCH_AVX2 1
#define SIMD_ARCH_AVX512 2
#if !defined(SIMD_ARCH)
#define SIMD_ARCH SIMD_ARCH_AVX
#endif
#if defined(_MSC_VER)
#define SIMDCALL __vectorcall
#define SIMDINLINE __forceinline
#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
#else
#define SIMDCALL
#define SIMDINLINE inline
#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
#endif
// For documentation, please see the following include...
// #include "simdlib_interface.hpp"
namespace SIMDImpl
{
enum class CompareType
{
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
LT_OS = 0x01, // Less-than (ordered, signaling)
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
UNORD_Q = 0x03, // Unordered (nonsignaling)
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
NLT_US = 0x05, // Not-less-than (unordered, signaling)
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
ORD_Q = 0x07, // Ordered (nonsignaling)
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
GT_OS = 0x0E, // Greater-than (ordered, signaling)
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
EQ_OS = 0x10, // Equal (ordered, signaling)
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
UNORD_S = 0x13, // Unordered (signaling)
NEQ_US = 0x14, // Not-equal (unordered, signaling)
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
ORD_S = 0x17, // Ordered (signaling)
EQ_US = 0x18, // Equal (unordered, signaling)
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
FALSE_OS = 0x1B, // False (ordered, signaling)
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
TRUE_US = 0x1F, // True (unordered, signaling)
};
#if SIMD_ARCH >= SIMD_ARCH_AVX512
enum class CompareTypeInt
{
EQ = _MM_CMPINT_EQ, // Equal
LT = _MM_CMPINT_LT, // Less than
LE = _MM_CMPINT_LE, // Less than or Equal
NE = _MM_CMPINT_NE, // Not Equal
GE = _MM_CMPINT_GE, // Greater than or Equal
GT = _MM_CMPINT_GT, // Greater than
};
#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
enum class ScaleFactor
{
SF_1 = 1, // No scaling
SF_2 = 2, // Scale offset by 2
SF_4 = 4, // Scale offset by 4
SF_8 = 8, // Scale offset by 8
};
enum class RoundMode
{
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
TO_NEG_INF = 0x01, // Round to negative infinity
TO_POS_INF = 0x02, // Round to positive infinity
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
RAISE_EXC = 0x00, // Raise exception on overflow
NO_EXC = 0x08, // Suppress exceptions
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
};
struct Traits
{
using CompareType = SIMDImpl::CompareType;
using ScaleFactor = SIMDImpl::ScaleFactor;
using RoundMode = SIMDImpl::RoundMode;
};
// Attribute, 4-dimensional attribute in SIMD SOA layout
template <typename Float, typename Integer, typename Double>
union Vec4
{
Float v[4];
Integer vi[4];
Double vd[4];
struct
{
Float x;
Float y;
Float z;
Float w;
};
SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; }
SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; }
SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in)
{
v[0] = in.v[0];
v[1] = in.v[1];
v[2] = in.v[2];
v[3] = in.v[3];
return *this;
}
};
namespace SIMD128Impl
{
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m128 in) : v(in) {}
SIMDINLINE Float& SIMDCALL operator=(__m128 in)
{
v = in;
return *this;
}
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m128() const { return v; }
SIMDALIGN(__m128, 16) v;
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m128i in) : v(in) {}
SIMDINLINE Integer& SIMDCALL operator=(__m128i in)
{
v = in;
return *this;
}
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m128i() const { return v; }
SIMDALIGN(__m128i, 16) v;
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m128d in) : v(in) {}
SIMDINLINE Double& SIMDCALL operator=(__m128d in)
{
v = in;
return *this;
}
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m128d() const { return v; }
SIMDALIGN(__m128d, 16) v;
};
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
using Mask = uint8_t;
static const uint32_t SIMD_WIDTH = 4;
} // namespace SIMD128Impl
namespace SIMD256Impl
{
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m256 in) : v(in) {}
SIMDINLINE Float(SIMD128Impl::Float const& in_lo,
SIMD128Impl::Float const& in_hi = _mm_setzero_ps())
{
v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
}
SIMDINLINE Float& SIMDCALL operator=(__m256 in)
{
v = in;
return *this;
}
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m256() const { return v; }
SIMDALIGN(__m256, 32) v;
SIMD128Impl::Float v4[2];
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m256i in) : v(in) {}
SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo,
SIMD128Impl::Integer const& in_hi = _mm_setzero_si128())
{
v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
}
SIMDINLINE Integer& SIMDCALL operator=(__m256i in)
{
v = in;
return *this;
}
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m256i() const { return v; }
SIMDALIGN(__m256i, 32) v;
SIMD128Impl::Integer v4[2];
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m256d const& in) : v(in) {}
SIMDINLINE Double(SIMD128Impl::Double const& in_lo,
SIMD128Impl::Double const& in_hi = _mm_setzero_pd())
{
v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
}
SIMDINLINE Double& SIMDCALL operator=(__m256d in)
{
v = in;
return *this;
}
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
{
v = in.v;
return *this;
}
SIMDINLINE SIMDCALL operator __m256d() const { return v; }
SIMDALIGN(__m256d, 32) v;
SIMD128Impl::Double v4[2];
};
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
using Mask = uint8_t;
static const uint32_t SIMD_WIDTH = 8;
} // namespace SIMD256Impl
namespace SIMD512Impl
{
#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED))
// Define AVX512 types if not included via immintrin.h.
// All data members of these types are ONLY to viewed
// in a debugger. Do NOT access them via code!
union __m512
{
private:
float m512_f32[16];
};
struct __m512d
{
private:
double m512d_f64[8];
};
union __m512i
{
private:
int8_t m512i_i8[64];
int16_t m512i_i16[32];
int32_t m512i_i32[16];
int64_t m512i_i64[8];
uint8_t m512i_u8[64];
uint16_t m512i_u16[32];
uint32_t m512i_u32[16];
uint64_t m512i_u64[8];
};
using __mmask16 = uint16_t;
#endif
#if defined(__INTEL_COMPILER) || (SIMD_ARCH >= SIMD_ARCH_AVX512)
#define SIMD_ALIGNMENT_BYTES 64
#else
#define SIMD_ALIGNMENT_BYTES 32
#endif
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m512 in) : v(in) {}
SIMDINLINE Float(SIMD256Impl::Float const& in_lo,
SIMD256Impl::Float const& in_hi = _mm256_setzero_ps())
{
v8[0] = in_lo;
v8[1] = in_hi;
}
SIMDINLINE Float& SIMDCALL operator=(__m512 in)
{
v = in;
return *this;
}
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE SIMDCALL operator __m512() const { return v; }
SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Float v8[2];
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m512i in) : v(in) {}
SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo,
SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256())
{
v8[0] = in_lo;
v8[1] = in_hi;
}
SIMDINLINE Integer& SIMDCALL operator=(__m512i in)
{
v = in;
return *this;
}
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE SIMDCALL operator __m512i() const { return v; }
SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Integer v8[2];
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m512d in) : v(in) {}
SIMDINLINE Double(SIMD256Impl::Double const& in_lo,
SIMD256Impl::Double const& in_hi = _mm256_setzero_pd())
{
v8[0] = in_lo;
v8[1] = in_hi;
}
SIMDINLINE Double& SIMDCALL operator=(__m512d in)
{
v = in;
return *this;
}
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE SIMDCALL operator __m512d() const { return v; }
SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Double v8[2];
};
typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
using Mask = __mmask16;
static const uint32_t SIMD_WIDTH = 16;
#undef SIMD_ALIGNMENT_BYTES
} // namespace SIMD512Impl
} // namespace SIMDImpl

View file

@ -1,299 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#include "common/os.h"
#include <stdarg.h>
#include <stdio.h>
#include <assert.h>
#include <algorithm>
#include <mutex>
#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
#if defined(_MSC_VER)
#pragma comment(lib, "user32.lib")
#endif // _WIN32
namespace ConsoleUtils
{
enum class TextColor
{
BLACK = 0,
#if defined(_WIN32)
RED = 4,
GREEN = 2,
BLUE = 1,
#else
RED = 1,
GREEN = 2,
BLUE = 4,
#endif // _WIN32
PURPLE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE),
CYAN = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
YELLOW = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN),
WHITE =
static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
};
enum class TextStyle
{
NORMAL = 0,
INTENSITY = 1,
};
void SetTextColor(FILE* stream,
TextColor color = TextColor::WHITE,
TextStyle style = TextStyle::NORMAL)
{
#if defined(_WIN32)
HANDLE hConsoleHandle = nullptr;
if (stream == stderr)
{
hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE);
}
else if (stream == stdout)
{
hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
}
else
{
// Not a console stream, do nothing
return;
}
WORD textAttributes = static_cast<WORD>(color);
if (style == TextStyle::INTENSITY)
{
textAttributes |= FOREGROUND_INTENSITY;
}
SetConsoleTextAttribute(hConsoleHandle, textAttributes);
#else // !_WIN32
// Print ANSI codes
uint32_t cc =
30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color);
fprintf(stream, "\033[0m\033[%d;%dm", static_cast<uint32_t>(style), cc);
#endif
}
void ResetTextColor(FILE* stream)
{
#if defined(_WIN32)
SetTextColor(stream);
#else // !_WIN32
// Print ANSI codes
fprintf(stream, "\033[0m");
#endif
}
static std::mutex g_stderrMutex;
} // namespace ConsoleUtils
bool SwrAssert(bool chkDebugger,
bool& enabled,
const char* pExpression,
const char* pFileName,
uint32_t lineNum,
const char* pFunction,
const char* pFmtString,
...)
{
using namespace ConsoleUtils;
std::lock_guard<std::mutex> l(g_stderrMutex);
SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL);
fprintf(stderr, "%s(%d): ", pFileName, lineNum);
SetTextColor(stderr, TextColor::RED, TextStyle::INTENSITY);
fprintf(stderr, "ASSERT: %s\n", pExpression);
SetTextColor(stderr, TextColor::CYAN, TextStyle::INTENSITY);
fprintf(stderr, "\t%s\n", pFunction);
if (pFmtString)
{
SetTextColor(stderr, TextColor::YELLOW, TextStyle::INTENSITY);
fprintf(stderr, "\t");
va_list args;
va_start(args, pFmtString);
vfprintf(stderr, pFmtString, args);
va_end(args);
fprintf(stderr, "\n");
}
ResetTextColor(stderr);
fflush(stderr);
#if defined(_WIN32)
static const int MAX_MESSAGE_LEN = 2048;
char msgBuf[MAX_MESSAGE_LEN];
sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression);
msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
msgBuf[MAX_MESSAGE_LEN - 1] = 0;
OutputDebugStringA(msgBuf);
sprintf_s(msgBuf, "\t%s\n", pFunction);
msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
msgBuf[MAX_MESSAGE_LEN - 1] = 0;
OutputDebugStringA(msgBuf);
int offset = 0;
if (pFmtString)
{
va_list args;
va_start(args, pFmtString);
offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
va_end(args);
if (offset < 0)
{
return true;
}
OutputDebugStringA("\t");
OutputDebugStringA(msgBuf);
OutputDebugStringA("\n");
}
if (enabled && KNOB_ENABLE_ASSERT_DIALOGS)
{
int retval = sprintf_s(&msgBuf[offset],
MAX_MESSAGE_LEN - offset,
"\n\n"
"File: %s\n"
"Line: %d\n"
"\n"
"Expression: %s\n\n"
"Cancel: Disable this assert for the remainder of the process\n"
"Try Again: Break into the debugger\n"
"Continue: Continue execution (but leave assert enabled)",
pFileName,
lineNum,
pExpression);
if (retval < 0)
{
return true;
}
offset += retval;
if (!IsDebuggerPresent())
{
sprintf_s(&msgBuf[offset],
MAX_MESSAGE_LEN - offset,
"\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a "
"program crash!");
}
retval = MessageBoxA(nullptr,
msgBuf,
"Assert Failed",
MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND);
switch (retval)
{
case IDCANCEL:
enabled = false;
return false;
case IDTRYAGAIN:
return true;
case IDCONTINUE:
return false;
}
}
else
{
return (IsDebuggerPresent() || !chkDebugger) && enabled;
}
#endif // _WIN32
return enabled;
}
void SwrTrace(
const char* pFileName, uint32_t lineNum, const char* pFunction, const char* pFmtString, ...)
{
using namespace ConsoleUtils;
std::lock_guard<std::mutex> l(g_stderrMutex);
SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL);
fprintf(stderr, "%s(%d): TRACE in %s:\n", pFileName, lineNum, pFunction);
if (pFmtString)
{
SetTextColor(stderr, TextColor::PURPLE, TextStyle::INTENSITY);
fprintf(stderr, "\t");
va_list args;
va_start(args, pFmtString);
vfprintf(stderr, pFmtString, args);
va_end(args);
fprintf(stderr, "\n");
}
ResetTextColor(stderr);
fflush(stderr);
#if defined(_WIN32)
static const int MAX_MESSAGE_LEN = 2048;
char msgBuf[MAX_MESSAGE_LEN];
sprintf_s(msgBuf, "%s(%d): TRACE in %s\n", pFileName, lineNum, pFunction);
msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
msgBuf[MAX_MESSAGE_LEN - 1] = 0;
OutputDebugStringA(msgBuf);
int offset = 0;
if (pFmtString)
{
va_list args;
va_start(args, pFmtString);
offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
va_end(args);
if (offset < 0)
{
return;
}
OutputDebugStringA("\t");
OutputDebugStringA(msgBuf);
OutputDebugStringA("\n");
}
#endif // _WIN32
}
#endif // SWR_ENABLE_ASSERTS

View file

@ -1,242 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_ASSERT_H__
#define __SWR_ASSERT_H__
#if !defined(__SWR_OS_H__)
#error swr_assert.h should not be included directly, please include "common/os.h" instead.
#endif
//=============================================================================
//
// MACROS defined in this file:
//
// - SWR_ASSUME(expression, ...): Tell compiler that the expression is true.
// Helps with static code analysis as well.
// DO NOT USE if code after this dynamically
// checks for errors and handles them. The
// compiler may optimize out the error check.
//
// - SWR_ASSERT(expression, ...): Inform the user is expression is false.
// This check is only conditionally made,
// usually only in debug mode.
//
// - SWR_REL_ASSERT(expression, ...): Unconditionally enabled version of SWR_ASSERT
//
// - SWR_ASSUME_ASSERT(expression, ...): Conditionally enabled SWR_ASSERT. Uses
// SWR_ASSUME if SWR_ASSERT is disabled.
// DO NOT USE in combination with actual
// error checking (see SWR_ASSUME)
//
// - SWR_REL_ASSUME_ASSERT(expression, ...): Same as SWR_REL_ASSERT.
//
//=============================================================================
// Stupid preprocessor tricks to avoid -Wall / -W4 warnings
#if defined(_MSC_VER)
#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable : 4127))
#define _SWR_WARN_RESTORE __pragma(warning(pop))
#else // ! MSVC compiler
#define _SWR_WARN_DISABLE
#define _SWR_WARN_RESTORE
#endif
#define _SWR_MACRO_START \
do \
{
#define _SWR_MACRO_END \
_SWR_WARN_DISABLE \
} \
while (0) \
_SWR_WARN_RESTORE
#if defined(_MSC_VER)
#define SWR_ASSUME(e, ...) \
_SWR_MACRO_START __assume(e); \
_SWR_MACRO_END
#elif defined(__clang__)
#define SWR_ASSUME(e, ...) \
_SWR_MACRO_START __builtin_assume(e); \
_SWR_MACRO_END
#elif defined(__GNUC__)
#define SWR_ASSUME(e, ...) \
_SWR_MACRO_START((e) ? ((void)0) : __builtin_unreachable()); \
_SWR_MACRO_END
#else
#define SWR_ASSUME(e, ...) \
_SWR_MACRO_START ASSUME(e); \
_SWR_MACRO_END
#endif
#if !defined(SWR_ENABLE_ASSERTS)
#if !defined(NDEBUG)
#define SWR_ENABLE_ASSERTS 1
#else
#define SWR_ENABLE_ASSERTS 0
#endif // _DEBUG
#endif // SWR_ENABLE_ASSERTS
#if !defined(SWR_ENABLE_REL_ASSERTS)
#define SWR_ENABLE_REL_ASSERTS 1
#endif
#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
#include "assert.h"
#if !defined(__cplusplus)
#pragma message("C++ is required for SWR Asserts, falling back to assert.h")
#if SWR_ENABLE_ASSERTS
#define SWR_ASSERT(e, ...) assert(e)
#endif
#if SWR_ENABLE_REL_ASSERTS
#define SWR_REL_ASSERT(e, ...) assert(e)
#endif
#else
bool SwrAssert(bool chkDebugger,
bool& enabled,
const char* pExpression,
const char* pFileName,
uint32_t lineNum,
const char* function,
const char* pFmtString = nullptr,
...);
void SwrTrace(
const char* pFileName, uint32_t lineNum, const char* function, const char* pFmtString, ...);
#define _SWR_ASSERT(chkDebugger, e, ...) \
_SWR_MACRO_START \
bool expFailed = !(e); \
if (expFailed) \
{ \
static bool swrAssertEnabled = true; \
expFailed = SwrAssert( \
chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
if (expFailed) \
{ \
DEBUGBREAK; \
} \
} \
_SWR_MACRO_END
#define _SWR_INVALID(chkDebugger, ...) \
_SWR_MACRO_START \
static bool swrAssertEnabled = true; \
bool expFailed = SwrAssert( \
chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
if (expFailed) \
{ \
DEBUGBREAK; \
} \
_SWR_MACRO_END
#define _SWR_TRACE(_fmtstr, ...) SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__);
#if SWR_ENABLE_ASSERTS
#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__)
#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSERT(e, ##__VA_ARGS__)
#define SWR_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
#endif // SWR_ENABLE_ASSERTS
#if SWR_ENABLE_REL_ASSERTS
#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__)
#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_REL_ASSERT(e, ##__VA_ARGS__)
#define SWR_REL_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
// SWR_INVALID is always enabled
// Funky handling to allow 0 arguments with g++/gcc
// This is needed because you can't "swallow commas" with ##_VA_ARGS__ unless
// there is a first argument to the macro. So having a macro that can optionally
// accept 0 arguments is tricky.
#define _SWR_INVALID_0() _SWR_INVALID(false)
#define _SWR_INVALID_1(...) _SWR_INVALID(false, ##__VA_ARGS__)
#define _SWR_INVALID_VARGS_(_10, _9, _8, _7, _6, _5, _4, _3, _2, _1, N, ...) N
#define _SWR_INVALID_VARGS(...) _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
#define _SWR_INVALID_VARGS_0() 1, 2, 3, 4, 5, 6, 7, 9, 9, 10
#define _SWR_INVALID_CONCAT_(a, b) a##b
#define _SWR_INVALID_CONCAT(a, b) _SWR_INVALID_CONCAT_(a, b)
#define SWR_INVALID(...) \
_SWR_INVALID_CONCAT(_SWR_INVALID_, _SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__())) \
(__VA_ARGS__)
#define SWR_STATIC_ASSERT(expression, ...) \
static_assert((expression), "Failed:\n " #expression "\n " __VA_ARGS__);
#endif // SWR_ENABLE_REL_ASSERTS
#endif // C++
#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
// Needed to allow passing bitfield members to sizeof() in disabled asserts
template <typename T>
static bool SwrSizeofWorkaround(T)
{
return false;
}
#if !SWR_ENABLE_ASSERTS
#define SWR_ASSERT(e, ...) \
_SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
_SWR_MACRO_END
#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
#define SWR_TRACE(_fmtstr, ...) \
_SWR_MACRO_START(void)(0); \
_SWR_MACRO_END
#endif
#if !SWR_ENABLE_REL_ASSERTS
#define SWR_REL_ASSERT(e, ...) \
_SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
_SWR_MACRO_END
#define SWR_INVALID(...) \
_SWR_MACRO_START(void)(0); \
_SWR_MACRO_END
#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
#define SWR_REL_TRACE(_fmtstr, ...) \
_SWR_MACRO_START(void)(0); \
_SWR_MACRO_END
#define SWR_STATIC_ASSERT(e, ...) \
_SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
_SWR_MACRO_END
#endif
#if defined(_MSC_VER)
#define SWR_FUNCTION_DECL __FUNCSIG__
#elif (defined(__GNUC__) || defined(__clang__))
#define SWR_FUNCTION_DECL __PRETTY_FUNCTION__
#else
#define SWR_FUNCTION_DECL __FUNCTION__
#endif
#define SWR_NOT_IMPL SWR_INVALID("%s not implemented", SWR_FUNCTION_DECL)
#endif //__SWR_ASSERT_H__

File diff suppressed because it is too large Load diff

View file

@ -1,772 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file api.h
*
* @brief API definitions
*
******************************************************************************/
#ifndef __SWR_API_H__
#define __SWR_API_H__
#include "common/os.h"
#include <assert.h>
#include <algorithm>
#include "common/intrin.h"
#include "common/formats.h"
#include "core/state.h"
typedef void(SWR_API* PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
//////////////////////////////////////////////////////////////////////////
/// @brief Rectangle structure
struct SWR_RECT
{
int32_t xmin; ///< inclusive
int32_t ymin; ///< inclusive
int32_t xmax; ///< exclusive
int32_t ymax; ///< exclusive
bool operator==(const SWR_RECT& rhs)
{
return (this->ymin == rhs.ymin && this->ymax == rhs.ymax && this->xmin == rhs.xmin &&
this->xmax == rhs.xmax);
}
bool operator!=(const SWR_RECT& rhs) { return !(*this == rhs); }
SWR_RECT& Intersect(const SWR_RECT& other)
{
this->xmin = std::max(this->xmin, other.xmin);
this->ymin = std::max(this->ymin, other.ymin);
this->xmax = std::min(this->xmax, other.xmax);
this->ymax = std::min(this->ymax, other.ymax);
if (xmax - xmin < 0 || ymax - ymin < 0)
{
// Zero area
ymin = ymax = xmin = xmax = 0;
}
return *this;
}
SWR_RECT& operator&=(const SWR_RECT& other) { return Intersect(other); }
SWR_RECT& Union(const SWR_RECT& other)
{
this->xmin = std::min(this->xmin, other.xmin);
this->ymin = std::min(this->ymin, other.ymin);
this->xmax = std::max(this->xmax, other.xmax);
this->ymax = std::max(this->ymax, other.ymax);
return *this;
}
SWR_RECT& operator|=(const SWR_RECT& other) { return Union(other); }
void Translate(int32_t x, int32_t y)
{
xmin += x;
ymin += y;
xmax += x;
ymax += y;
}
};
//////////////////////////////////////////////////////////////////////////
/// @brief Function signature for load hot tiles
/// @param hDC - handle to DRAW_CONTEXT
/// @param dstFormat - format of the hot tile
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
/// @param x - destination x coordinate
/// @param y - destination y coordinate
/// @param pDstHotTile - pointer to the hot tile surface
typedef void(SWR_API* PFN_LOAD_TILE)(HANDLE hDC,
HANDLE hWorkerPrivateData,
SWR_FORMAT dstFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
uint32_t x,
uint32_t y,
uint32_t renderTargetArrayIndex,
uint8_t* pDstHotTile);
//////////////////////////////////////////////////////////////////////////
/// @brief Function signature for store hot tiles
/// @param hDC - handle to DRAW_CONTEXT
/// @param srcFormat - format of the hot tile
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
/// @param x - destination x coordinate
/// @param y - destination y coordinate
/// @param pSrcHotTile - pointer to the hot tile surface
typedef void(SWR_API* PFN_STORE_TILE)(HANDLE hDC,
HANDLE hWorkerPrivateData,
SWR_FORMAT srcFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
uint32_t x,
uint32_t y,
uint32_t renderTargetArrayIndex,
uint8_t* pSrcHotTile);
//////////////////////////////////////////////////////////////////////////
/// @brief Function signature for clearing from the hot tiles clear value
/// @param hPrivateContext - handle to private data
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
/// @param x - destination x coordinate
/// @param y - destination y coordinate
/// @param renderTargetArrayIndex - render target array offset from arrayIndex
/// @param pClearColor - pointer to the hot tile's clear value
typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE hPrivateContext,
HANDLE hWorkerPrivateData,
SWR_RENDERTARGET_ATTACHMENT rtIndex,
uint32_t x,
uint32_t y,
uint32_t renderTargetArrayIndex,
const float* pClearColor);
typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_READ)(HANDLE hPrivateContext,
gfxptr_t xpAddr,
bool* pbNullTileAccessed,
HANDLE hPrivateWorkerData);
typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_WRITE)(HANDLE hPrivateContext,
gfxptr_t xpAddr,
bool* pbNullTileAccessed,
HANDLE hPrivateWorkerData);
typedef gfxptr_t(SWR_API* PFN_MAKE_GFXPTR)(HANDLE hPrivateContext, void* sysAddr);
typedef HANDLE(SWR_API* PFN_CREATE_MEMORY_CONTEXT)(HANDLE hExternalMemory);
typedef void(SWR_API* PFN_DESTROY_MEMORY_CONTEXT)(HANDLE hExternalMemory, HANDLE hMemoryContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Callback to allow driver to update their copy of streamout write offset.
/// This is call is made for any draw operation that has streamout enabled
/// and has updated the write offset.
/// @param hPrivateContext - handle to private data
/// @param soBufferSlot - buffer slot for write offset
/// @param soWriteOffset - update value for so write offset.
typedef void(SWR_API* PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext,
uint32_t soBufferSlot,
uint32_t soWriteOffset);
//////////////////////////////////////////////////////////////////////////
/// @brief Callback to allow driver to update their copy of stats.
/// @param hPrivateContext - handle to private data
/// @param pStats - pointer to draw stats
typedef void(SWR_API* PFN_UPDATE_STATS)(HANDLE hPrivateContext, const SWR_STATS* pStats);
//////////////////////////////////////////////////////////////////////////
/// @brief Callback to allow driver to update their copy of FE stats.
/// @note Its optimal to have a separate callback for FE stats since
/// there is only one DC per FE thread. This means we do not have
/// to sum up the stats across all of the workers.
/// @param hPrivateContext - handle to private data
/// @param pStats - pointer to draw stats
typedef void(SWR_API* PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, const SWR_STATS_FE* pStats);
//////////////////////////////////////////////////////////////////////////
/// @brief Callback to allow driver to update StreamOut status
/// @param hPrivateContext - handle to private data
/// @param numPrims - number of primitives written to StreamOut buffer
typedef void(SWR_API* PFN_UPDATE_STREAMOUT)(HANDLE hPrivateContext, uint64_t numPrims);
//////////////////////////////////////////////////////////////////////////
/// BucketManager
/// Forward Declaration (see rdtsc_buckets.h for full definition)
/////////////////////////////////////////////////////////////////////////
class BucketManager;
//////////////////////////////////////////////////////////////////////////
/// SWR_THREADING_INFO
/////////////////////////////////////////////////////////////////////////
struct SWR_THREADING_INFO
{
uint32_t BASE_NUMA_NODE;
uint32_t BASE_CORE;
uint32_t BASE_THREAD;
uint32_t MAX_WORKER_THREADS;
uint32_t MAX_NUMA_NODES;
uint32_t MAX_CORES_PER_NUMA_NODE;
uint32_t MAX_THREADS_PER_CORE;
bool SINGLE_THREADED;
};
//////////////////////////////////////////////////////////////////////////
/// SWR_API_THREADING_INFO
/// Data used to reserve HW threads for API use
/// API Threads are reserved from numa nodes / cores used for
/// SWR Worker threads. Specifying reserved threads here can reduce
/// the total number of SWR worker threads.
/////////////////////////////////////////////////////////////////////////
struct SWR_API_THREADING_INFO
{
uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
uint32_t bindAPIThread0; // Default is true if numAPIReservedThreads is > 0,
// binds thread used in SwrCreateContext to API Reserved
// thread 0
uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number.
// Independent of KNOB_MAX_THREADS_PER_CORE.
};
//////////////////////////////////////////////////////////////////////////
/// SWR_CONTEXT
/// Forward Declaration (see context.h for full definition)
/////////////////////////////////////////////////////////////////////////
struct SWR_CONTEXT;
//////////////////////////////////////////////////////////////////////////
/// SWR_WORKER_PRIVATE_STATE
/// Data used to allocate per-worker thread private data. A pointer
/// to this data will be passed in to each shader function.
/// The first field of this private data must be SWR_WORKER_DATA
/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA
/////////////////////////////////////////////////////////////////////////
struct SWR_WORKER_PRIVATE_STATE
{
typedef void(SWR_API* PFN_WORKER_DATA)(SWR_CONTEXT* pContext, HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
size_t perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker
PFN_WORKER_DATA pfnInitWorkerData; ///< Init function for worker data. If null
///< worker data will be initialized to 0.
PFN_WORKER_DATA pfnFinishWorkerData; ///< Finish / destroy function for worker data.
///< Can be null.
};
//////////////////////////////////////////////////////////////////////////
/// SWR_CREATECONTEXT_INFO
/////////////////////////////////////////////////////////////////////////
struct SWR_CREATECONTEXT_INFO
{
// External functions (e.g. sampler) need per draw context state.
// Use SwrGetPrivateContextState() to access private state.
size_t privateStateSize;
// Optional per-worker state, can be NULL for no worker-private data
SWR_WORKER_PRIVATE_STATE* pWorkerPrivateState;
// Callback functions
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead;
PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
PFN_MAKE_GFXPTR pfnMakeGfxPtr;
PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext;
PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext;
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
PFN_UPDATE_STATS pfnUpdateStats;
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
PFN_UPDATE_STREAMOUT pfnUpdateStreamOut;
// Pointer to rdtsc buckets mgr returned to the caller.
// Only populated when KNOB_ENABLE_RDTSC is set
BucketManager* pBucketMgr;
// Output: size required memory passed to for SwrSaveState / SwrRestoreState
size_t contextSaveSize;
// ArchRast event manager.
HANDLE hArEventManager;
// handle to external memory for worker data to create memory contexts
HANDLE hExternalMemory;
// Input (optional): Threading info that overrides any set KNOB values.
SWR_THREADING_INFO* pThreadInfo;
// Input (optional): Info for reserving API threads
SWR_API_THREADING_INFO* pApiThreadInfo;
// Input: if set to non-zero value, overrides KNOB value for maximum
// number of draws in flight
uint32_t MAX_DRAWS_IN_FLIGHT;
std::string contextName;
};
//////////////////////////////////////////////////////////////////////////
/// @brief Create SWR Context.
/// @param pCreateInfo - pointer to creation info.
SWR_FUNC(HANDLE, SwrCreateContext, SWR_CREATECONTEXT_INFO* pCreateInfo);
//////////////////////////////////////////////////////////////////////////
/// @brief Destroys SWR Context.
/// @param hContext - Handle passed back from SwrCreateContext
SWR_FUNC(void, SwrDestroyContext, HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Bind current thread to an API reserved HW thread
/// @param hContext - Handle passed back from SwrCreateContext
/// @param apiThreadId - index of reserved HW thread to bind to.
SWR_FUNC(void, SwrBindApiThread, HANDLE hContext, uint32_t apiThreadId);
//////////////////////////////////////////////////////////////////////////
/// @brief Saves API state associated with hContext
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pOutputStateBlock - Memory block to receive API state data
/// @param memSize - Size of memory pointed to by pOutputStateBlock
SWR_FUNC(void, SwrSaveState, HANDLE hContext, void* pOutputStateBlock, size_t memSize);
//////////////////////////////////////////////////////////////////////////
/// @brief Restores API state to hContext previously saved with SwrSaveState
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pStateBlock - Memory block to read API state data from
/// @param memSize - Size of memory pointed to by pStateBlock
SWR_FUNC(void, SwrRestoreState, HANDLE hContext, const void* pStateBlock, size_t memSize);
//////////////////////////////////////////////////////////////////////////
/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
/// has been completed
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnFunc - pointer to callback function,
/// @param userData - user data to pass back
SWR_FUNC(void,
SwrSync,
HANDLE hContext,
PFN_CALLBACK_FUNC pfnFunc,
uint64_t userData,
uint64_t userData2,
uint64_t userData3);
//////////////////////////////////////////////////////////////////////////
/// @brief Stall cmd. Stalls the backend until all previous work has been completed.
/// Frontend work can continue to make progress
/// @param hContext - Handle passed back from SwrCreateContext
SWR_FUNC(void, SwrStallBE, HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Blocks until all rendering has been completed.
/// @param hContext - Handle passed back from SwrCreateContext
SWR_FUNC(void, SwrWaitForIdle, HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Blocks until all FE rendering has been completed.
/// @param hContext - Handle passed back from SwrCreateContext
SWR_FUNC(void, SwrWaitForIdleFE, HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Set vertex buffer state.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param numBuffers - Number of vertex buffer state descriptors.
/// @param pVertexBuffers - Array of vertex buffer state descriptors.
SWR_FUNC(void,
SwrSetVertexBuffers,
HANDLE hContext,
uint32_t numBuffers,
const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
//////////////////////////////////////////////////////////////////////////
/// @brief Set index buffer
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pIndexBuffer - Index buffer.
SWR_FUNC(void, SwrSetIndexBuffer, HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
//////////////////////////////////////////////////////////////////////////
/// @brief Set fetch shader pointer.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnFetchFunc - Pointer to shader.
SWR_FUNC(void, SwrSetFetchFunc, HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set streamout shader pointer.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnSoFunc - Pointer to shader.
/// @param streamIndex - specifies stream
SWR_FUNC(void, SwrSetSoFunc, HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex);
//////////////////////////////////////////////////////////////////////////
/// @brief Set streamout state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pSoState - Pointer to streamout state.
SWR_FUNC(void, SwrSetSoState, HANDLE hContext, SWR_STREAMOUT_STATE* pSoState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set streamout buffer state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pSoBuffer - Pointer to streamout buffer.
/// @param slot - Slot to bind SO buffer to.
SWR_FUNC(void, SwrSetSoBuffers, HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot);
//////////////////////////////////////////////////////////////////////////
/// @brief Set vertex shader pointer.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnVertexFunc - Pointer to shader.
SWR_FUNC(void, SwrSetVertexFunc, HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set frontend state.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state
SWR_FUNC(void, SwrSetFrontendState, HANDLE hContext, SWR_FRONTEND_STATE* pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set geometry shader state.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state
SWR_FUNC(void, SwrSetGsState, HANDLE hContext, SWR_GS_STATE* pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set geometry shader
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to geometry shader function
SWR_FUNC(void, SwrSetGsFunc, HANDLE hContext, PFN_GS_FUNC pfnGsFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set compute shader
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnCsFunc - Pointer to compute shader function
/// @param totalThreadsInGroup - product of thread group dimensions.
/// @param totalSpillFillSize - size in bytes needed for spill/fill.
/// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance
/// @param numInstances - number of simd instances that are run per execution of the shader
SWR_FUNC(void,
SwrSetCsFunc,
HANDLE hContext,
PFN_CS_FUNC pfnCsFunc,
uint32_t totalThreadsInGroup,
uint32_t totalSpillFillSize,
uint32_t scratchSpaceSizePerInstance,
uint32_t numInstances);
//////////////////////////////////////////////////////////////////////////
/// @brief Set tessellation state.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state
SWR_FUNC(void, SwrSetTsState, HANDLE hContext, SWR_TS_STATE* pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set hull shader
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnFunc - Pointer to shader function
SWR_FUNC(void, SwrSetHsFunc, HANDLE hContext, PFN_HS_FUNC pfnFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set domain shader
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnFunc - Pointer to shader function
SWR_FUNC(void, SwrSetDsFunc, HANDLE hContext, PFN_DS_FUNC pfnFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set depth stencil state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
SWR_FUNC(void, SwrSetDepthStencilState, HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set backend state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
SWR_FUNC(void, SwrSetBackendState, HANDLE hContext, SWR_BACKEND_STATE* pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set depth bounds state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
SWR_FUNC(void, SwrSetDepthBoundsState, HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set pixel shader state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
SWR_FUNC(void, SwrSetPixelShaderState, HANDLE hContext, SWR_PS_STATE* pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set blend state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
SWR_FUNC(void, SwrSetBlendState, HANDLE hContext, SWR_BLEND_STATE* pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set blend function
/// @param hContext - Handle passed back from SwrCreateContext
/// @param renderTarget - render target index
/// @param pfnBlendFunc - function pointer
SWR_FUNC(
void, SwrSetBlendFunc, HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDraw
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param startVertex - Specifies start vertex in vertex buffer for draw.
/// @param primCount - Number of vertices.
SWR_FUNC(void,
SwrDraw,
HANDLE hContext,
PRIMITIVE_TOPOLOGY topology,
uint32_t startVertex,
uint32_t primCount);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDrawInstanced
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
/// @param numInstances - How many instances to render.
/// @param startVertex - Specifies start vertex for draw. (vertex data)
/// @param startInstance - Which instance to start sequentially fetching from in each buffer
/// (instanced data)
SWR_FUNC(void,
SwrDrawInstanced,
HANDLE hContext,
PRIMITIVE_TOPOLOGY topology,
uint32_t numVertsPerInstance,
uint32_t numInstances,
uint32_t startVertex,
uint32_t startInstance);
//////////////////////////////////////////////////////////////////////////
/// @brief DrawIndexed
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numIndices - Number of indices to read sequentially from index buffer.
/// @param indexOffset - Starting index into index buffer.
/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
SWR_FUNC(void,
SwrDrawIndexed,
HANDLE hContext,
PRIMITIVE_TOPOLOGY topology,
uint32_t numIndices,
uint32_t indexOffset,
int32_t baseVertex);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDrawIndexedInstanced
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numIndices - Number of indices to read sequentially from index buffer.
/// @param numInstances - Number of instances to render.
/// @param indexOffset - Starting index into index buffer.
/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
/// @param startInstance - Which instance to start sequentially fetching from in each buffer
/// (instanced data)
SWR_FUNC(void,
SwrDrawIndexedInstanced,
HANDLE hContext,
PRIMITIVE_TOPOLOGY topology,
uint32_t numIndices,
uint32_t numInstances,
uint32_t indexOffset,
int32_t baseVertex,
uint32_t startInstance);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrInvalidateTiles
/// @param hContext - Handle passed back from SwrCreateContext
/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
/// invalidate.
/// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to
/// be hottile size-aligned.
SWR_FUNC(void,
SwrInvalidateTiles,
HANDLE hContext,
uint32_t attachmentMask,
const SWR_RECT& invalidateRect);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDiscardRect
/// @param hContext - Handle passed back from SwrCreateContext
/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
/// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be
/// discarded.
SWR_FUNC(void, SwrDiscardRect, HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDispatch
/// @param hContext - Handle passed back from SwrCreateContext
/// @param threadGroupCountX - Number of thread groups dispatched in X direction
/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
SWR_FUNC(void,
SwrDispatch,
HANDLE hContext,
uint32_t threadGroupCountX,
uint32_t threadGroupCountY,
uint32_t threadGroupCountZ);
/// @note this enum needs to be kept in sync with HOTTILE_STATE!
enum SWR_TILE_STATE
{
SWR_TILE_INVALID = 0, // tile is in uninitialized state and should be loaded with surface contents
// before rendering
SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents
SWR_TILE_RESOLVED = 3, // is in sync with surface it represents
};
/// @todo Add a good description for what attachments are and when and why you would use the
/// different SWR_TILE_STATEs.
SWR_FUNC(void,
SwrStoreTiles,
HANDLE hContext,
uint32_t attachmentMask,
SWR_TILE_STATE postStoreTileState,
const SWR_RECT& storeRect);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
/// @param hContext - Handle passed back from SwrCreateContext
/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
/// @param renderTargetArrayIndex - the RT array index to clear
/// @param clearColor - color use for clearing render targets
/// @param z - depth value use for clearing depth buffer
/// @param stencil - stencil value used for clearing stencil buffer
/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
SWR_FUNC(void,
SwrClearRenderTarget,
HANDLE hContext,
uint32_t attachmentMask,
uint32_t renderTargetArrayIndex,
const float clearColor[4],
float z,
uint8_t stencil,
const SWR_RECT& clearRect);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrSetRastState
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pRastState - New SWR_RASTSTATE used for SwrDraw* commands
SWR_FUNC(void, SwrSetRastState, HANDLE hContext, const SWR_RASTSTATE* pRastState);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrSetViewports
/// @param hContext - Handle passed back from SwrCreateContext
/// @param numViewports - number of viewports passed in
/// @param pViewports - Specifies extents of viewport.
/// @param pMatrices - If not specified then SWR computes a default one.
SWR_FUNC(void,
SwrSetViewports,
HANDLE hContext,
uint32_t numViewports,
const SWR_VIEWPORT* pViewports,
const SWR_VIEWPORT_MATRICES* pMatrices);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrSetScissorRects
/// @param hContext - Handle passed back from SwrCreateContext
/// @param numScissors - number of scissors passed in
/// @param pScissors - array of scissors
SWR_FUNC(
void, SwrSetScissorRects, HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors);
//////////////////////////////////////////////////////////////////////////
/// @brief Returns a pointer to the private context state for the current
/// draw operation. This is used for external componets such as the
/// sampler.
///
/// @note Client needs to resend private state prior to each draw call.
/// Also, SWR is responsible for the private state memory.
/// @param hContext - Handle passed back from SwrCreateContext
SWR_FUNC(void*, SwrGetPrivateContextState, HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Clients can use this to allocate memory for draw/dispatch
/// operations. The memory will automatically be freed once operation
/// has completed. Client can use this to allocate binding tables,
/// etc. needed for shader execution.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param size - Size of allocation
/// @param align - Alignment needed for allocation.
SWR_FUNC(void*, SwrAllocDrawContextMemory, HANDLE hContext, uint32_t size, uint32_t align);
//////////////////////////////////////////////////////////////////////////
/// @brief Enables stats counting
/// @param hContext - Handle passed back from SwrCreateContext
/// @param enable - If true then counts are incremented.
SWR_FUNC(void, SwrEnableStatsFE, HANDLE hContext, bool enable);
//////////////////////////////////////////////////////////////////////////
/// @brief Enables stats counting
/// @param hContext - Handle passed back from SwrCreateContext
/// @param enable - If true then counts are incremented.
SWR_FUNC(void, SwrEnableStatsBE, HANDLE hContext, bool enable);
//////////////////////////////////////////////////////////////////////////
/// @brief Mark end of frame - used for performance profiling
/// @param hContext - Handle passed back from SwrCreateContext
SWR_FUNC(void, SwrEndFrame, HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Initialize swr backend and memory internal tables
SWR_FUNC(void, SwrInit);
struct SWR_INTERFACE
{
PFNSwrCreateContext pfnSwrCreateContext;
PFNSwrDestroyContext pfnSwrDestroyContext;
PFNSwrBindApiThread pfnSwrBindApiThread;
PFNSwrSaveState pfnSwrSaveState;
PFNSwrRestoreState pfnSwrRestoreState;
PFNSwrSync pfnSwrSync;
PFNSwrStallBE pfnSwrStallBE;
PFNSwrWaitForIdle pfnSwrWaitForIdle;
PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE;
PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers;
PFNSwrSetIndexBuffer pfnSwrSetIndexBuffer;
PFNSwrSetFetchFunc pfnSwrSetFetchFunc;
PFNSwrSetSoFunc pfnSwrSetSoFunc;
PFNSwrSetSoState pfnSwrSetSoState;
PFNSwrSetSoBuffers pfnSwrSetSoBuffers;
PFNSwrSetVertexFunc pfnSwrSetVertexFunc;
PFNSwrSetFrontendState pfnSwrSetFrontendState;
PFNSwrSetGsState pfnSwrSetGsState;
PFNSwrSetGsFunc pfnSwrSetGsFunc;
PFNSwrSetCsFunc pfnSwrSetCsFunc;
PFNSwrSetTsState pfnSwrSetTsState;
PFNSwrSetHsFunc pfnSwrSetHsFunc;
PFNSwrSetDsFunc pfnSwrSetDsFunc;
PFNSwrSetDepthStencilState pfnSwrSetDepthStencilState;
PFNSwrSetBackendState pfnSwrSetBackendState;
PFNSwrSetDepthBoundsState pfnSwrSetDepthBoundsState;
PFNSwrSetPixelShaderState pfnSwrSetPixelShaderState;
PFNSwrSetBlendState pfnSwrSetBlendState;
PFNSwrSetBlendFunc pfnSwrSetBlendFunc;
PFNSwrDraw pfnSwrDraw;
PFNSwrDrawInstanced pfnSwrDrawInstanced;
PFNSwrDrawIndexed pfnSwrDrawIndexed;
PFNSwrDrawIndexedInstanced pfnSwrDrawIndexedInstanced;
PFNSwrInvalidateTiles pfnSwrInvalidateTiles;
PFNSwrDiscardRect pfnSwrDiscardRect;
PFNSwrDispatch pfnSwrDispatch;
PFNSwrStoreTiles pfnSwrStoreTiles;
PFNSwrClearRenderTarget pfnSwrClearRenderTarget;
PFNSwrSetRastState pfnSwrSetRastState;
PFNSwrSetViewports pfnSwrSetViewports;
PFNSwrSetScissorRects pfnSwrSetScissorRects;
PFNSwrGetPrivateContextState pfnSwrGetPrivateContextState;
PFNSwrAllocDrawContextMemory pfnSwrAllocDrawContextMemory;
PFNSwrEnableStatsFE pfnSwrEnableStatsFE;
PFNSwrEnableStatsBE pfnSwrEnableStatsBE;
PFNSwrEndFrame pfnSwrEndFrame;
PFNSwrInit pfnSwrInit;
};
extern "C" {
typedef void(SWR_API* PFNSwrGetInterface)(SWR_INTERFACE& out_funcs);
SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE& out_funcs);
}
#endif

View file

@ -1,490 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file arena.h
*
* @brief Arena memory manager
* The arena is convenient and fast for managing allocations for any of
* our allocations that are associated with operations and can all be freed
* once when their operation has completed. Allocations are cheap since
* most of the time its simply an increment of an offset. Also, no need to
* free individual allocations. All of the arena memory can be freed at once.
*
******************************************************************************/
#pragma once
#include <mutex>
#include <algorithm>
#include <atomic>
#include "core/utils.h"
static const size_t ARENA_BLOCK_ALIGN = 64;
struct ArenaBlock
{
size_t blockSize = 0;
ArenaBlock* pNext = nullptr;
};
static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
class DefaultAllocator
{
public:
ArenaBlock* AllocateAligned(size_t size, size_t align)
{
SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
ArenaBlock* p = new (AlignedMalloc(size, align)) ArenaBlock();
p->blockSize = size;
return p;
}
void Free(ArenaBlock* pMem)
{
if (pMem)
{
SWR_ASSUME_ASSERT(pMem->blockSize < size_t(0xdddddddd));
AlignedFree(pMem);
}
}
};
// Caching Allocator for Arena
template <uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12>
struct CachingAllocatorT : DefaultAllocator
{
ArenaBlock* AllocateAligned(size_t size, size_t align)
{
SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
SWR_ASSUME_ASSERT(size <= uint32_t(-1));
uint32_t bucket = GetBucketId(size);
{
// search cached blocks
std::lock_guard<std::mutex> l(m_mutex);
ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket];
ArenaBlock* pBlock = SearchBlocks(pPrevBlock, size, align);
if (pBlock)
{
m_cachedSize -= pBlock->blockSize;
if (pBlock == m_pLastCachedBlocks[bucket])
{
m_pLastCachedBlocks[bucket] = pPrevBlock;
}
}
else
{
pPrevBlock = &m_oldCachedBlocks[bucket];
pBlock = SearchBlocks(pPrevBlock, size, align);
if (pBlock)
{
m_oldCachedSize -= pBlock->blockSize;
if (pBlock == m_pOldLastCachedBlocks[bucket])
{
m_pOldLastCachedBlocks[bucket] = pPrevBlock;
}
}
}
if (pBlock)
{
assert(pPrevBlock && pPrevBlock->pNext == pBlock);
pPrevBlock->pNext = pBlock->pNext;
pBlock->pNext = nullptr;
return pBlock;
}
m_totalAllocated += size;
#if 0
{
static uint32_t count = 0;
char buf[128];
sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
OutputDebugStringA(buf);
}
#endif
}
if (bucket && bucket < (CACHE_NUM_BUCKETS - 1))
{
// Make all blocks in this bucket the same size
size = size_t(1) << (bucket + 1 + CACHE_START_BUCKET_BIT);
}
return this->DefaultAllocator::AllocateAligned(size, align);
}
void Free(ArenaBlock* pMem)
{
if (pMem)
{
std::unique_lock<std::mutex> l(m_mutex);
InsertCachedBlock(GetBucketId(pMem->blockSize), pMem);
}
}
void FreeOldBlocks()
{
if (!m_cachedSize)
{
return;
}
std::lock_guard<std::mutex> l(m_mutex);
bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE);
for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
{
if (doFree)
{
ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext;
while (pBlock)
{
ArenaBlock* pNext = pBlock->pNext;
m_oldCachedSize -= pBlock->blockSize;
m_totalAllocated -= pBlock->blockSize;
this->DefaultAllocator::Free(pBlock);
pBlock = pNext;
}
m_oldCachedBlocks[i].pNext = nullptr;
m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
}
if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i])
{
if (i && i < (CACHE_NUM_BUCKETS - 1))
{
// We know that all blocks are the same size.
// Just move the list over.
m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext;
m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext;
m_cachedBlocks[i].pNext = nullptr;
if (m_pOldLastCachedBlocks[i]->pNext)
{
m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i];
}
m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
}
else
{
// The end buckets can have variable sized lists.
// Insert each block based on size
ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
while (pBlock)
{
ArenaBlock* pNext = pBlock->pNext;
pBlock->pNext = nullptr;
m_cachedSize -= pBlock->blockSize;
InsertCachedBlock<true>(i, pBlock);
pBlock = pNext;
}
m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
m_cachedBlocks[i].pNext = nullptr;
}
}
}
m_oldCachedSize += m_cachedSize;
m_cachedSize = 0;
}
CachingAllocatorT()
{
for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
{
m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
}
}
~CachingAllocatorT()
{
// Free all cached blocks
for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
{
ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
while (pBlock)
{
ArenaBlock* pNext = pBlock->pNext;
this->DefaultAllocator::Free(pBlock);
pBlock = pNext;
}
pBlock = m_oldCachedBlocks[i].pNext;
while (pBlock)
{
ArenaBlock* pNext = pBlock->pNext;
this->DefaultAllocator::Free(pBlock);
pBlock = pNext;
}
}
}
private:
static uint32_t GetBucketId(size_t blockSize)
{
uint32_t bucketId = 0;
#if defined(BitScanReverseSizeT)
BitScanReverseSizeT((unsigned long*)&bucketId, (blockSize - 1) >> CACHE_START_BUCKET_BIT);
bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
#endif
return bucketId;
}
template <bool OldBlockT = false>
void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock)
{
SWR_ASSUME_ASSERT(bucketId < CACHE_NUM_BUCKETS);
ArenaBlock* pPrevBlock =
OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId];
ArenaBlock* pBlock = pPrevBlock->pNext;
while (pBlock)
{
if (pNewBlock->blockSize >= pBlock->blockSize)
{
// Insert here
break;
}
pPrevBlock = pBlock;
pBlock = pBlock->pNext;
}
// Insert into list
SWR_ASSUME_ASSERT(pPrevBlock);
pPrevBlock->pNext = pNewBlock;
pNewBlock->pNext = pBlock;
if (OldBlockT)
{
if (m_pOldLastCachedBlocks[bucketId] == pPrevBlock)
{
m_pOldLastCachedBlocks[bucketId] = pNewBlock;
}
m_oldCachedSize += pNewBlock->blockSize;
}
else
{
if (m_pLastCachedBlocks[bucketId] == pPrevBlock)
{
m_pLastCachedBlocks[bucketId] = pNewBlock;
}
m_cachedSize += pNewBlock->blockSize;
}
}
static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align)
{
ArenaBlock* pBlock = pPrevBlock->pNext;
ArenaBlock* pPotentialBlock = nullptr;
ArenaBlock* pPotentialPrev = nullptr;
while (pBlock)
{
if (pBlock->blockSize >= blockSize)
{
if (pBlock == AlignUp(pBlock, align))
{
if (pBlock->blockSize == blockSize)
{
// Won't find a better match
break;
}
// We could use this as it is larger than we wanted, but
// continue to search for a better match
pPotentialBlock = pBlock;
pPotentialPrev = pPrevBlock;
}
}
else
{
// Blocks are sorted by size (biggest first)
// So, if we get here, there are no blocks
// large enough, fall through to allocation.
pBlock = nullptr;
break;
}
pPrevBlock = pBlock;
pBlock = pBlock->pNext;
}
if (!pBlock)
{
// Couldn't find an exact match, use next biggest size
pBlock = pPotentialBlock;
pPrevBlock = pPotentialPrev;
}
return pBlock;
}
// buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT;
static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT;
static const size_t MAX_UNUSED_SIZE = sizeof(MEGABYTE);
ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS];
ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
ArenaBlock m_oldCachedBlocks[CACHE_NUM_BUCKETS];
ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
std::mutex m_mutex;
size_t m_totalAllocated = 0;
size_t m_cachedSize = 0;
size_t m_oldCachedSize = 0;
};
typedef CachingAllocatorT<> CachingAllocator;
template <typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
class TArena
{
public:
TArena(T& in_allocator) : m_allocator(in_allocator) {}
TArena() : m_allocator(m_defAllocator) {}
~TArena() { Reset(true); }
void* AllocAligned(size_t size, size_t align)
{
if (0 == size)
{
return nullptr;
}
SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
if (m_pCurBlock)
{
ArenaBlock* pCurBlock = m_pCurBlock;
size_t offset = AlignUp(m_offset, align);
if ((offset + size) <= pCurBlock->blockSize)
{
void* pMem = PtrAdd(pCurBlock, offset);
m_offset = offset + size;
return pMem;
}
// Not enough memory in this block, fall through to allocate
// a new block
}
static const size_t ArenaBlockSize = BlockSizeT;
size_t blockSize = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize);
// Add in one BLOCK_ALIGN unit to store ArenaBlock in.
blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
ArenaBlock* pNewBlock = m_allocator.AllocateAligned(
blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned.
SWR_ASSERT(pNewBlock != nullptr);
if (pNewBlock != nullptr)
{
m_offset = ARENA_BLOCK_ALIGN;
pNewBlock->pNext = m_pCurBlock;
m_pCurBlock = pNewBlock;
}
return AllocAligned(size, align);
}
void* Alloc(size_t size) { return AllocAligned(size, 1); }
void* AllocAlignedSync(size_t size, size_t align)
{
void* pAlloc = nullptr;
m_mutex.lock();
pAlloc = AllocAligned(size, align);
m_mutex.unlock();
return pAlloc;
}
void* AllocSync(size_t size)
{
void* pAlloc = nullptr;
m_mutex.lock();
pAlloc = Alloc(size);
m_mutex.unlock();
return pAlloc;
}
void Reset(bool removeAll = false)
{
m_offset = ARENA_BLOCK_ALIGN;
if (m_pCurBlock)
{
ArenaBlock* pUsedBlocks = m_pCurBlock->pNext;
m_pCurBlock->pNext = nullptr;
while (pUsedBlocks)
{
ArenaBlock* pBlock = pUsedBlocks;
pUsedBlocks = pBlock->pNext;
m_allocator.Free(pBlock);
}
if (removeAll)
{
m_allocator.Free(m_pCurBlock);
m_pCurBlock = nullptr;
}
}
}
bool IsEmpty()
{
return (m_pCurBlock == nullptr) ||
(m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr);
}
private:
ArenaBlock* m_pCurBlock = nullptr;
size_t m_offset = ARENA_BLOCK_ALIGN;
/// @note Mutex is only used by sync allocation functions.
std::mutex m_mutex;
DefaultAllocator m_defAllocator;
T& m_allocator;
};
using StdArena = TArena<DefaultAllocator>;
using CachingArena = TArena<CachingAllocator>;

View file

@ -1,420 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.cpp
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#include <smmintrin.h>
#include "backend.h"
#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
#include "backends/gen_BackendPixelRate.hpp"
#include <algorithm>
//////////////////////////////////////////////////////////////////////////
/// @brief Process compute work.
/// @param pDC - pointer to draw context (dispatch).
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param threadGroupId - the linear index for the thread group within the dispatch.
void ProcessComputeBE(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t threadGroupId,
void*& pSpillFillBuffer,
void*& pScratchSpace)
{
SWR_CONTEXT* pContext = pDC->pContext;
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEDispatch, pDC->drawId);
const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
SWR_ASSERT(pTaskData != nullptr);
// Ensure spill fill memory has been allocated.
size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
if (spillFillSize && pSpillFillBuffer == nullptr)
{
pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD16_BYTES);
}
size_t scratchSpaceSize =
pDC->pState->state.scratchSpaceSizePerWarp * pDC->pState->state.scratchSpaceNumWarps;
if (scratchSpaceSize && pScratchSpace == nullptr)
{
pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD16_BYTES);
}
const API_STATE& state = GetApiState(pDC);
SWR_CS_CONTEXT csContext{0};
csContext.tileCounter = threadGroupId;
csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
csContext.pTGSM = pContext->ppScratch[workerId];
csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
csContext.pScratchSpace = (uint8_t*)pScratchSpace;
csContext.scratchSpacePerWarp = pDC->pState->state.scratchSpaceSizePerWarp;
state.pfnCsFunc(GetPrivateState(pDC),
pContext->threadPool.pThreadData[workerId].pWorkerPrivateData,
&csContext);
UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
AR_EVENT(CSStats((HANDLE)&csContext.stats));
RDTSC_END(pDC->pContext->pBucketMgr, BEDispatch, 1);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Process shutdown.
/// @param pDC - pointer to draw context (dispatch).
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param threadGroupId - the linear index for the thread group within the dispatch.
void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
{
// Dummy function
}
void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
{
uint32_t x, y;
MacroTileMgr::getTileIndices(macroTile, x, y);
SWR_ASSERT(x == 0 && y == 0);
}
void ProcessStoreTileBE(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t macroTile,
STORE_TILES_DESC* pDesc,
SWR_RENDERTARGET_ATTACHMENT attachment)
{
SWR_CONTEXT* pContext = pDC->pContext;
HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStoreTiles, pDC->drawId);
SWR_FORMAT srcFormat;
switch (attachment)
{
case SWR_ATTACHMENT_COLOR0:
case SWR_ATTACHMENT_COLOR1:
case SWR_ATTACHMENT_COLOR2:
case SWR_ATTACHMENT_COLOR3:
case SWR_ATTACHMENT_COLOR4:
case SWR_ATTACHMENT_COLOR5:
case SWR_ATTACHMENT_COLOR6:
case SWR_ATTACHMENT_COLOR7:
srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
break;
case SWR_ATTACHMENT_DEPTH:
srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT;
break;
case SWR_ATTACHMENT_STENCIL:
srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT;
break;
default:
SWR_INVALID("Unknown attachment: %d", attachment);
srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
break;
}
uint32_t x, y;
MacroTileMgr::getTileIndices(macroTile, x, y);
// Only need to store the hottile if it's been rendered to...
HOTTILE* pHotTile =
pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
if (pHotTile)
{
// clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
if (pHotTile->state == HOTTILE_CLEAR)
{
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
SWR_ASSERT(pfnClearTiles != nullptr);
pfnClearTiles(pDC,
hWorkerPrivateData,
attachment,
macroTile,
pHotTile->renderTargetArrayIndex,
pHotTile->clearData,
pDesc->rect);
}
if (pHotTile->state == HOTTILE_DIRTY ||
pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
{
int32_t destX = KNOB_MACROTILE_X_DIM * x;
int32_t destY = KNOB_MACROTILE_Y_DIM * y;
pContext->pfnStoreTile(pDC,
hWorkerPrivateData,
srcFormat,
attachment,
destX,
destY,
pHotTile->renderTargetArrayIndex,
pHotTile->pBuffer);
}
if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
{
if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY &&
pHotTile->state == HOTTILE_RESOLVED))
{
pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
}
}
}
RDTSC_END(pDC->pContext->pBucketMgr, BEStoreTiles, 1);
}
void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
{
STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData;
unsigned long rt = 0;
uint32_t mask = pDesc->attachmentMask;
while (_BitScanForward(&rt, mask))
{
mask &= ~(1 << rt);
ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
}
}
void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t macroTile,
void* pData)
{
DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pData;
SWR_CONTEXT* pContext = pDC->pContext;
const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
{
if (pDesc->attachmentMask & (1 << i))
{
HOTTILE* pHotTile =
pContext->pHotTileMgr->GetHotTileNoLoad(pContext,
pDC,
macroTile,
(SWR_RENDERTARGET_ATTACHMENT)i,
pDesc->createNewTiles,
numSamples);
if (pHotTile)
{
HOTTILE_STATE newState = (HOTTILE_STATE)pDesc->newTileState;;
if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_CLEAR)
{
if (newState == HOTTILE_INVALID)
{
// This is OK for APIs that explicitly allow discards
// (for e.g. depth / stencil data)
//SWR_INVALID("Discarding valid data!");
}
}
pHotTile->state = newState;
}
}
}
}
template <uint32_t sampleCountT>
void BackendNullPS(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t x,
uint32_t y,
SWR_TRIANGLE_DESC& work,
RenderOutputBuffers& renderBuffers)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BENullBackend, pDC->drawId);
///@todo: handle center multisample pattern
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
const API_STATE& state = GetApiState(pDC);
BarycentricCoeffs coeffs;
SetupBarycentricCoeffs(&coeffs, work);
uint8_t *pDepthBuffer, *pStencilBuffer;
SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
SWR_PS_CONTEXT psContext;
// skip SetupPixelShaderContext(&psContext, ...); // not needed here
RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
// iterate over active samples
unsigned long sample = 0;
uint32_t sampleMask = state.blendState.sampleMask;
while (_BitScanForward(&sample, sampleMask))
{
sampleMask &= ~(1 << sample);
simdmask coverageMask = work.coverageMask[sample] & MASK;
if (coverageMask)
{
// offset depth/stencil buffers current sample
uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
"Unsupported depth hot tile format");
const simdscalar z =
_simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
}
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
// calculate per sample positions
psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
CalcSampleBarycentrics(coeffs, psContext);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa,
coeffs.vZb,
coeffs.vZc,
psContext.vI.sample,
psContext.vJ.sample);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
// interpolate user clip distance if available
if (state.backendState.clipDistanceMask)
{
coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
work.pUserClipBuffer,
psContext.vI.sample,
psContext.vJ.sample);
}
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
simdscalar stencilPassMask = vCoverageMask;
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
simdscalar depthPassMask = DepthStencilTest(&state,
work.triFlags.frontFacing,
work.triFlags.viewportIndex,
psContext.vZ,
pDepthSample,
vCoverageMask,
pStencilSample,
&stencilPassMask);
AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask),
_simd_movemask_ps(stencilPassMask),
_simd_movemask_ps(vCoverageMask)));
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
&state.depthStencilState,
work.triFlags.frontFacing,
psContext.vZ,
pDepthSample,
depthPassMask,
vCoverageMask,
pStencilSample,
stencilPassMask);
RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
uint32_t statMask = _simd_movemask_ps(depthPassMask);
uint32_t statCount = _mm_popcnt_u32(statMask);
UPDATE_STAT_BE(DepthPassCount, statCount);
}
Endtile:
ATTR_UNUSED;
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
}
vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
}
RDTSC_END(pDC->pContext->pBucketMgr, BENullBackend, 0);
}
PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
[2] // canEarlyZ
= {};
PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
[SWR_INPUT_COVERAGE_COUNT][2] // centroid
[2] // forcedSampleCount
[2] // canEarlyZ
= {};
PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT]
[2] // centroid
[2] // canEarlyZ
= {};
void InitBackendFuncTables()
{
InitBackendPixelRate();
InitBackendSingleFuncTable(gBackendSingleSample);
InitBackendSampleFuncTable(gBackendSampleRateTable);
gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS<SWR_MULTISAMPLE_1X>;
gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS<SWR_MULTISAMPLE_2X>;
gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS<SWR_MULTISAMPLE_4X>;
gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS<SWR_MULTISAMPLE_8X>;
gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>;
}

View file

@ -1,70 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.h
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "core/context.h"
#include "core/multisample.h"
#include "depthstencil.h"
#include "rdtsc_core.h"
void ProcessComputeBE(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t threadGroupId,
void*& pSpillFillBuffer,
void*& pScratchSpace);
void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t macroTile,
void* pData);
void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
typedef void (*PFN_CLEAR_TILES)(DRAW_CONTEXT*,
HANDLE hWorkerData,
SWR_RENDERTARGET_ATTACHMENT rt,
uint32_t,
uint32_t,
uint32_t[4],
const SWR_RECT& rect);
extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS];
extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
[2]; // canEarlyZ
extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
[SWR_INPUT_COVERAGE_COUNT][2] // centroid
[2] // forcedSampleCount
[2] // canEarlyZ
;
extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
[SWR_INPUT_COVERAGE_COUNT][2] // centroid
[2]; // canEarlyZ

View file

@ -1,308 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.cpp
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#include <smmintrin.h>
#include "backend.h"
#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
#include <algorithm>
template <SWR_FORMAT format>
void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
{
auto lambda = [&](int32_t comp)
{
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
};
const uint32_t numIter =
(KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
for (uint32_t i = 0; i < numIter; ++i)
{
UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
}
}
template <SWR_FORMAT format>
INLINE void ClearMacroTile(DRAW_CONTEXT* pDC,
HANDLE hWorkerPrivateData,
SWR_RENDERTARGET_ATTACHMENT rt,
uint32_t macroTile,
uint32_t renderTargetArrayIndex,
uint32_t clear[4],
const SWR_RECT& rect)
{
// convert clear color to hottile format
// clear color is in RGBA float/uint32
simd16vector vClear;
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
{
simd16scalar vComp = _simd16_load1_ps((const float*)&clear[comp]);
if (FormatTraits<format>::isNormalized(comp))
{
vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
}
vComp = FormatTraits<format>::pack(comp, vComp);
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
}
uint32_t tileX, tileY;
MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
// Init to full macrotile
SWR_RECT clearTile = {
KNOB_MACROTILE_X_DIM * int32_t(tileX),
KNOB_MACROTILE_Y_DIM * int32_t(tileY),
KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
};
// intersect with clear rect
clearTile &= rect;
// translate to local hottile origin
clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM,
-int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
// Make maximums inclusive (needed for convert to raster tiles)
clearTile.xmax -= 1;
clearTile.ymax -= 1;
// convert to raster tiles
clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
// compute steps between raster tile samples / raster tiles / macro tile rows
const uint32_t rasterTileSampleStep =
KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
const uint32_t rasterTileStep =
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
HOTTILE* pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext,
pDC,
hWorkerPrivateData,
macroTile,
rt,
true,
numSamples,
renderTargetArrayIndex);
uint32_t rasterTileStartOffset =
(ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp>>(
pitch, clearTile.xmin, clearTile.ymin)) *
numSamples;
uint8_t* pRasterTileRow =
pHotTile->pBuffer +
rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ,
// FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
// loop over all raster tiles in the current hot tile
for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
{
uint8_t* pRasterTile = pRasterTileRow;
for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
{
for (int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
{
ClearRasterTile<format>(pRasterTile, vClear);
pRasterTile += rasterTileSampleStep;
}
}
pRasterTileRow += macroTileRowStep;
}
pHotTile->state = HOTTILE_DIRTY;
}
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
{
SWR_CONTEXT* pContext = pDC->pContext;
HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
if (KNOB_FAST_CLEAR)
{
CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
uint32_t numSamples = GetNumSamples(sampleCount);
SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId);
if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
{
unsigned long rt = 0;
uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
while (_BitScanForward(&rt, mask))
{
mask &= ~(1 << rt);
HOTTILE* pHotTile =
pContext->pHotTileMgr->GetHotTile(pContext,
pDC,
hWorkerPrivateData,
macroTile,
(SWR_RENDERTARGET_ATTACHMENT)rt,
true,
numSamples,
pClear->renderTargetArrayIndex);
// All we want to do here is to mark the hot tile as being in a "needs clear" state.
pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
pHotTile->state = HOTTILE_CLEAR;
}
}
if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
{
HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
pDC,
hWorkerPrivateData,
macroTile,
SWR_ATTACHMENT_DEPTH,
true,
numSamples,
pClear->renderTargetArrayIndex);
pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth;
pHotTile->state = HOTTILE_CLEAR;
}
if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
{
HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
pDC,
hWorkerPrivateData,
macroTile,
SWR_ATTACHMENT_STENCIL,
true,
numSamples,
pClear->renderTargetArrayIndex);
pHotTile->clearData[0] = pClear->clearStencil;
pHotTile->state = HOTTILE_CLEAR;
}
RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1);
}
else
{
// Legacy clear
CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId);
if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
{
uint32_t clearData[4];
clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
SWR_ASSERT(pfnClearTiles != nullptr);
unsigned long rt = 0;
uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
while (_BitScanForward(&rt, mask))
{
mask &= ~(1 << rt);
pfnClearTiles(pDC,
hWorkerPrivateData,
(SWR_RENDERTARGET_ATTACHMENT)rt,
macroTile,
pClear->renderTargetArrayIndex,
clearData,
pClear->rect);
}
}
if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
{
uint32_t clearData[4];
clearData[0] = *(uint32_t*)&pClear->clearDepth;
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
SWR_ASSERT(pfnClearTiles != nullptr);
pfnClearTiles(pDC,
hWorkerPrivateData,
SWR_ATTACHMENT_DEPTH,
macroTile,
pClear->renderTargetArrayIndex,
clearData,
pClear->rect);
}
if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
{
uint32_t clearData[4];
clearData[0] = pClear->clearStencil;
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
pfnClearTiles(pDC,
hWorkerPrivateData,
SWR_ATTACHMENT_STENCIL,
macroTile,
pClear->renderTargetArrayIndex,
clearData,
pClear->rect);
}
RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1);
}
}
void InitClearTilesTable()
{
memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
gClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
gClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
gClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
gClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
}

File diff suppressed because it is too large Load diff

View file

@ -1,454 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.cpp
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#include <smmintrin.h>
#include "backend.h"
#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
#include <algorithm>
template <typename T>
void BackendSampleRate(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t x,
uint32_t y,
SWR_TRIANGLE_DESC& work,
RenderOutputBuffers& renderBuffers)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESampleRateBackend, pDC->drawId);
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
const API_STATE& state = GetApiState(pDC);
BarycentricCoeffs coeffs;
SetupBarycentricCoeffs(&coeffs, work);
SWR_PS_CONTEXT psContext;
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
SetupPixelShaderContext<T>(&psContext, samplePos, work);
uint8_t *pDepthBuffer, *pStencilBuffer;
SetupRenderBuffers(psContext.pColorBuffer,
&pDepthBuffer,
&pStencilBuffer,
state.colorHottileEnable,
renderBuffers);
bool isTileDirty = false;
RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
{
const uint64_t* pCoverageMask =
(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
? &work.innerCoverageMask
: &work.coverageMask[0];
generateInputCoverage<T, T::InputCoverage>(
pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
}
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
CalcPixelBarycentrics(coeffs, psContext);
CalcCentroid<T, false>(
&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
{
simdmask coverageMask = work.coverageMask[sample] & MASK;
if (coverageMask)
{
// offset depth/stencil buffers current sample
uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
"Unsupported depth hot tile format");
const simdscalar z =
_simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
}
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
// calculate per sample positions
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
CalcSampleBarycentrics(coeffs, psContext);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa,
coeffs.vZb,
coeffs.vZc,
psContext.vI.sample,
psContext.vJ.sample);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
// interpolate user clip distance if available
if (state.backendState.clipDistanceMask)
{
coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
work.pUserClipBuffer,
psContext.vI.sample,
psContext.vJ.sample);
}
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
simdscalar depthPassMask = vCoverageMask;
simdscalar stencilPassMask = vCoverageMask;
// Early-Z?
if (T::bCanEarlyZ)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state,
work.triFlags.frontFacing,
work.triFlags.viewportIndex,
psContext.vZ,
pDepthSample,
vCoverageMask,
pStencilSample,
&stencilPassMask);
AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
_simd_movemask_ps(stencilPassMask),
_simd_movemask_ps(vCoverageMask)));
RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
// early-exit if no samples passed depth or earlyZ is forced on.
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
&state.depthStencilState,
work.triFlags.frontFacing,
psContext.vZ,
pDepthSample,
depthPassMask,
vCoverageMask,
pStencilSample,
stencilPassMask);
if (!_simd_movemask_ps(depthPassMask))
{
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
continue;
}
}
}
psContext.sampleIndex = sample;
psContext.activeMask = _simd_castps_si(vCoverageMask);
// execute pixel shader
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
// update stats
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
AR_EVENT(PSStats((HANDLE)&psContext.stats));
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
if (_simd_movemask_ps(vCoverageMask))
{
isTileDirty = true;
}
// late-Z
if (!T::bCanEarlyZ)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state,
work.triFlags.frontFacing,
work.triFlags.viewportIndex,
psContext.vZ,
pDepthSample,
vCoverageMask,
pStencilSample,
&stencilPassMask);
AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
_simd_movemask_ps(stencilPassMask),
_simd_movemask_ps(vCoverageMask)));
RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
if (!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
&state.depthStencilState,
work.triFlags.frontFacing,
psContext.vZ,
pDepthSample,
depthPassMask,
vCoverageMask,
pStencilSample,
stencilPassMask);
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
continue;
}
}
uint32_t statMask = _simd_movemask_ps(depthPassMask);
uint32_t statCount = _mm_popcnt_u32(statMask);
UPDATE_STAT_BE(DepthPassCount, statCount);
// output merger
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
OutputMerger8x2(pDC,
psContext,
psContext.pColorBuffer,
sample,
&state.blendState,
state.pfnBlendFunc,
vCoverageMask,
depthPassMask,
state.psState.renderTargetMask,
useAlternateOffset,
workerId);
// do final depth write after all pixel kills
if (!state.psState.forceEarlyZ)
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
&state.depthStencilState,
work.triFlags.frontFacing,
psContext.vZ,
pDepthSample,
depthPassMask,
vCoverageMask,
pStencilSample,
stencilPassMask);
}
RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
}
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
Endtile:
ATTR_UNUSED;
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
{
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
if (useAlternateOffset)
{
unsigned long rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{
rtMask &= ~(1 << rt);
psContext.pColorBuffer[rt] +=
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
}
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
}
if (isTileDirty)
{
SetRenderHotTilesDirty(pDC, renderBuffers);
}
RDTSC_END(pDC->pContext->pBucketMgr, BESampleRateBackend, 0);
}
// Recursive template used to auto-nest conditionals. Converts dynamic enum function
// arguments to static template arguments.
template <uint32_t... ArgsT>
struct BEChooserSampleRate
{
// Last Arg Terminator
static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
{
switch (tArg)
{
case SWR_BACKEND_MSAA_SAMPLE_RATE:
return BackendSampleRate<SwrBackendTraits<ArgsT...>>;
break;
case SWR_BACKEND_SINGLE_SAMPLE:
case SWR_BACKEND_MSAA_PIXEL_RATE:
SWR_ASSERT(0 && "Invalid backend func\n");
return nullptr;
break;
default:
SWR_ASSERT(0 && "Invalid backend func\n");
return nullptr;
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
{
switch (tArg)
{
case SWR_INPUT_COVERAGE_NONE:
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
remainingArgs...);
break;
case SWR_INPUT_COVERAGE_NORMAL:
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
remainingArgs...);
break;
case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
remainingArgs...);
break;
default:
SWR_ASSERT(0 && "Invalid sample pattern\n");
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
{
switch (tArg)
{
case SWR_MULTISAMPLE_1X:
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
break;
case SWR_MULTISAMPLE_2X:
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
break;
case SWR_MULTISAMPLE_4X:
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
break;
case SWR_MULTISAMPLE_8X:
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
break;
case SWR_MULTISAMPLE_16X:
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
break;
default:
SWR_ASSERT(0 && "Invalid sample count\n");
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
{
if (tArg == true)
{
return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
}
return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
}
};
void InitBackendSampleFuncTable(
PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
{
for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT;
sampleCount++)
{
for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
{
for (uint32_t centroid = 0; centroid < 2; centroid++)
{
for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
{
table[sampleCount][inputCoverage][centroid][canEarlyZ] =
BEChooserSampleRate<>::GetFunc(
(SWR_MULTISAMPLE_COUNT)sampleCount,
false,
(SWR_INPUT_COVERAGE)inputCoverage,
(centroid > 0),
false,
(canEarlyZ > 0),
(SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
}
}
}
}
}

View file

@ -1,428 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.cpp
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#include <smmintrin.h>
#include "backend.h"
#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
#include <algorithm>
template <typename T>
void BackendSingleSample(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t x,
uint32_t y,
SWR_TRIANGLE_DESC& work,
RenderOutputBuffers& renderBuffers)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESingleSampleBackend, pDC->drawId);
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
const API_STATE& state = GetApiState(pDC);
BarycentricCoeffs coeffs;
SetupBarycentricCoeffs(&coeffs, work);
SWR_PS_CONTEXT psContext;
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
SetupPixelShaderContext<T>(&psContext, samplePos, work);
uint8_t *pDepthBuffer, *pStencilBuffer;
SetupRenderBuffers(psContext.pColorBuffer,
&pDepthBuffer,
&pStencilBuffer,
state.colorHottileEnable,
renderBuffers);
// Indicates backend rendered something to the color buffer
bool isTileDirty = false;
RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 1);
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
simdmask coverageMask = work.coverageMask[0] & MASK;
if (coverageMask)
{
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
"Unsupported depth hot tile format");
const simdscalar z =
_simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
}
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
{
const uint64_t* pCoverageMask =
(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
? &work.innerCoverageMask
: &work.coverageMask[0];
generateInputCoverage<T, T::InputCoverage>(
pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
}
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
CalcPixelBarycentrics(coeffs, psContext);
CalcCentroid<T, true>(
&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
// interpolate and quantize z
psContext.vZ = vplaneps(
coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 1);
// interpolate user clip distance if available
if (state.backendState.clipDistanceMask)
{
coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
work.pUserClipBuffer,
psContext.vI.center,
psContext.vJ.center);
}
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
simdscalar depthPassMask = vCoverageMask;
simdscalar stencilPassMask = vCoverageMask;
// Early-Z?
if (T::bCanEarlyZ)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state,
work.triFlags.frontFacing,
work.triFlags.viewportIndex,
psContext.vZ,
pDepthBuffer,
vCoverageMask,
pStencilBuffer,
&stencilPassMask);
AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
_simd_movemask_ps(stencilPassMask),
_simd_movemask_ps(vCoverageMask)));
RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
// early-exit if no pixels passed depth or earlyZ is forced on
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
&state.depthStencilState,
work.triFlags.frontFacing,
psContext.vZ,
pDepthBuffer,
depthPassMask,
vCoverageMask,
pStencilBuffer,
stencilPassMask);
if (!_simd_movemask_ps(depthPassMask))
{
goto Endtile;
}
}
}
psContext.sampleIndex = 0;
psContext.activeMask = _simd_castps_si(vCoverageMask);
// execute pixel shader
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
// update stats
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
AR_EVENT(PSStats((HANDLE)&psContext.stats));
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
if (_simd_movemask_ps(vCoverageMask))
{
isTileDirty = true;
}
// late-Z
if (!T::bCanEarlyZ)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state,
work.triFlags.frontFacing,
work.triFlags.viewportIndex,
psContext.vZ,
pDepthBuffer,
vCoverageMask,
pStencilBuffer,
&stencilPassMask);
AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
_simd_movemask_ps(stencilPassMask),
_simd_movemask_ps(vCoverageMask)));
RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
if (!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
&state.depthStencilState,
work.triFlags.frontFacing,
psContext.vZ,
pDepthBuffer,
depthPassMask,
vCoverageMask,
pStencilBuffer,
stencilPassMask);
goto Endtile;
}
}
else
{
// for early z, consolidate discards from shader
// into depthPassMask
depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
}
uint32_t statMask = _simd_movemask_ps(depthPassMask);
uint32_t statCount = _mm_popcnt_u32(statMask);
UPDATE_STAT_BE(DepthPassCount, statCount);
// output merger
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
OutputMerger8x2(pDC,
psContext,
psContext.pColorBuffer,
0,
&state.blendState,
state.pfnBlendFunc,
vCoverageMask,
depthPassMask,
state.psState.renderTargetMask,
useAlternateOffset,
workerId);
// do final depth write after all pixel kills
if (!state.psState.forceEarlyZ)
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
&state.depthStencilState,
work.triFlags.frontFacing,
psContext.vZ,
pDepthBuffer,
depthPassMask,
vCoverageMask,
pStencilBuffer,
stencilPassMask);
}
RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
}
Endtile:
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
{
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
if (useAlternateOffset)
{
unsigned long rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{
rtMask &= ~(1 << rt);
psContext.pColorBuffer[rt] +=
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
}
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
}
if (isTileDirty)
{
SetRenderHotTilesDirty(pDC, renderBuffers);
}
RDTSC_END(pDC->pContext->pBucketMgr, BESingleSampleBackend, 0);
}
// Recursive template used to auto-nest conditionals. Converts dynamic enum function
// arguments to static template arguments.
template <uint32_t... ArgsT>
struct BEChooserSingleSample
{
// Last Arg Terminator
static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
{
switch (tArg)
{
case SWR_BACKEND_SINGLE_SAMPLE:
return BackendSingleSample<SwrBackendTraits<ArgsT...>>;
break;
case SWR_BACKEND_MSAA_PIXEL_RATE:
case SWR_BACKEND_MSAA_SAMPLE_RATE:
default:
SWR_ASSERT(0 && "Invalid backend func\n");
return nullptr;
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
{
switch (tArg)
{
case SWR_INPUT_COVERAGE_NONE:
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
remainingArgs...);
break;
case SWR_INPUT_COVERAGE_NORMAL:
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
remainingArgs...);
break;
case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
remainingArgs...);
break;
default:
SWR_ASSERT(0 && "Invalid sample pattern\n");
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
{
switch (tArg)
{
case SWR_MULTISAMPLE_1X:
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
break;
case SWR_MULTISAMPLE_2X:
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
break;
case SWR_MULTISAMPLE_4X:
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
break;
case SWR_MULTISAMPLE_8X:
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
break;
case SWR_MULTISAMPLE_16X:
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
break;
default:
SWR_ASSERT(0 && "Invalid sample count\n");
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
{
if (tArg == true)
{
return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
}
return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
}
};
void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
{
for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
{
for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
{
for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
{
table[inputCoverage][isCentroid][canEarlyZ] =
BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X,
false,
(SWR_INPUT_COVERAGE)inputCoverage,
(isCentroid > 0),
false,
(canEarlyZ > 0),
SWR_BACKEND_SINGLE_SAMPLE);
}
}
}
}

View file

@ -1,57 +0,0 @@
# Copyright © 2017-2018 Intel Corporation
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
files_swr_common += custom_target(
'gen_backend_pixel',
input : swr_gen_backends_py,
output : [
'gen_BackendPixelRate0.cpp', 'gen_BackendPixelRate1.cpp',
'gen_BackendPixelRate2.cpp', 'gen_BackendPixelRate3.cpp',
'gen_BackendPixelRate.hpp',
],
command : [
prog_python, '@INPUT@',
'--outdir', '@OUTDIR@',
'--dim', '5', '2', '3', '2', '2', '2',
'--numfiles', '4',
'--cpp', '--hpp',
],
depend_files : [ swr_gen_backend_files, swr_gen_header_init_files ],
)
files_swr_common += custom_target(
'gen_backend_raster',
input : swr_gen_backends_py,
output : [
'gen_rasterizer0.cpp', 'gen_rasterizer1.cpp',
'gen_rasterizer2.cpp', 'gen_rasterizer3.cpp',
'gen_rasterizer.hpp',
],
command : [
prog_python, '@INPUT@',
'--outdir', '@OUTDIR@',
'--rast',
'--dim', '5', '2', '2', '3', '5', '2',
'--numfiles', '4',
'--cpp', '--hpp',
],
depend_files : [ swr_gen_rasterizer_files, swr_gen_header_init_files ],
)

File diff suppressed because it is too large Load diff

View file

@ -1,254 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file binner.h
*
* @brief Declaration for the macrotile binner
*
******************************************************************************/
#include "state.h"
#include "conservativeRast.h"
#include "utils.h"
//////////////////////////////////////////////////////////////////////////
/// @brief Offsets added to post-viewport vertex positions based on
/// raster state.
///
/// Can't use templated variable because we must stick with C++11 features.
/// Template variables were introduced with C++14
template <typename SIMD_T>
struct SwrPixelOffsets
{
public:
INLINE static Float<SIMD_T> GetOffset(uint32_t loc)
{
SWR_ASSERT(loc <= 1);
return SIMD_T::set1_ps(loc ? 0.5f : 0.0f);
}
};
//////////////////////////////////////////////////////////////////////////
/// @brief Convert the X,Y coords of a triangle to the requested Fixed
/// Point precision from FP32.
template <typename SIMD_T, typename PT = FixedPointTraits<Fixed_16_8>>
INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T>& vIn)
{
return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value)));
}
//////////////////////////////////////////////////////////////////////////
/// @brief Helper function to set the X,Y coords of a triangle to the
/// requested Fixed Point precision from FP32.
/// @param tri: simdvector[3] of FP triangle verts
/// @param vXi: fixed point X coords of tri verts
/// @param vYi: fixed point Y coords of tri verts
template <typename SIMD_T>
INLINE static void
FPToFixedPoint(const Vec4<SIMD_T>* const tri, Integer<SIMD_T> (&vXi)[3], Integer<SIMD_T> (&vYi)[3])
{
vXi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].x);
vYi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].y);
vXi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].x);
vYi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].y);
vXi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].x);
vYi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].y);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Calculate bounding box for current triangle
/// @tparam CT: ConservativeRastFETraits type
/// @param vX: fixed point X position for triangle verts
/// @param vY: fixed point Y position for triangle verts
/// @param bbox: fixed point bbox
/// *Note*: expects vX, vY to be in the correct precision for the type
/// of rasterization. This avoids unnecessary FP->fixed conversions.
template <typename SIMD_T, typename CT>
INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T> (&vX)[3],
const Integer<SIMD_T> (&vY)[3],
SIMDBBOX_T<SIMD_T>& bbox)
{
Integer<SIMD_T> vMinX = vX[0];
vMinX = SIMD_T::min_epi32(vMinX, vX[1]);
vMinX = SIMD_T::min_epi32(vMinX, vX[2]);
Integer<SIMD_T> vMaxX = vX[0];
vMaxX = SIMD_T::max_epi32(vMaxX, vX[1]);
vMaxX = SIMD_T::max_epi32(vMaxX, vX[2]);
Integer<SIMD_T> vMinY = vY[0];
vMinY = SIMD_T::min_epi32(vMinY, vY[1]);
vMinY = SIMD_T::min_epi32(vMinY, vY[2]);
Integer<SIMD_T> vMaxY = vY[0];
vMaxY = SIMD_T::max_epi32(vMaxY, vY[1]);
vMaxY = SIMD_T::max_epi32(vMaxY, vY[2]);
if (CT::BoundingBoxOffsetT::value != 0)
{
/// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative
/// rasterization expand bbox by 1/256; coverage will be correctly handled in the
/// rasterizer.
const Integer<SIMD_T> value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value);
vMinX = SIMD_T::sub_epi32(vMinX, value);
vMaxX = SIMD_T::add_epi32(vMaxX, value);
vMinY = SIMD_T::sub_epi32(vMinY, value);
vMaxY = SIMD_T::add_epi32(vMaxY, value);
}
bbox.xmin = vMinX;
bbox.xmax = vMaxX;
bbox.ymin = vMinY;
bbox.ymax = vMaxY;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Gather scissor rect data based on per-prim viewport indices.
/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
/// @param pViewportIndex - array of per-primitive viewport indexes.
/// @param scisXmin - output vector of per-primitive scissor rect Xmin data.
/// @param scisYmin - output vector of per-primitive scissor rect Ymin data.
/// @param scisXmax - output vector of per-primitive scissor rect Xmax data.
/// @param scisYmax - output vector of per-primitive scissor rect Ymax data.
//
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
const uint32_t* pViewportIndex,
simdscalari& scisXmin,
simdscalari& scisYmin,
simdscalari& scisXmax,
simdscalari& scisYmax)
{
scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmin,
pScissorsInFixedPoint[pViewportIndex[6]].xmin,
pScissorsInFixedPoint[pViewportIndex[5]].xmin,
pScissorsInFixedPoint[pViewportIndex[4]].xmin,
pScissorsInFixedPoint[pViewportIndex[3]].xmin,
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
pScissorsInFixedPoint[pViewportIndex[0]].xmin);
scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymin,
pScissorsInFixedPoint[pViewportIndex[6]].ymin,
pScissorsInFixedPoint[pViewportIndex[5]].ymin,
pScissorsInFixedPoint[pViewportIndex[4]].ymin,
pScissorsInFixedPoint[pViewportIndex[3]].ymin,
pScissorsInFixedPoint[pViewportIndex[2]].ymin,
pScissorsInFixedPoint[pViewportIndex[1]].ymin,
pScissorsInFixedPoint[pViewportIndex[0]].ymin);
scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmax,
pScissorsInFixedPoint[pViewportIndex[6]].xmax,
pScissorsInFixedPoint[pViewportIndex[5]].xmax,
pScissorsInFixedPoint[pViewportIndex[4]].xmax,
pScissorsInFixedPoint[pViewportIndex[3]].xmax,
pScissorsInFixedPoint[pViewportIndex[2]].xmax,
pScissorsInFixedPoint[pViewportIndex[1]].xmax,
pScissorsInFixedPoint[pViewportIndex[0]].xmax);
scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymax,
pScissorsInFixedPoint[pViewportIndex[6]].ymax,
pScissorsInFixedPoint[pViewportIndex[5]].ymax,
pScissorsInFixedPoint[pViewportIndex[4]].ymax,
pScissorsInFixedPoint[pViewportIndex[3]].ymax,
pScissorsInFixedPoint[pViewportIndex[2]].ymax,
pScissorsInFixedPoint[pViewportIndex[1]].ymax,
pScissorsInFixedPoint[pViewportIndex[0]].ymax);
}
static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
const uint32_t* pViewportIndex,
simd16scalari& scisXmin,
simd16scalari& scisYmin,
simd16scalari& scisXmax,
simd16scalari& scisYmax)
{
scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmin,
pScissorsInFixedPoint[pViewportIndex[14]].xmin,
pScissorsInFixedPoint[pViewportIndex[13]].xmin,
pScissorsInFixedPoint[pViewportIndex[12]].xmin,
pScissorsInFixedPoint[pViewportIndex[11]].xmin,
pScissorsInFixedPoint[pViewportIndex[10]].xmin,
pScissorsInFixedPoint[pViewportIndex[9]].xmin,
pScissorsInFixedPoint[pViewportIndex[8]].xmin,
pScissorsInFixedPoint[pViewportIndex[7]].xmin,
pScissorsInFixedPoint[pViewportIndex[6]].xmin,
pScissorsInFixedPoint[pViewportIndex[5]].xmin,
pScissorsInFixedPoint[pViewportIndex[4]].xmin,
pScissorsInFixedPoint[pViewportIndex[3]].xmin,
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
pScissorsInFixedPoint[pViewportIndex[0]].xmin);
scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymin,
pScissorsInFixedPoint[pViewportIndex[14]].ymin,
pScissorsInFixedPoint[pViewportIndex[13]].ymin,
pScissorsInFixedPoint[pViewportIndex[12]].ymin,
pScissorsInFixedPoint[pViewportIndex[11]].ymin,
pScissorsInFixedPoint[pViewportIndex[10]].ymin,
pScissorsInFixedPoint[pViewportIndex[9]].ymin,
pScissorsInFixedPoint[pViewportIndex[8]].ymin,
pScissorsInFixedPoint[pViewportIndex[7]].ymin,
pScissorsInFixedPoint[pViewportIndex[6]].ymin,
pScissorsInFixedPoint[pViewportIndex[5]].ymin,
pScissorsInFixedPoint[pViewportIndex[4]].ymin,
pScissorsInFixedPoint[pViewportIndex[3]].ymin,
pScissorsInFixedPoint[pViewportIndex[2]].ymin,
pScissorsInFixedPoint[pViewportIndex[1]].ymin,
pScissorsInFixedPoint[pViewportIndex[0]].ymin);
scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmax,
pScissorsInFixedPoint[pViewportIndex[14]].xmax,
pScissorsInFixedPoint[pViewportIndex[13]].xmax,
pScissorsInFixedPoint[pViewportIndex[12]].xmax,
pScissorsInFixedPoint[pViewportIndex[11]].xmax,
pScissorsInFixedPoint[pViewportIndex[10]].xmax,
pScissorsInFixedPoint[pViewportIndex[9]].xmax,
pScissorsInFixedPoint[pViewportIndex[8]].xmax,
pScissorsInFixedPoint[pViewportIndex[7]].xmax,
pScissorsInFixedPoint[pViewportIndex[6]].xmax,
pScissorsInFixedPoint[pViewportIndex[5]].xmax,
pScissorsInFixedPoint[pViewportIndex[4]].xmax,
pScissorsInFixedPoint[pViewportIndex[3]].xmax,
pScissorsInFixedPoint[pViewportIndex[2]].xmax,
pScissorsInFixedPoint[pViewportIndex[1]].xmax,
pScissorsInFixedPoint[pViewportIndex[0]].xmax);
scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymax,
pScissorsInFixedPoint[pViewportIndex[14]].ymax,
pScissorsInFixedPoint[pViewportIndex[13]].ymax,
pScissorsInFixedPoint[pViewportIndex[12]].ymax,
pScissorsInFixedPoint[pViewportIndex[11]].ymax,
pScissorsInFixedPoint[pViewportIndex[10]].ymax,
pScissorsInFixedPoint[pViewportIndex[9]].ymax,
pScissorsInFixedPoint[pViewportIndex[8]].ymax,
pScissorsInFixedPoint[pViewportIndex[7]].ymax,
pScissorsInFixedPoint[pViewportIndex[6]].ymax,
pScissorsInFixedPoint[pViewportIndex[5]].ymax,
pScissorsInFixedPoint[pViewportIndex[4]].ymax,
pScissorsInFixedPoint[pViewportIndex[3]].ymax,
pScissorsInFixedPoint[pViewportIndex[2]].ymax,
pScissorsInFixedPoint[pViewportIndex[1]].ymax,
pScissorsInFixedPoint[pViewportIndex[0]].ymax);
}

View file

@ -1,348 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file blend.cpp
*
* @brief Implementation for blending operations.
*
******************************************************************************/
#include "state.h"
template <bool Color, bool Alpha>
INLINE void GenerateBlendFactor(SWR_BLEND_FACTOR func,
simdvector& constantColor,
simdvector& src,
simdvector& src1,
simdvector& dst,
simdvector& out)
{
simdvector result;
switch (func)
{
case BLENDFACTOR_ZERO:
result.x = _simd_setzero_ps();
result.y = _simd_setzero_ps();
result.z = _simd_setzero_ps();
result.w = _simd_setzero_ps();
break;
case BLENDFACTOR_ONE:
result.x = _simd_set1_ps(1.0);
result.y = _simd_set1_ps(1.0);
result.z = _simd_set1_ps(1.0);
result.w = _simd_set1_ps(1.0);
break;
case BLENDFACTOR_SRC_COLOR:
result = src;
break;
case BLENDFACTOR_DST_COLOR:
result = dst;
break;
case BLENDFACTOR_INV_SRC_COLOR:
result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
break;
case BLENDFACTOR_INV_DST_COLOR:
result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
break;
case BLENDFACTOR_SRC_ALPHA:
result.x = src.w;
result.y = src.w;
result.z = src.w;
result.w = src.w;
break;
case BLENDFACTOR_INV_SRC_ALPHA:
{
simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
result.x = oneMinusSrcA;
result.y = oneMinusSrcA;
result.z = oneMinusSrcA;
result.w = oneMinusSrcA;
break;
}
case BLENDFACTOR_DST_ALPHA:
result.x = dst.w;
result.y = dst.w;
result.z = dst.w;
result.w = dst.w;
break;
case BLENDFACTOR_INV_DST_ALPHA:
{
simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
result.x = oneMinusDstA;
result.y = oneMinusDstA;
result.z = oneMinusDstA;
result.w = oneMinusDstA;
break;
}
case BLENDFACTOR_SRC_ALPHA_SATURATE:
{
simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
result.x = sat;
result.y = sat;
result.z = sat;
result.w = _simd_set1_ps(1.0);
break;
}
case BLENDFACTOR_CONST_COLOR:
result.x = constantColor[0];
result.y = constantColor[1];
result.z = constantColor[2];
result.w = constantColor[3];
break;
case BLENDFACTOR_CONST_ALPHA:
result.x = result.y = result.z = result.w = constantColor[3];
break;
case BLENDFACTOR_INV_CONST_COLOR:
{
result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]);
result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]);
result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]);
result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
break;
}
case BLENDFACTOR_INV_CONST_ALPHA:
{
result.x = result.y = result.z = result.w =
_simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
break;
}
case BLENDFACTOR_SRC1_COLOR:
result.x = src1.x;
result.y = src1.y;
result.z = src1.z;
result.w = src1.w;
break;
case BLENDFACTOR_SRC1_ALPHA:
result.x = result.y = result.z = result.w = src1.w;
break;
case BLENDFACTOR_INV_SRC1_COLOR:
result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x);
result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y);
result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z);
result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
break;
case BLENDFACTOR_INV_SRC1_ALPHA:
result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
break;
default:
SWR_INVALID("Unimplemented blend factor: %d", func);
}
if (Color)
{
out.x = result.x;
out.y = result.y;
out.z = result.z;
}
if (Alpha)
{
out.w = result.w;
}
}
template <bool Color, bool Alpha>
INLINE void BlendFunc(SWR_BLEND_OP blendOp,
simdvector& src,
simdvector& srcFactor,
simdvector& dst,
simdvector& dstFactor,
simdvector& out)
{
simdvector result;
switch (blendOp)
{
case BLENDOP_ADD:
result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
break;
case BLENDOP_SUBTRACT:
result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
break;
case BLENDOP_REVSUBTRACT:
result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x));
result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y));
result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
break;
case BLENDOP_MIN:
result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
break;
case BLENDOP_MAX:
result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
break;
default:
SWR_INVALID("Unimplemented blend function: %d", blendOp);
}
if (Color)
{
out.x = result.x;
out.y = result.y;
out.z = result.z;
}
if (Alpha)
{
out.w = result.w;
}
}
template <SWR_TYPE type>
INLINE void Clamp(simdvector& src)
{
switch (type)
{
case SWR_TYPE_FLOAT:
break;
case SWR_TYPE_UNORM:
src.x = _simd_max_ps(src.x, _simd_setzero_ps());
src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
src.y = _simd_max_ps(src.y, _simd_setzero_ps());
src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
src.z = _simd_max_ps(src.z, _simd_setzero_ps());
src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
src.w = _simd_max_ps(src.w, _simd_setzero_ps());
src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
break;
case SWR_TYPE_SNORM:
src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f));
src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f));
src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f));
src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f));
src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
break;
default:
SWR_INVALID("Unimplemented clamp: %d", type);
break;
}
}
template <SWR_TYPE type>
void Blend(const SWR_BLEND_STATE* pBlendState,
const SWR_RENDER_TARGET_BLEND_STATE* pState,
simdvector& src,
simdvector& src1,
uint8_t* pDst,
simdvector& result)
{
// load render target
simdvector dst;
LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst);
simdvector constColor;
constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]);
constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]);
constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]);
constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]);
// clamp src/dst/constant
Clamp<type>(src);
Clamp<type>(src1);
Clamp<type>(dst);
Clamp<type>(constColor);
simdvector srcFactor, dstFactor;
if (pBlendState->independentAlphaBlendEnable)
{
GenerateBlendFactor<true, false>(
(SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor,
constColor,
src,
src1,
dst,
srcFactor);
GenerateBlendFactor<true, false>(
(SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
GenerateBlendFactor<false, true>(
(SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
BlendFunc<true, false>(
(SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
BlendFunc<false, true>(
(SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
}
else
{
GenerateBlendFactor<true, true>(
(SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
GenerateBlendFactor<true, true>(
(SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
BlendFunc<true, true>(
(SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
}
}

View file

@ -1,336 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file clip.cpp
*
* @brief Implementation for clipping
*
******************************************************************************/
#include <assert.h>
#include "common/os.h"
#include "core/clip.h"
float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
{
return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
}
template <SWR_CLIPCODES ClippingPlane>
inline void intersect(
int s, // index to first edge vertex v0 in pInPts.
int p, // index to second edge vertex v1 in pInPts.
const float* pInPts, // array of all the input positions.
const float* pInAttribs, // array of all attributes for all vertex. All the attributes for each
// vertex is contiguous.
int numInAttribs, // number of attributes per vertex.
int i, // output index.
float* pOutPts, // array of output positions. We'll write our new intersection point at i*4.
float* pOutAttribs) // array of output attributes. We'll write our new attributes at
// i*numInAttribs.
{
float t;
// Find the parameter of the intersection.
// t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
const float* v1 = &pInPts[s * 4];
const float* v2 = &pInPts[p * 4];
switch (ClippingPlane)
{
case FRUSTUM_LEFT:
t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]);
break;
case FRUSTUM_RIGHT:
t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]);
break;
case FRUSTUM_TOP:
t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]);
break;
case FRUSTUM_BOTTOM:
t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]);
break;
case FRUSTUM_NEAR:
t = ComputeInterpFactor(v1[2], v2[2]);
break;
case FRUSTUM_FAR:
t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]);
break;
default:
SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
};
const float* a1 = &pInAttribs[s * numInAttribs];
const float* a2 = &pInAttribs[p * numInAttribs];
float* pOutP = &pOutPts[i * 4];
float* pOutA = &pOutAttribs[i * numInAttribs];
// Interpolate new position.
for (int j = 0; j < 4; ++j)
{
pOutP[j] = v1[j] + (v2[j] - v1[j]) * t;
}
// Interpolate Attributes
for (int attr = 0; attr < numInAttribs; ++attr)
{
pOutA[attr] = a1[attr] + (a2[attr] - a1[attr]) * t;
}
}
// Checks whether vertex v lies inside clipping plane
// in homogenous coords check -w < {x,y,z} < w;
//
template <SWR_CLIPCODES ClippingPlane>
inline int inside(const float v[4])
{
switch (ClippingPlane)
{
case FRUSTUM_LEFT:
return (v[0] >= -v[3]);
case FRUSTUM_RIGHT:
return (v[0] <= v[3]);
case FRUSTUM_TOP:
return (v[1] >= -v[3]);
case FRUSTUM_BOTTOM:
return (v[1] <= v[3]);
case FRUSTUM_NEAR:
return (v[2] >= 0.0f);
case FRUSTUM_FAR:
return (v[2] <= v[3]);
default:
SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
return 0;
}
}
// Clips a polygon in homogenous coordinates to a particular clipping plane.
// Takes in vertices of the polygon (InPts) and the clipping plane
// Puts the vertices of the clipped polygon in OutPts
// Returns number of points in clipped polygon
//
template <SWR_CLIPCODES ClippingPlane>
int ClipTriToPlane(const float* pInPts,
int numInPts,
const float* pInAttribs,
int numInAttribs,
float* pOutPts,
float* pOutAttribs)
{
int i = 0; // index number of OutPts, # of vertices in OutPts = i div 4;
for (int j = 0; j < numInPts; ++j)
{
int s = j;
int p = (j + 1) % numInPts;
int s_in = inside<ClippingPlane>(&pInPts[s * 4]);
int p_in = inside<ClippingPlane>(&pInPts[p * 4]);
// test if vertex is to be added to output vertices
if (s_in != p_in) // edge crosses clipping plane
{
// find point of intersection
intersect<ClippingPlane>(
s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
i++;
}
if (p_in) // 2nd vertex is inside clipping volume, add it to output
{
// Copy 2nd vertex position of edge over to output.
for (int k = 0; k < 4; ++k)
{
pOutPts[i * 4 + k] = pInPts[p * 4 + k];
}
// Copy 2nd vertex attributes of edge over to output.
for (int attr = 0; attr < numInAttribs; ++attr)
{
pOutAttribs[i * numInAttribs + attr] = pInAttribs[p * numInAttribs + attr];
}
i++;
}
// edge does not cross clipping plane and vertex outside clipping volume
// => do not add vertex
}
return i;
}
void ClipRectangles(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simdvector prims[],
uint32_t primMask,
simdscalari const& primId,
simdscalari const& viewportIdx,
simdscalari const& rtIdx)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId);
Clipper<SIMD256, 3> clipper(workerId, pDC);
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1);
}
void ClipTriangles(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simdvector prims[],
uint32_t primMask,
simdscalari const& primId,
simdscalari const& viewportIdx,
simdscalari const& rtIdx)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId);
Clipper<SIMD256, 3> clipper(workerId, pDC);
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1);
}
void ClipLines(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simdvector prims[],
uint32_t primMask,
simdscalari const& primId,
simdscalari const& viewportIdx,
simdscalari const& rtIdx)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId);
Clipper<SIMD256, 2> clipper(workerId, pDC);
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1);
}
void ClipPoints(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simdvector prims[],
uint32_t primMask,
simdscalari const& primId,
simdscalari const& viewportIdx,
simdscalari const& rtIdx)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId);
Clipper<SIMD256, 1> clipper(workerId, pDC);
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1);
}
#if USE_SIMD16_FRONTEND
void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simd16vector prims[],
uint32_t primMask,
simd16scalari const& primId,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId);
enum
{
VERTS_PER_PRIM = 3
};
Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
pa.useAlternateOffset = false;
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1);
}
void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simd16vector prims[],
uint32_t primMask,
simd16scalari const& primId,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId);
enum
{
VERTS_PER_PRIM = 3
};
Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
pa.useAlternateOffset = false;
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1);
}
void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simd16vector prims[],
uint32_t primMask,
simd16scalari const& primId,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId);
enum
{
VERTS_PER_PRIM = 2
};
Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
pa.useAlternateOffset = false;
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1);
}
void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simd16vector prims[],
uint32_t primMask,
simd16scalari const& primId,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx)
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId);
enum
{
VERTS_PER_PRIM = 1
};
Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
pa.useAlternateOffset = false;
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1);
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,229 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file conservativerast.h
*
******************************************************************************/
#pragma once
#include <type_traits>
#include "common/simdintrin.h"
enum FixedPointFmt
{
FP_UNINIT,
_16_8,
_16_9,
_X_16,
};
//////////////////////////////////////////////////////////////////////////
/// @brief convenience typedefs for supported Fixed Point precisions
typedef std::integral_constant<uint32_t, FP_UNINIT> Fixed_Uninit;
typedef std::integral_constant<uint32_t, _16_8> Fixed_16_8;
typedef std::integral_constant<uint32_t, _16_9> Fixed_16_9;
typedef std::integral_constant<uint32_t, _X_16> Fixed_X_16;
//////////////////////////////////////////////////////////////////////////
/// @struct FixedPointTraits
/// @brief holds constants relating to converting between FP and Fixed point
/// @tparam FT: fixed precision type
template <typename FT>
struct FixedPointTraits
{
};
//////////////////////////////////////////////////////////////////////////
/// @brief Fixed_16_8 specialization of FixedPointTraits
template <>
struct FixedPointTraits<Fixed_16_8>
{
/// multiplier to go from FP32 to Fixed Point 16.8
typedef std::integral_constant<uint32_t, 256> ScaleT;
/// number of bits to shift to go from 16.8 fixed => int32
typedef std::integral_constant<uint32_t, 8> BitsT;
typedef Fixed_16_8 TypeT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief Fixed_16_9 specialization of FixedPointTraits
template <>
struct FixedPointTraits<Fixed_16_9>
{
/// multiplier to go from FP32 to Fixed Point 16.9
typedef std::integral_constant<uint32_t, 512> ScaleT;
/// number of bits to shift to go from 16.9 fixed => int32
typedef std::integral_constant<uint32_t, 9> BitsT;
typedef Fixed_16_9 TypeT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief Fixed_16_9 specialization of FixedPointTraits
template <>
struct FixedPointTraits<Fixed_X_16>
{
/// multiplier to go from FP32 to Fixed Point X.16
typedef std::integral_constant<uint32_t, 65536> ScaleT;
/// number of bits to shift to go from X.16 fixed => int32
typedef std::integral_constant<uint32_t, 16> BitsT;
typedef Fixed_X_16 TypeT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief convenience typedefs for conservative rasterization modes
typedef std::false_type StandardRastT;
typedef std::true_type ConservativeRastT;
//////////////////////////////////////////////////////////////////////////
/// @brief convenience typedefs for Input Coverage rasterization modes
typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE> NoInputCoverageT;
typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NORMAL> OuterConservativeCoverageT;
typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
InnerConservativeCoverageT;
//////////////////////////////////////////////////////////////////////////
/// @struct ConservativeRastTraits
/// @brief primary ConservativeRastTraits template. Shouldn't be instantiated
/// @tparam ConservativeT: type of conservative rasterization
template <typename ConservativeT>
struct ConservativeRastFETraits
{
};
//////////////////////////////////////////////////////////////////////////
/// @brief StandardRast specialization of ConservativeRastTraits
template <>
struct ConservativeRastFETraits<StandardRastT>
{
typedef std::false_type IsConservativeT;
typedef std::integral_constant<uint32_t, 0> BoundingBoxOffsetT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief ConservativeRastT specialization of ConservativeRastTraits
template <>
struct ConservativeRastFETraits<ConservativeRastT>
{
typedef std::true_type IsConservativeT;
typedef std::integral_constant<uint32_t, 1> BoundingBoxOffsetT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief convenience typedefs for ConservativeRastFETraits
typedef ConservativeRastFETraits<StandardRastT> FEStandardRastT;
typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT;
//////////////////////////////////////////////////////////////////////////
/// @struct ConservativeRastBETraits
/// @brief primary ConservativeRastBETraits template. Shouldn't be instantiated;
/// default to standard rasterization behavior
/// @tparam ConservativeT: type of conservative rasterization
/// @tparam InputCoverageT: type of input coverage requested, if any
template <typename ConservativeT, typename _InputCoverageT>
struct ConservativeRastBETraits
{
typedef std::false_type IsConservativeT;
typedef _InputCoverageT InputCoverageT;
typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT;
typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief StandardRastT specialization of ConservativeRastBETraits
template <typename _InputCoverageT>
struct ConservativeRastBETraits<StandardRastT, _InputCoverageT>
{
typedef std::false_type IsConservativeT;
typedef _InputCoverageT InputCoverageT;
typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT;
typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief ConservativeRastT specialization of ConservativeRastBETraits
/// with no input coverage
template <>
struct ConservativeRastBETraits<ConservativeRastT, NoInputCoverageT>
{
typedef std::true_type IsConservativeT;
typedef NoInputCoverageT InputCoverageT;
typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
/// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
/// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
/// of of having to compare individual edges to pixel corners to check if any part of the
/// triangle intersects a pixel
typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
ConservativeEdgeOffsetT;
typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief ConservativeRastT specialization of ConservativeRastBETraits
/// with OuterConservativeCoverage
template <>
struct ConservativeRastBETraits<ConservativeRastT, OuterConservativeCoverageT>
{
typedef std::true_type IsConservativeT;
typedef OuterConservativeCoverageT InputCoverageT;
typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
/// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
/// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
/// of of having to compare individual edges to pixel corners to check if any part of the
/// triangle intersects a pixel
typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
ConservativeEdgeOffsetT;
typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief ConservativeRastT specialization of ConservativeRastBETraits
/// with InnerConservativeCoverage
template <>
struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT>
{
typedef std::true_type IsConservativeT;
typedef InnerConservativeCoverageT InputCoverageT;
typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
/// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
/// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
/// of of having to compare individual edges to pixel corners to check if any part of the
/// triangle intersects a pixel
typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
ConservativeEdgeOffsetT;
/// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel +
/// 1/512, in Fixed 16.9 precision this allows the rasterizer to do the 3 edge coverage tests
/// against a single point, instead of of having to compare individual edges to pixel corners to
/// check if a pixel is fully covered by a triangle
typedef std::integral_constant<int32_t,
static_cast<int32_t>(
-((ConservativePrecisionT::ScaleT::value / 2) + 1) -
ConservativeEdgeOffsetT::value)>
InnerConservativeEdgeOffsetT;
};

View file

@ -1,608 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file context.h
*
* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
* The SWR_CONTEXT is our global context and contains the DC ring,
* thread state, etc.
*
* The DRAW_CONTEXT contains all state associated with a draw operation.
*
******************************************************************************/
#pragma once
#include <condition_variable>
#include <algorithm>
#include "core/api.h"
#include "core/utils.h"
#include "core/arena.h"
#include "core/fifo.hpp"
#include "core/knobs.h"
#include "common/intrin.h"
#include "common/rdtsc_buckets.h"
#include "core/threads.h"
#include "ringbuffer.h"
#include "archrast/archrast.h"
// x.8 fixed point precision values
#define FIXED_POINT_SHIFT 8
#define FIXED_POINT_SCALE 256
// x.16 fixed point precision values
#define FIXED_POINT16_SHIFT 16
#define FIXED_POINT16_SCALE 65536
struct SWR_CONTEXT;
struct DRAW_CONTEXT;
struct TRI_FLAGS
{
uint32_t frontFacing : 1;
uint32_t yMajor : 1;
uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM);
uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
float pointSize;
uint32_t renderTargetArrayIndex;
uint32_t viewportIndex;
};
//////////////////////////////////////////////////////////////////////////
/// SWR_TRIANGLE_DESC
/////////////////////////////////////////////////////////////////////////
struct SWR_TRIANGLE_DESC
{
float I[3];
float J[3];
float Z[3];
float OneOverW[3];
float recipDet;
float* pRecipW;
float* pAttribs;
float* pPerspAttribs;
float* pSamplePos;
float* pUserClipBuffer;
uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if
// entire pixel is covered
uint64_t anyCoveredSamples;
TRI_FLAGS triFlags;
};
struct TRIANGLE_WORK_DESC
{
float* pTriBuffer;
float* pAttribs;
float* pUserClipBuffer;
uint32_t numAttribs;
TRI_FLAGS triFlags;
};
struct CLEAR_DESC
{
SWR_RECT rect;
uint32_t attachmentMask;
uint32_t renderTargetArrayIndex;
float clearRTColor[4]; // RGBA_32F
float clearDepth; // [0..1]
uint8_t clearStencil;
};
struct DISCARD_INVALIDATE_TILES_DESC
{
uint32_t attachmentMask;
SWR_RECT rect;
SWR_TILE_STATE newTileState;
bool createNewTiles;
bool fullTilesOnly;
};
struct SYNC_DESC
{
PFN_CALLBACK_FUNC pfnCallbackFunc;
uint64_t userData;
uint64_t userData2;
uint64_t userData3;
};
struct STORE_TILES_DESC
{
uint32_t attachmentMask;
SWR_TILE_STATE postStoreTileState;
SWR_RECT rect;
};
struct COMPUTE_DESC
{
uint32_t threadGroupCountX;
uint32_t threadGroupCountY;
uint32_t threadGroupCountZ;
bool enableThreadDispatch;
};
typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
uint32_t workerId,
uint32_t macroTile,
void* pDesc);
enum WORK_TYPE
{
SYNC,
DRAW,
CLEAR,
DISCARDINVALIDATETILES,
STORETILES,
SHUTDOWN,
};
OSALIGNSIMD(struct) BE_WORK
{
WORK_TYPE type;
PFN_WORK_FUNC pfnWork;
union
{
SYNC_DESC sync;
TRIANGLE_WORK_DESC tri;
CLEAR_DESC clear;
DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
STORE_TILES_DESC storeTiles;
} desc;
};
struct DRAW_WORK
{
DRAW_CONTEXT* pDC;
union
{
uint32_t numIndices; // DrawIndexed: Number of indices for draw.
uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
};
union
{
gfxptr_t xpIB; // DrawIndexed: App supplied int32 indices
uint32_t startVertex; // Draw: Starting vertex in VB to render from.
};
int32_t baseVertex;
uint32_t numInstances; // Number of instances
uint32_t startInstance; // Instance offset
uint32_t startPrimID; // starting primitiveID for this draw batch
uint32_t
startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
SWR_FORMAT type; // index buffer type
};
typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext,
DRAW_CONTEXT* pDC,
uint32_t workerId,
void* pDesc);
struct FE_WORK
{
WORK_TYPE type;
PFN_FE_WORK_FUNC pfnWork;
union
{
SYNC_DESC sync;
DRAW_WORK draw;
CLEAR_DESC clear;
DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
STORE_TILES_DESC storeTiles;
} desc;
};
struct GUARDBANDS
{
float left[KNOB_NUM_VIEWPORTS_SCISSORS];
float right[KNOB_NUM_VIEWPORTS_SCISSORS];
float top[KNOB_NUM_VIEWPORTS_SCISSORS];
float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
};
struct PA_STATE;
// function signature for pipeline stages that execute after primitive assembly
typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simdvector prims[],
uint32_t primMask,
simdscalari const& primID,
simdscalari const& viewportIdx,
simdscalari const& rtIdx);
// function signature for pipeline stages that execute after primitive assembly
typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simd16vector prims[],
uint32_t primMask,
simd16scalari const& primID,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx);
OSALIGNLINE(struct) API_STATE
{
// Vertex Buffers
SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
// GS - Geometry Shader State
SWR_GS_STATE gsState;
PFN_GS_FUNC pfnGsFunc;
// FS - Fetch Shader State
PFN_FETCH_FUNC pfnFetchFunc;
// VS - Vertex Shader State
PFN_VERTEX_FUNC pfnVertexFunc;
// Index Buffer
SWR_INDEX_BUFFER_STATE indexBuffer;
// CS - Compute Shader
PFN_CS_FUNC pfnCsFunc;
uint32_t totalThreadsInGroup;
uint32_t totalSpillFillSize;
uint32_t scratchSpaceSizePerWarp;
uint32_t scratchSpaceNumWarps;
// FE - Frontend State
SWR_FRONTEND_STATE frontendState;
// SOS - Streamout Shader State
PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
// Streamout state
SWR_STREAMOUT_STATE soState;
mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
mutable SWR_STREAMOUT_BUFFER soPausedBuffer[MAX_SO_STREAMS];
// Tessellation State
PFN_HS_FUNC pfnHsFunc;
PFN_DS_FUNC pfnDsFunc;
SWR_TS_STATE tsState;
// Number of attributes used by the frontend (vs, so, gs)
uint32_t feNumAttributes;
// RS - Rasterizer State
SWR_RASTSTATE rastState;
// floating point multisample offsets
float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
GUARDBANDS gbState;
SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
SWR_VIEWPORT_MATRICES vpMatrices;
SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
bool scissorsTileAligned;
bool forceFront;
PRIMITIVE_TOPOLOGY topology;
// Backend state
OSALIGNLINE(SWR_BACKEND_STATE) backendState;
SWR_DEPTH_BOUNDS_STATE depthBoundsState;
// PS - Pixel shader state
SWR_PS_STATE psState;
SWR_DEPTH_STENCIL_STATE depthStencilState;
// OM - Output Merger State
SWR_BLEND_STATE blendState;
PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
struct
{
uint32_t enableStatsFE : 1; // Enable frontend pipeline stats
uint32_t enableStatsBE : 1; // Enable backend pipeline stats
uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles
uint32_t depthHottileEnable : 1; // Enable depth buffer hottile
uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
};
PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
};
class MacroTileMgr;
class DispatchQueue;
class HOTTILE;
struct RenderOutputBuffers
{
uint8_t* pColor[SWR_NUM_RENDERTARGETS];
uint8_t* pDepth;
uint8_t* pStencil;
HOTTILE* pColorHotTile[SWR_NUM_RENDERTARGETS];
HOTTILE* pDepthHotTile;
HOTTILE* pStencilHotTile;
};
// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
struct BarycentricCoeffs
{
simdscalar vIa;
simdscalar vIb;
simdscalar vIc;
simdscalar vJa;
simdscalar vJb;
simdscalar vJc;
simdscalar vZa;
simdscalar vZb;
simdscalar vZc;
simdscalar vRecipDet;
simdscalar vAOneOverW;
simdscalar vBOneOverW;
simdscalar vCOneOverW;
};
// pipeline function pointer types
typedef void (*PFN_BACKEND_FUNC)(
DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&,
uint8_t* (&)[SWR_NUM_RENDERTARGETS],
uint32_t,
const SWR_BLEND_STATE*,
const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS],
simdscalar&,
simdscalar const&);
typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&,
SWR_PS_CONTEXT&,
const uint64_t* const,
const uint32_t,
simdscalar const&,
simdscalar const&);
struct BACKEND_FUNCS
{
PFN_BACKEND_FUNC pfnBackend;
};
// Draw State
struct DRAW_STATE
{
API_STATE state;
void* pPrivateState; // Its required the driver sets this up for each draw.
// pipeline function pointers, filled in by API thread when setting up the draw
BACKEND_FUNCS backendFuncs;
PFN_PROCESS_PRIMS pfnProcessPrims;
#if USE_SIMD16_FRONTEND
PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
#endif
CachingArena* pArena; // This should only be used by API thread.
};
struct DRAW_DYNAMIC_STATE
{
void Reset(uint32_t numThreads)
{
SWR_STATS* pSavePtr = pStats;
memset(this, 0, sizeof(*this));
pStats = pSavePtr;
memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
}
///@todo Currently assumes only a single FE can do stream output for a draw.
uint32_t SoWriteOffset[4];
bool SoWriteOffsetDirty[4];
SWR_STATS_FE statsFE; // Only one FE thread per DC.
SWR_STATS* pStats;
uint64_t soPrims; // number of primitives written to StreamOut buffer
};
// Draw Context
// The api thread sets up a draw context that exists for the life of the draw.
// This draw context maintains all of the state needed for the draw operation.
struct DRAW_CONTEXT
{
SWR_CONTEXT* pContext;
union
{
MacroTileMgr* pTileMgr;
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
};
DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread.
CachingArena* pArena;
uint32_t drawId;
bool dependentFE; // Frontend work is dependent on all previous FE
bool dependent; // Backend work is dependent on all previous BE
bool isCompute; // Is this DC a compute context?
bool cleanupState; // True if this is the last draw using an entry in the state ring.
FE_WORK FeWork;
SYNC_DESC retireCallback; // Call this func when this DC is retired.
DRAW_DYNAMIC_STATE dynState;
volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
volatile OSALIGNLINE(uint32_t) FeLock;
volatile OSALIGNLINE(uint32_t) threadsDone;
};
static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
{
SWR_ASSERT(pDC != nullptr);
SWR_ASSERT(pDC->pState != nullptr);
return pDC->pState->state;
}
INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
{
SWR_ASSERT(pDC != nullptr);
SWR_ASSERT(pDC->pState != nullptr);
return pDC->pState->pPrivateState;
}
class HotTileMgr;
struct SWR_CONTEXT
{
// Draw Context Ring
// Each draw needs its own state in order to support multiple draws in flight across multiple
// threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
// maximum number of draws that can be in flight at any given time.
//
// Description:
// 1. State - When an application first sets state we'll request a new draw context to use.
// a. If there are no available draw contexts then we'll have to wait until one becomes
// free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
// c. All state calls set state on pCurDrawContext.
// 2. Draw - Creates submits a work item that is associated with current draw context.
// a. Set pPrevDrawContext = pCurDrawContext
// b. Set pCurDrawContext to NULL.
// 3. State - When an applications sets state after draw
// a. Same as step 1.
// b. State is copied from prev draw context to current.
RingBuffer<DRAW_CONTEXT> dcRing;
DRAW_CONTEXT* pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted
// that we can copy state from.
MacroTileMgr* pMacroTileManagerArray;
DispatchQueue* pDispatchQueueArray;
// Draw State Ring
// When draw are very large (lots of primitives) then the API thread will break these up.
// These split draws all have identical state. So instead of storing the state directly
// in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
// to reference a single entry in the DS ring.
RingBuffer<DRAW_STATE> dsRing;
uint32_t curStateId; // Current index to the next available entry in the DS ring.
uint32_t NumWorkerThreads;
uint32_t NumFEThreads;
uint32_t NumBEThreads;
THREAD_POOL threadPool; // Thread pool associated with this context
SWR_THREADING_INFO threadInfo;
SWR_API_THREADING_INFO apiThreadInfo;
SWR_WORKER_PRIVATE_STATE workerPrivateState;
uint32_t MAX_DRAWS_IN_FLIGHT;
std::condition_variable FifosNotEmpty;
std::mutex WaitLock;
uint32_t privateStateSize;
HotTileMgr* pHotTileMgr;
// Callback functions, passed in at create context time
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead;
PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
PFN_MAKE_GFXPTR pfnMakeGfxPtr;
PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext;
PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext;
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
PFN_UPDATE_STATS pfnUpdateStats;
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
PFN_UPDATE_STREAMOUT pfnUpdateStreamOut;
// Global Stats
SWR_STATS* pStats;
// Scratch space for workers.
uint8_t** ppScratch;
volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
uint32_t frameCount;
uint32_t lastFrameChecked;
uint64_t lastDrawChecked;
TileSet* pSingleThreadLockedTiles;
// ArchRast thread contexts.
HANDLE* pArContext;
// handle to external memory for worker data to create memory contexts
HANDLE hExternalMemory;
BucketManager *pBucketMgr;
};
#define UPDATE_STAT_BE(name, count) \
if (GetApiState(pDC).enableStatsBE) \
{ \
pDC->dynState.pStats[workerId].name += count; \
}
#define UPDATE_STAT_FE(name, count) \
if (GetApiState(pDC).enableStatsFE) \
{ \
pDC->dynState.statsFE.name += count; \
}
// ArchRast instrumentation framework
#define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
#define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
#ifdef KNOB_ENABLE_RDTSC
#define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type)
#define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0)
#else
#define RDTSC_BEGIN(pBucketMgr, type, drawid)
#define RDTSC_END(pBucketMgr, type, count)
#endif
#ifdef KNOB_ENABLE_AR
#define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
#define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
#else
#define _AR_EVENT(ctx, event)
#define _AR_FLUSH(ctx, id)
#endif
// Use these macros for api thread.
#define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
// Use these macros for worker threads.
#define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
#define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)

View file

@ -1,335 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file depthstencil.h
*
* @brief Implements depth/stencil functionality
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "format_conversion.h"
INLINE
void StencilOp(SWR_STENCILOP op,
simdscalar const& mask,
simdscalar const& stencilRefps,
simdscalar& stencilps)
{
simdscalari stencil = _simd_castps_si(stencilps);
switch (op)
{
case STENCILOP_KEEP:
break;
case STENCILOP_ZERO:
stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask);
break;
case STENCILOP_REPLACE:
stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask);
break;
case STENCILOP_INCRSAT:
{
simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
break;
}
case STENCILOP_DECRSAT:
{
simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
break;
}
case STENCILOP_INCR:
{
simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
break;
}
case STENCILOP_DECR:
{
simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
break;
}
case STENCILOP_INVERT:
{
simdscalar stencilinvert =
_simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
break;
}
default:
break;
}
}
template <SWR_FORMAT depthFormatT>
simdscalar QuantizeDepth(simdscalar const& depth)
{
SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0);
if (depthType == SWR_TYPE_FLOAT)
{
// assume only 32bit float depth supported
SWR_ASSERT(depthBpc == 32);
// matches shader precision, no quantizing needed
return depth;
}
// should be unorm depth if not float
SWR_ASSERT(depthType == SWR_TYPE_UNORM);
float quantize = (float)((1 << depthBpc) - 1);
simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize));
result = _simd_add_ps(result, _simd_set1_ps(0.5f));
result = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
if (depthBpc > 16)
{
result = _simd_div_ps(result, _simd_set1_ps(quantize));
}
else
{
result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize));
}
return result;
}
INLINE
simdscalar DepthStencilTest(const API_STATE* pState,
bool frontFacing,
uint32_t viewportIndex,
simdscalar const& iZ,
uint8_t* pDepthBase,
simdscalar const& coverageMask,
uint8_t* pStencilBase,
simdscalar* pStencilMask)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex];
simdscalar depthResult = _simd_set1_ps(-1.0f);
simdscalar zbuf;
// clamp Z to viewport [minZ..maxZ]
simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
if (pDSState->depthTestEnable)
{
switch (pDSState->depthTestFunc)
{
case ZFUNC_NEVER:
depthResult = _simd_setzero_ps();
break;
case ZFUNC_ALWAYS:
break;
default:
zbuf = _simd_load_ps((const float*)pDepthBase);
}
switch (pDSState->depthTestFunc)
{
case ZFUNC_LE:
depthResult = _simd_cmple_ps(interpZ, zbuf);
break;
case ZFUNC_LT:
depthResult = _simd_cmplt_ps(interpZ, zbuf);
break;
case ZFUNC_GT:
depthResult = _simd_cmpgt_ps(interpZ, zbuf);
break;
case ZFUNC_GE:
depthResult = _simd_cmpge_ps(interpZ, zbuf);
break;
case ZFUNC_EQ:
depthResult = _simd_cmpeq_ps(interpZ, zbuf);
break;
case ZFUNC_NE:
depthResult = _simd_cmpneq_ps(interpZ, zbuf);
break;
}
}
simdscalar stencilMask = _simd_set1_ps(-1.0f);
if (pDSState->stencilTestEnable)
{
uint8_t stencilRefValue;
uint32_t stencilTestFunc;
uint8_t stencilTestMask;
if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
{
stencilRefValue = pDSState->stencilRefValue;
stencilTestFunc = pDSState->stencilTestFunc;
stencilTestMask = pDSState->stencilTestMask;
}
else
{
stencilRefValue = pDSState->backfaceStencilRefValue;
stencilTestFunc = pDSState->backfaceStencilTestFunc;
stencilTestMask = pDSState->backfaceStencilTestMask;
}
simdvector sbuf;
simdscalar stencilWithMask;
simdscalar stencilRef;
switch (stencilTestFunc)
{
case ZFUNC_NEVER:
stencilMask = _simd_setzero_ps();
break;
case ZFUNC_ALWAYS:
break;
default:
LoadSOA<R8_UINT>(pStencilBase, sbuf);
// apply stencil read mask
stencilWithMask = _simd_castsi_ps(
_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
// do stencil compare in float to avoid simd integer emulation in AVX1
stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask));
break;
}
switch (stencilTestFunc)
{
case ZFUNC_LE:
stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask);
break;
case ZFUNC_LT:
stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask);
break;
case ZFUNC_GT:
stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask);
break;
case ZFUNC_GE:
stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask);
break;
case ZFUNC_EQ:
stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask);
break;
case ZFUNC_NE:
stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask);
break;
}
}
simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask);
*pStencilMask = stencilMask;
return depthWriteMask;
}
INLINE
void DepthStencilWrite(const SWR_VIEWPORT* pViewport,
const SWR_DEPTH_STENCIL_STATE* pDSState,
bool frontFacing,
simdscalar const& iZ,
uint8_t* pDepthBase,
const simdscalar& depthMask,
const simdscalar& coverageMask,
uint8_t* pStencilBase,
const simdscalar& stencilMask)
{
if (pDSState->depthWriteEnable)
{
// clamp Z to viewport [minZ..maxZ]
simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
_simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
}
if (pDSState->stencilWriteEnable)
{
simdvector sbuf;
LoadSOA<R8_UINT>(pStencilBase, sbuf);
simdscalar stencilbuf = sbuf.v[0];
uint8_t stencilRefValue;
uint32_t stencilFailOp;
uint32_t stencilPassDepthPassOp;
uint32_t stencilPassDepthFailOp;
uint8_t stencilWriteMask;
if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
{
stencilRefValue = pDSState->stencilRefValue;
stencilFailOp = pDSState->stencilFailOp;
stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
stencilWriteMask = pDSState->stencilWriteMask;
}
else
{
stencilRefValue = pDSState->backfaceStencilRefValue;
stencilFailOp = pDSState->backfaceStencilFailOp;
stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
stencilWriteMask = pDSState->backfaceStencilWriteMask;
}
simdscalar stencilps = stencilbuf;
simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask);
simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask);
simdscalar stencilPassDepthFailMask =
_simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
simdscalar origStencil = stencilps;
StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
StencilOp((SWR_STENCILOP)stencilPassDepthFailOp,
stencilPassDepthFailMask,
stencilRefps,
stencilps);
StencilOp((SWR_STENCILOP)stencilPassDepthPassOp,
stencilPassDepthPassMask,
stencilRefps,
stencilps);
// apply stencil write mask
simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
stencilps =
_simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
simdvector stencilResult;
stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask);
StoreSOA<R8_UINT>(stencilResult, pStencilBase);
}
}

View file

@ -1,138 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file fifo.hpp
*
* @brief Definitions for our fifos used for thread communication.
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "arena.h"
#include <vector>
#include <cassert>
template <class T>
struct QUEUE
{
OSALIGNLINE(volatile uint32_t) mLock{0};
OSALIGNLINE(volatile uint32_t) mNumEntries{0};
std::vector<T*> mBlocks;
T* mCurBlock{nullptr};
uint32_t mHead{0};
uint32_t mTail{0};
uint32_t mCurBlockIdx{0};
// power of 2
static const uint32_t mBlockSizeShift = 6;
static const uint32_t mBlockSize = 1 << mBlockSizeShift;
template <typename ArenaT>
void clear(ArenaT& arena)
{
mHead = 0;
mTail = 0;
mBlocks.clear();
T* pNewBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
mBlocks.push_back(pNewBlock);
mCurBlock = pNewBlock;
mCurBlockIdx = 0;
mNumEntries = 0;
mLock = 0;
}
uint32_t getNumQueued() { return mNumEntries; }
bool tryLock()
{
if (mLock)
{
return false;
}
// try to lock the FIFO
long initial = InterlockedCompareExchange(&mLock, 1, 0);
return (initial == 0);
}
void unlock() { mLock = 0; }
T* peek()
{
if (mNumEntries == 0)
{
return nullptr;
}
uint32_t block = mHead >> mBlockSizeShift;
return &mBlocks[block][mHead & (mBlockSize - 1)];
}
void dequeue_noinc()
{
mHead++;
mNumEntries--;
}
template <typename ArenaT>
bool enqueue_try_nosync(ArenaT& arena, const T* entry)
{
const float* pSrc = (const float*)entry;
float* pDst = (float*)&mCurBlock[mTail];
auto lambda = [&](int32_t i) {
__m256 vSrc = _mm256_load_ps(pSrc + i * KNOB_SIMD_WIDTH);
_mm256_stream_ps(pDst + i * KNOB_SIMD_WIDTH, vSrc);
};
const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH * 4);
static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
"FIFO element size should be multiple of SIMD width.");
UnrollerL<0, numSimdLines, 1>::step(lambda);
mTail++;
if (mTail == mBlockSize)
{
if (++mCurBlockIdx < mBlocks.size())
{
mCurBlock = mBlocks[mCurBlockIdx];
}
else
{
T* newBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
SWR_ASSERT(newBlock);
mBlocks.push_back(newBlock);
mCurBlock = newBlock;
}
mTail = 0;
}
mNumEntries++;
return true;
}
void destroy() {}
};

View file

@ -1,262 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file format_conversion.h
*
* @brief API implementation
*
******************************************************************************/
#include "format_types.h"
#include "format_traits.h"
//////////////////////////////////////////////////////////////////////////
/// @brief Load SIMD packed pixels in SOA format and converts to
/// SOA RGBA32_FLOAT format.
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
template <typename SIMD_T, SWR_FORMAT SrcFormat>
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4<SIMD_T>& dst)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
(FormatTraits<SrcFormat>::GetBPC(0) == 32))
{
auto lambda = [&](int comp)
{
Float<SIMD_T> vComp =
SIMD_T::load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(Float<SIMD_T>)));
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
};
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
return;
}
auto lambda = [&](int comp)
{
// load SIMD components
Float<SIMD_T> vComp;
FormatTraits<SrcFormat>::loadSOA(comp, pSrc, vComp);
// unpack
vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
// convert
if (FormatTraits<SrcFormat>::isNormalized(comp))
{
vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp));
vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
}
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
// is there a better way to get this from the SIMD traits?
const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
}
template <SWR_FORMAT SrcFormat>
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst)
{
LoadSOA<SIMD256, SrcFormat>(pSrc, dst);
}
template <SWR_FORMAT SrcFormat>
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
{
LoadSOA<SIMD512, SrcFormat>(pSrc, dst);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Clamps the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
template <typename SIMD_T, SWR_FORMAT Format>
INLINE Float<SIMD_T> SIMDCALL Clamp(Float<SIMD_T> const& v, uint32_t Component)
{
Float<SIMD_T> vComp = v;
if (Component >= 4 || Component < 0)
{
// Component shouldn't out of <0;3> range
assert(false);
return vComp;
}
if (FormatTraits<Format>::isNormalized(Component))
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
{
vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps());
}
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
{
vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f));
}
vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f));
}
else if (FormatTraits<Format>::GetBPC(Component) < 32)
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
{
int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
int iMin = 0;
Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin));
vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax));
vComp = SIMD_T::castsi_ps(vCompi);
}
else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
{
int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
int iMin = -1 - iMax;
Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin));
vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax));
vComp = SIMD_T::castsi_ps(vCompi);
}
}
return vComp;
}
template <SWR_FORMAT Format>
INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component)
{
return Clamp<SIMD256, Format>(v, Component);
}
template <SWR_FORMAT Format>
INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
{
return Clamp<SIMD512, Format>(v, Component);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Normalize the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
template <typename SIMD_T, SWR_FORMAT Format>
INLINE Float<SIMD_T> SIMDCALL Normalize(Float<SIMD_T> const& vComp, uint32_t Component)
{
Float<SIMD_T> r = vComp;
if (FormatTraits<Format>::isNormalized(Component))
{
r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits<Format>::fromFloat(Component)));
r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r));
}
return r;
}
template <SWR_FORMAT Format>
INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component)
{
return Normalize<SIMD256, Format>(vComp, Component);
}
template <SWR_FORMAT Format>
INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component)
{
return Normalize<SIMD512, Format>(vComp, Component);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Convert and store simdvector of pixels in SOA
/// RGBA32_FLOAT to SOA format
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
template <typename SIMD_T, SWR_FORMAT DstFormat>
INLINE void SIMDCALL StoreSOA(const Vec4<SIMD_T>& src, uint8_t* pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
(FormatTraits<DstFormat>::GetBPC(0) == 32))
{
for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
{
if (comp < 3) // Input format is always RGBA32_FLOAT.
{
vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
}
}
SIMD_T::store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
}
return;
}
auto lambda = [&](int comp) {
Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
{
if (comp < 3) // Input format is always RGBA32_FLOAT.
{
vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
}
}
// clamp
vComp = Clamp<SIMD_T, DstFormat>(vComp, comp);
// normalize
vComp = Normalize<SIMD_T, DstFormat>(vComp, comp);
// pack
vComp = FormatTraits<DstFormat>::pack(comp, vComp);
// store
FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
// is there a better way to get this from the SIMD traits?
const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
pDst += (FormatTraits<DstFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
}
template <SWR_FORMAT DstFormat>
INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst)
{
StoreSOA<SIMD256, DstFormat>(src, pDst);
}
template <SWR_FORMAT DstFormat>
INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
{
StoreSOA<SIMD512, DstFormat>(src, pDst);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,939 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file utils.h
*
* @brief Utilities used by SWR core related to pixel formats.
*
******************************************************************************/
#pragma once
#include "core/utils.h"
#include "common/simdintrin.h"
INLINE
void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3)
{
simd4scalari row0i = SIMD128::castps_si(row0);
simd4scalari row1i = SIMD128::castps_si(row1);
simd4scalari row2i = SIMD128::castps_si(row2);
simd4scalari row3i = SIMD128::castps_si(row3);
simd4scalari vTemp = row2i;
row2i = SIMD128::unpacklo_epi32(row2i, row3i);
vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
row3i = row0i;
row0i = SIMD128::unpacklo_epi32(row0i, row1i);
row3i = SIMD128::unpackhi_epi32(row3i, row1i);
row1i = row0i;
row0i = SIMD128::unpacklo_epi64(row0i, row2i);
row1i = SIMD128::unpackhi_epi64(row1i, row2i);
row2i = row3i;
row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
row0 = SIMD128::castsi_ps(row0i);
row1 = SIMD128::castsi_ps(row1i);
row2 = SIMD128::castsi_ps(row2i);
row3 = SIMD128::castsi_ps(row3i);
}
INLINE
void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3)
{
simd4scalari vTemp = row2;
row2 = SIMD128::unpacklo_epi32(row2, row3);
vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
row3 = row0;
row0 = SIMD128::unpacklo_epi32(row0, row1);
row3 = SIMD128::unpackhi_epi32(row3, row1);
row1 = row0;
row0 = SIMD128::unpacklo_epi64(row0, row2);
row1 = SIMD128::unpackhi_epi64(row1, row2);
row2 = row3;
row2 = SIMD128::unpacklo_epi64(row2, vTemp);
row3 = SIMD128::unpackhi_epi64(row3, vTemp);
}
#if KNOB_SIMD_WIDTH == 8
INLINE
void vTranspose3x8(simd4scalar (&vDst)[8],
const simdscalar& vSrc0,
const simdscalar& vSrc1,
const simdscalar& vSrc2)
{
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5
simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4
simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5
r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7
vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
}
INLINE
void vTranspose4x8(simd4scalar (&vDst)[8],
const simdscalar& vSrc0,
const simdscalar& vSrc1,
const simdscalar& vSrc2,
const simdscalar& vSrc3)
{
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5
simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4
simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5
r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7
vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
}
INLINE
void vTranspose4x16(simd16scalar (&dst)[4],
const simd16scalar& src0,
const simd16scalar& src1,
const simd16scalar& src2,
const simd16scalar& src3)
{
const simd16scalari perm =
_simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
// pre-permute input to setup the right order after all the unpacking
simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
dst[0] = _simd16_unpacklo_ps(rblo, galo);
dst[1] = _simd16_unpackhi_ps(rblo, galo);
dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
}
INLINE
void vTranspose8x8(simdscalar (&vDst)[8],
const simdscalar& vMask0,
const simdscalar& vMask1,
const simdscalar& vMask2,
const simdscalar& vMask3,
const simdscalar& vMask4,
const simdscalar& vMask5,
const simdscalar& vMask6,
const simdscalar& vMask7)
{
simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
}
INLINE
void vTranspose8x8(simdscalar (&vDst)[8],
const simdscalari& vMask0,
const simdscalari& vMask1,
const simdscalari& vMask2,
const simdscalari& vMask3,
const simdscalari& vMask4,
const simdscalari& vMask5,
const simdscalari& vMask6,
const simdscalari& vMask7)
{
vTranspose8x8(vDst,
_simd_castsi_ps(vMask0),
_simd_castsi_ps(vMask1),
_simd_castsi_ps(vMask2),
_simd_castsi_ps(vMask3),
_simd_castsi_ps(vMask4),
_simd_castsi_ps(vMask5),
_simd_castsi_ps(vMask6),
_simd_castsi_ps(vMask7));
}
#endif
//////////////////////////////////////////////////////////////////////////
/// TranposeSingleComponent
//////////////////////////////////////////////////////////////////////////
template <uint32_t bpp>
struct TransposeSingleComponent
{
//////////////////////////////////////////////////////////////////////////
/// @brief Pass-thru for single component.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose8_8_8_8
//////////////////////////////////////////////////////////////////////////
struct Transpose8_8_8_8
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
#if KNOB_SIMD_WIDTH == 8
#if KNOB_ARCH <= KNOB_ARCH_AVX
simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
simd4scalari c2c3 =
SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa
simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba
simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba
SIMD128::store_si((simd4scalari*)pDst, c0123lo);
SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
#else
simdscalari dst01 = _simd_shuffle_epi8(src,
_simd_set_epi32(0x0f078080,
0x0e068080,
0x0d058080,
0x0c048080,
0x80800b03,
0x80800a02,
0x80800901,
0x80800800));
simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
dst23 = _simd_shuffle_epi8(dst23,
_simd_set_epi32(0x80800f07,
0x80800e06,
0x80800d05,
0x80800c04,
0x0b038080,
0x0a028080,
0x09018080,
0x08008080));
simdscalari dst = _simd_or_si(dst01, dst23);
_simd_store_si((simdscalari*)pDst, dst);
#endif
#else
#error Unsupported vector width
#endif
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD16_WIDTH == 16
// clang-format off
simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
_simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
// clang-format on
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose8_8_8
//////////////////////////////////////////////////////////////////////////
struct Transpose8_8_8
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose8_8
//////////////////////////////////////////////////////////////////////////
struct Transpose8_8
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg
simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
rg = SIMD128::unpacklo_epi8(rg, g);
SIMD128::store_si((simd4scalari*)pDst, rg);
#else
#error Unsupported vector width
#endif
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD16_WIDTH == 16
// clang-format off
simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
simdscalari dst = _simd_or_si(cvt0, shl1);
_simd_store_si(reinterpret_cast<simdscalari*>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
// clang-format on
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose32_32_32_32
//////////////////////////////////////////////////////////////////////////
struct Transpose32_32_32_32
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src0 = _simd_load_ps((const float*)pSrc);
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
simd4scalar vDst[8];
vTranspose4x8(vDst, src0, src1, src2, src3);
SIMD128::store_ps((float*)pDst, vDst[0]);
SIMD128::store_ps((float*)pDst + 4, vDst[1]);
SIMD128::store_ps((float*)pDst + 8, vDst[2]);
SIMD128::store_ps((float*)pDst + 12, vDst[3]);
SIMD128::store_ps((float*)pDst + 16, vDst[4]);
SIMD128::store_ps((float*)pDst + 20, vDst[5]);
SIMD128::store_ps((float*)pDst + 24, vDst[6]);
SIMD128::store_ps((float*)pDst + 28, vDst[7]);
#else
#error Unsupported vector width
#endif
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD16_WIDTH == 16
// clang-format off
simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
simd16scalar dst[4];
vTranspose4x16(dst, src0, src1, src2, src3);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
// clang-format on
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose32_32_32
//////////////////////////////////////////////////////////////////////////
struct Transpose32_32_32
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src0 = _simd_load_ps((const float*)pSrc);
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
simd4scalar vDst[8];
vTranspose3x8(vDst, src0, src1, src2);
SIMD128::store_ps((float*)pDst, vDst[0]);
SIMD128::store_ps((float*)pDst + 4, vDst[1]);
SIMD128::store_ps((float*)pDst + 8, vDst[2]);
SIMD128::store_ps((float*)pDst + 12, vDst[3]);
SIMD128::store_ps((float*)pDst + 16, vDst[4]);
SIMD128::store_ps((float*)pDst + 20, vDst[5]);
SIMD128::store_ps((float*)pDst + 24, vDst[6]);
SIMD128::store_ps((float*)pDst + 28, vDst[7]);
#else
#error Unsupported vector width
#endif
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD16_WIDTH == 16
// clang-format off
simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
simd16scalar src3 = _simd16_setzero_ps();
simd16scalar dst[4];
vTranspose4x16(dst, src0, src1, src2, src3);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
// clang-format on
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose32_32
//////////////////////////////////////////////////////////////////////////
struct Transpose32_32
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
const float* pfSrc = (const float*)pSrc;
simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
float* pfDst = (float*)pDst;
SIMD128::store_ps(pfDst + 0, dst0);
SIMD128::store_ps(pfDst + 4, dst1);
SIMD128::store_ps(pfDst + 8, dst2);
SIMD128::store_ps(pfDst + 12, dst3);
#else
#error Unsupported vector width
#endif
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD16_WIDTH == 16
// clang-format off
simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr
simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
// clang-format on
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose16_16_16_16
//////////////////////////////////////////////////////////////////////////
struct Transpose16_16_16_16
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
#else
#error Unsupported vector width
#endif
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD16_WIDTH == 16
// clang-format off
simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
// clang-format on
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose16_16_16
//////////////////////////////////////////////////////////////////////////
struct Transpose16_16_16
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
simd4scalari src_a = SIMD128::setzero_si();
simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
#else
#error Unsupported vector width
#endif
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD16_WIDTH == 16
// clang-format off
simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
// clang-format on
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose16_16
//////////////////////////////////////////////////////////////////////////
struct Transpose16_16
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src = _simd_load_ps((const float*)pSrc);
simd4scalar comp0 = _simd_extractf128_ps(src, 0);
simd4scalar comp1 = _simd_extractf128_ps(src, 1);
simd4scalari comp0i = SIMD128::castps_si(comp0);
simd4scalari comp1i = SIMD128::castps_si(comp1);
simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
SIMD128::store_si((simd4scalari*)pDst, resLo);
SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
#else
#error Unsupported vector width
#endif
}
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD16_WIDTH == 16
// clang-format off
simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
// clang-format on
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose24_8
//////////////////////////////////////////////////////////////////////////
struct Transpose24_8
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 24_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose32_8_24
//////////////////////////////////////////////////////////////////////////
struct Transpose32_8_24
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose4_4_4_4
//////////////////////////////////////////////////////////////////////////
struct Transpose4_4_4_4
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose5_6_5
//////////////////////////////////////////////////////////////////////////
struct Transpose5_6_5
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose9_9_9_5
//////////////////////////////////////////////////////////////////////////
struct Transpose9_9_9_5
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose5_5_5_1
//////////////////////////////////////////////////////////////////////////
struct Transpose5_5_5_1
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose1_5_5_5
//////////////////////////////////////////////////////////////////////////
struct Transpose1_5_5_5
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose10_10_10_2
//////////////////////////////////////////////////////////////////////////
struct Transpose10_10_10_2
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose11_11_10
//////////////////////////////////////////////////////////////////////////
struct Transpose11_11_10
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose64
//////////////////////////////////////////////////////////////////////////
struct Transpose64
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose64_64
//////////////////////////////////////////////////////////////////////////
struct Transpose64_64
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose64_64_64
//////////////////////////////////////////////////////////////////////////
struct Transpose64_64_64
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose64_64_64_64
//////////////////////////////////////////////////////////////////////////
struct Transpose64_64_64_64
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};

File diff suppressed because it is too large Load diff

View file

@ -1,448 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file frontend.h
*
* @brief Definitions for Frontend which handles vertex processing,
* primitive assembly, clipping, binning, etc.
*
******************************************************************************/
#pragma once
#include "context.h"
#include "common/simdintrin.h"
#include <type_traits>
//////////////////////////////////////////////////////////////////////////
/// @brief Helper macro to generate a bitmask
static INLINE uint32_t
GenMask(uint32_t numBits)
{
SWR_ASSERT(
numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
return ((1U << numBits) - 1);
}
// Calculates the A and B coefficients for the 3 edges of the triangle
//
// maths for edge equations:
// standard form of a line in 2d
// Ax + By + C = 0
// A = y0 - y1
// B = x1 - x0
// C = x0y1 - x1y0
INLINE
void triangleSetupAB(const __m128 vX, const __m128 vY, __m128& vA, __m128& vB)
{
// vYsub = y1 y2 y0 dc
__m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
// vY = y0 y1 y2 dc
vA = _mm_sub_ps(vY, vYsub);
// Result:
// A[0] = y0 - y1
// A[1] = y1 - y2
// A[2] = y2 - y0
// vXsub = x1 x2 x0 dc
__m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
// vX = x0 x1 x2 dc
vB = _mm_sub_ps(vXsub, vX);
// Result:
// B[0] = x1 - x0
// B[1] = x2 - x1
// B[2] = x0 - x2
}
INLINE
void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i& vA, __m128i& vB)
{
// generate edge equations
// A = y0 - y1
// B = x1 - x0
// C = x0y1 - x1y0
__m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
vA = _mm_sub_epi32(vY, vYsub);
__m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
vB = _mm_sub_epi32(vXsub, vX);
}
INLINE
void triangleSetupABIntVertical(const simdscalari vX[3],
const simdscalari vY[3],
simdscalari (&vA)[3],
simdscalari (&vB)[3])
{
// A = y0 - y1
// B = x1 - x0
vA[0] = _simd_sub_epi32(vY[0], vY[1]);
vA[1] = _simd_sub_epi32(vY[1], vY[2]);
vA[2] = _simd_sub_epi32(vY[2], vY[0]);
vB[0] = _simd_sub_epi32(vX[1], vX[0]);
vB[1] = _simd_sub_epi32(vX[2], vX[1]);
vB[2] = _simd_sub_epi32(vX[0], vX[2]);
}
#if ENABLE_AVX512_SIMD16
INLINE
void triangleSetupABIntVertical(const simd16scalari vX[3],
const simd16scalari vY[3],
simd16scalari (&vA)[3],
simd16scalari (&vB)[3])
{
// A = y0 - y1
// B = x1 - x0
vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
}
#endif
// Calculate the determinant of the triangle
// 2 vectors between the 3 points: P, Q
// Px = x0-x2, Py = y0-y2
// Qx = x1-x2, Qy = y1-y2
// |Px Qx|
// det = | | = PxQy - PyQx
// |Py Qy|
// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
// : B[2]*A[1] - A[2]*B[1]
INLINE
float calcDeterminantInt(const __m128i vA, const __m128i vB)
{
// vAShuf = [A1, A0, A2, A0]
__m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
// vBShuf = [B2, B0, B1, B0]
__m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
// vMul = [A1*B2, B1*A2]
__m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
// shuffle upper to lower
// vMul2 = [B1*A2, B1*A2]
__m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
// vMul = [A1*B2 - B1*A2]
vMul = _mm_sub_epi64(vMul, vMul2);
int64_t result;
_mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
double dResult = (double)result;
dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
return (float)dResult;
}
INLINE
void calcDeterminantIntVertical(const simdscalari vA[3],
const simdscalari vB[3],
simdscalari* pvDet)
{
// refer to calcDeterminantInt comment for calculation explanation
// A1*B2
simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
// B1*A2
simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
// A1*B2 - A2*B1
simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
// shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
// shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
pvDet[0] = vResultLo;
pvDet[1] = vResultHi;
}
#if ENABLE_AVX512_SIMD16
INLINE
void calcDeterminantIntVertical(const simd16scalari vA[3],
const simd16scalari vB[3],
simd16scalari* pvDet)
{
// refer to calcDeterminantInt comment for calculation explanation
// A1*B2
simd16scalari vA1_lo =
_simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b)
simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F
// B1*A2
simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo);
simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
// A1*B2 - A2*B1
simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b)
simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F
// (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b)
simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F
// (3, 1, 2, 0) = 11 01 10 00 = 0xD8
pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F
}
#endif
INLINE
void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128& vB, __m128& vC)
{
// C = -Ax - By
vC = _mm_mul_ps(vA, vX);
__m128 vCy = _mm_mul_ps(vB, vY);
vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
vC = _mm_sub_ps(vC, vCy);
}
template <uint32_t NumVerts>
INLINE void viewportTransform(simdvector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
{
simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
for (uint32_t i = 0; i < NumVerts; ++i)
{
v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
}
}
#if USE_SIMD16_FRONTEND
template <uint32_t NumVerts>
INLINE void viewportTransform(simd16vector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
{
const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
for (uint32_t i = 0; i < NumVerts; ++i)
{
v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
}
}
#endif
template <uint32_t NumVerts>
INLINE void viewportTransform(simdvector* v,
const SWR_VIEWPORT_MATRICES& vpMatrices,
simdscalari const& vViewportIdx)
{
// perform a gather of each matrix element based on the viewport array indexes
simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
for (uint32_t i = 0; i < NumVerts; ++i)
{
v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
}
}
#if USE_SIMD16_FRONTEND
template <uint32_t NumVerts>
INLINE void viewportTransform(simd16vector* v,
const SWR_VIEWPORT_MATRICES& vpMatrices,
simd16scalari const& vViewportIdx)
{
// perform a gather of each matrix element based on the viewport array indexes
const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
for (uint32_t i = 0; i < NumVerts; ++i)
{
v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
}
}
#endif
INLINE
void calcBoundingBoxInt(const __m128i& vX, const __m128i& vY, SWR_RECT& bbox)
{
// Need horizontal fp min here
__m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
__m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
__m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
__m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
__m128i vMinX = _mm_min_epi32(vX, vX1);
vMinX = _mm_min_epi32(vMinX, vX2);
__m128i vMaxX = _mm_max_epi32(vX, vX1);
vMaxX = _mm_max_epi32(vMaxX, vX2);
__m128i vMinY = _mm_min_epi32(vY, vY1);
vMinY = _mm_min_epi32(vMinY, vY2);
__m128i vMaxY = _mm_max_epi32(vY, vY1);
vMaxY = _mm_max_epi32(vMaxY, vY2);
bbox.xmin = _mm_extract_epi32(vMinX, 0);
bbox.xmax = _mm_extract_epi32(vMaxX, 0);
bbox.ymin = _mm_extract_epi32(vMinY, 0);
bbox.ymax = _mm_extract_epi32(vMaxY, 0);
}
INLINE
bool CanUseSimplePoints(DRAW_CONTEXT* pDC)
{
const API_STATE& state = GetApiState(pDC);
return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
state.rastState.pointSize == 1.0f && !state.rastState.pointParam &&
!state.rastState.pointSpriteEnable && !state.backendState.clipDistanceMask);
}
INLINE
bool vHasNaN(const __m128& vec)
{
const __m128 result = _mm_cmpunord_ps(vec, vec);
const int32_t mask = _mm_movemask_ps(result);
return (mask != 0);
}
uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
// ProcessDraw front-end function. All combinations of parameter values are available
PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
bool IsCutIndexEnabled,
bool HasTessellation,
bool HasGeometryShader,
bool HasStreamOut,
bool HasRasterization);
void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
void ProcessStoreTiles(SWR_CONTEXT* pContext,
DRAW_CONTEXT* pDC,
uint32_t workerId,
void* pUserData);
void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext,
DRAW_CONTEXT* pDC,
uint32_t workerId,
void* pUserData);
void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
#if USE_SIMD16_FRONTEND
PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
#endif
struct PA_STATE_BASE; // forward decl
void BinPoints(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simdvector prims[3],
uint32_t primMask,
simdscalari const& primID,
simdscalari const& viewportIdx,
simdscalari const& rtIdx);
void BinLines(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simdvector prims[3],
uint32_t primMask,
simdscalari const& primID,
simdscalari const& viewportIdx,
simdscalari const& rtIdx);
#if USE_SIMD16_FRONTEND
void SIMDCALL BinPoints_simd16(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simd16vector prims[3],
uint32_t primMask,
simd16scalari const& primID,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx);
void SIMDCALL BinLines_simd16(DRAW_CONTEXT* pDC,
PA_STATE& pa,
uint32_t workerId,
simd16vector prims[3],
uint32_t primMask,
simd16scalari const& primID,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx);
#endif

View file

@ -1,175 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file knobs.h
*
* @brief Static (Compile-Time) Knobs for Core.
*
******************************************************************************/
#pragma once
#include <stdint.h>
#include <gen_knobs.h>
#define KNOB_ARCH_AVX 0
#define KNOB_ARCH_AVX2 1
#define KNOB_ARCH_AVX512 2
///////////////////////////////////////////////////////////////////////////////
// AVX512 Support
///////////////////////////////////////////////////////////////////////////////
#define ENABLE_AVX512_SIMD16 1
#define USE_SIMD16_FRONTEND 1
#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND
#define USE_SIMD16_VS 1 // requires USE_SIMD16_SHADERS
///////////////////////////////////////////////////////////////////////////////
// Architecture validation
///////////////////////////////////////////////////////////////////////////////
#if !defined(KNOB_ARCH)
#define KNOB_ARCH KNOB_ARCH_AVX
#endif
#if (KNOB_ARCH == KNOB_ARCH_AVX)
#define KNOB_ARCH_ISA AVX
#define KNOB_ARCH_STR "AVX"
#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
#define KNOB_ARCH_ISA AVX2
#define KNOB_ARCH_STR "AVX2"
#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
#define KNOB_ARCH_ISA AVX512F
#define KNOB_ARCH_STR "AVX512"
#else
#error "Unknown architecture"
#endif
#define KNOB_SIMD_WIDTH 8
#define KNOB_SIMD_BYTES 32
#define KNOB_SIMD16_WIDTH 16
#define KNOB_SIMD16_BYTES 64
#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
///////////////////////////////////////////////////////////////////////////////
// Configuration knobs
///////////////////////////////////////////////////////////////////////////////
// Maximum supported number of active vertex buffer streams
#define KNOB_NUM_STREAMS 32
// Maximum supported active viewports and scissors
#define KNOB_NUM_VIEWPORTS_SCISSORS 16
// Guardband range used by the clipper
#define KNOB_GUARDBAND_WIDTH 32768.0f
#define KNOB_GUARDBAND_HEIGHT 32768.0f
// Scratch space requirements per worker. Currently only used for TGSM sizing for some stages
#define KNOB_WORKER_SCRATCH_SPACE_SIZE (32 * 1024)
///////////////////////////////
// Macro tile configuration
///////////////////////////////
// raster tile dimensions
#define KNOB_TILE_X_DIM 8
#define KNOB_TILE_X_DIM_SHIFT 3
#define KNOB_TILE_Y_DIM 8
#define KNOB_TILE_Y_DIM_SHIFT 3
// fixed macrotile pixel dimension for now, eventually will be
// dynamically set based on tile format and pixel size
#define KNOB_MACROTILE_X_DIM 32
#define KNOB_MACROTILE_Y_DIM 32
#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 13
#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 13
#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8)
#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8)
#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
// total # of hot tiles available. This should be enough to
// fully render a 16kx16k 128bpp render target
#define KNOB_NUM_HOT_TILES_X 512
#define KNOB_NUM_HOT_TILES_Y 512
#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT
#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT
#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT
// Max scissor rectangle
#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X* KNOB_MACROTILE_X_DIM
#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y* KNOB_MACROTILE_Y_DIM
#if KNOB_SIMD_WIDTH == 8 && KNOB_TILE_X_DIM < 4
#error "incompatible width/tile dimensions"
#endif
#if ENABLE_AVX512_SIMD16
#if KNOB_SIMD16_WIDTH == 16 && KNOB_TILE_X_DIM < 8
#error "incompatible width/tile dimensions"
#endif
#endif
#if KNOB_SIMD_WIDTH == 8
#define SIMD_TILE_X_DIM 4
#define SIMD_TILE_Y_DIM 2
#else
#error "Invalid simd width"
#endif
#if ENABLE_AVX512_SIMD16
#if KNOB_SIMD16_WIDTH == 16
#define SIMD16_TILE_X_DIM 8
#define SIMD16_TILE_Y_DIM 2
#else
#error "Invalid simd width"
#endif
#endif
///////////////////////////////////////////////////////////////////////////////
// Optimization knobs
///////////////////////////////////////////////////////////////////////////////
#define KNOB_USE_FAST_SRGB TRUE
// enables cut-aware primitive assembler
#define KNOB_ENABLE_CUT_AWARE_PA TRUE
// enables early rasterization (useful for small triangles)
#if !defined(KNOB_ENABLE_EARLY_RAST)
#define KNOB_ENABLE_EARLY_RAST 1
#endif
#if KNOB_ENABLE_EARLY_RAST
#define ER_SIMD_TILE_X_SHIFT 2
#define ER_SIMD_TILE_Y_SHIFT 2
#endif
///////////////////////////////////////////////////////////////////////////////
// Debug knobs
///////////////////////////////////////////////////////////////////////////////
//#define KNOB_ENABLE_RDTSC
// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
#if !defined(KNOB_ENABLE_TOSS_POINTS)
#define KNOB_ENABLE_TOSS_POINTS 0
#endif

View file

@ -1,108 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file knobs_init.h
*
* @brief Dynamic Knobs Initialization for Core.
*
******************************************************************************/
#pragma once
#include <core/knobs.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdio.h>
// Assume the type is compatible with a 32-bit integer
template <typename T>
static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
{
uint32_t value = 0;
char* pStopped = nullptr;
value = strtoul(pOverride, &pStopped, 0);
if (pStopped != pOverride)
{
knobValue = static_cast<T>(value);
}
}
static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue)
{
size_t len = strlen(pOverride);
if (len == 1)
{
auto c = tolower(pOverride[0]);
if (c == 'y' || c == 't' || c == '1')
{
knobValue = true;
return;
}
if (c == 'n' || c == 'f' || c == '0')
{
knobValue = false;
return;
}
}
// Try converting to a number and casting to bool
uint32_t value = 0;
char* pStopped = nullptr;
value = strtoul(pOverride, &pStopped, 0);
if (pStopped != pOverride)
{
knobValue = value != 0;
}
}
static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
{
float value = knobValue;
if (sscanf(pOverride, "%f", &value))
{
knobValue = value;
}
}
static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
{
knobValue = pOverride;
}
template <typename T>
static inline void InitKnob(T& knob)
{
// Read environment variables
const char* pOverride = getenv(knob.Name());
if (pOverride)
{
auto knobValue = knob.DefaultValue();
ConvertEnvToKnob(pOverride, knobValue);
knob.Value(knobValue);
}
else
{
// Set default value
knob.Value(knob.DefaultValue());
}
}

View file

@ -1,459 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file multisample.h
*
******************************************************************************/
#pragma once
#include "context.h"
#include "format_traits.h"
//////////////////////////////////////////////////////////////////////////
/// @brief convenience typedef for testing for single sample case
typedef std::integral_constant<int, 1> SingleSampleT;
INLINE
SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
{
switch (numSamples)
{
case 1:
return SWR_MULTISAMPLE_1X;
case 2:
return SWR_MULTISAMPLE_2X;
case 4:
return SWR_MULTISAMPLE_4X;
case 8:
return SWR_MULTISAMPLE_8X;
case 16:
return SWR_MULTISAMPLE_16X;
default:
assert(0);
return SWR_MULTISAMPLE_1X;
}
}
// hardcoded offsets based on Direct3d standard multisample positions
// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
// coords are 0.8 fixed point offsets from (0, 0)
template <SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false>
struct MultisampleTraits
{
INLINE static float X(uint32_t sampleNum) = delete;
INLINE static float Y(uint32_t sampleNum) = delete;
INLINE static simdscalari FullSampleMask() = delete;
static const uint32_t numSamples = 0;
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_1X, false>
{
INLINE static float X(uint32_t sampleNum) { return samplePosX[sampleNum]; };
INLINE static float Y(uint32_t sampleNum) { return samplePosY[sampleNum]; };
INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
static const uint32_t numSamples = 1;
static const uint32_t numCoverageSamples = 1;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X;
static constexpr uint32_t samplePosXi[1] = {0x80};
static constexpr uint32_t samplePosYi[1] = {0x80};
static constexpr float samplePosX[1] = {0.5f};
static constexpr float samplePosY[1] = {0.5f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_1X, true>
{
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
static const uint32_t numSamples = 1;
static const uint32_t numCoverageSamples = 1;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X;
static constexpr uint32_t samplePosXi[1] = {0x80};
static constexpr uint32_t samplePosYi[1] = {0x80};
static constexpr float samplePosX[1] = {0.5f};
static constexpr float samplePosY[1] = {0.5f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_2X, false>
{
INLINE static float X(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
return samplePosX[sampleNum];
};
INLINE static float Y(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
return samplePosY[sampleNum];
};
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0x3);
return mask;
}
static const uint32_t numSamples = 2;
static const uint32_t numCoverageSamples = 2;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X;
static constexpr uint32_t samplePosXi[2] = {0xC0, 0x40};
static constexpr uint32_t samplePosYi[2] = {0xC0, 0x40};
static constexpr float samplePosX[2] = {0.75f, 0.25f};
static constexpr float samplePosY[2] = {0.75f, 0.25f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_2X, true>
{
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0x3);
return mask;
}
static const uint32_t numSamples = 2;
static const uint32_t numCoverageSamples = 1;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X;
static constexpr uint32_t samplePosXi[2] = {0x80, 0x80};
static constexpr uint32_t samplePosYi[2] = {0x80, 0x80};
static constexpr float samplePosX[2] = {0.5f, 0.5f};
static constexpr float samplePosY[2] = {0.5f, 0.5f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_4X, false>
{
INLINE static float X(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
return samplePosX[sampleNum];
};
INLINE static float Y(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
return samplePosY[sampleNum];
};
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xF);
return mask;
}
static const uint32_t numSamples = 4;
static const uint32_t numCoverageSamples = 4;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X;
static constexpr uint32_t samplePosXi[4] = {0x60, 0xE0, 0x20, 0xA0};
static constexpr uint32_t samplePosYi[4] = {0x20, 0x60, 0xA0, 0xE0};
static constexpr float samplePosX[4] = {0.375f, 0.875f, 0.125f, 0.625f};
static constexpr float samplePosY[4] = {0.125f, 0.375f, 0.625f, 0.875f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_4X, true>
{
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xF);
return mask;
}
static const uint32_t numSamples = 4;
static const uint32_t numCoverageSamples = 1;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X;
static constexpr uint32_t samplePosXi[4] = {0x80, 0x80, 0x80, 0x80};
static constexpr uint32_t samplePosYi[4] = {0x80, 0x80, 0x80, 0x80};
static constexpr float samplePosX[4] = {0.5f, 0.5f, 0.5f, 0.5f};
static constexpr float samplePosY[4] = {0.5f, 0.5f, 0.5f, 0.5f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_8X, false>
{
INLINE static float X(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
return samplePosX[sampleNum];
};
INLINE static float Y(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
return samplePosY[sampleNum];
};
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFF);
return mask;
}
static const uint32_t numSamples = 8;
static const uint32_t numCoverageSamples = 8;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X;
static constexpr uint32_t samplePosXi[8] = {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0};
static constexpr uint32_t samplePosYi[8] = {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10};
static constexpr float samplePosX[8] = {
0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f};
static constexpr float samplePosY[8] = {
0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_8X, true>
{
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFF);
return mask;
}
static const uint32_t numSamples = 8;
static const uint32_t numCoverageSamples = 1;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X;
static constexpr uint32_t samplePosXi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
static constexpr uint32_t samplePosYi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
static constexpr float samplePosX[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
static constexpr float samplePosY[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_16X, false>
{
INLINE static float X(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
return samplePosX[sampleNum];
};
INLINE static float Y(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
return samplePosY[sampleNum];
};
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFFFF);
return mask;
}
static const uint32_t numSamples = 16;
static const uint32_t numCoverageSamples = 16;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X;
static constexpr uint32_t samplePosXi[16] = {0x90,
0x70,
0x50,
0xC0,
0x30,
0xA0,
0xD0,
0xB0,
0x60,
0x80,
0x40,
0x20,
0x00,
0xF0,
0xE0,
0x10};
static constexpr uint32_t samplePosYi[16] = {0x90,
0x50,
0xA0,
0x70,
0x60,
0xD0,
0xB0,
0x30,
0xE0,
0x10,
0x20,
0xC0,
0x80,
0x40,
0xF0,
0x00};
static constexpr float samplePosX[16] = {0.5625f,
0.4375f,
0.3125f,
0.7500f,
0.1875f,
0.6250f,
0.8125f,
0.6875f,
0.3750f,
0.5000f,
0.2500f,
0.1250f,
0.0000f,
0.9375f,
0.8750f,
0.0625f};
static constexpr float samplePosY[16] = {0.5625f,
0.3125f,
0.6250f,
0.4375f,
0.3750f,
0.8125f,
0.6875f,
0.1875f,
0.8750f,
0.0625f,
0.1250f,
0.7500f,
0.5000f,
0.2500f,
0.9375f,
0.0000f};
};
template <>
struct MultisampleTraits<SWR_MULTISAMPLE_16X, true>
{
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFFFF);
return mask;
}
static const uint32_t numSamples = 16;
static const uint32_t numCoverageSamples = 1;
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X;
static constexpr uint32_t samplePosXi[16] = {0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80};
static constexpr uint32_t samplePosYi[16] = {0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80,
0x80};
static constexpr float samplePosX[16] = {0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f};
static constexpr float samplePosY[16] = {0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f,
0.5f};
};
INLINE
bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount,
const SWR_MULTISAMPLE_POS& samplePos)
{
// detect if we're using standard or center sample patterns
const uint32_t *standardPosX, *standardPosY;
switch (sampleCount)
{
case SWR_MULTISAMPLE_1X:
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi;
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosYi;
break;
case SWR_MULTISAMPLE_2X:
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi;
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi;
break;
case SWR_MULTISAMPLE_4X:
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi;
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi;
break;
case SWR_MULTISAMPLE_8X:
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi;
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi;
break;
case SWR_MULTISAMPLE_16X:
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi;
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi;
break;
default:
break;
}
// scan sample pattern for standard or center
uint32_t numSamples = GetNumSamples(sampleCount);
bool bIsStandard = true;
if (numSamples > 1)
{
for (uint32_t i = 0; i < numSamples; i++)
{
bIsStandard =
(standardPosX[i] == samplePos.Xi(i)) || (standardPosY[i] == samplePos.Yi(i));
if (!bIsStandard)
break;
}
}
return !bIsStandard;
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,473 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rasterizer.cpp
*
* @brief Implementation for the rasterizer.
*
******************************************************************************/
#include <vector>
#include <algorithm>
#include "rasterizer.h"
#include "backends/gen_rasterizer.hpp"
#include "rdtsc_core.h"
#include "backend.h"
#include "utils.h"
#include "frontend.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "rasterizer_impl.h"
PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
[STATE_VALID_TRI_EDGE_COUNT][2];
void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
{
const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData);
#if KNOB_ENABLE_TOSS_POINTS
if (KNOB_TOSS_BIN_TRIS)
{
return;
}
#endif
// bloat line to two tris and call the triangle rasterizer twice
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, pDC->drawId);
const API_STATE& state = GetApiState(pDC);
const SWR_RASTSTATE& rastState = state.rastState;
// macrotile dimensioning
uint32_t macroX, macroY;
MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
const SWR_RECT& scissorInFixedPoint =
state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
// create a copy of the triangle buffer to write our adjusted vertices to
OSALIGNSIMD(float) newTriBuffer[4 * 4];
TRIANGLE_WORK_DESC newWorkDesc = workDesc;
newWorkDesc.pTriBuffer = &newTriBuffer[0];
// create a copy of the attrib buffer to write our adjusted attribs to
OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
newWorkDesc.pAttribs = &newAttribBuffer[0];
const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
__m128 vX, vY, vZ, vRecipW;
vX = _mm_load_ps(workDesc.pTriBuffer);
vY = _mm_load_ps(workDesc.pTriBuffer + 4);
vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
// triangle 0
// v0,v1 -> v0,v0,v1
__m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
__m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
__m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
__m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
__m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
__m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
if (workDesc.triFlags.yMajor)
{
vXa = _mm_add_ps(vAdjust, vXa);
}
else
{
vYa = _mm_add_ps(vAdjust, vYa);
}
// Store triangle description for rasterizer
_mm_store_ps((float*)&newTriBuffer[0], vXa);
_mm_store_ps((float*)&newTriBuffer[4], vYa);
_mm_store_ps((float*)&newTriBuffer[8], vZa);
_mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
// binner bins 3 edges for lines as v0, v1, v1
// tri0 needs v0, v0, v1
for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
{
__m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
__m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
}
// Store user clip distances for triangle 0
float newClipBuffer[3 * 8];
uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
if (numClipDist)
{
newWorkDesc.pUserClipBuffer = newClipBuffer;
float* pOldBuffer = workDesc.pUserClipBuffer;
float* pNewBuffer = newClipBuffer;
for (uint32_t i = 0; i < numClipDist; ++i)
{
// read barycentric coeffs from binner
float a = *(pOldBuffer++);
float b = *(pOldBuffer++);
// reconstruct original clip distance at vertices
float c0 = a + b;
float c1 = b;
// construct triangle barycentrics
*(pNewBuffer++) = c0 - c1;
*(pNewBuffer++) = c0 - c1;
*(pNewBuffer++) = c1;
}
}
// setup triangle rasterizer function
PFN_WORK_FUNC pfnTriRast;
// conservative rast not supported for points/lines
pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
rastState.bIsCenterPattern,
false,
SWR_INPUT_COVERAGE_NONE,
EdgeValToEdgeState(ALL_EDGES_VALID),
(pDC->pState->state.scissorsTileAligned == false));
// make sure this macrotile intersects the triangle
__m128i vXai = fpToFixedPoint(vXa);
__m128i vYai = fpToFixedPoint(vYa);
OSALIGNSIMD(SWR_RECT) bboxA;
calcBoundingBoxInt(vXai, vYai, bboxA);
if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
{
// rasterize triangle
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}
// triangle 1
// v0,v1 -> v1,v1,v0
vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
if (workDesc.triFlags.yMajor)
{
vXa = _mm_add_ps(vAdjust, vXa);
}
else
{
vYa = _mm_add_ps(vAdjust, vYa);
}
// Store triangle description for rasterizer
_mm_store_ps((float*)&newTriBuffer[0], vXa);
_mm_store_ps((float*)&newTriBuffer[4], vYa);
_mm_store_ps((float*)&newTriBuffer[8], vZa);
_mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
// binner bins 3 edges for lines as v0, v1, v1
// tri1 needs v1, v1, v0
for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
{
__m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
__m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
}
// store user clip distance for triangle 1
if (numClipDist)
{
float* pOldBuffer = workDesc.pUserClipBuffer;
float* pNewBuffer = newClipBuffer;
for (uint32_t i = 0; i < numClipDist; ++i)
{
// read barycentric coeffs from binner
float a = *(pOldBuffer++);
float b = *(pOldBuffer++);
// reconstruct original clip distance at vertices
float c0 = a + b;
float c1 = b;
// construct triangle barycentrics
*(pNewBuffer++) = c1 - c0;
*(pNewBuffer++) = c1 - c0;
*(pNewBuffer++) = c0;
}
}
vXai = fpToFixedPoint(vXa);
vYai = fpToFixedPoint(vYa);
calcBoundingBoxInt(vXai, vYai, bboxA);
if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
{
// rasterize triangle
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, 1);
}
void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
{
#if KNOB_ENABLE_TOSS_POINTS
if (KNOB_TOSS_BIN_TRIS)
{
return;
}
#endif
const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
// map x,y relative offsets from start of raster tile to bit position in
// coverage mask for the point
static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13},
{2, 3, 6, 7, 10, 11, 14, 15},
{16, 17, 20, 21, 24, 25, 28, 29},
{18, 19, 22, 23, 26, 27, 30, 31},
{32, 33, 36, 37, 40, 41, 44, 45},
{34, 35, 38, 39, 42, 43, 46, 47},
{48, 49, 52, 53, 56, 57, 60, 61},
{50, 51, 54, 55, 58, 59, 62, 63}};
OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {};
// pull point information from triangle buffer
// @todo use structs for readability
uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
float z = *(workDesc.pTriBuffer + 2);
// construct triangle descriptor for point
// no interpolation, set up i,j for constant interpolation of z and attribs
// @todo implement an optimized backend that doesn't require triangle information
// compute coverage mask from x,y packed into the coverageMask flag
// mask indices by the maximum valid index for x/y of coveragemap.
uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i)
{
triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX];
}
triDesc.anyCoveredSamples = triDesc.coverageMask[0];
triDesc.innerCoverageMask = triDesc.coverageMask[0];
// no persp divide needed for points
triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
triDesc.triFlags = workDesc.triFlags;
triDesc.recipDet = 1.0f;
triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
RenderOutputBuffers renderBuffers;
GetRenderHotTiles(pDC,
workerId,
macroTile,
tileAlignedX >> KNOB_TILE_X_DIM_SHIFT,
tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
renderBuffers,
triDesc.triFlags.renderTargetArrayIndex);
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
}
void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
{
const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
// load point vertex
float x = *workDesc.pTriBuffer;
float y = *(workDesc.pTriBuffer + 1);
float z = *(workDesc.pTriBuffer + 2);
// create a copy of the triangle buffer to write our adjusted vertices to
OSALIGNSIMD(float) newTriBuffer[4 * 4];
TRIANGLE_WORK_DESC newWorkDesc = workDesc;
newWorkDesc.pTriBuffer = &newTriBuffer[0];
// create a copy of the attrib buffer to write our adjusted attribs to
OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
newWorkDesc.pAttribs = &newAttribBuffer[0];
newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
newWorkDesc.numAttribs = workDesc.numAttribs;
newWorkDesc.triFlags = workDesc.triFlags;
// construct two tris by bloating point by point size
float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
float lowerX = x - halfPointSize;
float upperX = x + halfPointSize;
float lowerY = y - halfPointSize;
float upperY = y + halfPointSize;
// tri 0
float* pBuf = &newTriBuffer[0];
*pBuf++ = lowerX;
*pBuf++ = lowerX;
*pBuf++ = upperX;
pBuf++;
*pBuf++ = lowerY;
*pBuf++ = upperY;
*pBuf++ = upperY;
pBuf++;
_mm_store_ps(pBuf, _mm_set1_ps(z));
_mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
// setup triangle rasterizer function
PFN_WORK_FUNC pfnTriRast;
// conservative rast not supported for points/lines
pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
rastState.bIsCenterPattern,
false,
SWR_INPUT_COVERAGE_NONE,
EdgeValToEdgeState(ALL_EDGES_VALID),
(pDC->pState->state.scissorsTileAligned == false));
// overwrite texcoords for point sprites
if (isPointSpriteTexCoordEnabled)
{
// copy original attribs
memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
newWorkDesc.pAttribs = &newAttribBuffer[0];
// overwrite texcoord for point sprites
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
unsigned long texCoordAttrib = 0;
while (_BitScanForward(&texCoordAttrib, texCoordMask))
{
texCoordMask &= ~(1 << texCoordAttrib);
__m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
if (rastState.pointSpriteTopOrigin)
{
pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
}
else
{
pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
}
}
}
else
{
// no texcoord overwrite, can reuse the attrib buffer from frontend
newWorkDesc.pAttribs = workDesc.pAttribs;
}
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
// tri 1
pBuf = &newTriBuffer[0];
*pBuf++ = lowerX;
*pBuf++ = upperX;
*pBuf++ = upperX;
pBuf++;
*pBuf++ = lowerY;
*pBuf++ = upperY;
*pBuf++ = lowerY;
// z, w unchanged
if (isPointSpriteTexCoordEnabled)
{
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
unsigned long texCoordAttrib = 0;
while (_BitScanForward(&texCoordAttrib, texCoordMask))
{
texCoordMask &= ~(1 << texCoordAttrib);
__m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
if (rastState.pointSpriteTopOrigin)
{
pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
}
else
{
pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
}
}
}
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}
void InitRasterizerFunctions()
{
InitRasterizerFuncs();
}
// Selector for correct templated RasterizeTriangle function
PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
bool IsCenter,
bool IsConservative,
SWR_INPUT_COVERAGE InputCoverage,
uint32_t EdgeEnable,
bool RasterizeScissorEdges)
{
SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage]
[EdgeEnable][RasterizeScissorEdges];
SWR_ASSERT(func);
return func;
}

View file

@ -1,237 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rasterizer.h
*
* @brief Definitions for the rasterizer.
*
******************************************************************************/
#pragma once
#include "context.h"
#include <type_traits>
#include "conservativeRast.h"
#include "multisample.h"
void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
void InitRasterizerFunctions();
INLINE
__m128i fpToFixedPoint(const __m128 vIn)
{
__m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE));
return _mm_cvtps_epi32(vFixed);
}
enum TriEdgesStates
{
STATE_NO_VALID_EDGES = 0,
STATE_E0_E1_VALID,
STATE_E0_E2_VALID,
STATE_E1_E2_VALID,
STATE_ALL_EDGES_VALID,
STATE_VALID_TRI_EDGE_COUNT,
};
enum TriEdgesValues
{
NO_VALID_EDGES = 0,
E0_E1_VALID = 0x3,
E0_E2_VALID = 0x5,
E1_E2_VALID = 0x6,
ALL_EDGES_VALID = 0x7,
VALID_TRI_EDGE_COUNT,
};
// Selector for correct templated RasterizeTriangle function
PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
bool IsCenter,
bool IsConservative,
SWR_INPUT_COVERAGE InputCoverage,
uint32_t EdgeEnable,
bool RasterizeScissorEdges);
//////////////////////////////////////////////////////////////////////////
/// @brief ValidTriEdges convenience typedefs used for templated function
/// specialization supported Fixed Point precisions
typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> AllEdgesValidT;
typedef std::integral_constant<uint32_t, E0_E1_VALID> E0E1ValidT;
typedef std::integral_constant<uint32_t, E0_E2_VALID> E0E2ValidT;
typedef std::integral_constant<uint32_t, E1_E2_VALID> E1E2ValidT;
typedef std::integral_constant<uint32_t, NO_VALID_EDGES> NoEdgesValidT;
typedef std::integral_constant<uint32_t, STATE_ALL_EDGES_VALID> StateAllEdgesValidT;
typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID> StateE0E1ValidT;
typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID> StateE0E2ValidT;
typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID> StateE1E2ValidT;
typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES> StateNoEdgesValidT;
// some specializations to convert from edge state to edge bitmask values
template <typename EdgeMask>
struct EdgeMaskVal
{
static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID,
"Primary EdgeMaskVal shouldn't be instantiated");
};
template <>
struct EdgeMaskVal<StateAllEdgesValidT>
{
typedef AllEdgesValidT T;
};
template <>
struct EdgeMaskVal<StateE0E1ValidT>
{
typedef E0E1ValidT T;
};
template <>
struct EdgeMaskVal<StateE0E2ValidT>
{
typedef E0E2ValidT T;
};
template <>
struct EdgeMaskVal<StateE1E2ValidT>
{
typedef E1E2ValidT T;
};
template <>
struct EdgeMaskVal<StateNoEdgesValidT>
{
typedef NoEdgesValidT T;
};
INLINE uint32_t EdgeValToEdgeState(uint32_t val)
{
SWR_ASSERT(val < VALID_TRI_EDGE_COUNT, "Unexpected tri edge mask");
static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = {0, 0, 0, 1, 0, 2, 3, 4};
return edgeValToEdgeState[val];
}
//////////////////////////////////////////////////////////////////////////
/// @struct RasterScissorEdgesT
/// @brief Primary RasterScissorEdgesT templated struct that holds compile
/// time information about the number of edges needed to be rasterized,
/// If either the scissor rect or conservative rast is enabled,
/// the scissor test is enabled and the rasterizer will test
/// 3 triangle edges + 4 scissor edges for coverage.
/// @tparam RasterScissorEdgesT: number of multisamples
/// @tparam ConservativeT: is this a conservative rasterization
/// @tparam EdgeMaskT: Which edges are valid(not degenerate)
template <typename RasterScissorEdgesT, typename ConservativeT, typename EdgeMaskT>
struct RasterEdgeTraits
{
typedef std::true_type RasterizeScissorEdgesT;
typedef std::integral_constant<uint32_t, 7> NumEdgesT;
// typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT;
typedef typename EdgeMaskVal<EdgeMaskT>::T ValidEdgeMaskT;
};
//////////////////////////////////////////////////////////////////////////
/// @brief specialization of RasterEdgeTraits. If neither scissor rect
/// nor conservative rast is enabled, only test 3 triangle edges
/// for coverage
template <typename EdgeMaskT>
struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT>
{
typedef std::false_type RasterizeScissorEdgesT;
typedef std::integral_constant<uint32_t, 3> NumEdgesT;
// no need for degenerate edge masking in non-conservative case; rasterize all triangle edges
typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> ValidEdgeMaskT;
};
//////////////////////////////////////////////////////////////////////////
/// @struct RasterizerTraits
/// @brief templated struct that holds compile time information used
/// during rasterization. Inherits EdgeTraits and ConservativeRastBETraits.
/// @tparam NumSamplesT: number of multisamples
/// @tparam ConservativeT: is this a conservative rasterization
/// @tparam InputCoverageT: what type of input coverage is the PS expecting?
/// (only used with conservative rasterization)
/// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor?
template <typename NumSamplesT,
typename CenterPatternT,
typename ConservativeT,
typename InputCoverageT,
typename EdgeEnableT,
typename RasterScissorEdgesT>
struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT>
{
typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value),
CenterPatternT::value>
MT;
/// Fixed point precision the rasterizer is using
typedef FixedPointTraits<Fixed_16_8> PrecisionT;
/// Fixed point precision of the edge tests used during rasterization
typedef FixedPointTraits<Fixed_X_16> EdgePrecisionT;
// If conservative rast or MSAA center pattern is enabled, only need a single sample coverage
// test, with the result copied to all samples
typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples>
NumCoverageSamplesT;
static_assert(
EdgePrecisionT::BitsT::value >=
ConservativeRastBETraits<ConservativeT,
InputCoverageT>::ConservativePrecisionT::BitsT::value,
"Rasterizer edge fixed point precision < required conservative rast precision");
/// constants used to offset between different types of raster tiles
static const int colorRasterTileStep{
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) *
MT::numSamples};
static const int depthRasterTileStep{
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) *
MT::numSamples};
static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM *
(FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) *
MT::numSamples};
static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
colorRasterTileStep};
static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
depthRasterTileStep};
static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
stencilRasterTileStep};
};
template <uint32_t NumSamplesT,
uint32_t CenterPatternT,
uint32_t ConservativeT,
uint32_t InputCoverageT,
uint32_t EdgeEnableT,
uint32_t RasterScissorEdgesT>
struct RasterizerTraits final
: public _RasterizerTraits<std::integral_constant<uint32_t, NumSamplesT>,
std::integral_constant<bool, CenterPatternT != 0>,
std::integral_constant<bool, ConservativeT != 0>,
std::integral_constant<uint32_t, InputCoverageT>,
std::integral_constant<uint32_t, EdgeEnableT>,
std::integral_constant<bool, RasterScissorEdgesT != 0>>
{
};

File diff suppressed because it is too large Load diff

View file

@ -1,94 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#include "rdtsc_core.h"
#include "common/rdtsc_buckets.h"
// must match CORE_BUCKETS enum order
BUCKET_DESC gCoreBuckets[] = {
{"APIClearRenderTarget", "", true, 0xff0b8bea},
{"APIDraw", "", true, 0xff000066},
{"APIDrawWakeAllThreads", "", false, 0xffffffff},
{"APIDrawIndexed", "", true, 0xff000066},
{"APIDispatch", "", true, 0xff660000},
{"APIStoreTiles", "", true, 0xff00ffff},
{"APIGetDrawContext", "", false, 0xffffffff},
{"APISync", "", true, 0xff6666ff},
{"APIWaitForIdle", "", true, 0xff0000ff},
{"FEProcessDraw", "", true, 0xff009900},
{"FEProcessDrawIndexed", "", true, 0xff009900},
{"FEFetchShader", "", false, 0xffffffff},
{"FEVertexShader", "", false, 0xffffffff},
{"FEHullShader", "", false, 0xffffffff},
{"FETessellation", "", false, 0xffffffff},
{"FEDomainShader", "", false, 0xffffffff},
{"FEGeometryShader", "", false, 0xffffffff},
{"FEStreamout", "", false, 0xffffffff},
{"FEPAAssemble", "", false, 0xffffffff},
{"FEBinPoints", "", false, 0xff29b854},
{"FEBinLines", "", false, 0xff29b854},
{"FEBinTriangles", "", false, 0xff29b854},
{"FETriangleSetup", "", false, 0xffffffff},
{"FEViewportCull", "", false, 0xffffffff},
{"FEGuardbandClip", "", false, 0xffffffff},
{"FEClipPoints", "", false, 0xffffffff},
{"FEClipLines", "", false, 0xffffffff},
{"FEClipTriangles", "", false, 0xffffffff},
{"FEClipRectangles", "", false, 0xffffffff},
{"FECullZeroAreaAndBackface", "", false, 0xffffffff},
{"FECullBetweenCenters", "", false, 0xffffffff},
{"FEEarlyRastEnter", "", false, 0xffffffff},
{"FEEarlyRastExit", "", false, 0xffffffff},
{"FEProcessStoreTiles", "", true, 0xff39c864},
{"FEProcessInvalidateTiles", "", true, 0xffffffff},
{"WorkerWorkOnFifoBE", "", false, 0xff40261c},
{"WorkerFoundWork", "", false, 0xff573326},
{"BELoadTiles", "", true, 0xffb0e2ff},
{"BEDispatch", "", true, 0xff00a2ff},
{"BEClear", "", true, 0xff00ccbb},
{"BERasterizeLine", "", true, 0xffb26a4e},
{"BERasterizeTriangle", "", true, 0xffb26a4e},
{"BETriangleSetup", "", false, 0xffffffff},
{"BEStepSetup", "", false, 0xffffffff},
{"BECullZeroArea", "", false, 0xffffffff},
{"BEEmptyTriangle", "", false, 0xffffffff},
{"BETrivialAccept", "", false, 0xffffffff},
{"BETrivialReject", "", false, 0xffffffff},
{"BERasterizePartial", "", false, 0xffffffff},
{"BEPixelBackend", "", false, 0xffffffff},
{"BESetup", "", false, 0xffffffff},
{"BEBarycentric", "", false, 0xffffffff},
{"BEEarlyDepthTest", "", false, 0xffffffff},
{"BEPixelShader", "", false, 0xffffffff},
{"BESingleSampleBackend", "", false, 0xffffffff},
{"BEPixelRateBackend", "", false, 0xffffffff},
{"BESampleRateBackend", "", false, 0xffffffff},
{"BENullBackend", "", false, 0xffffffff},
{"BELateDepthTest", "", false, 0xffffffff},
{"BEOutputMerger", "", false, 0xffffffff},
{"BEStoreTiles", "", true, 0xff00cccc},
{"BEEndTile", "", false, 0xffffffff},
};
static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])),
"RDTSC Bucket enum and description table size mismatched.");

View file

@ -1,185 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#include "knobs.h"
#include "common/os.h"
#include "common/rdtsc_buckets.h"
#include <vector>
///////////////////////////////////////////////////////////////////////////////
// NOTE: This enum MUST be kept in sync with gCoreBuckets in rdtsc_core.cpp
///////////////////////////////////////////////////////////////////////////////
enum CORE_BUCKETS
{
APIClearRenderTarget,
APIDraw,
APIDrawWakeAllThreads,
APIDrawIndexed,
APIDispatch,
APIStoreTiles,
APIGetDrawContext,
APISync,
APIWaitForIdle,
FEProcessDraw,
FEProcessDrawIndexed,
FEFetchShader,
FEVertexShader,
FEHullShader,
FETessellation,
FEDomainShader,
FEGeometryShader,
FEStreamout,
FEPAAssemble,
FEBinPoints,
FEBinLines,
FEBinTriangles,
FETriangleSetup,
FEViewportCull,
FEGuardbandClip,
FEClipPoints,
FEClipLines,
FEClipTriangles,
FEClipRectangles,
FECullZeroAreaAndBackface,
FECullBetweenCenters,
FEEarlyRastEnter,
FEEarlyRastExit,
FEProcessStoreTiles,
FEProcessInvalidateTiles,
WorkerWorkOnFifoBE,
WorkerFoundWork,
BELoadTiles,
BEDispatch,
BEClear,
BERasterizeLine,
BERasterizeTriangle,
BETriangleSetup,
BEStepSetup,
BECullZeroArea,
BEEmptyTriangle,
BETrivialAccept,
BETrivialReject,
BERasterizePartial,
BEPixelBackend,
BESetup,
BEBarycentric,
BEEarlyDepthTest,
BEPixelShader,
BESingleSampleBackend,
BEPixelRateBackend,
BESampleRateBackend,
BENullBackend,
BELateDepthTest,
BEOutputMerger,
BEStoreTiles,
BEEndTile,
NumBuckets
};
void rdtscReset(BucketManager* pBucketMgr);
void rdtscInit(BucketManager* pBucketMgr, int threadId);
void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId);
void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId);
void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2);
void rdtscEndFrame(BucketManager* pBucketMgr);
#ifdef KNOB_ENABLE_RDTSC
#define RDTSC_RESET(pBucketMgr) rdtscReset(pBucketMgr)
#define RDTSC_INIT(pBucketMgr, threadId) rdtscInit(pBucketMgr,threadId)
#define RDTSC_START(pBucketMgr, bucket) rdtscStart(pBucketMgr, bucket)
#define RDTSC_STOP(pBucketMgr, bucket, count, draw) rdtscStop(pBucketMgr, bucket, count, draw)
#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2) rdtscEvent(pBucketMgr, bucket, count1, count2)
#define RDTSC_ENDFRAME(pBucketMgr) rdtscEndFrame(pBucketMgr)
#else
#define RDTSC_RESET(pBucketMgr)
#define RDTSC_INIT(pBucketMgr, threadId)
#define RDTSC_START(pBucketMgr, bucket)
#define RDTSC_STOP(pBucketMgr, bucket, count, draw)
#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2)
#define RDTSC_ENDFRAME(pBucketMgr)
#endif
extern BUCKET_DESC gCoreBuckets[];
INLINE void rdtscReset(BucketManager *pBucketMgr)
{
pBucketMgr->mCurrentFrame = 0;
pBucketMgr->ClearThreads();
}
INLINE void rdtscInit(BucketManager* pBucketMgr, int threadId)
{
// register all the buckets once
if (!pBucketMgr->mBucketsInitialized && (threadId == 0))
{
pBucketMgr->mBucketMap.resize(NumBuckets);
for (uint32_t i = 0; i < NumBuckets; ++i)
{
pBucketMgr->mBucketMap[i] = pBucketMgr->RegisterBucket(gCoreBuckets[i]);
}
pBucketMgr->mBucketsInitialized = true;
}
std::string name = threadId == 0 ? "API" : "WORKER";
pBucketMgr->RegisterThread(name);
}
INLINE void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId)
{
uint32_t id = pBucketMgr->mBucketMap[bucketId];
pBucketMgr->StartBucket(id);
}
INLINE void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId)
{
uint32_t id = pBucketMgr->mBucketMap[bucketId];
pBucketMgr->StopBucket(id);
}
INLINE void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2)
{
uint32_t id = pBucketMgr->mBucketMap[bucketId];
pBucketMgr->AddEvent(id, count1);
}
INLINE void rdtscEndFrame(BucketManager* pBucketMgr)
{
pBucketMgr->mCurrentFrame++;
if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_START_FRAME &&
KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
{
pBucketMgr->StartCapture();
}
if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_END_FRAME &&
KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
{
pBucketMgr->StopCapture();
pBucketMgr->PrintReport("rdtsc.txt");
}
}

View file

@ -1,95 +0,0 @@
/****************************************************************************
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file arena.h
*
* @brief RingBuffer
* The RingBuffer class manages all aspects of the ring buffer including
* the head/tail indices, etc.
*
******************************************************************************/
#pragma once
template <typename T>
class RingBuffer
{
public:
RingBuffer() : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) {}
~RingBuffer() { Destroy(); }
void Init(uint32_t numEntries)
{
SWR_ASSERT(numEntries > 0);
SWR_ASSERT(((1ULL << 32) % numEntries) == 0,
"%d is not evenly divisible into 2 ^ 32. Wrap errors will occur!",
numEntries);
mNumEntries = numEntries;
mpRingBuffer = (T*)AlignedMalloc(sizeof(T) * numEntries, 64);
SWR_ASSERT(mpRingBuffer != nullptr);
memset((void*)mpRingBuffer, 0, sizeof(T) * numEntries);
}
void Destroy()
{
AlignedFree(mpRingBuffer);
mpRingBuffer = nullptr;
}
T& operator[](const uint32_t index)
{
SWR_ASSERT(index < mNumEntries);
return mpRingBuffer[index];
}
INLINE void Enqueue()
{
mRingHead++; // There's only one producer.
// Assert to find wrap-around cases, NEVER ENABLE DURING CHECKIN!!
// SWR_REL_ASSERT(mRingHead);
}
INLINE void Dequeue()
{
InterlockedIncrement(&mRingTail); // There are multiple consumers.
}
INLINE bool IsEmpty() { return (GetHead() == GetTail()); }
INLINE bool IsFull()
{
uint32_t numEnqueued = GetHead() - GetTail();
SWR_ASSERT(numEnqueued <= mNumEntries);
return (numEnqueued == mNumEntries);
}
INLINE uint32_t GetTail() volatile { return mRingTail; }
INLINE uint32_t GetHead() volatile { return mRingHead; }
protected:
T* mpRingBuffer;
uint32_t mNumEntries;
OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter
OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter
};

File diff suppressed because it is too large Load diff

View file

@ -1,67 +0,0 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file state.h
*
* @brief Definitions for API state - complex function implementation.
*
******************************************************************************/
#pragma once
#include "core/state.h"
#include "common/simdintrin.h"
template <typename MaskT>
INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max)
{
__m128i vMin = _mm_set1_epi32(*min);
__m128i vMax = _mm_set1_epi32(*max);
return _simd_blend4_epi32<MaskT::value>(vMin, vMax);
}
INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples)
{
for (int i = 0; i < numSamples; i++)
{
_vXi[i] = _mm_set1_epi32(_xi[i]);
_vYi[i] = _mm_set1_epi32(_yi[i]);
_vX[i] = _simd_set1_ps(_x[i]);
_vY[i] = _simd_set1_ps(_y[i]);
}
// precalculate the raster tile BB for the rasterizer.
CalcTileSampleOffsets(numSamples);
}
INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples)
{
auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]);
auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]);
using xMask = std::integral_constant<int, 0xA>;
// BR(max), BL(min), UR(max), UL(min)
tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi);
auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]);
auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]);
using yMask = std::integral_constant<int, 0xC>;
// BR(max), BL(min), UR(max), UL(min)
tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi);
};

Some files were not shown because too many files have changed in this diff Show more